Rewrite SLP induction vectorization
authorRichard Biener <rguenther@suse.de>
Mon, 2 Nov 2020 11:38:04 +0000 (12:38 +0100)
committerRichard Biener <rguenther@suse.de>
Mon, 2 Nov 2020 14:58:14 +0000 (15:58 +0100)
This rewrites SLP induction vectorization to handle different
inductions in the different SLP lanes.  It also changes SLP
build to represent the initial value (but not the cycle) so
it can be enhanced to handle outer loop vectorization later.

Note this FAILs gcc.dg/vect/costmodel/x86_64/costmodel-pr30843.c
because it removes one CSE optimization that no longer works
with non-uniform initial value and step.  I'll see to recover
from this after outer loop vectorization of inductions works.

It might be a bit friendlier to variable-size vectors now
but then we're now building the step vector from scalars ...

2020-11-02  Richard Biener  <rguenther@suse.de>

* tree.h (build_real_from_wide): Declare.
* tree.c (build_real_from_wide): New function.
* tree-vect-slp.c (vect_build_slp_tree_2): Remove
restriction on induction vectorization, represent
the initial value.
* tree-vect-loop.c (vect_model_induction_cost): Inline ...
(vectorizable_induction): ... here.  Rewrite SLP
code generation.

* gcc.dg/vect/slp-49.c: New testcase.

gcc/testsuite/gcc.dg/vect/slp-49.c [new file with mode: 0644]
gcc/tree-vect-loop.c
gcc/tree-vect-slp.c
gcc/tree.c
gcc/tree.h

diff --git a/gcc/testsuite/gcc.dg/vect/slp-49.c b/gcc/testsuite/gcc.dg/vect/slp-49.c
new file mode 100644 (file)
index 0000000..3f53baf
--- /dev/null
@@ -0,0 +1,38 @@
+/* { dg-require-effective-target vect_int } */
+
+#include "tree-vect.h"
+
+int a[1024];
+
+void __attribute__((noipa))
+foo(int k)
+{
+  int j = 5;
+  for (int i = 0; i < 512; ++i)
+    {
+      a[2*i] = j;
+      a[2*i+1] = k;
+      j++;
+      k+=3;
+    }
+}
+
+int
+main()
+{
+  check_vect ();
+
+  foo (17);
+
+  for (int i = 0; i < 512; ++i)
+    {
+      if (a[2*i] != 5 + i
+         || a[2*i+1] != 17 + 3 * i)
+       __builtin_abort ();
+    }
+
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump "vectorizing stmts using SLP" "vect" } } */
+/* { dg-final { scan-tree-dump "Loop contains only SLP stmts" "vect" } } */
index 105ea61ddbe5f6607b235fd345051022d9d49506..fcea28935bc6d0daac8f03ed0f82b04a41798f00 100644 (file)
@@ -4443,34 +4443,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 }
 
 
-/* Function vect_model_induction_cost.
-
-   Models cost for induction operations.  */
-
-static void
-vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
-                          stmt_vector_for_cost *cost_vec)
-{
-  unsigned inside_cost, prologue_cost;
-
-  if (PURE_SLP_STMT (stmt_info))
-    return;
-
-  /* loop cost for vec_loop.  */
-  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
-                                 stmt_info, 0, vect_body);
-
-  /* prologue cost for vec_init and vec_step.  */
-  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
-                                   stmt_info, 0, vect_prologue);
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-                     "vect_model_induction_cost: inside_cost = %d, "
-                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
-}
-
-
 
 /* Function get_initial_def_for_reduction
 
@@ -7796,7 +7768,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 
   if (slp_node && !nunits.is_constant ())
     {
-      /* The current SLP code creates the initial value element-by-element.  */
+      /* The current SLP code creates the step value element-by-element.  */
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "SLP induction not supported for variable-length"
@@ -7806,9 +7778,46 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      unsigned inside_cost = 0, prologue_cost = 0;
+      if (slp_node)
+       {
+         /* We eventually need to set a vector type on invariant
+            arguments.  */
+         unsigned j;
+         slp_tree child;
+         FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+           if (!vect_maybe_update_slp_op_vectype
+               (child, SLP_TREE_VECTYPE (slp_node)))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "incompatible vector types for "
+                                  "invariants\n");
+               return false;
+             }
+         /* loop cost for vec_loop.  */
+         inside_cost
+           = record_stmt_cost (cost_vec,
+                               SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                               vector_stmt, stmt_info, 0, vect_body);
+       }
+      else /* if (!slp_node) */
+       {
+         /* loop cost for vec_loop.  */
+         inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+                                         stmt_info, 0, vect_body);
+         /* prologue cost for vec_init and vec_step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "vect_model_induction_cost: inside_cost = %d, "
+                        "prologue_cost = %d .\n", inside_cost,
+                        prologue_cost);
+
       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
       DUMP_VECT_SCOPE ("vectorizable_induction");
-      vect_model_induction_cost (stmt_info, ncopies, cost_vec);
       return true;
     }
 
@@ -7827,98 +7836,66 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
 
   pe = loop_preheader_edge (iv_loop);
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-                                    loop_preheader_edge (iv_loop));
-
-  stmts = NULL;
-  if (!nested_in_vect_loop)
-    {
-      /* Convert the initial value to the IV update type.  */
-      tree new_type = TREE_TYPE (step_expr);
-      init_expr = gimple_convert (&stmts, new_type, init_expr);
-
-      /* If we are using the loop mask to "peel" for alignment then we need
-        to adjust the start value here.  */
-      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-      if (skip_niters != NULL_TREE)
-       {
-         if (FLOAT_TYPE_P (vectype))
-           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
-                                       skip_niters);
-         else
-           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
-         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
-                                        skip_niters, step_expr);
-         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
-                                   init_expr, skip_step);
-       }
-    }
-
-  if (stmts)
-    {
-      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-      gcc_assert (!new_bb);
-    }
-
   /* Find the first insertion point in the BB.  */
   basic_block bb = gimple_bb (phi);
   si = gsi_after_labels (bb);
 
   /* For SLP induction we have to generate several IVs as for example
-     with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
-     [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
-     [VF*S, VF*S, VF*S, VF*S] for all.  */
+     with group size 3 we need
+       [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
+       [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
   if (slp_node)
     {
       /* Enforced above.  */
       unsigned int const_nunits = nunits.to_constant ();
 
-      /* Generate [VF*S, VF*S, ... ].  */
-      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-       {
-         expr = build_int_cst (integer_type_node, vf);
-         expr = fold_convert (TREE_TYPE (step_expr), expr);
-       }
-      else
-       expr = build_int_cst (TREE_TYPE (step_expr), vf);
-      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                             expr, step_expr);
-      if (! CONSTANT_CLASS_P (new_name))
-       new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
-                                    TREE_TYPE (step_expr), NULL);
-      new_vec = build_vector_from_val (step_vectype, new_name);
-      vec_step = vect_init_vector (loop_vinfo, stmt_info,
-                                  new_vec, step_vectype, NULL);
+      /* The initial values are vectorized, but any lanes > group_size
+        need adjustment.  */
+      slp_tree init_node
+       = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
 
-      /* Now generate the IVs.  */
+      /* Gather steps.  Since we do not vectorize inductions as
+        cycles we have to reconstruct the step from SCEV data.  */
       unsigned group_size = SLP_TREE_LANES (slp_node);
+      tree *steps = XALLOCAVEC (tree, group_size);
+      stmt_vec_info phi_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
+       steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+
+      /* Now generate the IVs.  */
       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      unsigned elts = const_nunits * nvects;
-      /* Compute the number of distinct IVs we need.  First reduce
-        group_size if it is a multiple of const_nunits so we get
-        one IV for a group_size of 4 but const_nunits 2.  */
-      unsigned group_sizep = group_size;
-      if (group_sizep % const_nunits == 0)
-       group_sizep = group_sizep / const_nunits;
-      unsigned nivs = least_common_multiple (group_sizep,
+      gcc_assert ((const_nunits * nvects) % group_size == 0);
+      unsigned nivs = least_common_multiple (group_size,
                                             const_nunits) / const_nunits;
-      gcc_assert (elts % group_size == 0);
-      tree elt = init_expr;
+      unsigned lup_mul = (nvects * const_nunits) / group_size;
+      tree stept = TREE_TYPE (step_vectype);
+      tree lupdate_mul
+       = build_vector_from_val (step_vectype,
+                                SCALAR_FLOAT_TYPE_P (stept)
+                                ? build_real_from_wide (stept, lup_mul,
+                                                        UNSIGNED)
+                                : build_int_cstu (stept, lup_mul));
       unsigned ivn;
+      auto_vec<tree> vec_steps;
       for (ivn = 0; ivn < nivs; ++ivn)
        {
          tree_vector_builder elts (step_vectype, const_nunits, 1);
-         stmts = NULL;
+         tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
          for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
            {
-             if (ivn*const_nunits + eltn >= group_size
-                 && (ivn * const_nunits + eltn) % group_size == 0)
-               elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
-                                   elt, step_expr);
+             tree elt = steps[(ivn*const_nunits + eltn) % group_size];
              elts.quick_push (elt);
+             unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
+             mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
+                                  ? build_real_from_wide (stept,
+                                                          mul_elt, UNSIGNED)
+                                  : build_int_cstu (stept, mul_elt));
            }
-         vec_init = gimple_build_vector (&stmts, &elts);
-         vec_init = gimple_convert (&stmts, vectype, vec_init);
+         stmts = NULL;
+         vec_step = gimple_build_vector (&stmts, &elts);
+         vec_step = gimple_convert (&stmts, step_vectype, vec_step);
+         vec_steps.safe_push (vec_step);
+         tree step_mul = gimple_build_vector (&stmts, &mul_elts);
          if (stmts)
            {
              new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
@@ -7926,65 +7903,81 @@ vectorizable_induction (loop_vec_info loop_vinfo,
            }
 
          /* Create the induction-phi that defines the induction-operand.  */
-         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
+                                           "vec_iv_");
          induction_phi = create_phi_node (vec_dest, iv_loop->header);
          induc_def = PHI_RESULT (induction_phi);
 
          /* Create the iv update inside the loop  */
-         gimple_seq stmts = NULL;
+         stmts = NULL;
+         tree up = gimple_build (&stmts, MULT_EXPR, step_vectype,
+                                 vec_step, lupdate_mul);
+         if (stmts)
+           {
+             new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+             gcc_assert (!new_bb);
+           }
+         stmts = NULL;
          vec_def = gimple_convert (&stmts, step_vectype, induc_def);
          vec_def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, vec_def, vec_step);
+                                 PLUS_EXPR, step_vectype, vec_def, up);
          vec_def = gimple_convert (&stmts, vectype, vec_def);
          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+                      UNKNOWN_LOCATION);
+
+         vec_init = vect_get_slp_vect_def (init_node, ivn);
+         if (!integer_zerop (step_mul))
+           {
+             stmts = NULL;
+             vec_def = gimple_convert (&stmts, step_vectype, vec_init);
+             up = gimple_build (&stmts, MULT_EXPR, step_vectype,
+                                vec_step, step_mul);
+             vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
+                                     vec_def, up);
+             vec_init = gimple_convert (&stmts, vectype, vec_def);
+             new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+             gcc_assert (!new_bb);
+           }
 
          /* Set the arguments of the phi node:  */
          add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
-         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
-                      UNKNOWN_LOCATION);
 
          SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
        }
-      /* Fill up to the number of vectors we need for the whole group.  */
-      nivs = least_common_multiple (group_size,
-                                   const_nunits) / const_nunits;
-      for (; ivn < nivs; ++ivn)
-       SLP_TREE_VEC_STMTS (slp_node)
-         .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
 
-      /* Re-use IVs when we can.  */
+      /* Re-use IVs when we can.  We are generating further vector
+        stmts by adding VF' * stride to the IVs generated above.  */
       if (ivn < nvects)
        {
          unsigned vfp
            = least_common_multiple (group_size, const_nunits) / group_size;
-         /* Generate [VF'*S, VF'*S, ... ].  */
-         if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-           {
-             expr = build_int_cst (integer_type_node, vfp);
-             expr = fold_convert (TREE_TYPE (step_expr), expr);
-           }
-         else
-           expr = build_int_cst (TREE_TYPE (step_expr), vfp);
-         new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                                 expr, step_expr);
-         if (! CONSTANT_CLASS_P (new_name))
-           new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
-                                        TREE_TYPE (step_expr), NULL);
-         new_vec = build_vector_from_val (step_vectype, new_name);
-         vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
-                                      step_vectype, NULL);
+         tree lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept,
+                                                            vfp, UNSIGNED)
+                                    : build_int_cstu (stept, vfp));
          for (; ivn < nvects; ++ivn)
            {
              gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
-             tree def;
-             if (gimple_code (iv) == GIMPLE_PHI)
-               def = gimple_phi_result (iv);
-             else
-               def = gimple_assign_lhs (iv);
+             tree def = gimple_get_lhs (iv);
              gimple_seq stmts = NULL;
+             if (ivn < 2*nivs)
+               {
+                 vec_steps[ivn - nivs]
+                   = gimple_build (&stmts, MULT_EXPR, step_vectype,
+                                   vec_steps[ivn - nivs], lupdate_mul);
+                 if (stmts)
+                   {
+                     new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+                     gcc_assert (!new_bb);
+                   }
+               }
+             stmts = NULL;
              def = gimple_convert (&stmts, step_vectype, def);
-             def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, def, vec_step);
+             def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
+                                 def, vec_steps[ivn % nivs]);
              def = gimple_convert (&stmts, vectype, def);
              if (gimple_code (iv) == GIMPLE_PHI)
                gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
@@ -8001,6 +7994,39 @@ vectorizable_induction (loop_vec_info loop_vinfo,
       return true;
     }
 
+  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
+                                    loop_preheader_edge (iv_loop));
+
+  stmts = NULL;
+  if (!nested_in_vect_loop)
+    {
+      /* Convert the initial value to the IV update type.  */
+      tree new_type = TREE_TYPE (step_expr);
+      init_expr = gimple_convert (&stmts, new_type, init_expr);
+
+      /* If we are using the loop mask to "peel" for alignment then we need
+        to adjust the start value here.  */
+      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+      if (skip_niters != NULL_TREE)
+       {
+         if (FLOAT_TYPE_P (vectype))
+           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
+                                       skip_niters);
+         else
+           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
+         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
+                                        skip_niters, step_expr);
+         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
+                                   init_expr, skip_step);
+       }
+    }
+
+  if (stmts)
+    {
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+      gcc_assert (!new_bb);
+    }
+
   /* Create the vector that holds the initial_value of the induction.  */
   if (nested_in_vect_loop)
     {
index 45e33c031d2882cfe23e42acd7fbcfb6d704b928..63a59c0c8ed388f24d782048fa0187283403ba59 100644 (file)
@@ -1441,20 +1441,14 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
          return NULL;
 
        vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
-       /* Induction from different IVs is not supported.  */
        if (def_type == vect_induction_def)
          {
-           stmt_vec_info other_info;
-           FOR_EACH_VEC_ELT (stmts, i, other_info)
-             if (stmt_info != other_info)
-               return NULL;
-
-           /* Induction PHIs are leafs.  */
-           (*tree_size)++;
-           node = vect_create_new_slp_node (node, stmts, nops);
-           SLP_TREE_VECTYPE (node) = vectype;
-           SLP_TREE_CHILDREN (node).quick_grow_cleared (nops);
-           return node;
+           /* Induction PHIs are not cycles but walk the initial
+              value.  */
+           class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+           if (nested_in_vect_loop_p (loop, stmt_info))
+             loop = loop->inner;
+           skip_args[loop_latch_edge (loop)->dest_idx] = true;
          }
        else if (def_type == vect_reduction_def
                 || def_type == vect_double_reduction_def
index 81f867ddded8a3cd1fc1de2e7b531b0ce865ac3c..739c288378aff8af128562eb47ed48f6e4e7264e 100644 (file)
@@ -2250,6 +2250,22 @@ build_real_from_int_cst (tree type, const_tree i)
   return v;
 }
 
+/* Return a new REAL_CST node whose type is TYPE
+   and whose value is the integer value I which has sign SGN.  */
+
+tree
+build_real_from_wide (tree type, const wide_int_ref &i, signop sgn)
+{
+  REAL_VALUE_TYPE d;
+
+  /* Clear all bits of the real value type so that we can later do
+     bitwise comparisons to see if two values are the same.  */
+  memset (&d, 0, sizeof d);
+
+  real_from_integer (&d, TYPE_MODE (type), i, sgn);
+  return build_real (type, d);
+}
+
 /* Return a newly constructed STRING_CST node whose value is the LEN
    characters at STR when STR is nonnull, or all zeros otherwise.
    Note that for a C string literal, LEN should include the trailing NUL.
index 7f0aa5b8d1d13dd45773b6d04ef94f7c388df97b..04e564c979898f1c5141ec3255b74afafa4e5e36 100644 (file)
@@ -4429,6 +4429,7 @@ extern tree build_constructor_from_vec (tree, const vec<tree, va_gc> *);
 extern tree build_constructor_va (tree, int, ...);
 extern tree build_clobber (tree);
 extern tree build_real_from_int_cst (tree, const_tree);
+extern tree build_real_from_wide (tree, const wide_int_ref &, signop);
 extern tree build_complex (tree, tree, tree);
 extern tree build_complex_inf (tree, bool);
 extern tree build_each_one_cst (tree);