tree-optimization/95866 - avoid using scalar ops for vectorized shift
authorRichard Biener <rguenther@suse.de>
Thu, 25 Jun 2020 09:21:20 +0000 (11:21 +0200)
committerRichard Biener <rguenther@suse.de>
Thu, 25 Jun 2020 10:29:52 +0000 (12:29 +0200)
This avoids using the original scalar SSA operand when vectorizing
a shift with a vectorized shift operand where we know all vector
components have the same value and thus we can use a vector by
scalar shift.  Using the scalar SSA operand causes a possibly
long chain of scalar computation to be retained so it's better
to simply extract lane zero from the available vectorized shift
operand.

2020-06-25  Richard Biener  <rguenther@suse.de>

PR tree-optimization/95866
* tree-vect-stmts.c (vectorizable_shift): Reject incompatible
vectorized shift operands.  For scalar shifts use lane zero
of a vectorized shift operand.

* gcc.dg/vect/bb-slp-pr95866.c: New testcase.

gcc/testsuite/gcc.dg/vect/bb-slp-pr95866.c [new file with mode: 0644]
gcc/tree-vect-stmts.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr95866.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr95866.c
new file mode 100644 (file)
index 0000000..5de4671
--- /dev/null
@@ -0,0 +1,17 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_shift } */
+
+int x[4];
+int j[4];
+void foo()
+{
+  x[0] = (x[0] << j[0]) + j[0];
+  x[1] = (x[1] << j[0]) + j[1];
+  x[2] = (x[2] << j[0]) + j[2];
+  x[3] = (x[3] << j[0]) + j[3];
+}
+
+/* The scalar shift argument should be extracted from the available vector.  */
+/* { dg-final { scan-tree-dump "BIT_FIELD_REF" "slp2" } } */
+/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" } } */
index de7d77f3872b33c5621830fd2738f8d3925a1a4a..edd28534cb0fb5f0e9d27efbf15d62cf7942cb2c 100644 (file)
@@ -5413,6 +5413,15 @@ vectorizable_shift (vec_info *vinfo,
                = (!op1_vectype
                   || !tree_nop_conversion_p (TREE_TYPE (vectype),
                                              TREE_TYPE (op1)));
+             if (incompatible_op1_vectype_p
+                 && dt[1] == vect_internal_def)
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "unusable type for last operand in"
+                                    " vector/vector shift/rotate.\n");
+                 return false;
+               }
             }
         }
     }
@@ -5457,7 +5466,7 @@ vectorizable_shift (vec_info *vinfo,
     {
       if (slp_node
          && (!vect_maybe_update_slp_op_vectype (slp_op0, vectype)
-             || (!scalar_shift_arg
+             || ((!scalar_shift_arg || dt[1] == vect_internal_def)
                  && (!incompatible_op1_vectype_p
                      || dt[1] == vect_constant_def)
                  && !vect_maybe_update_slp_op_vectype
@@ -5499,6 +5508,7 @@ vectorizable_shift (vec_info *vinfo,
 
   if (incompatible_op1_vectype_p && !slp_node)
     {
+      gcc_assert (!scalar_shift_arg && was_scalar_shift_arg);
       op1 = fold_convert (TREE_TYPE (vectype), op1);
       if (dt[1] != vect_constant_def)
        op1 = vect_init_vector (vinfo, stmt_info, op1,
@@ -5508,7 +5518,7 @@ vectorizable_shift (vec_info *vinfo,
   /* Handle def.  */
   vec_dest = vect_create_destination_var (scalar_dest, vectype);
 
-  if (scalar_shift_arg)
+  if (scalar_shift_arg && dt[1] != vect_internal_def)
     {
       /* Vector shl and shr insn patterns can be defined with scalar
         operand 2 (shift operand).  In this case, use constant or loop
@@ -5533,7 +5543,7 @@ vectorizable_shift (vec_info *vinfo,
            vec_oprnds1.quick_push (vec_oprnd1);
        }
     }
-  else if (slp_node && incompatible_op1_vectype_p)
+  else if (!scalar_shift_arg && slp_node && incompatible_op1_vectype_p)
     {
       if (was_scalar_shift_arg)
        {
@@ -5566,6 +5576,20 @@ vectorizable_shift (vec_info *vinfo,
   FOR_EACH_VEC_ELT (vec_oprnds0, i, vop0)
     {
       vop1 = vec_oprnds1[i];
+      /* For internal defs where we need to use a scalar shift arg
+        extract the first lane.  */
+      if (scalar_shift_arg && dt[1] == vect_internal_def)
+       {
+         new_temp = make_ssa_name (TREE_TYPE (TREE_TYPE (vop1)));
+         gassign *new_stmt
+           = gimple_build_assign (new_temp,
+                                  build3 (BIT_FIELD_REF, TREE_TYPE (new_temp),
+                                          vop1,
+                                          TYPE_SIZE (TREE_TYPE (new_temp)),
+                                          bitsize_zero_node));
+         vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+         vop1 = new_temp;
+       }
       gassign *new_stmt = gimple_build_assign (vec_dest, code, vop0, vop1);
       new_temp = make_ssa_name (vec_dest, new_stmt);
       gimple_assign_set_lhs (new_stmt, new_temp);