re PR tree-optimization/52252 (An opportunity for x86 gcc vectorizer (gain up to...
authorEvgeny Stupachenko <evstupac@gmail.com>
Wed, 7 May 2014 12:10:22 +0000 (12:10 +0000)
committerKirill Yukhin <kyukhin@gcc.gnu.org>
Wed, 7 May 2014 12:10:22 +0000 (12:10 +0000)
gcc/
* tree-vect-data-refs.c (vect_grouped_load_supported): New
check for loads group of length 3.
(vect_permute_load_chain): New permutations for loads group of
length 3.
* tree-vect-stmts.c (vect_model_load_cost): Change cost
of vec_perm_shuffle for the new permutations.

gcc/testsuite/
PR tree-optimization/52252
* gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3.

From-SVN: r210155

gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/vect/pr52252-ld.c [new file with mode: 0644]
gcc/tree-vect-data-refs.c
gcc/tree-vect-stmts.c

index 76339eeede1ee4ac186b78cecc2e9310526f86ec..000a5fb4dcbc4b6875055159d83cfc7d00f21ae7 100644 (file)
@@ -1,3 +1,12 @@
+2014-05-07  Evgeny Stupachenko  <evstupac@gmail.com>
+
+       * tree-vect-data-refs.c (vect_grouped_load_supported): New
+       check for loads group of length 3.
+       (vect_permute_load_chain): New permutations for loads group of
+       length 3.
+       * tree-vect-stmts.c (vect_model_load_cost): Change cost
+       of vec_perm_shuffle for the new permutations.
+
 2014-05-07  Alan Lawrence  <alan.lawrence@arm.com>
 
        * config/aarch64/arm_neon.h (vtrn1_f32, vtrn1_p8, vtrn1_p16, vtrn1_s8,
index 14e0e02fd99b09c1b5b443c1513ab3bef7f8707b..ea89f0614808b2fe2ea02d2d2498242a23d20985 100644 (file)
@@ -1,3 +1,8 @@
+2014-05-07  Evgeny Stupachenko  <evstupac@gmail.com>
+
+       PR tree-optimization/52252
+       * gcc.dg/vect/pr52252-ld.c: Test on loads group of size 3.
+
 2014-05-07  Alan Lawrence  <alan.lawrence@arm.com>
 
        * gcc.target/aarch64/simd/vrev16p8_1.c: New file.
diff --git a/gcc/testsuite/gcc.dg/vect/pr52252-ld.c b/gcc/testsuite/gcc.dg/vect/pr52252-ld.c
new file mode 100644 (file)
index 0000000..6e3cb52
--- /dev/null
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -g -ftree-vectorize -mssse3 -fdump-tree-vect-details" { target { i?86-*-* x86_64-*-* } } } */
+
+#define byte unsigned char
+
+void
+matrix_mul (byte *in, byte *out, int size)
+{
+  int i;
+  for (i = 0; i < size; i++)
+    {
+      byte in0 = in[0];
+      byte in1 = in[1];
+      byte in2 = in[2];
+      byte out0, out1, out2, out3;
+      out0 = in0 + in1;
+      out1 = in0 + in2;
+      out2 = in1 + in2;
+      out3 = in0 + in1 + in2;
+      out[0] = out0;
+      out[1] = out1;
+      out[2] = out2;
+      out[3] = out3;
+      in += 3;
+      out += 4;
+    }
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
index d48e3cdcfca918e599ff444cefba0148a582c764..c48640550e7d705fa6a0e173e1d1007c546f5f62 100644 (file)
@@ -4810,36 +4810,76 @@ vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
 {
   enum machine_mode mode = TYPE_MODE (vectype);
 
-  /* vect_permute_load_chain requires the group size to be a power of two.  */
-  if (exact_log2 (count) == -1)
+  /* vect_permute_load_chain requires the group size to be equal to 3 or
+     be a power of two.  */
+  if (count != 3 && exact_log2 (count) == -1)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "the size of the group of accesses"
-                         " is not a power of 2\n");
+                        "the size of the group of accesses"
+                        " is not a power of 2 or not equal to 3\n");
       return false;
     }
 
   /* Check that the permutation is supported.  */
   if (VECTOR_MODE_P (mode))
     {
-      unsigned int i, nelt = GET_MODE_NUNITS (mode);
+      unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
 
-      for (i = 0; i < nelt; i++)
-       sel[i] = i * 2;
-      if (can_vec_perm_p (mode, false, sel))
+      if (count == 3)
        {
+         unsigned int k;
+         for (k = 0; k < 3; k++)
+           {
+             for (i = 0; i < nelt; i++)
+               if (3 * i + k < 2 * nelt)
+                 sel[i] = 3 * i + k;
+               else
+                 sel[i] = 0;
+             if (!can_vec_perm_p (mode, false, sel))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "shuffle of 3 loads is not supported by"
+                                    " target\n");
+                   return false;
+               }
+             for (i = 0, j = 0; i < nelt; i++)
+               if (3 * i + k < 2 * nelt)
+                 sel[i] = i;
+               else
+                 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+             if (!can_vec_perm_p (mode, false, sel))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "shuffle of 3 loads is not supported by"
+                                    " target\n");
+                 return false;
+               }
+           }
+         return true;
+       }
+      else
+       {
+         /* If length is not equal to 3 then only power of 2 is supported.  */
+         gcc_assert (exact_log2 (count) != -1);
          for (i = 0; i < nelt; i++)
-           sel[i] = i * 2 + 1;
+           sel[i] = i * 2;
          if (can_vec_perm_p (mode, false, sel))
-           return true;
-       }
+           {
+             for (i = 0; i < nelt; i++)
+               sel[i] = i * 2 + 1;
+             if (can_vec_perm_p (mode, false, sel))
+               return true;
+           }
+        }
     }
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                     "extract even/odd not supported by target\n");
+                    "extract even/odd not supported by target\n");
   return false;
 }
 
@@ -4857,8 +4897,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
 /* Function vect_permute_load_chain.
 
    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
-   a power of 2, generate extract_even/odd stmts to reorder the input data
-   correctly.  Return the final references for loads in RESULT_CHAIN.
+   a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
+   the input data correctly.  Return the final references for loads in
+   RESULT_CHAIN.
 
    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
    The input is 4 vectors each containing 8 elements. We assign a number to each
@@ -4939,6 +4980,7 @@ vect_permute_load_chain (vec<tree> dr_chain,
 {
   tree data_ref, first_vect, second_vect;
   tree perm_mask_even, perm_mask_odd;
+  tree perm3_mask_low, perm3_mask_high;
   gimple perm_stmt;
   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
   unsigned int i, j, log_length = exact_log2 (length);
@@ -4949,44 +4991,97 @@ vect_permute_load_chain (vec<tree> dr_chain,
   memcpy (result_chain->address (), dr_chain.address (),
          length * sizeof (tree));
 
-  for (i = 0; i < nelt; ++i)
-    sel[i] = i * 2;
-  perm_mask_even = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_even != NULL);
-
-  for (i = 0; i < nelt; ++i)
-    sel[i] = i * 2 + 1;
-  perm_mask_odd = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_odd != NULL);
-
-  for (i = 0; i < log_length; i++)
+  if (length == 3)
     {
-      for (j = 0; j < length; j += 2)
-       {
-         first_vect = dr_chain[j];
-         second_vect = dr_chain[j+1];
+      unsigned int k;
 
-         /* data_ref = permute_even (first_data_ref, second_data_ref);  */
-         data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+      for (k = 0; k < 3; k++)
+       {
+         for (i = 0; i < nelt; i++)
+           if (3 * i + k < 2 * nelt)
+             sel[i] = 3 * i + k;
+           else
+             sel[i] = 0;
+         perm3_mask_low = vect_gen_perm_mask (vectype, sel);
+         gcc_assert (perm3_mask_low != NULL);
+
+         for (i = 0, j = 0; i < nelt; i++)
+           if (3 * i + k < 2 * nelt)
+             sel[i] = i;
+           else
+             sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+
+         perm3_mask_high = vect_gen_perm_mask (vectype, sel);
+         gcc_assert (perm3_mask_high != NULL);
+
+         first_vect = dr_chain[0];
+         second_vect = dr_chain[1];
+
+         /* Create interleaving stmt (low part of):
+            low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+                                                            ...}>  */
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low");
          perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
                                                    first_vect, second_vect,
-                                                   perm_mask_even);
+                                                   perm3_mask_low);
          vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-         (*result_chain)[j/2] = data_ref;
 
-         /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
-         data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+         /* Create interleaving stmt (high part of):
+            high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+                                                             ...}>  */
+         first_vect = data_ref;
+         second_vect = dr_chain[2];
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high");
          perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
                                                    first_vect, second_vect,
-                                                   perm_mask_odd);
+                                                   perm3_mask_high);
          vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-         (*result_chain)[j/2+length/2] = data_ref;
+         (*result_chain)[k] = data_ref;
        }
-      memcpy (dr_chain.address (), result_chain->address (),
-             length * sizeof (tree));
     }
-}
+  else
+    {
+      /* If length is not equal to 3 then only power of 2 is supported.  */
+      gcc_assert (exact_log2 (length) != -1);
+
+      for (i = 0; i < nelt; ++i)
+       sel[i] = i * 2;
+      perm_mask_even = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm_mask_even != NULL);
+
+      for (i = 0; i < nelt; ++i)
+       sel[i] = i * 2 + 1;
+      perm_mask_odd = vect_gen_perm_mask (vectype, sel);
+      gcc_assert (perm_mask_odd != NULL);
 
+      for (i = 0; i < log_length; i++)
+       {
+         for (j = 0; j < length; j += 2)
+           {
+             first_vect = dr_chain[j];
+             second_vect = dr_chain[j+1];
+
+             /* data_ref = permute_even (first_data_ref, second_data_ref);  */
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+                                                       first_vect, second_vect,
+                                                       perm_mask_even);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             (*result_chain)[j/2] = data_ref;
+
+             /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+             perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
+                                                       first_vect, second_vect,
+                                                       perm_mask_odd);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             (*result_chain)[j/2+length/2] = data_ref;
+           }
+         memcpy (dr_chain.address (), result_chain->address (),
+                 length * sizeof (tree));
+       }
+    }
+}
 
 /* Function vect_transform_grouped_load.
 
index b8547cba962db95eba1279de13d1e92b4730684f..ec9cc68bfabb07d919494c2464d58ced1e4aacb4 100644 (file)
@@ -1091,10 +1091,11 @@ vect_model_load_cost (stmt_vec_info stmt_info, int ncopies,
      include the cost of the permutes.  */
   if (!load_lanes_p && group_size > 1)
     {
-      /* Uses an even and odd extract operations for each needed permute.  */
-      int nstmts = ncopies * exact_log2 (group_size) * group_size;
-      inside_cost += record_stmt_cost (body_cost_vec, nstmts, vec_perm,
-                                      stmt_info, 0, vect_body);
+      /* Uses an even and odd extract operations or shuffle operations
+        for each needed permute.  */
+      int nstmts = ncopies * ceil_log2 (group_size) * group_size;
+      inside_cost = record_stmt_cost (body_cost_vec, nstmts, vec_perm,
+                                     stmt_info, 0, vect_body);
 
       if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,