Avoid unnecessary peeling for gaps with LD3
authorRichard Sandiford <richard.sandiford@arm.com>
Tue, 24 May 2016 10:15:36 +0000 (10:15 +0000)
committerRichard Sandiford <rsandifo@gcc.gnu.org>
Tue, 24 May 2016 10:15:36 +0000 (10:15 +0000)
vectorizable_load forces peeling for gaps if the vectorisation factor
is not a multiple of the group size, since in that case we'd normally load
beyond the original scalar accesses but drop the excess elements as part
of a following permute:

          if (loop_vinfo
              && ! STMT_VINFO_STRIDED_P (stmt_info)
              && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
                  || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))

This isn't necessary for LOAD_LANES though, since it loads only the
data needed and does the permute itself.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.

gcc/
* tree-vect-stmts.c (vectorizable_load): Reorder checks so that
load_lanes/grouped_load classification comes first.  Don't check
whether the vectorization factor is a multiple of the group size
for load_lanes.

gcc/testsuite/
* gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.

From-SVN: r236632

gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c [new file with mode: 0644]
gcc/tree-vect-stmts.c

index a61b6cd97d0a289349d34bf8388236f4d93551e7..8f726b2c6497afa2f11df29d3eebc8a9835b3d6f 100644 (file)
@@ -1,3 +1,10 @@
+2016-05-24  Richard Sandiford  <richard.sandiford@arm.com>
+
+       * tree-vect-stmts.c (vectorizable_load): Reorder checks so that
+       load_lanes/grouped_load classification comes first.  Don't check
+       whether the vectorization factor is a multiple of the group size
+       for load_lanes.
+
 2016-05-24  Richard Sandiford  <richard.sandiford@arm.com>
 
        * tree-vect-data-refs.c (vect_analyze_group_access_1): Set
index 65589fa4effe87a94c5358850bafc5bf60f0afe5..cb78cc2c5805b918ea25b7756e968577f91b1c0e 100644 (file)
@@ -1,3 +1,7 @@
+2016-05-24  Richard Sandiford  <richard.sandiford@arm.com>
+
+       * gcc.dg/vect/vect-load-lanes-peeling-1.c: New test.
+
 2016-05-24  Richard Biener  <rguenther@suse.de>
 
        PR middle-end/70434
diff --git a/gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c b/gcc/testsuite/gcc.dg/vect/vect-load-lanes-peeling-1.c
new file mode 100644 (file)
index 0000000..c9cd104
--- /dev/null
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_int } */
+/* { dg-require-effective-target vect_load_lanes } */
+
+void
+f (int *__restrict a, int *__restrict b)
+{
+  for (int i = 0; i < 96; ++i)
+    a[i] = b[i * 3] + b[i * 3 + 1] + b[i * 3 + 2];
+}
+
+/* { dg-final { scan-tree-dump-not "Data access with gaps" "vect" } } */
+/* { dg-final { scan-tree-dump-not "epilog loop required" "vect" } } */
index f66e18062087c9bc039470d0ad27c5fc8780af56..1252d33510472d85fc0f258145e8ba2379e78315 100644 (file)
@@ -6303,6 +6303,17 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       gcc_assert (!nested_in_vect_loop && !STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 
       first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
+
+      if (!slp
+         && !PURE_SLP_STMT (stmt_info)
+         && !STMT_VINFO_STRIDED_P (stmt_info))
+       {
+         if (vect_load_lanes_supported (vectype, group_size))
+           load_lanes_p = true;
+         else if (!vect_grouped_load_supported (vectype, group_size))
+           return false;
+       }
 
       /* If this is single-element interleaving with an element distance
          that leaves unused vector loads around punt - we at least create
@@ -6330,7 +6341,7 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       if (loop_vinfo
          && ! STMT_VINFO_STRIDED_P (stmt_info)
          && (GROUP_GAP (vinfo_for_stmt (first_stmt)) != 0
-             || (!slp && vf % GROUP_SIZE (vinfo_for_stmt (first_stmt)) != 0)))
+             || (!slp && !load_lanes_p && vf % group_size != 0)))
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6350,8 +6361,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
       if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ())
        slp_perm = true;
 
-      group_size = GROUP_SIZE (vinfo_for_stmt (first_stmt));
-
       /* ???  The following is overly pessimistic (as well as the loop
          case above) in the case we can statically determine the excess
         elements loaded are within the bounds of a decl that is accessed.
@@ -6364,16 +6373,6 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
          return false;
        }
 
-      if (!slp
-         && !PURE_SLP_STMT (stmt_info)
-         && !STMT_VINFO_STRIDED_P (stmt_info))
-       {
-         if (vect_load_lanes_supported (vectype, group_size))
-           load_lanes_p = true;
-         else if (!vect_grouped_load_supported (vectype, group_size))
-           return false;
-       }
-
       /* Invalidate assumptions made by dependence analysis when vectorization
         on the unrolled body effectively re-orders stmts.  */
       if (!PURE_SLP_STMT (stmt_info)