Allow single-element interleaving for non-power-of-2 strides
authorRichard Sandiford <richard.sandiford@linaro.org>
Sat, 13 Jan 2018 18:00:31 +0000 (18:00 +0000)
committerRichard Sandiford <rsandifo@gcc.gnu.org>
Sat, 13 Jan 2018 18:00:31 +0000 (18:00 +0000)
This allows LD3 to be used for isolated a[i * 3] accesses, in a similar
way to the current a[i * 2] and a[i * 4] for LD2 and LD4 respectively.
Given the problems with the cost model underestimating the cost of
elementwise accesses, the patch continues to reject the VMAT_ELEMENTWISE
cases that are currently rejected.

2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
    Alan Hayward  <alan.hayward@arm.com>
    David Sherwood  <david.sherwood@arm.com>

gcc/
* tree-vect-data-refs.c (vect_analyze_group_access_1): Allow
single-element interleaving even if the size is not a power of 2.
* tree-vect-stmts.c (get_load_store_type): Disallow elementwise
accesses for single-element interleaving if the group size is
not a power of 2.

gcc/testsuite/
* gcc.target/aarch64/sve/struct_vect_18.c: New test.
* gcc.target/aarch64/sve/struct_vect_18_run.c: Likewise.
* gcc.target/aarch64/sve/struct_vect_19.c: Likewise.
* gcc.target/aarch64/sve/struct_vect_19_run.c: Likewise.

Co-Authored-By: Alan Hayward <alan.hayward@arm.com>
Co-Authored-By: David Sherwood <david.sherwood@arm.com>
From-SVN: r256634

gcc/ChangeLog
gcc/testsuite/ChangeLog
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c [new file with mode: 0644]
gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c [new file with mode: 0644]
gcc/tree-vect-data-refs.c
gcc/tree-vect-stmts.c

index 169cae0b06edca5cbc067dfa7fe162298bb21afe..73bfb41ce16129c0033e6bc77dd799d6c3826ac6 100644 (file)
@@ -1,3 +1,13 @@
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
+       * tree-vect-data-refs.c (vect_analyze_group_access_1): Allow
+       single-element interleaving even if the size is not a power of 2.
+       * tree-vect-stmts.c (get_load_store_type): Disallow elementwise
+       accesses for single-element interleaving if the group size is
+       not a power of 2.
+
 2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
            Alan Hayward  <alan.hayward@arm.com>
            David Sherwood  <david.sherwood@arm.com>
index df572fda5ba7177fe21b0090da48aa796268d699..72da41973ead1b9fd1e333a75414d6ae2c1205c1 100644 (file)
@@ -1,3 +1,12 @@
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
+       * gcc.target/aarch64/sve/struct_vect_18.c: New test.
+       * gcc.target/aarch64/sve/struct_vect_18_run.c: Likewise.
+       * gcc.target/aarch64/sve/struct_vect_19.c: Likewise.
+       * gcc.target/aarch64/sve/struct_vect_19_run.c: Likewise.
+
 2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
            Alan Hayward  <alan.hayward@arm.com>
            David Sherwood  <david.sherwood@arm.com>
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18.c
new file mode 100644 (file)
index 0000000..67b08d1
--- /dev/null
@@ -0,0 +1,44 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define N 2000
+
+#define TEST_LOOP(NAME, TYPE)                                  \
+  void __attribute__ ((noinline, noclone))                     \
+  NAME (TYPE *restrict dest, TYPE *restrict src)               \
+  {                                                            \
+    for (int i = 0; i < N; ++i)                                        \
+      dest[i] += src[i * 3];                                   \
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i8, signed char) \
+  TEST_LOOP (NAME##_i16, unsigned short) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
+
+/* Check the scalar tail.  */
+/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_18_run.c
new file mode 100644 (file)
index 0000000..9698216
--- /dev/null
@@ -0,0 +1,36 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "struct_vect_18.c"
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)                          \
+  {                                                    \
+    TYPE out[N];                                       \
+    TYPE in[N * 3];                                    \
+    for (int i = 0; i < N; ++i)                                \
+      {                                                        \
+       out[i] = i * 7 / 2;                             \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+    for (int i = 0; i < N * 3; ++i)                    \
+      {                                                        \
+       in[i] = i * 9 / 2;                              \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+    NAME (out, in);                                    \
+    for (int i = 0; i < N; ++i)                                \
+      {                                                        \
+       TYPE expected = i * 7 / 2 + in[i * 3];          \
+       if (out[i] != expected)                         \
+         __builtin_abort ();                           \
+       asm volatile ("" ::: "memory");                 \
+      }                                                        \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19.c
new file mode 100644 (file)
index 0000000..3754190
--- /dev/null
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define TEST_LOOP(NAME, TYPE)                                  \
+  void __attribute__ ((noinline, noclone))                     \
+  NAME (TYPE *restrict dest, TYPE *restrict src, int n)                \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      dest[i] += src[i * 3];                                   \
+  }
+
+#define TEST(NAME) \
+  TEST_LOOP (NAME##_i8, signed char) \
+  TEST_LOOP (NAME##_i16, unsigned short) \
+  TEST_LOOP (NAME##_f32, float) \
+  TEST_LOOP (NAME##_f64, double)
+
+TEST (test)
+
+/* Check the vectorized loop.  */
+/* { dg-final { scan-assembler-times {\tld1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld3d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1d\t} 1 } } */
+
+/* Check the scalar tail.  */
+/* { dg-final { scan-assembler-times {\tldrb\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrb\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldrh\tw} 2 } } */
+/* { dg-final { scan-assembler-times {\tstrh\tw} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\ts} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\ts} 1 } } */
+/* { dg-final { scan-assembler-times {\tldr\td} 2 } } */
+/* { dg-final { scan-assembler-times {\tstr\td} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c b/gcc/testsuite/gcc.target/aarch64/sve/struct_vect_19_run.c
new file mode 100644 (file)
index 0000000..1d0325f
--- /dev/null
@@ -0,0 +1,45 @@
+/* { dg-do run { target aarch64_sve_hw } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "struct_vect_19.c"
+
+#define N 1000
+
+#undef TEST_LOOP
+#define TEST_LOOP(NAME, TYPE)                  \
+  {                                            \
+    TYPE out[N];                               \
+    TYPE in[N * 3];                            \
+    int counts[] = { 0, 1, N - 1 };            \
+    for (int j = 0; j < 3; ++j)                        \
+      {                                                \
+       int count = counts[j];                  \
+       for (int i = 0; i < N; ++i)             \
+         {                                     \
+           out[i] = i * 7 / 2;                 \
+           asm volatile ("" ::: "memory");     \
+         }                                     \
+       for (int i = 0; i < N * 3; ++i)         \
+         {                                     \
+           in[i] = i * 9 / 2;                  \
+           asm volatile ("" ::: "memory");     \
+         }                                     \
+       NAME (out, in, count);                  \
+       for (int i = 0; i < N; ++i)             \
+         {                                     \
+           TYPE expected = i * 7 / 2;          \
+           if (i < count)                      \
+             expected += in[i * 3];            \
+           if (out[i] != expected)             \
+             __builtin_abort ();               \
+           asm volatile ("" ::: "memory");     \
+         }                                     \
+      }                                                \
+  }
+
+int __attribute__ ((optimize (1)))
+main (void)
+{
+  TEST (test);
+  return 0;
+}
index 23b10844ffcfe01318e4bee89de6a0ea7263b9bf..59462be4295e4b55f2a4881e9cdea8b27742d034 100644 (file)
@@ -2427,11 +2427,10 @@ vect_analyze_group_access_1 (struct data_reference *dr)
         element of the group that is accessed in the loop.  */
 
       /* Gaps are supported only for loads. STEP must be a multiple of the type
-        size.  The size of the group must be a power of 2.  */
+        size.  */
       if (DR_IS_READ (dr)
          && (dr_step % type_size) == 0
-         && groupsize > 0
-         && pow2p_hwi (groupsize))
+         && groupsize > 0)
        {
          GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
          GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
index 5bb61319b669ee27133a6acdeeba04869da2491c..e4d20514c00db75a299f366bb77076b5e3b9d197 100644 (file)
@@ -2176,7 +2176,10 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p,
      cost of using elementwise accesses.  This check preserves the
      traditional behavior until that can be fixed.  */
   if (*memory_access_type == VMAT_ELEMENTWISE
-      && !STMT_VINFO_STRIDED_P (stmt_info))
+      && !STMT_VINFO_STRIDED_P (stmt_info)
+      && !(stmt == GROUP_FIRST_ELEMENT (stmt_info)
+          && !GROUP_NEXT_ELEMENT (stmt_info)
+          && !pow2p_hwi (GROUP_SIZE (stmt_info))))
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,