Allow gather loads to be used for grouped accesses

author Richard Sandiford <richard.sandiford@linaro.org>

Sat, 13 Jan 2018 18:01:49 +0000 (18:01 +0000)

committer Richard Sandiford <rsandifo@gcc.gnu.org>

Sat, 13 Jan 2018 18:01:49 +0000 (18:01 +0000)
author Richard Sandiford <richard.sandiford@linaro.org>
Sat, 13 Jan 2018 18:01:49 +0000 (18:01 +0000)
committer Richard Sandiford <rsandifo@gcc.gnu.org>
Sat, 13 Jan 2018 18:01:49 +0000 (18:01 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index fb7a205be00936a699c37da18bb353c76517a3c8..c660ff13f0df3a48c0e533196845bb1c55611c70 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,20 @@
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
+       * tree-vectorizer.h (vect_gather_scatter_fn_p): Declare.
+       * tree-vect-data-refs.c (vect_gather_scatter_fn_p): Make public.
+       * tree-vect-stmts.c (vect_truncate_gather_scatter_offset): New
+       function.
+       (vect_use_strided_gather_scatters_p): Take a masked_p argument.
+       Use vect_truncate_gather_scatter_offset if we can't treat the
+       operation as a normal gather load or scatter store.
+       (get_group_load_store_type): Take the gather_scatter_info
+       as argument.  Try using a gather load or scatter store for
+       single-element groups.
+       (get_load_store_type): Update calls to get_group_load_store_type
+       and vect_use_strided_gather_scatters_p.
+
  2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
             Alan Hayward  <alan.hayward@arm.com>
             David Sherwood  <david.sherwood@arm.com>
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 30bcb7a057c2e12895b18426acbf3545a15cebdf..20d84c268403ed1653a35ef748834d2be270b59e 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,14 @@
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
+       * gcc.target/aarch64/sve/reduc_strict_3.c: Expect FADDA to be used
+       for double_reduc1.
+       * gcc.target/aarch64/sve/strided_load_4.c: New test.
+       * gcc.target/aarch64/sve/strided_load_5.c: Likewise.
+       * gcc.target/aarch64/sve/strided_load_6.c: Likewise.
+       * gcc.target/aarch64/sve/strided_load_7.c: Likewise.
+
  2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
             Alan Hayward  <alan.hayward@arm.com>
             David Sherwood  <david.sherwood@arm.com>
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c

index a28145febce65b443f3b84d12e61139b1aa23ac7..a718e9d2ebfabae80b4d00d4733cfe0706778c2d 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c
@@ -118,14 +118,11 @@ double_reduc3 (float *restrict i, float *restrict j)
    return l * k;
  }
  
-/* We can't yet handle double_reduc1.  */
-/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 3 } } */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 4 } } */
  /* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 9 } } */
  /* 1 reduction each for double_reduc{1,2} and 2 for double_reduc3.  Each one
     is reported three times, once for SVE, once for 128-bit AdvSIMD and once
     for 64-bit AdvSIMD.  */
  /* { dg-final { scan-tree-dump-times "Detected double reduction" 12 "vect" } } */
-/* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3.
-   double_reduc1 is reported 3 times (SVE, 128-bit AdvSIMD, 64-bit AdvSIMD)
-   before failing.  */
-/* { dg-final { scan-tree-dump-times "Detected reduction" 12 "vect" } } */
+/* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3.  */
+/* { dg-final { scan-tree-dump-times "Detected reduction" 10 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c

new file mode 100644 (file)

index 0000000..0eff384
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c
@@ -0,0 +1,33 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE)                      \
+  void __attribute__ ((noinline, noclone))                     \
+  f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest,            \
+                          DATA_TYPE *restrict src, int n)      \
+  {                                                            \
+    for (int i = 0; i < n; ++i)                                        \
+      dest[i] += src[i * SCALE];                               \
+  }
+
+#define TEST_TYPE(T, DATA_TYPE)                        \
+  T (DATA_TYPE, 5, 5)                          \
+  T (DATA_TYPE, 7, 7)                          \
+  T (DATA_TYPE, 11, 11)                                \
+  T (DATA_TYPE, 200, 200)                      \
+  T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T)                            \
+  TEST_TYPE (T, int32_t)                       \
+  TEST_TYPE (T, uint32_t)                      \
+  TEST_TYPE (T, float)                         \
+  TEST_TYPE (T, int64_t)                       \
+  TEST_TYPE (T, uint64_t)                      \
+  TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw 2\]\n} 15 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c

new file mode 100644 (file)

index 0000000..415b466
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c
@@ -0,0 +1,34 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=256 --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE)                      \
+  void __attribute__ ((noinline, noclone))                     \
+  f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest,            \
+                         DATA_TYPE *restrict src, long n)      \
+  {                                                            \
+    for (long i = 0; i < n; ++i)                               \
+      dest[i] += src[i * SCALE];                               \
+  }
+
+#define TEST_TYPE(T, DATA_TYPE)                        \
+  T (DATA_TYPE, 5, 5)                          \
+  T (DATA_TYPE, 7, 7)                          \
+  T (DATA_TYPE, 11, 11)                                \
+  T (DATA_TYPE, 200, 200)                      \
+  T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T)                            \
+  TEST_TYPE (T, int32_t)                       \
+  TEST_TYPE (T, uint32_t)                      \
+  TEST_TYPE (T, float)                         \
+  TEST_TYPE (T, int64_t)                       \
+  TEST_TYPE (T, uint64_t)                      \
+  TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c

new file mode 100644 (file)

index 0000000..9e00015
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c
@@ -0,0 +1,7 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable --save-temps" } */
+
+#include "strided_load_5.c"
+
+/* { dg-final { scan-assembler-not {\[x[0-9]+, z[0-9]+\.s} } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c

new file mode 100644 (file)

index 0000000..3a36367
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c
@@ -0,0 +1,34 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(DATA_TYPE, NAME, SCALE)                      \
+  void __attribute__ ((noinline, noclone))                     \
+  f_##DATA_TYPE##_##NAME (DATA_TYPE *restrict dest,            \
+                         DATA_TYPE *restrict src)              \
+  {                                                            \
+    for (long i = 0; i < 1000; ++i)                            \
+      dest[i] += src[i * SCALE];                               \
+  }
+
+#define TEST_TYPE(T, DATA_TYPE)                        \
+  T (DATA_TYPE, 5, 5)                          \
+  T (DATA_TYPE, 7, 7)                          \
+  T (DATA_TYPE, 11, 11)                                \
+  T (DATA_TYPE, 200, 200)                      \
+  T (DATA_TYPE, m100, -100)
+
+#define TEST_ALL(T)                            \
+  TEST_TYPE (T, int32_t)                       \
+  TEST_TYPE (T, uint32_t)                      \
+  TEST_TYPE (T, float)                         \
+  TEST_TYPE (T, int64_t)                       \
+  TEST_TYPE (T, uint64_t)                      \
+  TEST_TYPE (T, double)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw\]\n} 12 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, sxtw\]\n} 3 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d\]\n} 15 } } */
diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c

index 69721a9a1f967991b50236bdc46b635137b2a96b..daa8b0c36d3fea7d0ffb5831c76f65556efddcd5 100644 (file)
--- a/gcc/tree-vect-data-refs.c
+++ b/gcc/tree-vect-data-refs.c
@@ -3312,7 +3312,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
     Return true if the function is supported, storing the function
     id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
  
-static bool
+bool
  vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
                           tree memory_type, unsigned int offset_bits,
                           signop offset_sign, int scale,
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c

index 079cbddfd5ebb77876b2e1c7e0a8bb5d363ac00b..df588347789184bca1a726d469561f379048446e 100644 (file)
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -1849,18 +1849,117 @@ prepare_load_store_mask (tree mask_type, tree loop_mask, tree vec_mask,
    return and_res;
  }
  
+/* Determine whether we can use a gather load or scatter store to vectorize
+   strided load or store STMT by truncating the current offset to a smaller
+   width.  We need to be able to construct an offset vector:
+
+     { 0, X, X*2, X*3, ... }
+
+   without loss of precision, where X is STMT's DR_STEP.
+
+   Return true if this is possible, describing the gather load or scatter
+   store in GS_INFO.  MASKED_P is true if the load or store is conditional.  */
+
+static bool
+vect_truncate_gather_scatter_offset (gimple *stmt, loop_vec_info loop_vinfo,
+                                    bool masked_p,
+                                    gather_scatter_info *gs_info)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
+  tree step = DR_STEP (dr);
+  if (TREE_CODE (step) != INTEGER_CST)
+    {
+      /* ??? Perhaps we could use range information here?  */
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "cannot truncate variable step.\n");
+      return false;
+    }
+
+  /* Get the number of bits in an element.  */
+  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  scalar_mode element_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype));
+  unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
+
+  /* Set COUNT to the upper limit on the number of elements - 1.
+     Start with the maximum vectorization factor.  */
+  unsigned HOST_WIDE_INT count = vect_max_vf (loop_vinfo) - 1;
+
+  /* Try lowering COUNT to the number of scalar latch iterations.  */
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  widest_int max_iters;
+  if (max_loop_iterations (loop, &max_iters)
+      && max_iters < count)
+    count = max_iters.to_shwi ();
+
+  /* Try scales of 1 and the element size.  */
+  int scales[] = { 1, vect_get_scalar_dr_size (dr) };
+  bool overflow_p = false;
+  for (int i = 0; i < 2; ++i)
+    {
+      int scale = scales[i];
+      widest_int factor;
+      if (!wi::multiple_of_p (wi::to_widest (step), scale, SIGNED, &factor))
+       continue;
+
+      /* See whether we can calculate (COUNT - 1) * STEP / SCALE
+        in OFFSET_BITS bits.  */
+      widest_int range = wi::mul (count, factor, SIGNED, &overflow_p);
+      if (overflow_p)
+       continue;
+      signop sign = range >= 0 ? UNSIGNED : SIGNED;
+      if (wi::min_precision (range, sign) > element_bits)
+       {
+         overflow_p = true;
+         continue;
+       }
+
+      /* See whether the target supports the operation.  */
+      tree memory_type = TREE_TYPE (DR_REF (dr));
+      if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
+                                    memory_type, element_bits, sign, scale,
+                                    &gs_info->ifn, &gs_info->element_type))
+       continue;
+
+      tree offset_type = build_nonstandard_integer_type (element_bits,
+                                                        sign == UNSIGNED);
+
+      gs_info->decl = NULL_TREE;
+      /* Logically the sum of DR_BASE_ADDRESS, DR_INIT and DR_OFFSET,
+        but we don't need to store that here.  */
+      gs_info->base = NULL_TREE;
+      gs_info->offset = fold_convert (offset_type, step);
+      gs_info->offset_dt = vect_unknown_def_type;
+      gs_info->offset_vectype = NULL_TREE;
+      gs_info->scale = scale;
+      gs_info->memory_type = memory_type;
+      return true;
+    }
+
+  if (overflow_p && dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "truncating gather/scatter offset to %d bits"
+                    " might change its value.\n", element_bits);
+
+  return false;
+}
+
  /* Return true if we can use gather/scatter internal functions to
     vectorize STMT, which is a grouped or strided load or store.
-   When returning true, fill in GS_INFO with the information required
-   to perform the operation.  */
+   MASKED_P is true if load or store is conditional.  When returning
+   true, fill in GS_INFO with the information required to perform the
+   operation.  */
  
  static bool
  vect_use_strided_gather_scatters_p (gimple *stmt, loop_vec_info loop_vinfo,
+                                   bool masked_p,
                                     gather_scatter_info *gs_info)
  {
    if (!vect_check_gather_scatter (stmt, loop_vinfo, gs_info)
        || gs_info->decl)
-    return false;
+    return vect_truncate_gather_scatter_offset (stmt, loop_vinfo,
+                                               masked_p, gs_info);
  
    scalar_mode element_mode = SCALAR_TYPE_MODE (gs_info->element_type);
    unsigned int element_bits = GET_MODE_BITSIZE (element_mode);
@@ -1951,7 +2050,8 @@ vect_get_store_rhs (gimple *stmt)
  static bool
  get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
                            bool masked_p, vec_load_store_type vls_type,
-                          vect_memory_access_type *memory_access_type)
+                          vect_memory_access_type *memory_access_type,
+                          gather_scatter_info *gs_info)
  {
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
    vec_info *vinfo = stmt_info->vinfo;
@@ -2073,6 +2173,20 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
               overrun_p = would_overrun_p;
             }
         }
+
+      /* As a last resort, trying using a gather load or scatter store.
+
+        ??? Although the code can handle all group sizes correctly,
+        it probably isn't a win to use separate strided accesses based
+        on nearby locations.  Or, even if it's a win over scalar code,
+        it might not be a win over vectorizing at a lower VF, if that
+        allows us to use contiguous accesses.  */
+      if (*memory_access_type == VMAT_ELEMENTWISE
+         && single_element_p
+         && loop_vinfo
+         && vect_use_strided_gather_scatters_p (stmt, loop_vinfo,
+                                                masked_p, gs_info))
+       *memory_access_type = VMAT_GATHER_SCATTER;
      }
  
    if (vls_type != VLS_LOAD && first_stmt == stmt)
@@ -2200,14 +2314,15 @@ get_load_store_type (gimple *stmt, tree vectype, bool slp, bool masked_p,
    else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
      {
        if (!get_group_load_store_type (stmt, vectype, slp, masked_p, vls_type,
-                                     memory_access_type))
+                                     memory_access_type, gs_info))
         return false;
      }
    else if (STMT_VINFO_STRIDED_P (stmt_info))
      {
        gcc_assert (!slp);
        if (loop_vinfo
-         && vect_use_strided_gather_scatters_p (stmt, loop_vinfo, gs_info))
+         && vect_use_strided_gather_scatters_p (stmt, loop_vinfo,
+                                                masked_p, gs_info))
         *memory_access_type = VMAT_GATHER_SCATTER;
        else
         *memory_access_type = VMAT_ELEMENTWISE;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index c661578403a6fab60d2dfe0cb93242b4043c3196..903e56e3431b147d0133b6c90f07f14b98ddd141 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1455,6 +1455,8 @@ extern bool vect_verify_datarefs_alignment (loop_vec_info);
  extern bool vect_slp_analyze_and_verify_instance_alignment (slp_instance);
  extern bool vect_analyze_data_ref_accesses (vec_info *);
  extern bool vect_prune_runtime_alias_test_list (loop_vec_info);
+extern bool vect_gather_scatter_fn_p (bool, bool, tree, tree, unsigned int,
+                                     signop, int, internal_fn *, tree *);
  extern bool vect_check_gather_scatter (gimple *, loop_vec_info,
                                        gather_scatter_info *);
  extern bool vect_analyze_data_refs (vec_info *, poly_uint64 *);
author	Richard Sandiford <richard.sandiford@linaro.org>
	Sat, 13 Jan 2018 18:01:49 +0000 (18:01 +0000)
committer	Richard Sandiford <rsandifo@gcc.gnu.org>
	Sat, 13 Jan 2018 18:01:49 +0000 (18:01 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/strided_load_4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/strided_load_5.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/strided_load_6.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/strided_load_7.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-data-refs.c		patch \| blob \| history
gcc/tree-vect-stmts.c		patch \| blob \| history
gcc/tree-vectorizer.h		patch \| blob \| history