Add support for in-order addition reduction using SVE FADDA

author Richard Sandiford <richard.sandiford@linaro.org>

Sat, 13 Jan 2018 18:01:24 +0000 (18:01 +0000)

committer Richard Sandiford <rsandifo@gcc.gnu.org>

Sat, 13 Jan 2018 18:01:24 +0000 (18:01 +0000)
author Richard Sandiford <richard.sandiford@linaro.org>
Sat, 13 Jan 2018 18:01:24 +0000 (18:01 +0000)
committer Richard Sandiford <rsandifo@gcc.gnu.org>
Sat, 13 Jan 2018 18:01:24 +0000 (18:01 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 2edd769c58827cdaf3fdbc323209864989f49d89..a5daf352ca1b84138bae0a05d3ebf4a9874b5248 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,37 @@
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
+       * optabs.def (fold_left_plus_optab): New optab.
+       * doc/md.texi (fold_left_plus_@var{m}): Document.
+       * internal-fn.def (IFN_FOLD_LEFT_PLUS): New internal function.
+       * internal-fn.c (fold_left_direct): Define.
+       (expand_fold_left_optab_fn): Likewise.
+       (direct_fold_left_optab_supported_p): Likewise.
+       * fold-const-call.c (fold_const_fold_left): New function.
+       (fold_const_call): Use it to fold CFN_FOLD_LEFT_PLUS.
+       * tree-parloops.c (valid_reduction_p): New function.
+       (gather_scalar_reductions): Use it.
+       * tree-vectorizer.h (FOLD_LEFT_REDUCTION): New vect_reduction_type.
+       (vect_finish_replace_stmt): Declare.
+       * tree-vect-loop.c (fold_left_reduction_fn): New function.
+       (needs_fold_left_reduction_p): New function, split out from...
+       (vect_is_simple_reduction): ...here.  Accept reductions that
+       forbid reassociation, but give them type FOLD_LEFT_REDUCTION.
+       (vect_force_simple_reduction): Also store the reduction type in
+       the assignment's STMT_VINFO_REDUC_TYPE.
+       (vect_model_reduction_cost): Handle FOLD_LEFT_REDUCTION.
+       (merge_with_identity): New function.
+       (vect_expand_fold_left): Likewise.
+       (vectorize_fold_left_reduction): Likewise.
+       (vectorizable_reduction): Handle FOLD_LEFT_REDUCTION.  Leave the
+       scalar phi in place for it.  Check for target support and reject
+       cases that would reassociate the operation.  Defer the transform
+       phase to vectorize_fold_left_reduction.
+       * config/aarch64/aarch64.md (UNSPEC_FADDA): New unspec.
+       * config/aarch64/aarch64-sve.md (fold_left_plus_<mode>): New expander.
+       (*fold_left_plus_<mode>, *pred_fold_left_plus_<mode>): New insns.
+
  2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
  
         * tree-if-conv.c (predicate_mem_writes): Remove redundant
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index f4bd987d8070ee75bc8a611cf5bfa9939d05bd65..08956b9ce4ba4ddd2d7b6873b11e6ce14d59a57e 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -1550,6 +1550,45 @@
    "<bit_reduc_op>\t%<Vetype>0, %1, %2.<Vetype>"
  )
  
+;; Unpredicated in-order FP reductions.
+(define_expand "fold_left_plus_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand")
+       (unspec:<VEL> [(match_dup 3)
+                      (match_operand:<VEL> 1 "register_operand")
+                      (match_operand:SVE_F 2 "register_operand")]
+                     UNSPEC_FADDA))]
+  "TARGET_SVE"
+  {
+    operands[3] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+  }
+)
+
+;; In-order FP reductions predicated with PTRUE.
+(define_insn "*fold_left_plus_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
+                      (match_operand:<VEL> 2 "register_operand" "0")
+                      (match_operand:SVE_F 3 "register_operand" "w")]
+                     UNSPEC_FADDA))]
+  "TARGET_SVE"
+  "fadda\t%<Vetype>0, %1, %<Vetype>0, %3.<Vetype>"
+)
+
+;; Predicated form of the above in-order reduction.
+(define_insn "*pred_fold_left_plus_<mode>"
+  [(set (match_operand:<VEL> 0 "register_operand" "=w")
+       (unspec:<VEL>
+         [(match_operand:<VEL> 1 "register_operand" "0")
+          (unspec:SVE_F
+            [(match_operand:<VPRED> 2 "register_operand" "Upl")
+             (match_operand:SVE_F 3 "register_operand" "w")
+             (match_operand:SVE_F 4 "aarch64_simd_imm_zero")]
+            UNSPEC_SEL)]
+         UNSPEC_FADDA))]
+  "TARGET_SVE"
+  "fadda\t%<Vetype>0, %2, %<Vetype>0, %3.<Vetype>"
+)
+
  ;; Unpredicated floating-point addition.
  (define_expand "add<mode>3"
    [(set (match_operand:SVE_F 0 "register_operand")
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md

index c38f2011efd46cce94e7666775b14136fa5eb55f..4a73ccc7c3f60af01d180ae2461dd6cb11333432 100644 (file)
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -165,6 +165,7 @@
      UNSPEC_STN
      UNSPEC_INSR
      UNSPEC_CLASTB
+    UNSPEC_FADDA
  ])
  
  (define_c_enum "unspecv" [
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi

index d31577bed6dff445ada247c40b891513ce64cfed..4527b44d1af1b026d06125a4079c5196c33ecf33 100644 (file)
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5236,6 +5236,14 @@ has mode @var{m} and operands 0 and 1 have the mode appropriate for
  one element of @var{m}.  Operand 2 has the usual mask mode for vectors
  of mode @var{m}; see @code{TARGET_VECTORIZE_GET_MASK_MODE}.
  
+@cindex @code{fold_left_plus_@var{m}} instruction pattern
+@item @code{fold_left_plus_@var{m}}
+Take scalar operand 1 and successively add each element from vector
+operand 2.  Store the result in scalar operand 0.  The vector has
+mode @var{m} and the scalars have the mode appropriate for one
+element of @var{m}.  The operation is strictly in-order: there is
+no reassociation.
+
  @cindex @code{sdot_prod@var{m}} instruction pattern
  @item @samp{sdot_prod@var{m}}
  @cindex @code{udot_prod@var{m}} instruction pattern
diff --git a/gcc/fold-const-call.c b/gcc/fold-const-call.c

index 60acf96bbf3bfe1d9cc8b1fd7b512abd5f3f9348..fcf4a14ebaa75ca3a8fb4e4df83a724a79769140 100644 (file)
--- a/gcc/fold-const-call.c
+++ b/gcc/fold-const-call.c
@@ -1195,6 +1195,28 @@ fold_const_call (combined_fn fn, tree type, tree arg)
      }
  }
  
+/* Fold a call to IFN_FOLD_LEFT_<CODE> (ARG0, ARG1), returning a value
+   of type TYPE.  */
+
+static tree
+fold_const_fold_left (tree type, tree arg0, tree arg1, tree_code code)
+{
+  if (TREE_CODE (arg1) != VECTOR_CST)
+    return NULL_TREE;
+
+  unsigned HOST_WIDE_INT nelts;
+  if (!VECTOR_CST_NELTS (arg1).is_constant (&nelts))
+    return NULL_TREE;
+
+  for (unsigned HOST_WIDE_INT i = 0; i < nelts; i++)
+    {
+      arg0 = const_binop (code, type, arg0, VECTOR_CST_ELT (arg1, i));
+      if (arg0 == NULL_TREE || !CONSTANT_CLASS_P (arg0))
+       return NULL_TREE;
+    }
+  return arg0;
+}
+
  /* Try to evaluate:
  
        *RESULT = FN (*ARG0, *ARG1)
@@ -1500,6 +1522,9 @@ fold_const_call (combined_fn fn, tree type, tree arg0, tree arg1)
         }
        return NULL_TREE;
  
+    case CFN_FOLD_LEFT_PLUS:
+      return fold_const_fold_left (type, arg0, arg1, PLUS_EXPR);
+
      default:
        return fold_const_call_1 (fn, type, arg0, arg1);
      }
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c

index 0cc59e84514355bed13cdf34ac2ae551ed152621..42cdf1345e560b297cf1b83e3fcc5b88e2109c84 100644 (file)
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -92,6 +92,7 @@ init_internal_fns ()
  #define cond_binary_direct { 1, 1, true }
  #define while_direct { 0, 2, false }
  #define fold_extract_direct { 2, 2, false }
+#define fold_left_direct { 1, 1, false }
  
  const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
  #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
@@ -2897,6 +2898,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
  #define expand_fold_extract_optab_fn(FN, STMT, OPTAB) \
    expand_direct_optab_fn (FN, STMT, OPTAB, 3)
  
+#define expand_fold_left_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 2)
+
  /* RETURN_TYPE and ARGS are a return type and argument list that are
     in principle compatible with FN (which satisfies direct_internal_fn_p).
     Return the types that should be used to determine whether the
@@ -2980,6 +2984,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
  #define direct_mask_store_lanes_optab_supported_p multi_vector_optab_supported_p
  #define direct_while_optab_supported_p convert_optab_supported_p
  #define direct_fold_extract_optab_supported_p direct_optab_supported_p
+#define direct_fold_left_optab_supported_p direct_optab_supported_p
  
  /* Return the optab used by internal function FN.  */
  
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def

index 44330466cfdb1ba7f8f2d5ff98ae09529168e052..36bcf885bf78d000420cc9e3319a3a2d2e684f18 100644 (file)
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -58,6 +58,8 @@ along with GCC; see the file COPYING3.  If not see
  
     - cond_binary: a conditional binary optab, such as add<mode>cc
  
+   - fold_left: for scalar = FN (scalar, vector), keyed off the vector mode
+
     DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that
     maps to one of two optabs, depending on the signedness of an input.
     SIGNED_OPTAB and UNSIGNED_OPTAB are the optabs for signed and
@@ -162,6 +164,8 @@ DEF_INTERNAL_OPTAB_FN (EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
  DEF_INTERNAL_OPTAB_FN (FOLD_EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
                        fold_extract_last, fold_extract)
  
+DEF_INTERNAL_OPTAB_FN (FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
+                      fold_left_plus, fold_left)
  
  /* Unary math functions.  */
  DEF_INTERNAL_FLT_FN (ACOS, ECF_CONST, acos, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def

index e19d4a50c963396d518c69c350660adba07fb42e..7fab96f2d6659bf7681741cd72c21da33ed4cdf8 100644 (file)
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -306,6 +306,7 @@ OPTAB_D (reduc_umin_scal_optab, "reduc_umin_scal_$a")
  OPTAB_D (reduc_and_scal_optab,  "reduc_and_scal_$a")
  OPTAB_D (reduc_ior_scal_optab,  "reduc_ior_scal_$a")
  OPTAB_D (reduc_xor_scal_optab,  "reduc_xor_scal_$a")
+OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
  
  OPTAB_D (extract_last_optab, "extract_last_$a")
  OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index b393ceff26cecf49da53e252f9af7637774fc50d..8c8987639e65df9a3b1b3ebe6823758208b737a2 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,30 @@
+2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
+           Alan Hayward  <alan.hayward@arm.com>
+           David Sherwood  <david.sherwood@arm.com>
+
+       * gcc.dg/vect/no-fast-math-vect16.c: Expect the test to pass and
+       check for a message about using in-order reductions.
+       * gcc.dg/vect/pr79920.c: Expect both loops to be vectorized and
+       check for a message about using in-order reductions.
+       * gcc.dg/vect/trapv-vect-reduc-4.c: Expect all three loops to be
+       vectorized and check for a message about using in-order reductions.
+       Expect targets with variable-length vectors to fall back to the
+       fixed-length mininum.
+       * gcc.dg/vect/vect-reduc-6.c: Expect the loop to be vectorized and
+       check for a message about using in-order reductions.
+       * gcc.dg/vect/vect-reduc-in-order-1.c: New test.
+       * gcc.dg/vect/vect-reduc-in-order-2.c: Likewise.
+       * gcc.dg/vect/vect-reduc-in-order-3.c: Likewise.
+       * gcc.dg/vect/vect-reduc-in-order-4.c: Likewise.
+       * gcc.target/aarch64/sve/reduc_strict_1.c: New test.
+       * gcc.target/aarch64/sve/reduc_strict_1_run.c: Likewise.
+       * gcc.target/aarch64/sve/reduc_strict_2.c: Likewise.
+       * gcc.target/aarch64/sve/reduc_strict_2_run.c: Likewise.
+       * gcc.target/aarch64/sve/reduc_strict_3.c: Likewise.
+       * gcc.target/aarch64/sve/slp_13.c: Add floating-point types.
+       * gfortran.dg/vect/vect-8.f90: Expect 22 loops to be vectorized if
+       vect_fold_left_plus.
+
  2018-01-13  Richard Sandiford  <richard.sandiford@linaro.org>
  
         * gcc.target/aarch64/sve/spill_1.c: Also test that no predicates
diff --git a/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c b/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c

index def8efb20c403d8cfb8abdc485039bbaef8220d2..2445d605534cc096d6f9a0c560ef961aa29db433 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c
+++ b/gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c
@@ -33,5 +33,5 @@ int main (void)
    return main1 ();
  }
  
-/* Requires fast-math.  */
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { xfail *-*-* } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {using an in-order \(fold-left\) reduction} 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/pr79920.c b/gcc/testsuite/gcc.dg/vect/pr79920.c

index c066b91e73f2597a0bac8ddeeb55a22bd78fcaf9..9ba0fb3e4f52703739f5e485cf1d576b37e92f54 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/pr79920.c
+++ b/gcc/testsuite/gcc.dg/vect/pr79920.c
@@ -1,5 +1,5 @@
  /* { dg-do run } */
-/* { dg-additional-options "-O3" } */
+/* { dg-additional-options "-O3 -fno-fast-math" } */
  
  #include "tree-vect.h"
  
@@ -41,4 +41,5 @@ int main()
    return 0;
  }
  
-/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target { vect_double && { vect_perm && vect_hw_misalign } } } } } */
+/* { dg-final { scan-tree-dump-times {using an in-order \(fold-left\) reduction} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { vect_double && { vect_perm && vect_hw_misalign } } } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c b/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c

index d19d42cc0489b47ba1f623506eedb00063ee2c63..8a57eb69a91c3f48dce4385ae6e4058d2369383e 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c
+++ b/gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c
@@ -46,5 +46,8 @@ int main (void)
    return 0;
  }
  
-/* { dg-final { scan-tree-dump-times "Detected reduction\\." 2 "vect"  } } */
-/* { dg-final { scan-tree-dump-times "vectorized 2 loops" 1 "vect" { target { ! vect_no_int_min_max } } } } */
+/* We can't handle the first loop with variable-length vectors and so
+   fall back to the fixed-length mininum instead.  */
+/* { dg-final { scan-tree-dump-times "Detected reduction\\." 3 "vect" { xfail vect_variable_length } } } */
+/* { dg-final { scan-tree-dump-times "vectorized 3 loops" 1 "vect" { target { ! vect_no_int_min_max } } } } */
+/* { dg-final { scan-tree-dump-times {using an in-order \(fold-left\) reduction} 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c

index 1bbf10c7607af68772e45fccac7a0507d80ba73e..f80d5ba8114d69f99c9b87a6a465aba245b3a3f7 100644 (file)
--- a/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-6.c
@@ -1,4 +1,5 @@
  /* { dg-require-effective-target vect_float } */
+/* { dg-additional-options "-fno-fast-math" } */
  
  #include <stdarg.h>
  #include "tree-vect.h"
@@ -48,6 +49,5 @@ int main (void)
    return 0;
  }
  
-/* need -ffast-math to vectorizer these loops.  */
-/* ARM NEON passes -ffast-math to these tests, so expect this to fail.  */
-/* { dg-final { scan-tree-dump-times "vectorized 0 loops" 1 "vect" { xfail arm_neon_ok } } } */
+/* { dg-final { scan-tree-dump-times {using an in-order \(fold-left\) reduction} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-1.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-1.c

new file mode 100644 (file)

index 0000000..cc73d52
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-1.c
@@ -0,0 +1,42 @@
+/* { dg-do run { xfail { { i?86-*-* x86_64-*-* } && ia32 } } } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *a, double *b)
+{
+  double r = 0, q = 3;
+  for (int i = 0; i < N; i++)
+    {
+      r += a[i];
+      q -= b[i];
+    }
+  return r * q;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  double a[N];
+  double b[N];
+  double r = 0, q = 3;
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      b[i] = (i * 0.3) * (i & 1 ? 1 : -1);
+      r += a[i];
+      q -= b[i];
+      asm volatile ("" ::: "memory");
+    }
+  double res = reduc_plus_double (a, b);
+  if (res != r * q)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times {using an in-order \(fold-left\) reduction} 2 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-2.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-2.c

new file mode 100644 (file)

index 0000000..dea6cdd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-2.c
@@ -0,0 +1,44 @@
+/* { dg-do run { xfail { { i?86-*-* x86_64-*-* } && ia32 } } } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *restrict a, int n)
+{
+  double res = 0.0;
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < N; j++)
+      res += a[i];
+  return res;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  int n = 19;
+  double a[N];
+  double r = 0;
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      asm volatile ("" ::: "memory");
+    }
+  for (int i = 0; i < n; i++)
+    for (int j = 0; j < N; j++)
+      {
+       r += a[i];
+       asm volatile ("" ::: "memory");
+      }
+  double res = reduc_plus_double (a, n);
+  if (res != r)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {in-order double reduction not supported} "vect" } } */
+/* { dg-final { scan-tree-dump-times {using an in-order \(fold-left\) reduction} 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-3.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-3.c

new file mode 100644 (file)

index 0000000..037213e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-3.c
@@ -0,0 +1,42 @@
+/* { dg-do run { xfail { { i?86-*-* x86_64-*-* } && ia32 } } } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *a)
+{
+  double r = 0;
+  for (int i = 0; i < N; i += 4)
+    {
+      r += a[i] * 2.0;
+      r += a[i + 1] * 3.0;
+      r += a[i + 2] * 4.0;
+      r += a[i + 3] * 5.0;
+    }
+  return r;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  double a[N];
+  double r = 0;
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      r += a[i] * (i % 4 + 2);
+      asm volatile ("" ::: "memory");
+    }
+  double res = reduc_plus_double (a);
+  if (res != r)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times {using an in-order \(fold-left\) reduction} 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times {vectorizing stmts using SLP} 1 "vect" } } */
diff --git a/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-4.c b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-4.c

new file mode 100644 (file)

index 0000000..46d0f3a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-4.c
@@ -0,0 +1,45 @@
+/* { dg-do run { xfail { { i?86-*-* x86_64-*-* } && ia32 } } } */
+/* { dg-require-effective-target vect_double } */
+/* { dg-add-options ieee } */
+/* { dg-additional-options "-fno-fast-math" } */
+
+#include "tree-vect.h"
+
+#define N (VECTOR_BITS * 17)
+
+double __attribute__ ((noinline, noclone))
+reduc_plus_double (double *a)
+{
+  double r1 = 0;
+  double r2 = 0;
+  double r3 = 0;
+  double r4 = 0;
+  for (int i = 0; i < N; i += 4)
+    {
+      r1 += a[i];
+      r2 += a[i + 1];
+      r3 += a[i + 2];
+      r4 += a[i + 3];
+    }
+  return r1 * r2 * r3 * r4;
+}
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  double a[N];
+  double r[4] = {};
+  for (int i = 0; i < N; i++)
+    {
+      a[i] = (i * 0.1) * (i & 1 ? 1 : -1);
+      r[i % 4] += a[i];
+      asm volatile ("" ::: "memory");
+    }
+  double res = reduc_plus_double (a);
+  if (res != r[0] * r[1] * r[2] * r[3])
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump {in-order unchained SLP reductions not supported} "vect" } } */
+/* { dg-final { scan-tree-dump-not {vectorizing stmts using SLP} "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_1.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_1.c

new file mode 100644 (file)

index 0000000..4c75807
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_1.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define NUM_ELEMS(TYPE) ((int)(5 * (256 / sizeof (TYPE)) + 3))
+
+#define DEF_REDUC_PLUS(TYPE)                   \
+  TYPE __attribute__ ((noinline, noclone))     \
+  reduc_plus_##TYPE (TYPE *a, TYPE *b)         \
+  {                                            \
+    TYPE r = 0, q = 3;                         \
+    for (int i = 0; i < NUM_ELEMS (TYPE); i++) \
+      {                                                \
+       r += a[i];                              \
+       q -= b[i];                              \
+      }                                                \
+    return r * q;                              \
+  }
+
+#define TEST_ALL(T) \
+  T (_Float16) \
+  T (float) \
+  T (double)
+
+TEST_ALL (DEF_REDUC_PLUS)
+
+/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 2 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_1_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_1_run.c

new file mode 100644 (file)

index 0000000..40e0cf0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_1_run.c
@@ -0,0 +1,29 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#include "reduc_strict_1.c"
+
+#define TEST_REDUC_PLUS(TYPE)                  \
+  {                                            \
+    TYPE a[NUM_ELEMS (TYPE)];                  \
+    TYPE b[NUM_ELEMS (TYPE)];                  \
+    TYPE r = 0, q = 3;                         \
+    for (int i = 0; i < NUM_ELEMS (TYPE); i++) \
+      {                                                \
+       a[i] = (i * 0.1) * (i & 1 ? 1 : -1);    \
+       b[i] = (i * 0.3) * (i & 1 ? 1 : -1);    \
+       r += a[i];                              \
+       q -= b[i];                              \
+       asm volatile ("" ::: "memory");         \
+      }                                                \
+    TYPE res = reduc_plus_##TYPE (a, b);       \
+    if (res != r * q)                          \
+      __builtin_abort ();                      \
+  }
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  TEST_ALL (TEST_REDUC_PLUS);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_2.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_2.c

new file mode 100644 (file)

index 0000000..d32efc2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_2.c
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+#define NUM_ELEMS(TYPE) ((int) (5 * (256 / sizeof (TYPE)) + 3))
+
+#define DEF_REDUC_PLUS(TYPE)                                   \
+void __attribute__ ((noinline, noclone))                       \
+reduc_plus_##TYPE (TYPE (*restrict a)[NUM_ELEMS (TYPE)],       \
+                  TYPE *restrict r, int n)                     \
+{                                                              \
+  for (int i = 0; i < n; i++)                                  \
+    {                                                          \
+      r[i] = 0;                                                        \
+      for (int j = 0; j < NUM_ELEMS (TYPE); j++)               \
+        r[i] += a[i][j];                                       \
+    }                                                          \
+}
+
+#define TEST_ALL(T) \
+  T (_Float16) \
+  T (float) \
+  T (double)
+
+TEST_ALL (DEF_REDUC_PLUS)
+
+/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 1 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_2_run.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_2_run.c

new file mode 100644 (file)

index 0000000..e59f640
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_2_run.c
@@ -0,0 +1,31 @@
+/* { dg-do run { target { aarch64_sve_hw } } } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline" } */
+
+#include "reduc_strict_2.c"
+
+#define NROWS 5
+
+#define TEST_REDUC_PLUS(TYPE)                                  \
+  {                                                            \
+    TYPE a[NROWS][NUM_ELEMS (TYPE)];                           \
+    TYPE r[NROWS];                                             \
+    TYPE expected[NROWS] = {};                                 \
+    for (int i = 0; i < NROWS; ++i)                            \
+      for (int j = 0; j < NUM_ELEMS (TYPE); ++j)               \
+       {                                                       \
+         a[i][j] = (i * 0.1 + j * 0.6) * (j & 1 ? 1 : -1);     \
+         expected[i] += a[i][j];                               \
+         asm volatile ("" ::: "memory");                       \
+       }                                                       \
+    reduc_plus_##TYPE (a, r, NROWS);                           \
+    for (int i = 0; i < NROWS; ++i)                            \
+      if (r[i] != expected[i])                                 \
+       __builtin_abort ();                                     \
+  }
+
+int __attribute__ ((optimize (1)))
+main ()
+{
+  TEST_ALL (TEST_REDUC_PLUS);
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c

new file mode 100644 (file)

index 0000000..a28145f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c
@@ -0,0 +1,131 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-inline -msve-vector-bits=256 -fdump-tree-vect-details" } */
+
+double mat[100][4];
+double mat2[100][8];
+double mat3[100][12];
+double mat4[100][3];
+
+double
+slp_reduc_plus (int n)
+{
+  double tmp = 0.0;
+  for (int i = 0; i < n; i++)
+    {
+      tmp = tmp + mat[i][0];
+      tmp = tmp + mat[i][1];
+      tmp = tmp + mat[i][2];
+      tmp = tmp + mat[i][3];
+    }
+  return tmp;
+}
+
+double
+slp_reduc_plus2 (int n)
+{
+  double tmp = 0.0;
+  for (int i = 0; i < n; i++)
+    {
+      tmp = tmp + mat2[i][0];
+      tmp = tmp + mat2[i][1];
+      tmp = tmp + mat2[i][2];
+      tmp = tmp + mat2[i][3];
+      tmp = tmp + mat2[i][4];
+      tmp = tmp + mat2[i][5];
+      tmp = tmp + mat2[i][6];
+      tmp = tmp + mat2[i][7];
+    }
+  return tmp;
+}
+
+double
+slp_reduc_plus3 (int n)
+{
+  double tmp = 0.0;
+  for (int i = 0; i < n; i++)
+    {
+      tmp = tmp + mat3[i][0];
+      tmp = tmp + mat3[i][1];
+      tmp = tmp + mat3[i][2];
+      tmp = tmp + mat3[i][3];
+      tmp = tmp + mat3[i][4];
+      tmp = tmp + mat3[i][5];
+      tmp = tmp + mat3[i][6];
+      tmp = tmp + mat3[i][7];
+      tmp = tmp + mat3[i][8];
+      tmp = tmp + mat3[i][9];
+      tmp = tmp + mat3[i][10];
+      tmp = tmp + mat3[i][11];
+    }
+  return tmp;
+}
+
+void
+slp_non_chained_reduc (int n, double * restrict out)
+{
+  for (int i = 0; i < 3; i++)
+    out[i] = 0;
+
+  for (int i = 0; i < n; i++)
+    {
+      out[0] = out[0] + mat4[i][0];
+      out[1] = out[1] + mat4[i][1];
+      out[2] = out[2] + mat4[i][2];
+    }
+}
+
+/* Strict FP reductions shouldn't be used for the outer loops, only the
+   inner loops.  */
+
+float
+double_reduc1 (float (*restrict i)[16])
+{
+  float l = 0;
+
+  for (int a = 0; a < 8; a++)
+    for (int b = 0; b < 8; b++)
+      l += i[b][a];
+  return l;
+}
+
+float
+double_reduc2 (float *restrict i)
+{
+  float l = 0;
+
+  for (int a = 0; a < 8; a++)
+    for (int b = 0; b < 16; b++)
+      {
+        l += i[b * 4];
+        l += i[b * 4 + 1];
+        l += i[b * 4 + 2];
+        l += i[b * 4 + 3];
+      }
+  return l;
+}
+
+float
+double_reduc3 (float *restrict i, float *restrict j)
+{
+  float k = 0, l = 0;
+
+  for (int a = 0; a < 8; a++)
+    for (int b = 0; b < 8; b++)
+      {
+        k += i[b];
+        l += j[b];
+      }
+  return l * k;
+}
+
+/* We can't yet handle double_reduc1.  */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s} 3 } } */
+/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d} 9 } } */
+/* 1 reduction each for double_reduc{1,2} and 2 for double_reduc3.  Each one
+   is reported three times, once for SVE, once for 128-bit AdvSIMD and once
+   for 64-bit AdvSIMD.  */
+/* { dg-final { scan-tree-dump-times "Detected double reduction" 12 "vect" } } */
+/* double_reduc2 has 2 reductions and slp_non_chained_reduc has 3.
+   double_reduc1 is reported 3 times (SVE, 128-bit AdvSIMD, 64-bit AdvSIMD)
+   before failing.  */
+/* { dg-final { scan-tree-dump-times "Detected reduction" 12 "vect" } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c

index 5b875bfcc32c3d9e44329542f171529a053e5dfe..0b2a7ad57e37175b47ef82a00550dd0470e9af78 100644 (file)
--- a/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_13.c
@@ -1,5 +1,6 @@
  /* { dg-do compile } */
-/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+/* The cost model thinks that the double loop isn't a win for SVE-128.  */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable -fno-vect-cost-model" } */
  
  #include <stdint.h>
  
@@ -24,7 +25,10 @@ vec_slp_##TYPE (TYPE *restrict a, int n)                     \
    T (int32_t)                                  \
    T (uint32_t)                                 \
    T (int64_t)                                  \
-  T (uint64_t)
+  T (uint64_t)                                 \
+  T (_Float16)                                 \
+  T (float)                                    \
+  T (double)
  
  TEST_ALL (VEC_PERM)
  
@@ -32,21 +36,25 @@ TEST_ALL (VEC_PERM)
  /* ??? We don't treat the uint loops as SLP.  */
  /* The loop should be fully-masked.  */
  /* { dg-final { scan-assembler-times {\tld1b\t} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tld1h\t} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tld1w\t} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tld1w\t} 1 } } */
-/* { dg-final { scan-assembler-times {\tld1d\t} 2 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\tld1d\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\t} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1w\t} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 3 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\tld1d\t} 2 } } */
  /* { dg-final { scan-assembler-not {\tldr} { xfail *-*-* } } } */
  
  /* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b} 4 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 4 { xfail *-*-* } } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 4 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 4 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h} 6 { xfail *-*-* } } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s} 6 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d} 6 } } */
  
  /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.b\n} 2 { xfail *-*-* } } } */
  /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.h\n} 2 { xfail *-*-* } } } */
  /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.s\n} 2 } } */
  /* { dg-final { scan-assembler-times {\tuaddv\td[0-9]+, p[0-7], z[0-9]+\.d\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-not {\tfadd\n} } } */
  
  /* { dg-final { scan-assembler-not {\tuqdec} } } */
diff --git a/gcc/testsuite/gfortran.dg/vect/vect-8.f90 b/gcc/testsuite/gfortran.dg/vect/vect-8.f90

index 8e18be5eebd2ab2746c8155cdf7bab076d641db5..0ac5f1c390b365bdb04e49b8e94b10bcb15f0eea 100644 (file)
--- a/gcc/testsuite/gfortran.dg/vect/vect-8.f90
+++ b/gcc/testsuite/gfortran.dg/vect/vect-8.f90
@@ -704,5 +704,5 @@ CALL track('KERNEL  ')
  RETURN
  END SUBROUTINE kernel
  
-! { dg-final { scan-tree-dump-times "vectorized 21 loops" 1 "vect" { target { vect_intdouble_cvt } } } }
+! { dg-final { scan-tree-dump-times "vectorized 22 loops" 1 "vect" { target vect_intdouble_cvt } } }
  ! { dg-final { scan-tree-dump-times "vectorized 17 loops" 1 "vect" { target { ! vect_intdouble_cvt } } } }
diff --git a/gcc/tree-parloops.c b/gcc/tree-parloops.c

index a872f8c87537d900c5b1e1ad195e1d5a7709981a..e44ad5e6861f31795d9a66ba1604109cea205279 100644 (file)
--- a/gcc/tree-parloops.c
+++ b/gcc/tree-parloops.c
@@ -2531,6 +2531,19 @@ set_reduc_phi_uids (reduction_info **slot, void *data ATTRIBUTE_UNUSED)
    return 1;
  }
  
+/* Return true if the type of reduction performed by STMT is suitable
+   for this pass.  */
+
+static bool
+valid_reduction_p (gimple *stmt)
+{
+  /* Parallelization would reassociate the operation, which isn't
+     allowed for in-order reductions.  */
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (stmt_info);
+  return reduc_type != FOLD_LEFT_REDUCTION;
+}
+
  /* Detect all reductions in the LOOP, insert them into REDUCTION_LIST.  */
  
  static void
@@ -2564,7 +2577,7 @@ gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list
        gimple *reduc_stmt
         = vect_force_simple_reduction (simple_loop_info, phi,
                                        &double_reduc, true);
-      if (!reduc_stmt)
+      if (!reduc_stmt || !valid_reduction_p (reduc_stmt))
         continue;
  
        if (double_reduc)
@@ -2610,7 +2623,8 @@ gather_scalar_reductions (loop_p loop, reduction_info_table_type *reduction_list
                 = vect_force_simple_reduction (simple_loop_info, inner_phi,
                                                &double_reduc, true);
               gcc_assert (!double_reduc);
-             if (inner_reduc_stmt == NULL)
+             if (inner_reduc_stmt == NULL
+                 || !valid_reduction_p (inner_reduc_stmt))
                 continue;
  
               build_new_reduction (reduction_list, double_reduc_stmts[i], phi);
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 4b9226ff83c83d4ec953f9193963e5c32cc9963d..7fc2215b12aa8d014e5ab5fbe9b8f3f2bc02d402 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -2576,6 +2576,22 @@ vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
      }
  }
  
+/* Return true if there is an in-order reduction function for CODE, storing
+   it in *REDUC_FN if so.  */
+
+static bool
+fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
+{
+  switch (code)
+    {
+    case PLUS_EXPR:
+      *reduc_fn = IFN_FOLD_LEFT_PLUS;
+      return true;
+
+    default:
+      return false;
+    }
+}
  
  /* Function reduction_fn_for_scalar_code
  
@@ -2882,6 +2898,42 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
    return true;
  }
  
+/* Return true if we need an in-order reduction for operation CODE
+   on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
+   overflow must wrap.  */
+
+static bool
+needs_fold_left_reduction_p (tree type, tree_code code,
+                            bool need_wrapping_integral_overflow)
+{
+  /* CHECKME: check for !flag_finite_math_only too?  */
+  if (SCALAR_FLOAT_TYPE_P (type))
+    switch (code)
+      {
+      case MIN_EXPR:
+      case MAX_EXPR:
+       return false;
+
+      default:
+       return !flag_associative_math;
+      }
+
+  if (INTEGRAL_TYPE_P (type))
+    {
+      if (!operation_no_trapping_overflow (type, code))
+       return true;
+      if (need_wrapping_integral_overflow
+         && !TYPE_OVERFLOW_WRAPS (type)
+         && operation_can_overflow (code))
+       return true;
+      return false;
+    }
+
+  if (SAT_FIXED_POINT_TYPE_P (type))
+    return true;
+
+  return false;
+}
  
  /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
     reduction operation CODE has a handled computation expression.  */
@@ -3308,58 +3360,18 @@ vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
        return NULL;
      }
  
-  /* Check that it's ok to change the order of the computation.
+  /* Check whether it's ok to change the order of the computation.
       Generally, when vectorizing a reduction we change the order of the
       computation.  This may change the behavior of the program in some
       cases, so we need to check that this is ok.  One exception is when
       vectorizing an outer-loop: the inner-loop is executed sequentially,
       and therefore vectorizing reductions in the inner-loop during
       outer-loop vectorization is safe.  */
-
-  if (*v_reduc_type != COND_REDUCTION
-      && check_reduction)
-    {
-      /* CHECKME: check for !flag_finite_math_only too?  */
-      if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
-       {
-         /* Changing the order of operations changes the semantics.  */
-         if (dump_enabled_p ())
-           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-                       "reduction: unsafe fp math optimization: ");
-         return NULL;
-       }
-      else if (INTEGRAL_TYPE_P (type))
-       {
-         if (!operation_no_trapping_overflow (type, code))
-           {
-             /* Changing the order of operations changes the semantics.  */
-             if (dump_enabled_p ())
-               report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-                               "reduction: unsafe int math optimization"
-                               " (overflow traps): ");
-             return NULL;
-           }
-         if (need_wrapping_integral_overflow
-             && !TYPE_OVERFLOW_WRAPS (type)
-             && operation_can_overflow (code))
-           {
-             /* Changing the order of operations changes the semantics.  */
-             if (dump_enabled_p ())
-               report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-                               "reduction: unsafe int math optimization"
-                               " (overflow doesn't wrap): ");
-             return NULL;
-           }
-       }
-      else if (SAT_FIXED_POINT_TYPE_P (type))
-       {
-         /* Changing the order of operations changes the semantics.  */
-         if (dump_enabled_p ())
-         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
-                         "reduction: unsafe fixed-point math optimization: ");
-         return NULL;
-       }
-    }
+  if (check_reduction
+      && *v_reduc_type == TREE_CODE_REDUCTION
+      && needs_fold_left_reduction_p (type, code,
+                                     need_wrapping_integral_overflow))
+    *v_reduc_type = FOLD_LEFT_REDUCTION;
  
    /* Reduction is safe. We're dealing with one of the following:
       1) integer arithmetic and no trapv
@@ -3525,6 +3537,7 @@ vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
        STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
        STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
        reduc_def_info = vinfo_for_stmt (def);
+      STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
        STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
      }
    return def;
@@ -4076,14 +4089,27 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
  
    code = gimple_assign_rhs_code (orig_stmt);
  
-  if (reduction_type == EXTRACT_LAST_REDUCTION)
+  if (reduction_type == EXTRACT_LAST_REDUCTION
+      || reduction_type == FOLD_LEFT_REDUCTION)
      {
        /* No extra instructions needed in the prologue.  */
        prologue_cost = 0;
  
-      /* Count NCOPIES FOLD_EXTRACT_LAST operations.  */
-      inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
-                                  stmt_info, 0, vect_body);
+      if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
+       /* Count one reduction-like operation per vector.  */
+       inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
+                                    stmt_info, 0, vect_body);
+      else
+       {
+         /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
+         unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
+         inside_cost = add_stmt_cost (target_cost_data,  nelements,
+                                      vec_to_scalar, stmt_info, 0,
+                                      vect_body);
+         inside_cost += add_stmt_cost (target_cost_data,  nelements,
+                                       scalar_stmt, stmt_info, 0,
+                                       vect_body);
+       }
      }
    else
      {
@@ -4149,7 +4175,8 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
                                           scalar_stmt, stmt_info, 0,
                                           vect_epilogue);
         }
-      else if (reduction_type == EXTRACT_LAST_REDUCTION)
+      else if (reduction_type == EXTRACT_LAST_REDUCTION
+              || reduction_type == FOLD_LEFT_REDUCTION)
         /* No extra instructions need in the epilogue.  */
         ;
        else
@@ -6025,6 +6052,197 @@ vect_finalize_reduction:
      }
  }
  
+/* Return a vector of type VECTYPE that is equal to the vector select
+   operation "MASK ? VEC : IDENTITY".  Insert the select statements
+   before GSI.  */
+
+static tree
+merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
+                    tree vec, tree identity)
+{
+  tree cond = make_temp_ssa_name (vectype, NULL, "cond");
+  gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
+                                         mask, vec, identity);
+  gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+  return cond;
+}
+
+/* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
+   order, starting with LHS.  Insert the extraction statements before GSI and
+   associate the new scalar SSA names with variable SCALAR_DEST.
+   Return the SSA name for the result.  */
+
+static tree
+vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
+                      tree_code code, tree lhs, tree vector_rhs)
+{
+  tree vectype = TREE_TYPE (vector_rhs);
+  tree scalar_type = TREE_TYPE (vectype);
+  tree bitsize = TYPE_SIZE (scalar_type);
+  unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+  unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
+
+  for (unsigned HOST_WIDE_INT bit_offset = 0;
+       bit_offset < vec_size_in_bits;
+       bit_offset += element_bitsize)
+    {
+      tree bitpos = bitsize_int (bit_offset);
+      tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
+                        bitsize, bitpos);
+
+      gassign *stmt = gimple_build_assign (scalar_dest, rhs);
+      rhs = make_ssa_name (scalar_dest, stmt);
+      gimple_assign_set_lhs (stmt, rhs);
+      gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+
+      stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
+      tree new_name = make_ssa_name (scalar_dest, stmt);
+      gimple_assign_set_lhs (stmt, new_name);
+      gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
+      lhs = new_name;
+    }
+  return lhs;
+}
+
+/* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
+   statement that sets the live-out value.  REDUC_DEF_STMT is the phi
+   statement.  CODE is the operation performed by STMT and OPS are
+   its scalar operands.  REDUC_INDEX is the index of the operand in
+   OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
+   implements in-order reduction, or IFN_LAST if we should open-code it.
+   VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
+   that should be used to control the operation in a fully-masked loop.  */
+
+static bool
+vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
+                              gimple **vec_stmt, slp_tree slp_node,
+                              gimple *reduc_def_stmt,
+                              tree_code code, internal_fn reduc_fn,
+                              tree ops[3], tree vectype_in,
+                              int reduc_index, vec_loop_masks *masks)
+{
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
+  gimple *new_stmt = NULL;
+
+  int ncopies;
+  if (slp_node)
+    ncopies = 1;
+  else
+    ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
+
+  gcc_assert (!nested_in_vect_loop_p (loop, stmt));
+  gcc_assert (ncopies == 1);
+  gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
+  gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
+  gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
+             == FOLD_LEFT_REDUCTION);
+
+  if (slp_node)
+    gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
+                         TYPE_VECTOR_SUBPARTS (vectype_in)));
+
+  tree op0 = ops[1 - reduc_index];
+
+  int group_size = 1;
+  gimple *scalar_dest_def;
+  auto_vec<tree> vec_oprnds0;
+  if (slp_node)
+    {
+      vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
+      group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
+      scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
+    }
+  else
+    {
+      tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
+      vec_oprnds0.create (1);
+      vec_oprnds0.quick_push (loop_vec_def0);
+      scalar_dest_def = stmt;
+    }
+
+  tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
+  tree scalar_type = TREE_TYPE (scalar_dest);
+  tree reduc_var = gimple_phi_result (reduc_def_stmt);
+
+  int vec_num = vec_oprnds0.length ();
+  gcc_assert (vec_num == 1 || slp_node);
+  tree vec_elem_type = TREE_TYPE (vectype_out);
+  gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
+
+  tree vector_identity = NULL_TREE;
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    vector_identity = build_zero_cst (vectype_out);
+
+  tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
+  int i;
+  tree def0;
+  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
+    {
+      tree mask = NULL_TREE;
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
+
+      /* Handle MINUS by adding the negative.  */
+      if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
+       {
+         tree negated = make_ssa_name (vectype_out);
+         new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
+         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
+         def0 = negated;
+       }
+
+      if (mask)
+       def0 = merge_with_identity (gsi, mask, vectype_out, def0,
+                                   vector_identity);
+
+      /* On the first iteration the input is simply the scalar phi
+        result, and for subsequent iterations it is the output of
+        the preceding operation.  */
+      if (reduc_fn != IFN_LAST)
+       {
+         new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
+         /* For chained SLP reductions the output of the previous reduction
+            operation serves as the input of the next. For the final statement
+            the output cannot be a temporary - we reuse the original
+            scalar destination of the last statement.  */
+         if (i != vec_num - 1)
+           {
+             gimple_set_lhs (new_stmt, scalar_dest_var);
+             reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
+             gimple_set_lhs (new_stmt, reduc_var);
+           }
+       }
+      else
+       {
+         reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
+                                            reduc_var, def0);
+         new_stmt = SSA_NAME_DEF_STMT (reduc_var);
+         /* Remove the statement, so that we can use the same code paths
+            as for statements that we've just created.  */
+         gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
+         gsi_remove (&tmp_gsi, false);
+       }
+
+      if (i == vec_num - 1)
+       {
+         gimple_set_lhs (new_stmt, scalar_dest);
+         vect_finish_replace_stmt (scalar_dest_def, new_stmt);
+       }
+      else
+       vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
+
+      if (slp_node)
+       SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+    }
+
+  if (!slp_node)
+    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
+
+  return true;
+}
  
  /* Function is_nonwrapping_integer_induction.
  
@@ -6203,6 +6421,12 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
           return true;
         }
  
+      if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
+       /* Leave the scalar phi in place.  Note that checking
+          STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
+          for reductions involving a single statement.  */
+       return true;
+
        gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
        if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
@@ -6434,6 +6658,14 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
       directy used in stmt.  */
    if (reduc_index == -1)
      {
+      if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "in-order reduction chain without SLP.\n");
+         return false;
+       }
+
        if (orig_stmt)
         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
        else
@@ -6687,7 +6919,9 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
  
    vect_reduction_type reduction_type
      = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
-  if (orig_stmt && reduction_type == TREE_CODE_REDUCTION)
+  if (orig_stmt
+      && (reduction_type == TREE_CODE_REDUCTION
+         || reduction_type == FOLD_LEFT_REDUCTION))
      {
        /* This is a reduction pattern: get the vectype from the type of the
           reduction variable, and get the tree-code from orig_stmt.  */
@@ -6734,10 +6968,13 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
    reduc_fn = IFN_LAST;
  
    if (reduction_type == TREE_CODE_REDUCTION
+      || reduction_type == FOLD_LEFT_REDUCTION
        || reduction_type == INTEGER_INDUC_COND_REDUCTION
        || reduction_type == CONST_COND_REDUCTION)
      {
-      if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
+      if (reduction_type == FOLD_LEFT_REDUCTION
+         ? fold_left_reduction_fn (orig_code, &reduc_fn)
+         : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
         {
           if (reduc_fn != IFN_LAST
               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
@@ -6803,6 +7040,41 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
        = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
  
+  if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
+    {
+      /* We can't support in-order reductions of code such as this:
+
+          for (int i = 0; i < n1; ++i)
+            for (int j = 0; j < n2; ++j)
+              l += a[j];
+
+        since GCC effectively transforms the loop when vectorizing:
+
+          for (int i = 0; i < n1 / VF; ++i)
+            for (int j = 0; j < n2; ++j)
+              for (int k = 0; k < VF; ++k)
+                l += a[j];
+
+        which is a reassociation of the original operation.  */
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "in-order double reduction not supported.\n");
+
+      return false;
+    }
+
+  if (reduction_type == FOLD_LEFT_REDUCTION
+      && slp_node
+      && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
+    {
+      /* We cannot use in-order reductions in this case because there is
+        an implicit reassociation of the operations involved.  */
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "in-order unchained SLP reductions not supported.\n");
+      return false;
+    }
+
    /* For double reductions, and for SLP reductions with a neutral value,
       we construct a variable-length initial vector by loading a vector
       full of the neutral value and then shift-and-inserting the start
@@ -6976,9 +7248,10 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
        if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
         {
-         if (cond_fn == IFN_LAST
-             || !direct_internal_fn_supported_p (cond_fn, vectype_in,
-                                                 OPTIMIZE_FOR_SPEED))
+         if (reduction_type != FOLD_LEFT_REDUCTION
+             && (cond_fn == IFN_LAST
+                 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
+                                                     OPTIMIZE_FOR_SPEED)))
             {
               if (dump_enabled_p ())
                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -6998,6 +7271,10 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
                                    vectype_in);
         }
+      if (dump_enabled_p ()
+         && reduction_type == FOLD_LEFT_REDUCTION)
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "using an in-order (fold-left) reduction.\n");
        STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
        return true;
      }
@@ -7013,6 +7290,11 @@ vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
  
    bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
  
+  if (reduction_type == FOLD_LEFT_REDUCTION)
+    return vectorize_fold_left_reduction
+      (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
+       reduc_fn, ops, vectype_in, reduc_index, masks);
+
    if (reduction_type == EXTRACT_LAST_REDUCTION)
      {
        gcc_assert (!slp_node);
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index 22dfbed332ca51691db4b088dd0c558709486564..003d4accedb6b7e0bcdb0349fba29a75c654f2c2 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -74,7 +74,15 @@ enum vect_reduction_type {
  
         for (int i = 0; i < VF; ++i)
           res = cond[i] ? val[i] : res;  */
-  EXTRACT_LAST_REDUCTION
+  EXTRACT_LAST_REDUCTION,
+
+  /* Use a folding reduction within the loop to implement:
+
+       for (int i = 0; i < VF; ++i)
+        res = res OP val[i];
+
+     (with no reassocation).  */
+  FOLD_LEFT_REDUCTION
  };
  
  #define VECTORIZABLE_CYCLE_DEF(D) (((D) == vect_reduction_def)           \
@@ -1390,6 +1398,7 @@ extern void vect_model_load_cost (stmt_vec_info, int, vect_memory_access_type,
  extern unsigned record_stmt_cost (stmt_vector_for_cost *, int,
                                   enum vect_cost_for_stmt, stmt_vec_info,
                                   int, enum vect_cost_model_location);
+extern void vect_finish_replace_stmt (gimple *, gimple *);
  extern void vect_finish_stmt_generation (gimple *, gimple *,
                                           gimple_stmt_iterator *);
  extern bool vect_mark_stmts_to_be_vectorized (loop_vec_info);
author	Richard Sandiford <richard.sandiford@linaro.org>
	Sat, 13 Jan 2018 18:01:24 +0000 (18:01 +0000)
committer	Richard Sandiford <rsandifo@gcc.gnu.org>
	Sat, 13 Jan 2018 18:01:24 +0000 (18:01 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| history
gcc/config/aarch64/aarch64.md		patch \| blob \| history
gcc/doc/md.texi		patch \| blob \| history
gcc/fold-const-call.c		patch \| blob \| history
gcc/internal-fn.c		patch \| blob \| history
gcc/internal-fn.def		patch \| blob \| history
gcc/optabs.def		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/no-fast-math-vect16.c		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/pr79920.c		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/trapv-vect-reduc-4.c		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/vect-reduc-6.c		patch \| blob \| history
gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.dg/vect/vect-reduc-in-order-4.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_1.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_1_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_2.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_2_run.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/reduc_strict_3.c	[new file with mode: 0644]	patch \| blob
gcc/testsuite/gcc.target/aarch64/sve/slp_13.c		patch \| blob \| history
gcc/testsuite/gfortran.dg/vect/vect-8.f90		patch \| blob \| history
gcc/tree-parloops.c		patch \| blob \| history
gcc/tree-vect-loop.c		patch \| blob \| history
gcc/tree-vectorizer.h		patch \| blob \| history