[Vectorizer] Support masking fold left reductions

author Alejandro Martinez <alejandro.martinezvicente@arm.com>

Tue, 18 Jun 2019 08:09:00 +0000 (08:09 +0000)

committer Alejandro Martinez <alejandro@gcc.gnu.org>

Tue, 18 Jun 2019 08:09:00 +0000 (08:09 +0000)
author Alejandro Martinez <alejandro.martinezvicente@arm.com>
Tue, 18 Jun 2019 08:09:00 +0000 (08:09 +0000)
committer Alejandro Martinez <alejandro@gcc.gnu.org>
Tue, 18 Jun 2019 08:09:00 +0000 (08:09 +0000)
diff --git a/gcc/ChangeLog b/gcc/ChangeLog

index 9a46f93d89d2a4b898f10efb19f8ace5e0ba265d..51f9cd22ebc70702802cb5b3c4407912b8f14c7f 100644 (file)
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,17 @@
+2019-06-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+
+       * config/aarch64/aarch64-sve.md (mask_fold_left_plus_<mode>): Renamed
+       from "*fold_left_plus_<mode>", updated operands order.
+       * doc/md.texi (mask_fold_left_plus_@var{m}): Documented new optab.
+       * internal-fn.c (mask_fold_left_direct): New define.
+       (expand_mask_fold_left_optab_fn): Likewise.
+       (direct_mask_fold_left_optab_supported_p): Likewise.
+       * internal-fn.def (MASK_FOLD_LEFT_PLUS): New internal function.
+       * optabs.def (mask_fold_left_plus_optab): New optab.
+       * tree-vect-loop.c (mask_fold_left_plus_optab): New function to get a
+       masked internal_fn for a reduction ifn.
+       (vectorize_fold_left_reduction): Add support for masking reductions.
+
  2019-06-18  Kewen Lin  <linkw@gcc.gnu.org>
  
         PR middle-end/80791
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md

index ba39134e0ae6a265a670fe8825cffa74b9ecefb9..806823f321f6ec6db15ee8ca7ae30b691585607c 100644 (file)
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -2180,14 +2180,14 @@
  )
  
  ;; In-order FP reductions predicated with PTRUE.
-(define_insn "*fold_left_plus_<mode>"
+(define_insn "mask_fold_left_plus_<mode>"
    [(set (match_operand:<VEL> 0 "register_operand" "=w")
-       (unspec:<VEL> [(match_operand:<VPRED> 1 "register_operand" "Upl")
-                      (match_operand:<VEL> 2 "register_operand" "0")
-                      (match_operand:SVE_F 3 "register_operand" "w")]
+       (unspec:<VEL> [(match_operand:<VPRED> 3 "register_operand" "Upl")
+                      (match_operand:<VEL> 1 "register_operand" "0")
+                      (match_operand:SVE_F 2 "register_operand" "w")]
                       UNSPEC_FADDA))]
    "TARGET_SVE"
-  "fadda\t%<Vetype>0, %1, %<Vetype>0, %3.<Vetype>"
+  "fadda\t%<Vetype>0, %3, %<Vetype>0, %2.<Vetype>"
  )
  
  ;; Predicated form of the above in-order reduction.
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi

index 844fb6ef18dd4d8176977248e8407ee5263ed072..40c2b8b20cc0e94d80c1e059a728496d1d2ed390 100644 (file)
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5417,6 +5417,11 @@ mode @var{m} and the scalars have the mode appropriate for one
  element of @var{m}.  The operation is strictly in-order: there is
  no reassociation.
  
+@cindex @code{mask_fold_left_plus_@var{m}} instruction pattern
+@item @code{mask_fold_left_plus_@var{m}}
+Like @samp{fold_left_plus_@var{m}}, but takes an additional mask operand
+(operand 3) that specifies which elements of the source vector should be added.
+
  @cindex @code{sdot_prod@var{m}} instruction pattern
  @item @samp{sdot_prod@var{m}}
  @cindex @code{udot_prod@var{m}} instruction pattern
diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c

index 04081f36c4d31ecfba4099e50412345c67e1f58f..90f8e567d57cf6a6cc14fe688c9c146fecac2991 100644 (file)
--- a/gcc/internal-fn.c
+++ b/gcc/internal-fn.c
@@ -117,6 +117,7 @@ init_internal_fns ()
  #define while_direct { 0, 2, false }
  #define fold_extract_direct { 2, 2, false }
  #define fold_left_direct { 1, 1, false }
+#define mask_fold_left_direct { 1, 1, false }
  
  const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
  #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
@@ -3000,6 +3001,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab)
  #define expand_fold_left_optab_fn(FN, STMT, OPTAB) \
    expand_direct_optab_fn (FN, STMT, OPTAB, 2)
  
+#define expand_mask_fold_left_optab_fn(FN, STMT, OPTAB) \
+  expand_direct_optab_fn (FN, STMT, OPTAB, 3)
+
  /* RETURN_TYPE and ARGS are a return type and argument list that are
     in principle compatible with FN (which satisfies direct_internal_fn_p).
     Return the types that should be used to determine whether the
@@ -3088,6 +3092,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types,
  #define direct_while_optab_supported_p convert_optab_supported_p
  #define direct_fold_extract_optab_supported_p direct_optab_supported_p
  #define direct_fold_left_optab_supported_p direct_optab_supported_p
+#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
  
  /* Return the optab used by internal function FN.  */
  
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def

index 016301a58d83d7128817824d7c7ef92825c7e03e..906d74b1d08b9f14f3cf81aff10f902d472ef326 100644 (file)
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -199,6 +199,9 @@ DEF_INTERNAL_OPTAB_FN (FOLD_EXTRACT_LAST, ECF_CONST | ECF_NOTHROW,
  DEF_INTERNAL_OPTAB_FN (FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
                        fold_left_plus, fold_left)
  
+DEF_INTERNAL_OPTAB_FN (MASK_FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW,
+                      mask_fold_left_plus, mask_fold_left)
+
  /* Unary math functions.  */
  DEF_INTERNAL_FLT_FN (ACOS, ECF_CONST, acos, unary)
  DEF_INTERNAL_FLT_FN (ACOSH, ECF_CONST, acosh, unary)
diff --git a/gcc/optabs.def b/gcc/optabs.def

index 8af3a2f43fd99a36b69999d3fb50b9a6d5379964..75c8a0aee2ac9a7fc928d38f9575288893d42521 100644 (file)
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -323,6 +323,7 @@ OPTAB_D (reduc_and_scal_optab,  "reduc_and_scal_$a")
  OPTAB_D (reduc_ior_scal_optab,  "reduc_ior_scal_$a")
  OPTAB_D (reduc_xor_scal_optab,  "reduc_xor_scal_$a")
  OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a")
+OPTAB_D (mask_fold_left_plus_optab, "mask_fold_left_plus_$a")
  
  OPTAB_D (extract_last_optab, "extract_last_$a")
  OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a")
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog

index 27a522e0140a478cb2d025270b2a4ce9c1940551..56c4e7f127171117858c0ca5c2065cf8aea0acda 100644 (file)
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,6 @@
+2019-06-18  Alejandro Martinez  <alejandro.martinezvicente@arm.com>
+       * gcc.target/aarch64/sve/fadda_1.c: New test.
+
  2019-06-17  Jakub Jelinek  <jakub@redhat.com>
  
         * gcc.dg/vect/vect-simd-8.c: New test.
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c

new file mode 100644 (file)

index 0000000..158cd6c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c
@@ -0,0 +1,20 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O2 -ftree-vectorize --save-temps" } */
+
+#define DO_OPS(TYPE)                                   \
+TYPE fold_##TYPE (TYPE *src, int count)                        \
+{                                                      \
+  TYPE res = 0;                                                \
+  for (int i = 0; i < count; ++i)                      \
+    res += src[i];                                     \
+  return res;                                          \
+}
+
+DO_OPS (_Float16)
+DO_OPS (float)
+DO_OPS (double)
+
+/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */
+/* { dg-final { scan-assembler-not "sel" } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 92a7c29d077c078ea0b6f61f7cd1108f897598d8..a27eda675ad61c392a596f2dd321eccde0e669f5 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -5916,6 +5916,30 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
    return lhs;
  }
  
+/* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
+   type of the vector input.  */
+
+static internal_fn
+get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
+{
+  internal_fn mask_reduc_fn;
+
+  switch (reduc_fn)
+    {
+    case IFN_FOLD_LEFT_PLUS:
+      mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
+      break;
+
+    default:
+      return IFN_LAST;
+    }
+
+  if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
+                                     OPTIMIZE_FOR_SPEED))
+    return mask_reduc_fn;
+  return IFN_LAST;
+}
+
  /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
     statement that sets the live-out value.  REDUC_DEF_STMT is the phi
     statement.  CODE is the operation performed by STMT_INFO and OPS are
@@ -5938,6 +5962,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
    stmt_vec_info new_stmt_info = NULL;
+  internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
  
    int ncopies;
    if (slp_node)
@@ -6014,16 +6039,21 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
           def0 = negated;
         }
  
-      if (mask)
+      if (mask && mask_reduc_fn == IFN_LAST)
         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
                                     vector_identity);
  
        /* On the first iteration the input is simply the scalar phi
          result, and for subsequent iterations it is the output of
          the preceding operation.  */
-      if (reduc_fn != IFN_LAST)
+      if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
         {
-         new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
+         if (mask && mask_reduc_fn != IFN_LAST)
+           new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
+                                                  def0, mask);
+         else
+           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
+                                                  def0);
           /* For chained SLP reductions the output of the previous reduction
              operation serves as the input of the next. For the final statement
              the output cannot be a temporary - we reuse the original
author	Alejandro Martinez <alejandro.martinezvicente@arm.com>
	Tue, 18 Jun 2019 08:09:00 +0000 (08:09 +0000)
committer	Alejandro Martinez <alejandro@gcc.gnu.org>
	Tue, 18 Jun 2019 08:09:00 +0000 (08:09 +0000)
gcc/ChangeLog		patch \| blob \| history
gcc/config/aarch64/aarch64-sve.md		patch \| blob \| history
gcc/doc/md.texi		patch \| blob \| history
gcc/internal-fn.c		patch \| blob \| history
gcc/internal-fn.def		patch \| blob \| history
gcc/optabs.def		patch \| blob \| history
gcc/testsuite/ChangeLog		patch \| blob \| history
gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-loop.c		patch \| blob \| history