From: Alejandro Martinez Date: Tue, 18 Jun 2019 08:09:00 +0000 (+0000) Subject: [Vectorizer] Support masking fold left reductions X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=bce29d65ebe1316d15ec7582a1d257ef1be163f7;p=gcc.git [Vectorizer] Support masking fold left reductions This patch adds support in the vectorizer for masking fold left reductions. This avoids the need to insert a conditional assignement with some identity value. From-SVN: r272407 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 9a46f93d89d..51f9cd22ebc 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,17 @@ +2019-06-18 Alejandro Martinez + + * config/aarch64/aarch64-sve.md (mask_fold_left_plus_): Renamed + from "*fold_left_plus_", updated operands order. + * doc/md.texi (mask_fold_left_plus_@var{m}): Documented new optab. + * internal-fn.c (mask_fold_left_direct): New define. + (expand_mask_fold_left_optab_fn): Likewise. + (direct_mask_fold_left_optab_supported_p): Likewise. + * internal-fn.def (MASK_FOLD_LEFT_PLUS): New internal function. + * optabs.def (mask_fold_left_plus_optab): New optab. + * tree-vect-loop.c (mask_fold_left_plus_optab): New function to get a + masked internal_fn for a reduction ifn. + (vectorize_fold_left_reduction): Add support for masking reductions. + 2019-06-18 Kewen Lin PR middle-end/80791 diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index ba39134e0ae..806823f321f 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -2180,14 +2180,14 @@ ) ;; In-order FP reductions predicated with PTRUE. -(define_insn "*fold_left_plus_" +(define_insn "mask_fold_left_plus_" [(set (match_operand: 0 "register_operand" "=w") - (unspec: [(match_operand: 1 "register_operand" "Upl") - (match_operand: 2 "register_operand" "0") - (match_operand:SVE_F 3 "register_operand" "w")] + (unspec: [(match_operand: 3 "register_operand" "Upl") + (match_operand: 1 "register_operand" "0") + (match_operand:SVE_F 2 "register_operand" "w")] UNSPEC_FADDA))] "TARGET_SVE" - "fadda\t%0, %1, %0, %3." + "fadda\t%0, %3, %0, %2." ) ;; Predicated form of the above in-order reduction. diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi index 844fb6ef18d..40c2b8b20cc 100644 --- a/gcc/doc/md.texi +++ b/gcc/doc/md.texi @@ -5417,6 +5417,11 @@ mode @var{m} and the scalars have the mode appropriate for one element of @var{m}. The operation is strictly in-order: there is no reassociation. +@cindex @code{mask_fold_left_plus_@var{m}} instruction pattern +@item @code{mask_fold_left_plus_@var{m}} +Like @samp{fold_left_plus_@var{m}}, but takes an additional mask operand +(operand 3) that specifies which elements of the source vector should be added. + @cindex @code{sdot_prod@var{m}} instruction pattern @item @samp{sdot_prod@var{m}} @cindex @code{udot_prod@var{m}} instruction pattern diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 04081f36c4d..90f8e567d57 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -117,6 +117,7 @@ init_internal_fns () #define while_direct { 0, 2, false } #define fold_extract_direct { 2, 2, false } #define fold_left_direct { 1, 1, false } +#define mask_fold_left_direct { 1, 1, false } const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = { #define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct, @@ -3000,6 +3001,9 @@ expand_while_optab_fn (internal_fn, gcall *stmt, convert_optab optab) #define expand_fold_left_optab_fn(FN, STMT, OPTAB) \ expand_direct_optab_fn (FN, STMT, OPTAB, 2) +#define expand_mask_fold_left_optab_fn(FN, STMT, OPTAB) \ + expand_direct_optab_fn (FN, STMT, OPTAB, 3) + /* RETURN_TYPE and ARGS are a return type and argument list that are in principle compatible with FN (which satisfies direct_internal_fn_p). Return the types that should be used to determine whether the @@ -3088,6 +3092,7 @@ multi_vector_optab_supported_p (convert_optab optab, tree_pair types, #define direct_while_optab_supported_p convert_optab_supported_p #define direct_fold_extract_optab_supported_p direct_optab_supported_p #define direct_fold_left_optab_supported_p direct_optab_supported_p +#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p /* Return the optab used by internal function FN. */ diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def index 016301a58d8..906d74b1d08 100644 --- a/gcc/internal-fn.def +++ b/gcc/internal-fn.def @@ -199,6 +199,9 @@ DEF_INTERNAL_OPTAB_FN (FOLD_EXTRACT_LAST, ECF_CONST | ECF_NOTHROW, DEF_INTERNAL_OPTAB_FN (FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW, fold_left_plus, fold_left) +DEF_INTERNAL_OPTAB_FN (MASK_FOLD_LEFT_PLUS, ECF_CONST | ECF_NOTHROW, + mask_fold_left_plus, mask_fold_left) + /* Unary math functions. */ DEF_INTERNAL_FLT_FN (ACOS, ECF_CONST, acos, unary) DEF_INTERNAL_FLT_FN (ACOSH, ECF_CONST, acosh, unary) diff --git a/gcc/optabs.def b/gcc/optabs.def index 8af3a2f43fd..75c8a0aee2a 100644 --- a/gcc/optabs.def +++ b/gcc/optabs.def @@ -323,6 +323,7 @@ OPTAB_D (reduc_and_scal_optab, "reduc_and_scal_$a") OPTAB_D (reduc_ior_scal_optab, "reduc_ior_scal_$a") OPTAB_D (reduc_xor_scal_optab, "reduc_xor_scal_$a") OPTAB_D (fold_left_plus_optab, "fold_left_plus_$a") +OPTAB_D (mask_fold_left_plus_optab, "mask_fold_left_plus_$a") OPTAB_D (extract_last_optab, "extract_last_$a") OPTAB_D (fold_extract_last_optab, "fold_extract_last_$a") diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 27a522e0140..56c4e7f1271 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,6 @@ +2019-06-18 Alejandro Martinez + * gcc.target/aarch64/sve/fadda_1.c: New test. + 2019-06-17 Jakub Jelinek * gcc.dg/vect/vect-simd-8.c: New test. diff --git a/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c b/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c new file mode 100644 index 00000000000..158cd6c8470 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/fadda_1.c @@ -0,0 +1,20 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O2 -ftree-vectorize --save-temps" } */ + +#define DO_OPS(TYPE) \ +TYPE fold_##TYPE (TYPE *src, int count) \ +{ \ + TYPE res = 0; \ + for (int i = 0; i < count; ++i) \ + res += src[i]; \ + return res; \ +} + +DO_OPS (_Float16) +DO_OPS (float) +DO_OPS (double) + +/* { dg-final { scan-assembler-times {\tfadda\th[0-9]+, p[0-7], h[0-9]+, z[0-9]+\.h\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfadda\ts[0-9]+, p[0-7], s[0-9]+, z[0-9]+\.s\n} 1 } } */ +/* { dg-final { scan-assembler-times {\tfadda\td[0-9]+, p[0-7], d[0-9]+, z[0-9]+\.d\n} 1 } } */ +/* { dg-final { scan-assembler-not "sel" } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 92a7c29d077..a27eda675ad 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -5916,6 +5916,30 @@ vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest, return lhs; } +/* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the + type of the vector input. */ + +static internal_fn +get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in) +{ + internal_fn mask_reduc_fn; + + switch (reduc_fn) + { + case IFN_FOLD_LEFT_PLUS: + mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS; + break; + + default: + return IFN_LAST; + } + + if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in, + OPTIMIZE_FOR_SPEED)) + return mask_reduc_fn; + return IFN_LAST; +} + /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the statement that sets the live-out value. REDUC_DEF_STMT is the phi statement. CODE is the operation performed by STMT_INFO and OPS are @@ -5938,6 +5962,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info, struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); tree vectype_out = STMT_VINFO_VECTYPE (stmt_info); stmt_vec_info new_stmt_info = NULL; + internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in); int ncopies; if (slp_node) @@ -6014,16 +6039,21 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info, def0 = negated; } - if (mask) + if (mask && mask_reduc_fn == IFN_LAST) def0 = merge_with_identity (gsi, mask, vectype_out, def0, vector_identity); /* On the first iteration the input is simply the scalar phi result, and for subsequent iterations it is the output of the preceding operation. */ - if (reduc_fn != IFN_LAST) + if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST)) { - new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0); + if (mask && mask_reduc_fn != IFN_LAST) + new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var, + def0, mask); + else + new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, + def0); /* For chained SLP reductions the output of the previous reduction operation serves as the input of the next. For the final statement the output cannot be a temporary - we reuse the original