From 997636716c5dde7d59d026726a6f58918069f122 Mon Sep 17 00:00:00 2001 From: Alejandro Martinez Date: Tue, 28 May 2019 13:48:44 +0000 Subject: [PATCH] Current vectoriser doesn't support masked loads for SLP. Current vectoriser doesn't support masked loads for SLP. We should add that, to allow things like: void f (int *restrict x, int *restrict y, int *restrict z, int n) { for (int i = 0; i < n; i += 2) { x[i] = y[i] ? z[i] : 1; x[i + 1] = y[i + 1] ? z[i + 1] : 2; } } to be vectorized using contiguous loads rather than LD2 and ST2. This patch was motivated by SVE, but it is completely generic and should apply to any architecture with masked loads. From-SVN: r271704 --- gcc/ChangeLog | 20 +++++ gcc/internal-fn.c | 2 +- gcc/testsuite/ChangeLog | 5 ++ .../gcc.target/aarch64/sve/mask_load_slp_1.c | 90 +++++++++++++++++++ gcc/tree-data-ref.c | 2 +- gcc/tree-vect-data-refs.c | 18 +++- gcc/tree-vect-loop.c | 47 ++++++++++ gcc/tree-vect-slp.c | 29 ++++-- gcc/tree-vect-stmts.c | 32 ++++--- gcc/tree-vectorizer.c | 1 + gcc/tree-vectorizer.h | 6 +- 11 files changed, 229 insertions(+), 23 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 8065ee2675e..f48289f0f56 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,23 @@ +2019-05-28 Alejandro Martinez + + * internal-fn.c: Marked mask_load_direct as vectorizable. + * tree-data-ref.c (data_ref_compare_tree): Fixed comment typo. + * tree-vect-data-refs.c (can_group_stmts_p): Allow masked loads to be + combined even if masks different with allow_slp_p param. + (vect_analyze_data_ref_accesses): Mark SLP only vectorizable groups. + * tree-vect-loop.c (vect_dissolve_slp_only_groups): New function to + dissolve SLP-only vectorizable groups when SLP has been discarded. + (vect_analyze_loop_2): Call vect_dissolve_slp_only_groups when needed. + * tree-vect-slp.c (vect_get_and_check_slp_defs): Check masked loads + masks. + (vect_build_slp_tree_1): Fixed comment typo. + (vect_build_slp_tree_2): Include masks from masked loads in SLP tree. + * tree-vect-stmts.c (vectorizable_load): Allow vectorizaion of masked + loads for SLP only. + * tree-vectorizer.h (_stmt_vec_info): Added flag for SLP-only + vectorizable. + * tree-vectorizer.c (vec_info::new_stmt_vec_info): Likewise. + 2019-05-28 Rainer Orth * config/alpha/alpha.c [TARGET_ABI_OSF] (alpha_output_mi_thunk_osf): diff --git a/gcc/internal-fn.c b/gcc/internal-fn.c index 04081f36c4d..3051a7aa72d 100644 --- a/gcc/internal-fn.c +++ b/gcc/internal-fn.c @@ -100,7 +100,7 @@ init_internal_fns () /* Create static initializers for the information returned by direct_internal_fn. */ #define not_direct { -2, -2, false } -#define mask_load_direct { -1, 2, false } +#define mask_load_direct { -1, 2, true } #define load_lanes_direct { -1, -1, false } #define mask_load_lanes_direct { -1, -1, false } #define gather_load_direct { -1, -1, false } diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index cffbfb9d297..5f4be276bdb 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2019-05-28 Alejandro Martinez + + * gcc.target/aarch64/sve/mask_load_slp_1.c: New test for SLP + vectorized masked loads. + 2019-05-28 Jeff Law * testsuite/gcc.target/sh/pr50749-qihisi-predec-3.c: Disable diff --git a/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c new file mode 100644 index 00000000000..78c70b2be32 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/mask_load_slp_1.c @@ -0,0 +1,90 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +#define MASK_SLP_2(TYPE_COND, ALT_VAL) \ +void __attribute__ ((noinline, noclone)) \ +mask_slp_##TYPE_COND##_2_##ALT_VAL (int *restrict x, int *restrict y, \ + TYPE_COND *restrict z, int n) \ +{ \ + for (int i = 0; i < n; i += 2) \ + { \ + x[i] = y[i] ? z[i] : 1; \ + x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; \ + } \ +} + +#define MASK_SLP_4(TYPE_COND, ALT_VAL) \ +void __attribute__ ((noinline, noclone)) \ +mask_slp_##TYPE_COND##_4_##ALT_VAL (int *restrict x, int *restrict y, \ + TYPE_COND *restrict z, int n) \ +{ \ + for (int i = 0; i < n; i += 4) \ + { \ + x[i] = y[i] ? z[i] : 1; \ + x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; \ + x[i + 2] = y[i + 2] ? z[i + 2] : 1; \ + x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL; \ + } \ +} + +#define MASK_SLP_8(TYPE_COND, ALT_VAL) \ +void __attribute__ ((noinline, noclone)) \ +mask_slp_##TYPE_COND##_8_##ALT_VAL (int *restrict x, int *restrict y, \ + TYPE_COND *restrict z, int n) \ +{ \ + for (int i = 0; i < n; i += 8) \ + { \ + x[i] = y[i] ? z[i] : 1; \ + x[i + 1] = y[i + 1] ? z[i + 1] : ALT_VAL; \ + x[i + 2] = y[i + 2] ? z[i + 2] : 1; \ + x[i + 3] = y[i + 3] ? z[i + 3] : ALT_VAL; \ + x[i + 4] = y[i + 4] ? z[i + 4] : 1; \ + x[i + 5] = y[i + 5] ? z[i + 5] : ALT_VAL; \ + x[i + 6] = y[i + 6] ? z[i + 6] : 1; \ + x[i + 7] = y[i + 7] ? z[i + 7] : ALT_VAL; \ + } \ +} + +#define MASK_SLP_FAIL(TYPE_COND) \ +void __attribute__ ((noinline, noclone)) \ +mask_slp_##TYPE_COND##_FAIL (int *restrict x, int *restrict y, \ + TYPE_COND *restrict z, int n) \ +{ \ + for (int i = 0; i < n; i += 2) \ + { \ + x[i] = y[i] ? z[i] : 1; \ + x[i + 1] = y[i + 1] ? z[i + 1] : x[z[i + 1]]; \ + } \ +} + +MASK_SLP_2(int8_t, 1) +MASK_SLP_2(int8_t, 2) +MASK_SLP_2(int, 1) +MASK_SLP_2(int, 2) +MASK_SLP_2(int64_t, 1) +MASK_SLP_2(int64_t, 2) + +MASK_SLP_4(int8_t, 1) +MASK_SLP_4(int8_t, 2) +MASK_SLP_4(int, 1) +MASK_SLP_4(int, 2) +MASK_SLP_4(int64_t, 1) +MASK_SLP_4(int64_t, 2) + +MASK_SLP_8(int8_t, 1) +MASK_SLP_8(int8_t, 2) +MASK_SLP_8(int, 1) +MASK_SLP_8(int, 2) +MASK_SLP_8(int64_t, 1) +MASK_SLP_8(int64_t, 2) + +MASK_SLP_FAIL(int8_t) +MASK_SLP_FAIL(int) +MASK_SLP_FAIL(int64_t) + +/* { dg-final { scan-assembler-not {\tld2w\t} } } */ +/* { dg-final { scan-assembler-not {\tst2w\t} } } */ +/* { dg-final { scan-assembler-times {\tld1w\t} 48 } } */ +/* { dg-final { scan-assembler-times {\tst1w\t} 40 } } */ diff --git a/gcc/tree-data-ref.c b/gcc/tree-data-ref.c index 67b960d5c6d..4dc03efd1de 100644 --- a/gcc/tree-data-ref.c +++ b/gcc/tree-data-ref.c @@ -1271,7 +1271,7 @@ create_data_ref (edge nest, loop_p loop, tree memref, gimple *stmt, return dr; } -/* A helper function computes order between two tree epxressions T1 and T2. +/* A helper function computes order between two tree expressions T1 and T2. This is used in comparator functions sorting objects based on the order of tree expressions. The function returns -1, 0, or 1. */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index d71a39ffd78..55d87f8f59f 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -2863,10 +2863,12 @@ strip_conversion (tree op) } /* Return true if vectorizable_* routines can handle statements STMT1_INFO - and STMT2_INFO being in a single group. */ + and STMT2_INFO being in a single group. When ALLOW_SLP_P, masked loads can + be grouped in SLP mode. */ static bool -can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info) +can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info, + bool allow_slp_p) { if (gimple_assign_single_p (stmt1_info->stmt)) return gimple_assign_single_p (stmt2_info->stmt); @@ -2888,7 +2890,8 @@ can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info) like those created by build_mask_conversion. */ tree mask1 = gimple_call_arg (call1, 2); tree mask2 = gimple_call_arg (call2, 2); - if (!operand_equal_p (mask1, mask2, 0)) + if (!operand_equal_p (mask1, mask2, 0) + && (ifn == IFN_MASK_STORE || !allow_slp_p)) { mask1 = strip_conversion (mask1); if (!mask1) @@ -2974,7 +2977,7 @@ vect_analyze_data_ref_accesses (vec_info *vinfo) || data_ref_compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb)) != 0 || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0 - || !can_group_stmts_p (stmtinfo_a, stmtinfo_b)) + || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true)) break; /* Check that the data-refs have the same constant size. */ @@ -3059,6 +3062,13 @@ vect_analyze_data_ref_accesses (vec_info *vinfo) DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b; lastinfo = stmtinfo_b; + STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a) + = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false); + + if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)) + dump_printf_loc (MSG_NOTE, vect_location, + "Load suitable for SLP vectorization only.\n"); + if (init_b == init_prev && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)) && dump_enabled_p ()) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index e1229a51c48..4942c6937e0 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -1774,6 +1774,50 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs, return opt_result::success (); } +/* Look for SLP-only access groups and turn each individual access into its own + group. */ +static void +vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo) +{ + unsigned int i; + struct data_reference *dr; + + DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups"); + + vec datarefs = loop_vinfo->shared->datarefs; + FOR_EACH_VEC_ELT (datarefs, i, dr) + { + gcc_assert (DR_REF (dr)); + stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr)); + + /* Check if the load is a part of an interleaving chain. */ + if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) + { + stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info); + unsigned int group_size = DR_GROUP_SIZE (first_element); + + /* Check if SLP-only groups. */ + if (!STMT_SLP_TYPE (stmt_info) + && STMT_VINFO_SLP_VECT_ONLY (first_element)) + { + /* Dissolve the group. */ + STMT_VINFO_SLP_VECT_ONLY (first_element) = false; + + stmt_vec_info vinfo = first_element; + while (vinfo) + { + stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo); + DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo; + DR_GROUP_NEXT_ELEMENT (vinfo) = NULL; + DR_GROUP_SIZE (vinfo) = 1; + DR_GROUP_GAP (vinfo) = group_size - 1; + vinfo = next; + } + } + } + } +} + /* Function vect_analyze_loop_2. Apply a set of analyses on LOOP, and create a loop_vec_info struct @@ -1990,6 +2034,9 @@ start_over: } } + /* Dissolve SLP-only groups. */ + vect_dissolve_slp_only_groups (loop_vinfo); + /* Scan all the remaining operations in the loop that are not subject to SLP and make sure they are vectorizable. */ ok = vect_analyze_loop_operations (loop_vinfo); diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 2810228f9a5..884db33c8ec 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -325,6 +325,14 @@ vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char *swap, { internal_fn ifn = gimple_call_internal_fn (stmt); commutative_op = first_commutative_argument (ifn); + + /* Masked load, only look at mask. */ + if (ifn == IFN_MASK_LOAD) + { + number_of_oprnds = 1; + /* Mask operand index. */ + first_op_idx = 5; + } } } else if (gassign *stmt = dyn_cast (stmt_info->stmt)) @@ -626,7 +634,7 @@ vect_two_operations_perm_ok_p (vec stmts, is false then this indicates the comparison could not be carried out or the stmts will never be vectorized by SLP. - Note COND_EXPR is possibly ismorphic to another one after swapping its + Note COND_EXPR is possibly isomorphic to another one after swapping its operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to the first stmt by swapping the two operands of comparison; set SWAP[i] to 2 if stmt I is isormorphic to the first stmt by inverting the code @@ -1146,14 +1154,23 @@ vect_build_slp_tree_2 (vec_info *vinfo, &this_max_nunits, matches, &two_operators)) return NULL; - /* If the SLP node is a load, terminate the recursion. */ + /* If the SLP node is a load, terminate the recursion unless masked. */ if (STMT_VINFO_GROUPED_ACCESS (stmt_info) && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) { - *max_nunits = this_max_nunits; - (*tree_size)++; - node = vect_create_new_slp_node (stmts); - return node; + if (gcall *stmt = dyn_cast (stmt_info->stmt)) + { + /* Masked load. */ + gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)); + nops = 1; + } + else + { + *max_nunits = this_max_nunits; + (*tree_size)++; + node = vect_create_new_slp_node (stmts); + return node; + } } /* Get at the operands, verifying they are compatible. */ diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 4ed60808a65..21046931243 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -7622,14 +7622,6 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, if (!scalar_dest) return false; - if (slp_node != NULL) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "SLP of masked loads not supported.\n"); - return false; - } - int mask_index = internal_fn_mask_index (ifn); if (mask_index >= 0) { @@ -7712,6 +7704,15 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info); group_size = DR_GROUP_SIZE (first_stmt_info); + /* Refuse non-SLP vectorization of SLP-only groups. */ + if (!slp && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "cannot vectorize load in non-SLP mode.\n"); + return false; + } + if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) slp_perm = true; @@ -8389,8 +8390,19 @@ vectorizable_load (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi, simd_lane_access_p, byte_offset, bump); if (mask) - vec_mask = vect_get_vec_def_for_operand (mask, stmt_info, - mask_vectype); + { + if (slp_node) + { + auto_vec ops (1); + auto_vec > vec_defs (1); + ops.quick_push (mask); + vect_get_slp_defs (ops, slp_node, &vec_defs); + vec_mask = vec_defs[0][0]; + } + else + vec_mask = vect_get_vec_def_for_operand (mask, stmt_info, + mask_vectype); + } } else { diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index d27104933a9..4f6c65faf64 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -641,6 +641,7 @@ vec_info::new_stmt_vec_info (gimple *stmt) STMT_VINFO_VECTORIZABLE (res) = true; STMT_VINFO_VEC_REDUCTION_TYPE (res) = TREE_CODE_REDUCTION; STMT_VINFO_VEC_CONST_COND_REDUC_CODE (res) = ERROR_MARK; + STMT_VINFO_SLP_VECT_ONLY (res) = false; if (gimple_code (stmt) == GIMPLE_PHI && is_loop_header_bb_p (gimple_bb (stmt))) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index d5fd4690b1d..4db30ccc22b 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -396,7 +396,7 @@ typedef struct _loop_vec_info : public vec_info { /* Condition under which this loop is analyzed and versioned. */ tree num_iters_assumptions; - /* Threshold of number of iterations below which vectorzation will not be + /* Threshold of number of iterations below which vectorization will not be performed. It is calculated from MIN_PROFITABLE_ITERS and PARAM_MIN_VECT_LOOP_BOUND. */ unsigned int th; @@ -946,6 +946,9 @@ struct _stmt_vec_info { and OPERATION_BITS without changing the result. */ unsigned int operation_precision; signop operation_sign; + + /* True if this is only suitable for SLP vectorization. */ + bool slp_vect_only_p; }; /* Information about a gather/scatter call. */ @@ -1041,6 +1044,7 @@ STMT_VINFO_BB_VINFO (stmt_vec_info stmt_vinfo) #define STMT_VINFO_NUM_SLP_USES(S) (S)->num_slp_uses #define STMT_VINFO_REDUC_TYPE(S) (S)->reduc_type #define STMT_VINFO_REDUC_DEF(S) (S)->reduc_def +#define STMT_VINFO_SLP_VECT_ONLY(S) (S)->slp_vect_only_p #define DR_GROUP_FIRST_ELEMENT(S) \ (gcc_checking_assert ((S)->dr_aux.dr), (S)->first_element) -- 2.30.2