From bc484e250990393e887f7239157cc85ce6fadcce Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Mon, 24 Feb 2020 15:36:40 +0100 Subject: [PATCH] move permutation validity check This delays the SLP permutation check to vectorizable_load and optimizes permutations only after all SLP instances have been generated and the vectorization factor is determined. 2020-05-08 Richard Biener * tree-vectorizer.h (vec_info::slp_loads): New. (vect_optimize_slp): Declare. * tree-vect-slp.c (vect_attempt_slp_rearrange_stmts): Do nothing when there are no loads. (vect_gather_slp_loads): Gather loads into a vector. (vect_supported_load_permutation_p): Remove. (vect_analyze_slp_instance): Do not verify permutation validity here. (vect_analyze_slp): Optimize permutations of reductions after all SLP instances have been gathered and gather all loads. (vect_optimize_slp): New function split out from vect_supported_load_permutation_p. Elide some permutations. (vect_slp_analyze_bb_1): Call vect_optimize_slp. * tree-vect-loop.c (vect_analyze_loop_2): Likewise. * tree-vect-stmts.c (vectorizable_load): Check whether the load can be permuted. When generating code assert we can. * gcc.dg/vect/bb-slp-pr68892.c: Adjust for not supported SLP permutations becoming builds from scalars. * gcc.dg/vect/bb-slp-pr78205.c: Likewise. * gcc.dg/vect/bb-slp-34.c: Likewise. --- gcc/ChangeLog | 20 ++ gcc/testsuite/ChangeLog | 7 + gcc/testsuite/gcc.dg/vect/bb-slp-34.c | 3 +- gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c | 7 +- gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c | 6 +- gcc/tree-vect-loop.c | 3 + gcc/tree-vect-slp.c | 262 ++++++++------------- gcc/tree-vect-stmts.c | 48 +++- gcc/tree-vectorizer.h | 4 +- 9 files changed, 180 insertions(+), 180 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 14605950a8b..21eabf82113 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,23 @@ +2020-05-08 Richard Biener + + * tree-vectorizer.h (vec_info::slp_loads): New. + (vect_optimize_slp): Declare. + * tree-vect-slp.c (vect_attempt_slp_rearrange_stmts): Do + nothing when there are no loads. + (vect_gather_slp_loads): Gather loads into a vector. + (vect_supported_load_permutation_p): Remove. + (vect_analyze_slp_instance): Do not verify permutation + validity here. + (vect_analyze_slp): Optimize permutations of reductions + after all SLP instances have been gathered and gather + all loads. + (vect_optimize_slp): New function split out from + vect_supported_load_permutation_p. Elide some permutations. + (vect_slp_analyze_bb_1): Call vect_optimize_slp. + * tree-vect-loop.c (vect_analyze_loop_2): Likewise. + * tree-vect-stmts.c (vectorizable_load): Check whether + the load can be permuted. When generating code assert we can. + 2020-05-08 Richard Biener * tree-ssa-sccvn.c (rpo_avail): Change type to diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index f54ebc89446..9618eaab2ab 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,10 @@ +2020-05-08 Richard Biener + + * gcc.dg/vect/bb-slp-pr68892.c: Adjust for not supported + SLP permutations becoming builds from scalars. + * gcc.dg/vect/bb-slp-pr78205.c: Likewise. + * gcc.dg/vect/bb-slp-34.c: Likewise. + 2020-05-08 Nathan Sidwell * c-c++-common/raw-string-6.c: Adjust EOF error location. diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-34.c b/gcc/testsuite/gcc.dg/vect/bb-slp-34.c index ffd6ce25f64..c51c7706adc 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-34.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-34.c @@ -32,5 +32,4 @@ int main() return 0; } -/* ??? XFAILed because we access "excess" elements with the permutation. */ -/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" { target vect_perm xfail { ! { aarch64*-*-* arm*-*-* } } } } } */ +/* { dg-final { scan-tree-dump "basic block vectorized" "slp2" { target vect_perm } } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c index 216883fc0c4..8cd3a6a1274 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr68892.c @@ -13,7 +13,8 @@ void foo(void) b[3] = a[3][0]; } -/* ??? The profitability check is not reached because we give up on the - gaps we access earlier. */ +/* ??? Due to the gaps we fall back to scalar loads which makes the + vectorization profitable. */ /* { dg-final { scan-tree-dump "not profitable" "slp2" { xfail *-*-* } } } */ -/* { dg-final { scan-tree-dump-times "Basic block will be vectorized" 0 "slp2" } } */ +/* { dg-final { scan-tree-dump-times "BB vectorization with gaps at the end of a load is not supported" 1 "slp2" } } */ +/* { dg-final { scan-tree-dump-times "Basic block will be vectorized" 1 "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c index e02502a3fc1..f5dc5340792 100644 --- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr78205.c @@ -19,7 +19,9 @@ void foo () } /* We may not vectorize the store to x[] as it accesses c out-of bounds - but we do want to vectorize the other two store groups. */ + but we do want to vectorize the other two store groups. But we may + end up using scalar loads to vectorize the last group. */ /* { dg-final { scan-tree-dump-times "basic block vectorized" 1 "slp2" } } */ -/* { dg-final { scan-tree-dump-times "x\\\[\[0-1\]\\\] = " 2 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "BB vectorization with gaps at the end of a load is not supported" 1 "slp2" } } */ +/* { dg-final { scan-tree-dump-times " = c\\\[4\\\];" 1 "optimized" } } */ diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index c4c3cc9ecaa..64463b874c7 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -2049,6 +2049,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts) /* Update the vectorization factor based on the SLP decision. */ vect_update_vf_for_slp (loop_vinfo); + + /* Optimize the SLP graph with the vectorization factor fixed. */ + vect_optimize_slp (loop_vinfo); } bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo); diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index c097840b09a..f9ad0821fa0 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -1815,6 +1815,9 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn) unsigned int lidx; slp_tree node, load; + if (SLP_INSTANCE_LOADS (slp_instn).is_empty ()) + return false; + /* Compare all the permutation sequences to the first one. We know that at least one load is permuted. */ node = SLP_INSTANCE_LOADS (slp_instn)[0]; @@ -1889,7 +1892,7 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn) /* Gather loads in the SLP graph NODE and populate the INST loads array. */ static void -vect_gather_slp_loads (slp_instance inst, slp_tree node, +vect_gather_slp_loads (vec &loads, slp_tree node, hash_set &visited) { if (visited.add (node)) @@ -1902,14 +1905,14 @@ vect_gather_slp_loads (slp_instance inst, slp_tree node, stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; if (STMT_VINFO_GROUPED_ACCESS (stmt_info) && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))) - SLP_INSTANCE_LOADS (inst).safe_push (node); + loads.safe_push (node); } else { unsigned i; slp_tree child; FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_gather_slp_loads (inst, child, visited); + vect_gather_slp_loads (loads, child, visited); } } @@ -1917,137 +1920,7 @@ static void vect_gather_slp_loads (slp_instance inst, slp_tree node) { hash_set visited; - vect_gather_slp_loads (inst, node, visited); -} - -/* Check if the required load permutations in the SLP instance - SLP_INSTN are supported. */ - -static bool -vect_supported_load_permutation_p (vec_info *vinfo, slp_instance slp_instn) -{ - unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn); - unsigned int i, j, k, next; - slp_tree node; - - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_NOTE, vect_location, "Load permutation "); - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) - if (node->load_permutation.exists ()) - FOR_EACH_VEC_ELT (node->load_permutation, j, next) - dump_printf (MSG_NOTE, "%d ", next); - else - for (k = 0; k < group_size; ++k) - dump_printf (MSG_NOTE, "%d ", k); - dump_printf (MSG_NOTE, "\n"); - } - - /* In case of reduction every load permutation is allowed, since the order - of the reduction statements is not important (as opposed to the case of - grouped stores). The only condition we need to check is that all the - load nodes are of the same size and have the same permutation (and then - rearrange all the nodes of the SLP instance according to this - permutation). */ - - /* Check that all the load nodes are of the same size. */ - /* ??? Can't we assert this? */ - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) - if (SLP_TREE_SCALAR_STMTS (node).length () != (unsigned) group_size) - return false; - - node = SLP_INSTANCE_TREE (slp_instn); - stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; - - /* Reduction (there are no data-refs in the root). - In reduction chain the order of the loads is not important. */ - if (!STMT_VINFO_DATA_REF (stmt_info) - && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) - vect_attempt_slp_rearrange_stmts (slp_instn); - - /* In basic block vectorization we allow any subchain of an interleaving - chain. - FORNOW: not supported in loop SLP because of realignment compications. */ - if (is_a (vinfo)) - { - /* Check whether the loads in an instance form a subchain and thus - no permutation is necessary. */ - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) - { - if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) - continue; - bool subchain_p = true; - stmt_vec_info next_load_info = NULL; - stmt_vec_info load_info; - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) - { - if (j != 0 - && (next_load_info != load_info - || DR_GROUP_GAP (load_info) != 1)) - { - subchain_p = false; - break; - } - next_load_info = DR_GROUP_NEXT_ELEMENT (load_info); - } - if (subchain_p) - SLP_TREE_LOAD_PERMUTATION (node).release (); - else - { - stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (node)[0]; - group_info = DR_GROUP_FIRST_ELEMENT (group_info); - unsigned HOST_WIDE_INT nunits; - unsigned k, maxk = 0; - FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), j, k) - if (k > maxk) - maxk = k; - /* In BB vectorization we may not actually use a loaded vector - accessing elements in excess of DR_GROUP_SIZE. */ - tree vectype = STMT_VINFO_VECTYPE (group_info); - if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) - || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1))) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "BB vectorization with gaps at the end of " - "a load is not supported\n"); - return false; - } - - /* Verify the permutation can be generated. */ - vec tem; - unsigned n_perms; - if (!vect_transform_slp_perm_load (vinfo, node, tem, NULL, - 1, true, &n_perms)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, - vect_location, - "unsupported load permutation\n"); - return false; - } - } - } - return true; - } - - /* For loop vectorization verify we can generate the permutation. Be - conservative about the vectorization factor, there are permutations - that will use three vector inputs only starting from a specific factor - and the vectorization factor is not yet final. - ??? The SLP instance unrolling factor might not be the maximum one. */ - unsigned n_perms; - poly_uint64 test_vf - = force_common_multiple (SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), - LOOP_VINFO_VECT_FACTOR - (as_a (vinfo))); - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) - if (node->load_permutation.exists () - && !vect_transform_slp_perm_load (vinfo, node, vNULL, NULL, test_vf, - true, &n_perms)) - return false; - - return true; + vect_gather_slp_loads (SLP_INSTANCE_LOADS (inst), node, visited); } @@ -2289,7 +2162,7 @@ vect_analyze_slp_instance (vec_info *vinfo, "SLP size %u vs. limit %u.\n", tree_size, max_tree_size); - /* Compute the load permutation. */ + /* Check whether any load is possibly permuted. */ slp_tree load_node; bool loads_permuted = false; FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (new_instance), i, load_node) @@ -2298,43 +2171,15 @@ vect_analyze_slp_instance (vec_info *vinfo, continue; unsigned j; stmt_vec_info load_info; - bool this_load_permuted = false; - stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT - (SLP_TREE_SCALAR_STMTS (load_node)[0]); FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info) if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j) { - this_load_permuted = true; + loads_permuted = true; break; } - if (!this_load_permuted - /* The load requires permutation when unrolling exposes - a gap either because the group is larger than the SLP - group-size or because there is a gap between the groups. */ - && (known_eq (unrolling_factor, 1U) - || (group_size == DR_GROUP_SIZE (first_stmt_info) - && DR_GROUP_GAP (first_stmt_info) == 0))) - { - SLP_TREE_LOAD_PERMUTATION (load_node).release (); - continue; - } - loads_permuted = true; - } - - if (loads_permuted) - { - if (!vect_supported_load_permutation_p (vinfo, new_instance)) - { - if (dump_enabled_p ()) - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "Build SLP failed: unsupported load " - "permutation %G", stmt_info->stmt); - vect_free_slp_instance (new_instance, false); - return false; - } } - /* If the loads and stores can be handled with load/store-lan + /* If the loads and stores can be handled with load/store-lane instructions do not generate this SLP instance. */ if (is_a (vinfo) && loads_permuted @@ -2514,9 +2359,93 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) vect_free_slp_tree ((*it).second, false); delete bst_map; + /* Optimize permutations in SLP reductions. */ + slp_instance instance; + FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) + { + slp_tree node = SLP_INSTANCE_TREE (instance); + stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; + /* Reduction (there are no data-refs in the root). + In reduction chain the order of the loads is not important. */ + if (!STMT_VINFO_DATA_REF (stmt_info) + && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)) + vect_attempt_slp_rearrange_stmts (instance); + } + + /* Gather all loads in the SLP graph. */ + hash_set visited; + FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) + vect_gather_slp_loads (vinfo->slp_loads, SLP_INSTANCE_TREE (instance), + visited); + return opt_result::success (); } +void +vect_optimize_slp (vec_info *vinfo) +{ + slp_tree node; + unsigned i; + FOR_EACH_VEC_ELT (vinfo->slp_loads, i, node) + { + if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) + continue; + + /* In basic block vectorization we allow any subchain of an interleaving + chain. + FORNOW: not in loop SLP because of realignment complications. */ + if (is_a (vinfo)) + { + bool subchain_p = true; + stmt_vec_info next_load_info = NULL; + stmt_vec_info load_info; + unsigned j; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) + { + if (j != 0 + && (next_load_info != load_info + || DR_GROUP_GAP (load_info) != 1)) + { + subchain_p = false; + break; + } + next_load_info = DR_GROUP_NEXT_ELEMENT (load_info); + } + if (subchain_p) + { + SLP_TREE_LOAD_PERMUTATION (node).release (); + continue; + } + } + else + { + stmt_vec_info load_info; + bool this_load_permuted = false; + unsigned j; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info) + if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j) + { + this_load_permuted = true; + break; + } + stmt_vec_info first_stmt_info + = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]); + if (!this_load_permuted + /* The load requires permutation when unrolling exposes + a gap either because the group is larger than the SLP + group-size or because there is a gap between the groups. */ + && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a (vinfo)), 1U) + || ((SLP_TREE_SCALAR_STMTS (node).length () + == DR_GROUP_SIZE (first_stmt_info)) + && DR_GROUP_GAP (first_stmt_info) == 0))) + { + SLP_TREE_LOAD_PERMUTATION (node).release (); + continue; + } + } + } +} + /* For each possible SLP instance decide whether to SLP it and calculate overall unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at @@ -3180,6 +3109,9 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal) return false; } + /* Optimize permutations. */ + vect_optimize_slp (bb_vinfo); + vect_record_base_alignments (bb_vinfo); /* Analyze and verify the alignment of data references and the diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c index 2349d317206..f7f19fee1bb 100644 --- a/gcc/tree-vect-stmts.c +++ b/gcc/tree-vect-stmts.c @@ -8781,7 +8781,44 @@ vectorizable_load (vec_info *vinfo, } if (slp && SLP_TREE_LOAD_PERMUTATION (slp_node).exists ()) - slp_perm = true; + { + slp_perm = true; + + if (!loop_vinfo) + { + /* In BB vectorization we may not actually use a loaded vector + accessing elements in excess of DR_GROUP_SIZE. */ + stmt_vec_info group_info = SLP_TREE_SCALAR_STMTS (slp_node)[0]; + group_info = DR_GROUP_FIRST_ELEMENT (group_info); + unsigned HOST_WIDE_INT nunits; + unsigned j, k, maxk = 0; + FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (slp_node), j, k) + if (k > maxk) + maxk = k; + tree vectype = STMT_VINFO_VECTYPE (group_info); + if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nunits) + || maxk >= (DR_GROUP_SIZE (group_info) & ~(nunits - 1))) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "BB vectorization with gaps at the end of " + "a load is not supported\n"); + return false; + } + } + + auto_vec tem; + unsigned n_perms; + if (!vect_transform_slp_perm_load (vinfo, slp_node, tem, NULL, vf, + true, &n_perms)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, + vect_location, + "unsupported load permutation\n"); + return false; + } + } /* Invalidate assumptions made by dependence analysis when vectorization on the unrolled body effectively re-orders stmts. */ @@ -9896,12 +9933,9 @@ vectorizable_load (vec_info *vinfo, if (slp_perm) { unsigned n_perms; - if (!vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, gsi, vf, - false, &n_perms)) - { - dr_chain.release (); - return false; - } + bool ok = vect_transform_slp_perm_load (vinfo, slp_node, dr_chain, + gsi, vf, false, &n_perms); + gcc_assert (ok); } else { diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index da6d37ac199..aa8bd33b9d1 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -324,8 +324,9 @@ public: /* The mapping of GIMPLE UID to stmt_vec_info. */ vec stmt_vec_infos; - /* All SLP instances. */ + /* The SLP graph. */ auto_vec slp_instances; + auto_vec slp_loads; /* Maps base addresses to an innermost_loop_behavior that gives the maximum known alignment for that base. */ @@ -1858,6 +1859,7 @@ extern void vect_schedule_slp (vec_info *); extern opt_result vect_analyze_slp (vec_info *, unsigned); extern bool vect_make_slp_decision (loop_vec_info); extern void vect_detect_hybrid_slp (loop_vec_info); +extern void vect_optimize_slp (vec_info *); extern void vect_get_slp_defs (vec_info *, slp_tree, vec > *, unsigned n = -1U); extern bool vect_slp_bb (basic_block); -- 2.30.2