From 126ed72b9f48f8530b194532cc281fb761690435 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Wed, 30 Sep 2020 17:08:01 +0200 Subject: [PATCH] optimize permutes in SLP, remove vect_attempt_slp_rearrange_stmts This introduces a permute optimization phase for SLP which is intended to cover the existing permute eliding for SLP reductions plus handling commonizing the easy cases. It currently uses graphds to compute a postorder on the reverse SLP graph and it handles all cases vect_attempt_slp_rearrange_stmts did (hopefully - I've adjusted most testcases that triggered it a few days ago). It restricts itself to move around bijective permutations to simplify things for now, mainly around constant nodes. As a prerequesite it makes the SLP graph cyclic (ugh). It looks like it would pay off to compute a PRE/POST order visit array once and elide all the recursive SLP graph walks and their visited hash-set. At least for the time where we do not change the SLP graph during such walk. I do not like using graphds too much but at least I don't have to re-implement yet another RPO walk, so maybe it isn't too bad. It now computes permute placement during iteration and thus should get cycles more obviously correct. Richard. 2020-10-06 Richard Biener * tree-vect-data-refs.c (vect_slp_analyze_instance_dependence): Use SLP_TREE_REPRESENTATIVE. * tree-vectorizer.h (_slp_tree::vertex): New member used for graphds interfacing. * tree-vect-slp.c (vect_build_slp_tree_2): Allocate space for PHI SLP children. (vect_analyze_slp_backedges): New function filling in SLP node children for PHIs that correspond to backedge values. (vect_analyze_slp): Call vect_analyze_slp_backedges for the graph. (vect_slp_analyze_node_operations): Deal with a cyclic graph. (vect_schedule_slp_instance): Likewise. (vect_schedule_slp): Likewise. (slp_copy_subtree): Remove. (vect_slp_rearrange_stmts): Likewise. (vect_attempt_slp_rearrange_stmts): Likewise. (vect_slp_build_vertices): New functions. (vect_slp_permute): Likewise. (vect_slp_perms_eq): Likewise. (vect_optimize_slp): Remove special code to elide permutations with SLP reductions. Implement generic permute optimization. * gcc.dg/vect/bb-slp-50.c: New testcase. * gcc.dg/vect/bb-slp-51.c: Likewise. --- gcc/testsuite/gcc.dg/vect/bb-slp-50.c | 20 + gcc/testsuite/gcc.dg/vect/bb-slp-51.c | 20 + gcc/tree-vect-data-refs.c | 2 +- gcc/tree-vect-slp.c | 685 +++++++++++++++++--------- gcc/tree-vectorizer.h | 2 + 5 files changed, 503 insertions(+), 226 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-50.c create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-51.c diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-50.c b/gcc/testsuite/gcc.dg/vect/bb-slp-50.c new file mode 100644 index 00000000000..80216be4ebf --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-50.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ + +double a[2]; +double b[2]; +double c[2]; +double d[2]; +double e[2]; +void foo(double x) +{ + double tembc0 = b[1] + c[1]; + double tembc1 = b[0] + c[0]; + double temde0 = d[0] + e[1]; + double temde1 = d[1] + e[0]; + a[0] = tembc0 + temde0; + a[1] = tembc1 + temde1; +} + +/* We should common the permutation on the tembc{0,1} operands. */ +/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\n\\r\]* = VEC_PERM_EXPR" 2 "slp2" } } */ diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-51.c b/gcc/testsuite/gcc.dg/vect/bb-slp-51.c new file mode 100644 index 00000000000..1481018428e --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-51.c @@ -0,0 +1,20 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target vect_double } */ + +double a[2]; +double b[2]; +double c[2]; +double e[2]; +void foo(double x) +{ + double tembc0 = b[1] + c[1]; + double tembc1 = b[0] + c[0]; + double temde0 = 5 + e[1]; + double temde1 = 11 + e[0]; + a[0] = tembc0 + temde0; + a[1] = tembc1 + temde1; +} + +/* We should common the permutations on the tembc{0,1} and temde{0,1} + operands. */ +/* { dg-final { scan-tree-dump-times "add new stmt: \[^\\r\\n\]* VEC_PERM_EXPR" 1 "slp2" } } */ diff --git a/gcc/tree-vect-data-refs.c b/gcc/tree-vect-data-refs.c index 676182c0888..3ff3088641a 100644 --- a/gcc/tree-vect-data-refs.c +++ b/gcc/tree-vect-data-refs.c @@ -841,7 +841,7 @@ vect_slp_analyze_instance_dependence (vec_info *vinfo, slp_instance instance) /* The stores of this instance are at the root of the SLP tree. */ slp_tree store = SLP_INSTANCE_TREE (instance); - if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0])) + if (! STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (store))) store = NULL; /* Verify we can sink stores to the vectorized stmt insert location. */ diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 8acef6f3cef..ff0ecda801b 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -1256,8 +1256,8 @@ vect_build_slp_tree_2 (vec_info *vinfo, if (gimple_assign_rhs_code (stmt) == COND_EXPR) nops++; } - else if (is_a (stmt_info->stmt)) - nops = 0; + else if (gphi *phi = dyn_cast (stmt_info->stmt)) + nops = gimple_phi_num_args (phi); else return NULL; @@ -1294,7 +1294,7 @@ vect_build_slp_tree_2 (vec_info *vinfo, else return NULL; (*tree_size)++; - node = vect_create_new_slp_node (stmts, 0); + node = vect_create_new_slp_node (stmts, nops); SLP_TREE_VECTYPE (node) = vectype; return node; } @@ -1812,188 +1812,6 @@ vect_mark_slp_stmts_relevant (slp_tree node) vect_mark_slp_stmts_relevant (node, visited); } -/* Copy the SLP subtree rooted at NODE. */ - -static slp_tree -slp_copy_subtree (slp_tree node, hash_map &map) -{ - unsigned i; - - bool existed_p; - slp_tree ©_ref = map.get_or_insert (node, &existed_p); - if (existed_p) - return copy_ref; - - copy_ref = new _slp_tree; - slp_tree copy = copy_ref; - SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node); - SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node); - SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node); - SLP_TREE_LANES (copy) = SLP_TREE_LANES (node); - copy->max_nunits = node->max_nunits; - SLP_TREE_REF_COUNT (copy) = 0; - if (SLP_TREE_SCALAR_STMTS (node).exists ()) - SLP_TREE_SCALAR_STMTS (copy) = SLP_TREE_SCALAR_STMTS (node).copy (); - if (SLP_TREE_SCALAR_OPS (node).exists ()) - SLP_TREE_SCALAR_OPS (copy) = SLP_TREE_SCALAR_OPS (node).copy (); - if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) - SLP_TREE_LOAD_PERMUTATION (copy) = SLP_TREE_LOAD_PERMUTATION (node).copy (); - if (SLP_TREE_LANE_PERMUTATION (node).exists ()) - SLP_TREE_LANE_PERMUTATION (copy) = SLP_TREE_LANE_PERMUTATION (node).copy (); - if (SLP_TREE_CHILDREN (node).exists ()) - SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node).copy (); - gcc_assert (!SLP_TREE_VEC_STMTS (node).exists ()); - - slp_tree child; - FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (copy), i, child) - { - SLP_TREE_CHILDREN (copy)[i] = slp_copy_subtree (child, map); - SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (copy)[i])++; - } - return copy; -} - -/* Rearrange the statements of NODE according to PERMUTATION. */ - -static void -vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size, - vec permutation, - hash_set &visited) -{ - unsigned int i; - slp_tree child; - - if (visited.add (node)) - return; - - FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_slp_rearrange_stmts (child, group_size, permutation, visited); - - if (SLP_TREE_SCALAR_STMTS (node).exists ()) - { - gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ()); - vec tmp_stmts; - tmp_stmts.create (group_size); - tmp_stmts.quick_grow (group_size); - stmt_vec_info stmt_info; - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) - tmp_stmts[permutation[i]] = stmt_info; - SLP_TREE_SCALAR_STMTS (node).release (); - SLP_TREE_SCALAR_STMTS (node) = tmp_stmts; - } - if (SLP_TREE_SCALAR_OPS (node).exists ()) - { - gcc_assert (group_size == SLP_TREE_SCALAR_OPS (node).length ()); - vec tmp_ops; - tmp_ops.create (group_size); - tmp_ops.quick_grow (group_size); - tree op; - FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op) - tmp_ops[permutation[i]] = op; - SLP_TREE_SCALAR_OPS (node).release (); - SLP_TREE_SCALAR_OPS (node) = tmp_ops; - } - if (SLP_TREE_LANE_PERMUTATION (node).exists ()) - { - gcc_assert (group_size == SLP_TREE_LANE_PERMUTATION (node).length ()); - for (i = 0; i < group_size; ++i) - SLP_TREE_LANE_PERMUTATION (node)[i].second - = permutation[SLP_TREE_LANE_PERMUTATION (node)[i].second]; - } -} - - -/* Attempt to reorder stmts in a reduction chain so that we don't - require any load permutation. Return true if that was possible, - otherwise return false. */ - -static bool -vect_attempt_slp_rearrange_stmts (slp_instance slp_instn) -{ - unsigned int i, j; - unsigned int lidx; - slp_tree node, load; - - if (SLP_INSTANCE_LOADS (slp_instn).is_empty ()) - return false; - - /* Compare all the permutation sequences to the first one. We know - that at least one load is permuted. */ - node = SLP_INSTANCE_LOADS (slp_instn)[0]; - if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) - return false; - unsigned int group_size = SLP_TREE_LOAD_PERMUTATION (node).length (); - for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i) - { - if (!SLP_TREE_LOAD_PERMUTATION (load).exists () - || SLP_TREE_LOAD_PERMUTATION (load).length () != group_size) - return false; - FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (load), j, lidx) - if (lidx != SLP_TREE_LOAD_PERMUTATION (node)[j]) - return false; - } - - /* Check that the loads in the first sequence are different and there - are no gaps between them and that there is an actual permutation. */ - bool any_permute = false; - auto_sbitmap load_index (group_size); - bitmap_clear (load_index); - FOR_EACH_VEC_ELT (node->load_permutation, i, lidx) - { - if (lidx != i) - any_permute = true; - if (lidx >= group_size) - return false; - if (bitmap_bit_p (load_index, lidx)) - return false; - - bitmap_set_bit (load_index, lidx); - } - if (!any_permute) - return false; - for (i = 0; i < group_size; i++) - if (!bitmap_bit_p (load_index, i)) - return false; - - /* This permutation is valid for reduction. Since the order of the - statements in the nodes is not important unless they are memory - accesses, we can rearrange the statements in all the nodes - according to the order of the loads. */ - - /* We have to unshare the SLP tree we modify. */ - hash_map map; - slp_tree unshared = slp_copy_subtree (SLP_INSTANCE_TREE (slp_instn), map); - vect_free_slp_tree (SLP_INSTANCE_TREE (slp_instn)); - SLP_TREE_REF_COUNT (unshared)++; - SLP_INSTANCE_TREE (slp_instn) = unshared; - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) - SLP_INSTANCE_LOADS (slp_instn)[i] = *map.get (node); - node = SLP_INSTANCE_LOADS (slp_instn)[0]; - - /* Do the actual re-arrangement. */ - hash_set visited; - vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size, - node->load_permutation, visited); - - /* We are done, no actual permutations need to be generated. */ - poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn); - FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node) - { - stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; - first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info); - /* But we have to keep those permutations that are required because - of handling of gaps. */ - if (known_eq (unrolling_factor, 1U) - || (group_size == DR_GROUP_SIZE (first_stmt_info) - && DR_GROUP_GAP (first_stmt_info) == 0)) - SLP_TREE_LOAD_PERMUTATION (node).release (); - else - for (j = 0; j < SLP_TREE_LOAD_PERMUTATION (node).length (); ++j) - SLP_TREE_LOAD_PERMUTATION (node)[j] = j; - } - - return true; -} /* Gather loads in the SLP graph NODE and populate the INST loads array. */ @@ -2458,6 +2276,53 @@ vect_analyze_slp_instance (vec_info *vinfo, return false; } +/* Fill in backedge SLP children in the SLP graph. */ + +static void +vect_analyze_slp_backedges (vec_info *vinfo, slp_tree node, + scalar_stmts_to_slp_tree_map_t *bst_map, + hash_set &visited) +{ + if (SLP_TREE_DEF_TYPE (node) != vect_internal_def + || visited.add (node)) + return; + + slp_tree child; + unsigned i; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) + if (child) + vect_analyze_slp_backedges (vinfo, child, bst_map, visited); + + if (gphi *phi = dyn_cast (SLP_TREE_REPRESENTATIVE (node)->stmt)) + for (unsigned i = 0; i < gimple_phi_num_args (phi); ++i) + { + auto_vec stmts; + unsigned j; + stmt_vec_info phi_info; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, phi_info) + { + tree def = gimple_phi_arg_def (as_a (phi_info->stmt), i); + stmt_vec_info def_info = vinfo->lookup_def (def); + if (!def_info) + break; + stmts.safe_push (vect_stmt_to_vectorize (def_info)); + } + if (j != SLP_TREE_LANES (node)) + continue; + slp_tree *edge_def = bst_map->get (stmts); + if (edge_def) + { + /* ??? We're currently not recording non-backedge children + of PHIs like external reduction initial values. Avoid + NULL entries in SLP_TREE_CHILDREN for those and thus + for now simply only record backedge defs at a + SLP_TREE_CHILDREN index possibly not matching that of + the corresponding PHI argument index. */ + SLP_TREE_CHILDREN (node).quick_push (*edge_def); + (*edge_def)->refcnt++; + } + } +} /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP trees of packed scalar stmts if SLP is possible. */ @@ -2509,6 +2374,13 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) max_tree_size); } + /* Fill in backedges. */ + slp_instance instance; + hash_set visited; + FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) + vect_analyze_slp_backedges (vinfo, SLP_INSTANCE_TREE (instance), + bst_map, visited); + /* The map keeps a reference on SLP nodes built, release that. */ for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin (); it != bst_map->end (); ++it) @@ -2519,34 +2391,396 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size) return opt_result::success (); } +/* Fill the vertices and leafs vector with all nodes in the SLP graph. */ + +static void +vect_slp_build_vertices (hash_set &visited, slp_tree node, + vec &vertices, vec &leafs) +{ + unsigned i; + slp_tree child; + + if (visited.add (node)) + return; + + node->vertex = vertices.length (); + vertices.safe_push (node); + if (SLP_TREE_CHILDREN (node).is_empty ()) + leafs.safe_push (node->vertex); + else + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) + vect_slp_build_vertices (visited, child, vertices, leafs); +} + +/* Fill the vertices and leafs vector with all nodes in the SLP graph. */ + +static void +vect_slp_build_vertices (vec_info *info, vec &vertices, + vec &leafs) +{ + hash_set visited; + unsigned i; + slp_instance instance; + FOR_EACH_VEC_ELT (info->slp_instances, i, instance) + vect_slp_build_vertices (visited, SLP_INSTANCE_TREE (instance), vertices, + leafs); +} + +/* Apply (reverse) bijectite PERM to VEC. */ + +template +static void +vect_slp_permute (vec perm, + vec &vec, bool reverse) +{ + auto_vec saved; + saved.create (vec.length ()); + for (unsigned i = 0; i < vec.length (); ++i) + saved.quick_push (vec[i]); + + if (reverse) + { + for (unsigned i = 0; i < vec.length (); ++i) + vec[perm[i]] = saved[i]; + for (unsigned i = 0; i < vec.length (); ++i) + gcc_assert (vec[perm[i]] == saved[i]); + } + else + { + for (unsigned i = 0; i < vec.length (); ++i) + vec[i] = saved[perm[i]]; + for (unsigned i = 0; i < vec.length (); ++i) + gcc_assert (vec[i] == saved[perm[i]]); + } +} + +/* Return whether permutations PERM_A and PERM_B as recorded in the + PERMS vector are equal. */ + +static bool +vect_slp_perms_eq (const vec > &perms, + int perm_a, int perm_b) +{ + return (perm_a == perm_b + || (perms[perm_a].length () == perms[perm_b].length () + && memcmp (&perms[perm_a][0], &perms[perm_b][0], + sizeof (unsigned) * perms[perm_a].length ()) == 0)); +} + +/* Optimize the SLP graph of VINFO. */ + void vect_optimize_slp (vec_info *vinfo) { - /* Optimize permutations in SLP reductions. */ - slp_instance instance; + if (vinfo->slp_instances.is_empty ()) + return; + + slp_tree node; unsigned i; - FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) + auto_vec vertices; + auto_vec leafs; + vect_slp_build_vertices (vinfo, vertices, leafs); + + struct graph *slpg = new_graph (vertices.length ()); + FOR_EACH_VEC_ELT (vertices, i, node) { - slp_tree node = SLP_INSTANCE_TREE (instance); - stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0]; - /* Reduction (there are no data-refs in the root). - In reduction chain the order of the loads is not important. */ - if (!STMT_VINFO_DATA_REF (stmt_info) - && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) - && !SLP_INSTANCE_ROOT_STMT (instance)) - vect_attempt_slp_rearrange_stmts (instance); + unsigned j; + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) + add_edge (slpg, i, child->vertex); } - /* Gather all loads in the SLP graph. */ - auto_vec slp_loads; - hash_set visited; - FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance) - vect_gather_slp_loads (slp_loads, SLP_INSTANCE_TREE (instance), - visited); + /* Compute (reverse) postorder on the inverted graph. */ + auto_vec ipo; + graphds_dfs (slpg, &leafs[0], leafs.length (), &ipo, false, NULL, NULL); - slp_tree node; - FOR_EACH_VEC_ELT (slp_loads, i, node) + auto_sbitmap n_visited (vertices.length ()); + auto_sbitmap n_materialize (vertices.length ()); + auto_vec n_perm (vertices.length ()); + auto_vec > perms; + + bitmap_clear (n_visited); + bitmap_clear (n_materialize); + n_perm.quick_grow_cleared (vertices.length ()); + perms.safe_push (vNULL); /* zero is no permute */ + + /* Produce initial permutations. */ + for (i = 0; i < leafs.length (); ++i) + { + int idx = leafs[i]; + slp_tree node = vertices[idx]; + + /* Handle externals and constants optimistically throughout the + iteration. */ + if (SLP_TREE_DEF_TYPE (node) == vect_external_def + || SLP_TREE_DEF_TYPE (node) == vect_constant_def) + continue; + + /* Loads are the only thing generating permutes and leafs do not + change across iterations. */ + bitmap_set_bit (n_visited, idx); + if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) + continue; + + /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the + node unpermuted, record this permute. */ + stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node); + if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt)) + continue; + dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt); + unsigned imin = DR_GROUP_SIZE (dr_stmt) + 1, imax = 0; + bool any_permute = false; + for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) + { + unsigned idx = SLP_TREE_LOAD_PERMUTATION (node)[j]; + imin = MIN (imin, idx); + imax = MAX (imax, idx); + if (idx - SLP_TREE_LOAD_PERMUTATION (node)[0] != j) + any_permute = true; + } + /* If there's no permute no need to split one out. */ + if (!any_permute) + continue; + /* If the span doesn't match we'd disrupt VF computation, avoid + that for now. */ + if (imax - imin + 1 != SLP_TREE_LANES (node)) + continue; + + /* For now only handle true permutes, like + vect_attempt_slp_rearrange_stmts did. This allows us to be lazy + when permuting constants and invariants keeping the permute + bijective. */ + auto_sbitmap load_index (SLP_TREE_LANES (node)); + bitmap_clear (load_index); + for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) + bitmap_set_bit (load_index, SLP_TREE_LOAD_PERMUTATION (node)[j] - imin); + unsigned j; + for (j = 0; j < SLP_TREE_LANES (node); ++j) + if (!bitmap_bit_p (load_index, j)) + break; + if (j != SLP_TREE_LANES (node)) + continue; + + vec perm = vNULL; + perm.safe_grow (SLP_TREE_LANES (node), true); + for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) + perm[j] = SLP_TREE_LOAD_PERMUTATION (node)[j] - imin; + perms.safe_push (perm); + n_perm[idx] = perms.length () - 1; + } + + /* Propagate permutes along the graph and compute materialization points. */ + bool changed; + unsigned iteration = 0; + do + { + changed = false; + ++iteration; + + for (i = vertices.length (); i > 0 ; --i) + { + int idx = ipo[i-1]; + slp_tree node = vertices[idx]; + /* For leafs there's nothing to do - we've seeded permutes + on those above. */ + if (SLP_TREE_DEF_TYPE (node) != vect_internal_def) + continue; + + bitmap_set_bit (n_visited, idx); + + /* We cannot move a permute across a store. */ + if (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node)) + && DR_IS_WRITE + (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (node)))) + continue; + + int perm = -1; + for (graph_edge *succ = slpg->vertices[idx].succ; + succ; succ = succ->succ_next) + { + int succ_idx = succ->dest; + /* Handle unvisited nodes optimistically. */ + /* ??? But for constants once we want to handle non-bijective + permutes we have to verify the permute, when unifying lanes, + will not unify different constants. For example see + gcc.dg/vect/bb-slp-14.c for a case that would break. */ + if (!bitmap_bit_p (n_visited, succ_idx)) + continue; + int succ_perm = n_perm[succ_idx]; + /* Once we materialize succs permutation its output lanes + appear unpermuted to us. */ + if (bitmap_bit_p (n_materialize, succ_idx)) + succ_perm = 0; + if (perm == -1) + perm = succ_perm; + else if (succ_perm == 0) + { + perm = 0; + break; + } + else if (!vect_slp_perms_eq (perms, perm, succ_perm)) + { + perm = 0; + break; + } + } + + if (perm == -1) + /* Pick up pre-computed leaf values. */ + perm = n_perm[idx]; + else if (!vect_slp_perms_eq (perms, perm, n_perm[idx])) + { + if (iteration > 1) + /* Make sure we eventually converge. */ + gcc_checking_assert (perm == 0); + n_perm[idx] = perm; + if (perm == 0) + bitmap_clear_bit (n_materialize, idx); + changed = true; + } + + if (perm == 0) + continue; + + /* Elide pruning at materialization points in the first + iteration so every node was visited once at least. */ + if (iteration == 1) + continue; + + /* Decide on permute materialization. Look whether there's + a use (pred) edge that is permuted differently than us. + In that case mark ourselves so the permutation is applied. */ + bool all_preds_permuted = slpg->vertices[idx].pred != NULL; + for (graph_edge *pred = slpg->vertices[idx].pred; + pred; pred = pred->pred_next) + { + gcc_checking_assert (bitmap_bit_p (n_visited, pred->src)); + int pred_perm = n_perm[pred->src]; + if (!vect_slp_perms_eq (perms, perm, pred_perm)) + { + all_preds_permuted = false; + break; + } + } + if (!all_preds_permuted) + { + if (!bitmap_bit_p (n_materialize, idx)) + changed = true; + bitmap_set_bit (n_materialize, idx); + } + } + } + while (changed || iteration == 1); + + /* Materialize. */ + for (i = 0; i < vertices.length (); ++i) + { + int perm = n_perm[i]; + if (perm <= 0) + continue; + + slp_tree node = vertices[i]; + + /* First permute invariant/external original successors. */ + unsigned j; + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child) + { + if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) + continue; + + /* If the vector is uniform there's nothing to do. */ + if (vect_slp_tree_uniform_p (child)) + continue; + + /* We can end up sharing some externals via two_operator + handling. Be prepared to unshare those. */ + if (child->refcnt != 1) + { + gcc_assert (slpg->vertices[child->vertex].pred->pred_next); + SLP_TREE_CHILDREN (node)[j] = child + = vect_create_new_slp_node + (SLP_TREE_SCALAR_OPS (child).copy ()); + } + vect_slp_permute (perms[perm], + SLP_TREE_SCALAR_OPS (child), true); + } + + if (bitmap_bit_p (n_materialize, i)) + { + if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) + /* For loads simply drop the permutation, the load permutation + already performs the desired permutation. */ + ; + else + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "inserting permute node in place of %p\n", + node); + + /* Make a copy of NODE and in-place change it to a + VEC_PERM node to permute the lanes of the copy. */ + slp_tree copy = new _slp_tree; + SLP_TREE_CHILDREN (copy) = SLP_TREE_CHILDREN (node); + SLP_TREE_CHILDREN (node) = vNULL; + SLP_TREE_SCALAR_STMTS (copy) + = SLP_TREE_SCALAR_STMTS (node).copy (); + vect_slp_permute (perms[perm], + SLP_TREE_SCALAR_STMTS (copy), true); + gcc_assert (!SLP_TREE_SCALAR_OPS (node).exists ()); + SLP_TREE_REPRESENTATIVE (copy) = SLP_TREE_REPRESENTATIVE (node); + gcc_assert (!SLP_TREE_LOAD_PERMUTATION (node).exists ()); + SLP_TREE_LANE_PERMUTATION (copy) + = SLP_TREE_LANE_PERMUTATION (node); + SLP_TREE_LANE_PERMUTATION (node) = vNULL; + SLP_TREE_VECTYPE (copy) = SLP_TREE_VECTYPE (node); + copy->refcnt = 1; + copy->max_nunits = node->max_nunits; + SLP_TREE_DEF_TYPE (copy) = SLP_TREE_DEF_TYPE (node); + SLP_TREE_LANES (copy) = SLP_TREE_LANES (node); + SLP_TREE_CODE (copy) = SLP_TREE_CODE (node); + + /* Now turn NODE into a VEC_PERM. */ + SLP_TREE_CHILDREN (node).safe_push (copy); + SLP_TREE_LANE_PERMUTATION (node).create (SLP_TREE_LANES (node)); + for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j) + SLP_TREE_LANE_PERMUTATION (node) + .quick_push (std::make_pair (0, perms[perm][j])); + SLP_TREE_CODE (node) = VEC_PERM_EXPR; + } + } + else + { + /* Apply the reverse permutation to our stmts. */ + vect_slp_permute (perms[perm], + SLP_TREE_SCALAR_STMTS (node), true); + /* And to the load permutation, which we can simply + make regular by design. */ + if (SLP_TREE_LOAD_PERMUTATION (node).exists ()) + { + /* ??? When we handle non-bijective permutes the idea + is that we can force the load-permutation to be + { min, min + 1, min + 2, ... max }. But then the + scalar defs might no longer match the lane content + which means wrong-code with live lane vectorization. + So we possibly have to have NULL entries for those. */ + vect_slp_permute (perms[perm], + SLP_TREE_LOAD_PERMUTATION (node), true); + } + } + } + + /* Free the perms vector used for propagation. */ + while (!perms.is_empty ()) + perms.pop ().release (); + free_graph (slpg); + + + /* Now elide load permutations that are not necessary. */ + for (i = 0; i < leafs.length (); ++i) { + node = vertices[i]; if (!SLP_TREE_LOAD_PERMUTATION (node).exists ()) continue; @@ -2593,7 +2827,8 @@ vect_optimize_slp (vec_info *vinfo) /* The load requires permutation when unrolling exposes a gap either because the group is larger than the SLP group-size or because there is a gap between the groups. */ - && (known_eq (LOOP_VINFO_VECT_FACTOR (as_a (vinfo)), 1U) + && (known_eq (LOOP_VINFO_VECT_FACTOR + (as_a (vinfo)), 1U) || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info)) && DR_GROUP_GAP (first_stmt_info) == 0))) { @@ -2975,12 +3210,9 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, return true; /* If we already analyzed the exact same set of scalar stmts we're done. - We share the generated vector stmts for those. - The SLP graph is acyclic so not caching whether we failed or succeeded - doesn't result in any issue since we throw away the lvisited set - when we fail. */ + We share the generated vector stmts for those. */ if (visited.contains (node) - || lvisited.contains (node)) + || lvisited.add (node)) return true; bool res = true; @@ -2993,12 +3225,10 @@ vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node, } if (res) - { - res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance, - cost_vec); - if (res) - lvisited.add (node); - } + res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance, + cost_vec); + if (!res) + lvisited.remove (node); /* When the node can be vectorized cost invariant nodes it references. This is not done in DFS order to allow the refering node @@ -4685,7 +4915,8 @@ vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi, static void vect_schedule_slp_instance (vec_info *vinfo, - slp_tree node, slp_instance instance) + slp_tree node, slp_instance instance, + hash_set &visited) { gimple_stmt_iterator si; int i; @@ -4712,8 +4943,12 @@ vect_schedule_slp_instance (vec_info *vinfo, return; } + /* ??? If we'd have a way to mark backedges that would be cheaper. */ + if (visited.add (node)) + return; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) - vect_schedule_slp_instance (vinfo, child, instance); + vect_schedule_slp_instance (vinfo, child, instance, visited); gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0); SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node)); @@ -4737,14 +4972,13 @@ vect_schedule_slp_instance (vec_info *vinfo, last_stmt_info = vect_find_last_scalar_stmt_in_slp (node); si = gsi_for_stmt (last_stmt_info->stmt); } - else if (SLP_TREE_CHILDREN (node).is_empty ()) + else if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) + == cycle_phi_info_type) + || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) + == induc_vec_info_type)) { - /* This happens for reduction and induction PHIs where we do not use the + /* For reduction and induction PHIs we do not use the insertion iterator. */ - gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) - == cycle_phi_info_type - || (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node)) - == induc_vec_info_type)); si = gsi_none (); } else @@ -4957,6 +5191,7 @@ vect_schedule_slp (vec_info *vinfo, vec slp_instances) slp_instance instance; unsigned int i; + hash_set visited; FOR_EACH_VEC_ELT (slp_instances, i, instance) { slp_tree node = SLP_INSTANCE_TREE (instance); @@ -4971,7 +5206,7 @@ vect_schedule_slp (vec_info *vinfo, vec slp_instances) SLP_INSTANCE_TREE (instance)); } /* Schedule the tree of INSTANCE. */ - vect_schedule_slp_instance (vinfo, node, instance); + vect_schedule_slp_instance (vinfo, node, instance, visited); if (SLP_INSTANCE_ROOT_STMT (instance)) vectorize_slp_instance_root_stmt (node, instance); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 38daa05aebb..2a8c4a56a8a 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -161,6 +161,8 @@ struct _slp_tree { unsigned int lanes; /* The operation of this node. */ enum tree_code code; + + int vertex; }; -- 2.30.2