From 47ddf4c7b1d4471cb9534f27844ab5e4279c2168 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Tue, 8 Sep 2020 14:49:59 +0200 Subject: [PATCH] tree-optimization/96043 - BB vectorization costing improvement This makes the BB vectorizer cost independent SLP subgraphs separately. While on pristine trunk and for x86_64 I failed to distill a testcase where the vectorizer would think _any_ basic-block vectorization opportunity is not profitable I do have pending work that would make the cost savings of a profitable opportunity make another independently not profitable opportunity vectorized. 2020-09-08 Richard Biener PR tree-optimization/96043 * tree-vectorizer.h (_slp_instance::cost_vec): New. (_slp_instance::subgraph_entries): Likewise. (BB_VINFO_TARGET_COST_DATA): Remove. * tree-vect-slp.c (vect_free_slp_instance): Free cost_vec and subgraph_entries. (vect_analyze_slp_instance): Initialize them. (vect_slp_analyze_operations): Defer passing costs to the target, instead record them in the SLP graph entry. (get_ultimate_leader): New helper for graph partitioning. (vect_bb_partition_graph_r): Likewise. (vect_bb_partition_graph): New function to partition the SLP graph into independently costable parts. (vect_bb_vectorization_profitable_p): Adjust to work on a subgraph. (vect_bb_vectorization_profitable_p): New wrapper, discarding non-profitable vectorization of subgraphs. (vect_slp_analyze_bb_1): Call vect_bb_partition_graph before costing. * gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c: Adjust. --- .../vect/costmodel/x86_64/costmodel-pr69297.c | 20 +- gcc/tree-vect-slp.c | 185 ++++++++++++++++-- gcc/tree-vectorizer.h | 8 +- 3 files changed, 194 insertions(+), 19 deletions(-) diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c index e65a30c06d6..ef74785f6a8 100644 --- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c +++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c @@ -74,10 +74,28 @@ foo (int* diff) d[13] = m[12] - m[13]; d[14] = m[14] + m[15]; d[15] = m[15] - m[14]; + /* The following obviously profitable part should not make + the former unprofitable one profitable. */ + diff[16 + 16] = diff[16]; + diff[17 + 16] = diff[17]; + diff[18 + 16] = diff[18]; + diff[19 + 16] = diff[19]; + diff[20 + 16] = diff[20]; + diff[21 + 16] = diff[21]; + diff[22 + 16] = diff[22]; + diff[23 + 16] = diff[23]; + diff[24 + 16] = diff[24]; + diff[25 + 16] = diff[25]; + diff[26 + 16] = diff[26]; + diff[27 + 16] = diff[27]; + diff[28 + 16] = diff[28]; + diff[29 + 16] = diff[29]; + diff[30 + 16] = diff[30]; + diff[31 + 16] = diff[31]; for (k=0; k<16; k++) satd += abs(d[k]); return satd; } /* { dg-final { scan-tree-dump "vectorization is not profitable" "slp1" } } */ -/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */ +/* { dg-final { scan-tree-dump-times "Vectorizing SLP tree" 1 "slp1" } } */ diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 7860fe3f32b..458a2c5bb30 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -126,6 +126,8 @@ vect_free_slp_instance (slp_instance instance, bool final_p) { vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p); SLP_INSTANCE_LOADS (instance).release (); + instance->subgraph_entries.release (); + instance->cost_vec.release (); free (instance); } @@ -2263,6 +2265,8 @@ vect_analyze_slp_instance (vec_info *vinfo, SLP_INSTANCE_LOADS (new_instance) = vNULL; SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL; new_instance->reduc_phis = NULL; + new_instance->cost_vec = vNULL; + new_instance->subgraph_entries = vNULL; vect_gather_slp_loads (new_instance, node); if (dump_enabled_p ()) @@ -3133,8 +3137,8 @@ vect_slp_analyze_operations (vec_info *vinfo) visited.add (*x); i++; - add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec); - cost_vec.release (); + /* Remember the SLP graph entry cost for later. */ + instance->cost_vec = cost_vec; } } @@ -3142,18 +3146,108 @@ vect_slp_analyze_operations (vec_info *vinfo) if (bb_vec_info bb_vinfo = dyn_cast (vinfo)) { hash_set svisited; - stmt_vector_for_cost cost_vec; - cost_vec.create (2); for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i) vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance), - instance, &cost_vec, svisited); - add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec); - cost_vec.release (); + instance, &instance->cost_vec, svisited); } return !vinfo->slp_instances.is_empty (); } +/* Get the SLP instance leader from INSTANCE_LEADER thereby transitively + closing the eventual chain. */ + +static slp_instance +get_ultimate_leader (slp_instance instance, + hash_map &instance_leader) +{ + auto_vec chain; + slp_instance *tem; + while (*(tem = instance_leader.get (instance)) != instance) + { + chain.safe_push (tem); + instance = *tem; + } + while (!chain.is_empty ()) + *chain.pop () = instance; + return instance; +} + +/* Worker of vect_bb_partition_graph, recurse on NODE. */ + +static void +vect_bb_partition_graph_r (bb_vec_info bb_vinfo, + slp_instance instance, slp_tree node, + hash_map &stmt_to_instance, + hash_map &instance_leader) +{ + stmt_vec_info stmt_info; + unsigned i; + bool all = true; + FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info) + { + bool existed_p; + slp_instance &stmt_instance + = stmt_to_instance.get_or_insert (stmt_info, &existed_p); + if (!existed_p) + all = false; + else if (stmt_instance != instance) + { + /* If we're running into a previously marked stmt make us the + leader of the current ultimate leader. This keeps the + leader chain acyclic and works even when the current instance + connects two previously independent graph parts. */ + slp_instance stmt_leader + = get_ultimate_leader (stmt_instance, instance_leader); + if (stmt_leader != instance) + instance_leader.put (stmt_leader, instance); + } + stmt_instance = instance; + } + /* If not all stmts had been visited we have to recurse on children. */ + if (all) + return; + + slp_tree child; + FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child) + if (SLP_TREE_DEF_TYPE (child) == vect_internal_def) + vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance, + instance_leader); +} + +/* Partition the SLP graph into pieces that can be costed independently. */ + +static void +vect_bb_partition_graph (bb_vec_info bb_vinfo) +{ + DUMP_VECT_SCOPE ("vect_bb_partition_graph"); + + /* First walk the SLP graph assigning each involved scalar stmt a + corresponding SLP graph entry and upon visiting a previously + marked stmt, make the stmts leader the current SLP graph entry. */ + hash_map stmt_to_instance; + hash_map instance_leader; + slp_instance instance; + for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i) + { + instance_leader.put (instance, instance); + vect_bb_partition_graph_r (bb_vinfo, + instance, SLP_INSTANCE_TREE (instance), + stmt_to_instance, instance_leader); + } + + /* Then collect entries to each independent subgraph. */ + for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i) + { + slp_instance leader = get_ultimate_leader (instance, instance_leader); + leader->subgraph_entries.safe_push (instance); + if (dump_enabled_p () + && leader != instance) + dump_printf_loc (MSG_NOTE, vect_location, + "instance %p is leader of %p\n", + leader, instance); + } +} /* Compute the scalar cost of the SLP node NODE and its children and return it. Do not account defs that are marked in LIFE and @@ -3261,18 +3355,22 @@ vect_bb_slp_scalar_cost (vec_info *vinfo, } } -/* Check if vectorization of the basic block is profitable. */ +/* Check if vectorization of the basic block is profitable for the + subgraph denoted by SLP_INSTANCES. */ static bool -vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) +vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, + vec slp_instances) { - vec slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo); slp_instance instance; int i; unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0; unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0; - /* Calculate scalar cost. */ + void *vect_target_cost_data = init_cost (NULL); + + /* Calculate scalar cost and sum the cost for the vector stmts + previously collected. */ stmt_vector_for_cost scalar_costs; scalar_costs.create (0); hash_set visited; @@ -3284,22 +3382,25 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) vect_bb_slp_scalar_cost (bb_vinfo, SLP_INSTANCE_TREE (instance), &life, &scalar_costs, visited); + add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec); + instance->cost_vec.release (); } /* Unset visited flag. */ stmt_info_for_cost *si; FOR_EACH_VEC_ELT (scalar_costs, i, si) gimple_set_visited (si->stmt_info->stmt, false); - void *target_cost_data = init_cost (NULL); - add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs); + void *scalar_target_cost_data = init_cost (NULL); + add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs); scalar_costs.release (); unsigned dummy; - finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy); - destroy_cost_data (target_cost_data); + finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy); + destroy_cost_data (scalar_target_cost_data); - /* Complete the target-specific cost calculation. */ - finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost, + /* Complete the target-specific vector cost calculation. */ + finish_cost (vect_target_cost_data, &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost); + destroy_cost_data (vect_target_cost_data); vec_outside_cost = vec_prologue_cost + vec_epilogue_cost; @@ -3324,6 +3425,54 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) return true; } +/* For each SLP subgraph determine profitability and remove parts not so. + Returns true if any profitable to vectorize subgraph remains. */ + +static bool +vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) +{ + slp_instance instance; + unsigned i; + + auto_vec subgraphs (BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()); + FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance) + if (!instance->subgraph_entries.is_empty ()) + subgraphs.quick_push (instance); + BB_VINFO_SLP_INSTANCES (bb_vinfo).truncate (0); + for (i = 0; i < subgraphs.length ();) + { + instance = subgraphs[i]; + if (!vect_bb_vectorization_profitable_p (bb_vinfo, + instance->subgraph_entries)) + { + /* ??? We need to think of providing better dump/opt-report + locations here. */ + if (dump_enabled_p ()) + { + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: vectorization is not " + "profitable.\n"); + } + slp_instance entry; + unsigned j; + FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry) + if (entry != instance) + vect_free_slp_instance (entry, false); + vect_free_slp_instance (instance, false); + subgraphs.ordered_remove (i); + } + else + { + slp_instance entry; + unsigned j; + FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry) + BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (entry); + ++i; + } + } + return !BB_VINFO_SLP_INSTANCES (bb_vinfo).is_empty (); +} + /* Find any vectorizable constructors and add them to the grouped_store array. */ @@ -3465,6 +3614,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, return false; } + vect_bb_partition_graph (bb_vinfo); + /* Cost model: check if the vectorization is worthwhile. */ if (!unlimited_cost_model (NULL) && !vect_bb_vectorization_profitable_p (bb_vinfo)) diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index fb1111c8615..8bf33137395 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -183,6 +183,13 @@ public: /* The SLP node containing the reduction PHIs. */ slp_tree reduc_phis; + + /* Vector cost of this entry to the SLP graph. */ + stmt_vector_for_cost cost_vec; + + /* If this instance is the main entry of a subgraph the set of + entries into the same subgraph, including itself. */ + vec<_slp_instance *> subgraph_entries; } *slp_instance; @@ -913,7 +920,6 @@ public: #define BB_VINFO_SLP_INSTANCES(B) (B)->slp_instances #define BB_VINFO_DATAREFS(B) (B)->shared->datarefs #define BB_VINFO_DDRS(B) (B)->shared->ddrs -#define BB_VINFO_TARGET_COST_DATA(B) (B)->target_cost_data static inline bb_vec_info vec_info_for_bb (basic_block bb) -- 2.30.2