From 054fc495fac5478a119715d10e1dd76442851bb5 Mon Sep 17 00:00:00 2001 From: Richard Biener Date: Fri, 11 Sep 2020 09:57:18 +0200 Subject: [PATCH] improve BB vectorization dump locations This tries to improve BB vectorization dumps by providing more precise locations. Currently the vect_location is simply the very last stmt in a basic-block that has a location. So for double a[4], b[4]; int x[4], y[4]; void foo() { a[0] = b[0]; // line 5 a[1] = b[1]; a[2] = b[2]; a[3] = b[3]; x[0] = y[0]; // line 9 x[1] = y[1]; x[2] = y[2]; x[3] = y[3]; } // line 13 we show the user with -O3 -fopt-info-vec t.c:13:1: optimized: basic block part vectorized using 16 byte vectors while with the patch we point to both independently vectorized opportunities: t.c:5:8: optimized: basic block part vectorized using 16 byte vectors t.c:9:8: optimized: basic block part vectorized using 16 byte vectors there's the possibility that the location regresses in case the root stmt in the SLP instance has no location. For a SLP subgraph with multiple entries the location also chooses one entry at random, not sure in which case we want to dump both. Still as the plan is to extend the basic-block vectorization scope from single basic-block to multiple ones this is a first step to preserve something sensible. Implementation-wise this makes both costing and code-generation happen on the subgraphs as analyzed. 2020-09-11 Richard Biener * tree-vectorizer.h (_slp_instance::location): New method. (vect_schedule_slp): Adjust prototype. * tree-vectorizer.c (vec_info::remove_stmt): Adjust the BB region begin if we removed the stmt it points to. * tree-vect-loop.c (vect_transform_loop): Adjust. * tree-vect-slp.c (_slp_instance::location): Implement. (vect_analyze_slp_instance): For BB vectorization set vect_location to that of the instance. (vect_slp_analyze_operations): Likewise. (vect_bb_vectorization_profitable_p): Remove wrapper. (vect_slp_analyze_bb_1): Remove cost check here. (vect_slp_region): Cost check and code generate subgraphs separately, report optimized locations and missed optimizations due to profitability for each of them. (vect_schedule_slp): Get the vector of SLP graph entries to vectorize as argument. --- gcc/tree-vect-loop.c | 2 +- gcc/tree-vect-slp.c | 138 +++++++++++++++++++----------------------- gcc/tree-vectorizer.c | 8 ++- gcc/tree-vectorizer.h | 4 +- 4 files changed, 73 insertions(+), 79 deletions(-) diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c index 80e78f7adf4..c95ec5ad267 100644 --- a/gcc/tree-vect-loop.c +++ b/gcc/tree-vect-loop.c @@ -9018,7 +9018,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call) if (!loop_vinfo->slp_instances.is_empty ()) { DUMP_VECT_SCOPE ("scheduling SLP instances"); - vect_schedule_slp (loop_vinfo); + vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); } /* FORNOW: the vectorizer supports only loops which body consist diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c index 35bde9bcb9d..519cd6a7254 100644 --- a/gcc/tree-vect-slp.c +++ b/gcc/tree-vect-slp.c @@ -117,6 +117,18 @@ vect_free_slp_tree (slp_tree node, bool final_p) delete node; } +/* Return a location suitable for dumpings related to the SLP instance. */ + +dump_user_location_t +_slp_instance::location () const +{ + if (root_stmt) + return root_stmt->stmt; + else + return SLP_TREE_SCALAR_STMTS (root)[0]->stmt; +} + + /* Free the memory allocated for the SLP instance. FINAL_P is true if we have vectorized the instance or if we have made a final decision not to vectorize the statements in any way. */ @@ -2121,6 +2133,8 @@ vect_analyze_slp_instance (vec_info *vinfo, vec scalar_stmts; bool constructor = false; + if (is_a (vinfo)) + vect_location = stmt_info->stmt; if (STMT_VINFO_GROUPED_ACCESS (stmt_info)) { scalar_type = TREE_TYPE (DR_REF (dr)); @@ -3120,6 +3134,8 @@ vect_slp_analyze_operations (vec_info *vinfo) hash_set lvisited; stmt_vector_for_cost cost_vec; cost_vec.create (2); + if (is_a (vinfo)) + vect_location = instance->location (); if (!vect_slp_analyze_node_operations (vinfo, SLP_INSTANCE_TREE (instance), instance, visited, lvisited, @@ -3157,8 +3173,11 @@ vect_slp_analyze_operations (vec_info *vinfo) { hash_set svisited; for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i) - vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance), - instance, &instance->cost_vec, svisited); + { + vect_location = instance->location (); + vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance), + instance, &instance->cost_vec, svisited); + } } return !vinfo->slp_instances.is_empty (); @@ -3435,54 +3454,6 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo, return true; } -/* For each SLP subgraph determine profitability and remove parts not so. - Returns true if any profitable to vectorize subgraph remains. */ - -static bool -vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo) -{ - slp_instance instance; - unsigned i; - - auto_vec subgraphs (BB_VINFO_SLP_INSTANCES (bb_vinfo).length ()); - FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance) - if (!instance->subgraph_entries.is_empty ()) - subgraphs.quick_push (instance); - BB_VINFO_SLP_INSTANCES (bb_vinfo).truncate (0); - for (i = 0; i < subgraphs.length ();) - { - instance = subgraphs[i]; - if (!vect_bb_vectorization_profitable_p (bb_vinfo, - instance->subgraph_entries)) - { - /* ??? We need to think of providing better dump/opt-report - locations here. */ - if (dump_enabled_p ()) - { - dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, - "not vectorized: vectorization is not " - "profitable.\n"); - } - slp_instance entry; - unsigned j; - FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry) - if (entry != instance) - vect_free_slp_instance (entry, false); - vect_free_slp_instance (instance, false); - subgraphs.ordered_remove (i); - } - else - { - slp_instance entry; - unsigned j; - FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry) - BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (entry); - ++i; - } - } - return !BB_VINFO_SLP_INSTANCES (bb_vinfo).is_empty (); -} - /* Find any vectorizable constructors and add them to the grouped_store array. */ @@ -3590,6 +3561,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, dependence in the SLP instances. */ for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); ) { + vect_location = instance->location (); if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance) || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance)) { @@ -3626,14 +3598,6 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal, vect_bb_partition_graph (bb_vinfo); - /* Cost model: check if the vectorization opportunities are worthwhile. */ - if (!unlimited_cost_model (NULL) - && !vect_bb_vectorization_profitable_p (bb_vinfo)) - return false; - - if (dump_enabled_p ()) - dump_printf_loc (MSG_NOTE, vect_location, - "Basic block will be vectorized using SLP\n"); return true; } @@ -3686,22 +3650,48 @@ vect_slp_region (gimple_stmt_iterator region_begin, } bb_vinfo->shared->check_datarefs (); - vect_schedule_slp (bb_vinfo); - unsigned HOST_WIDE_INT bytes; - if (dump_enabled_p ()) + unsigned i; + slp_instance instance; + FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance) { - if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes)) - dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, - "basic block part vectorized using %wu byte " - "vectors\n", bytes); - else - dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, - "basic block part vectorized using variable " - "length vectors\n"); - } + if (instance->subgraph_entries.is_empty ()) + continue; - vectorized = true; + vect_location = instance->location (); + if (!unlimited_cost_model (NULL) + && !vect_bb_vectorization_profitable_p + (bb_vinfo, instance->subgraph_entries)) + { + if (dump_enabled_p ()) + dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, + "not vectorized: vectorization is not " + "profitable.\n"); + continue; + } + + if (!vectorized && dump_enabled_p ()) + dump_printf_loc (MSG_NOTE, vect_location, + "Basic block will be vectorized " + "using SLP\n"); + vectorized = true; + + vect_schedule_slp (bb_vinfo, instance->subgraph_entries); + + unsigned HOST_WIDE_INT bytes; + if (dump_enabled_p ()) + { + if (GET_MODE_SIZE + (bb_vinfo->vector_mode).is_constant (&bytes)) + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, + "basic block part vectorized using %wu " + "byte vectors\n", bytes); + else + dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, + "basic block part vectorized using " + "variable length vectors\n"); + } + } } else { @@ -4828,16 +4818,14 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance) gsi_replace (&rgsi, rstmt, true); } -/* Generate vector code for all SLP instances in the loop/basic block. */ +/* Generate vector code for SLP_INSTANCES in the loop/basic block. */ void -vect_schedule_slp (vec_info *vinfo) +vect_schedule_slp (vec_info *vinfo, vec slp_instances) { - vec slp_instances; slp_instance instance; unsigned int i; - slp_instances = vinfo->slp_instances; FOR_EACH_VEC_ELT (slp_instances, i, instance) { slp_tree node = SLP_INSTANCE_TREE (instance); diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 3c60f30ed8f..bbe2de56365 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -603,9 +603,13 @@ vec_info::remove_stmt (stmt_vec_info stmt_info) { gcc_assert (!stmt_info->pattern_stmt_p); set_vinfo_for_stmt (stmt_info->stmt, NULL); - gimple_stmt_iterator si = gsi_for_stmt (stmt_info->stmt); unlink_stmt_vdef (stmt_info->stmt); - gsi_remove (&si, true); + gimple_stmt_iterator si = gsi_for_stmt (stmt_info->stmt); + gimple_stmt_iterator *psi = &si; + if (bb_vec_info bb_vinfo = dyn_cast (this)) + if (gsi_stmt (bb_vinfo->region_begin) == stmt_info->stmt) + psi = &bb_vinfo->region_begin; + gsi_remove (psi, true); release_defs (stmt_info->stmt); free_stmt_vec_info (stmt_info); } diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 8bf33137395..6c29ee6cfed 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -190,6 +190,8 @@ public: /* If this instance is the main entry of a subgraph the set of entries into the same subgraph, including itself. */ vec<_slp_instance *> subgraph_entries; + + dump_user_location_t location () const; } *slp_instance; @@ -2027,7 +2029,7 @@ extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec, gimple_stmt_iterator *, poly_uint64, bool, unsigned *); extern bool vect_slp_analyze_operations (vec_info *); -extern void vect_schedule_slp (vec_info *); +extern void vect_schedule_slp (vec_info *, vec); extern opt_result vect_analyze_slp (vec_info *, unsigned); extern bool vect_make_slp_decision (loop_vec_info); extern void vect_detect_hybrid_slp (loop_vec_info); -- 2.30.2