improve BB vectorization dump locations
authorRichard Biener <rguenther@suse.de>
Fri, 11 Sep 2020 07:57:18 +0000 (09:57 +0200)
committerRichard Biener <rguenther@suse.de>
Fri, 11 Sep 2020 09:29:18 +0000 (11:29 +0200)
This tries to improve BB vectorization dumps by providing more
precise locations.  Currently the vect_location is simply the
very last stmt in a basic-block that has a location.  So for

double a[4], b[4];
int x[4], y[4];
void foo()
{
  a[0] = b[0]; // line 5
  a[1] = b[1];
  a[2] = b[2];
  a[3] = b[3];
  x[0] = y[0]; // line 9
  x[1] = y[1];
  x[2] = y[2];
  x[3] = y[3];
} // line 13

we show the user with -O3 -fopt-info-vec

t.c:13:1: optimized: basic block part vectorized using 16 byte vectors

while with the patch we point to both independently vectorized
opportunities:

t.c:5:8: optimized: basic block part vectorized using 16 byte vectors
t.c:9:8: optimized: basic block part vectorized using 16 byte vectors

there's the possibility that the location regresses in case the
root stmt in the SLP instance has no location.  For a SLP subgraph
with multiple entries the location also chooses one entry at random,
not sure in which case we want to dump both.

Still as the plan is to extend the basic-block vectorization
scope from single basic-block to multiple ones this is a first
step to preserve something sensible.

Implementation-wise this makes both costing and code-generation
happen on the subgraphs as analyzed.

2020-09-11  Richard Biener  <rguenther@suse.de>

* tree-vectorizer.h (_slp_instance::location): New method.
(vect_schedule_slp): Adjust prototype.
* tree-vectorizer.c (vec_info::remove_stmt): Adjust
the BB region begin if we removed the stmt it points to.
* tree-vect-loop.c (vect_transform_loop): Adjust.
* tree-vect-slp.c (_slp_instance::location): Implement.
(vect_analyze_slp_instance): For BB vectorization set
vect_location to that of the instance.
(vect_slp_analyze_operations): Likewise.
(vect_bb_vectorization_profitable_p): Remove wrapper.
(vect_slp_analyze_bb_1): Remove cost check here.
(vect_slp_region): Cost check and code generate subgraphs separately,
report optimized locations and missed optimizations due to
profitability for each of them.
(vect_schedule_slp): Get the vector of SLP graph entries to
vectorize as argument.

gcc/tree-vect-loop.c
gcc/tree-vect-slp.c
gcc/tree-vectorizer.c
gcc/tree-vectorizer.h

index 80e78f7adf4d1cbc279a63831d72b57176d58470..c95ec5ad267c4cb65fa26fe93d70cfe02cfa9ca6 100644 (file)
@@ -9018,7 +9018,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   if (!loop_vinfo->slp_instances.is_empty ())
     {
       DUMP_VECT_SCOPE ("scheduling SLP instances");
-      vect_schedule_slp (loop_vinfo);
+      vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
     }
 
   /* FORNOW: the vectorizer supports only loops which body consist
index 35bde9bcb9dd7bd643b3b9e107bb2dddaaaf9b58..519cd6a725415447aeea0904a8576a909eeed9ac 100644 (file)
@@ -117,6 +117,18 @@ vect_free_slp_tree (slp_tree node, bool final_p)
   delete node;
 }
 
+/* Return a location suitable for dumpings related to the SLP instance.  */
+
+dump_user_location_t
+_slp_instance::location () const
+{
+  if (root_stmt)
+    return root_stmt->stmt;
+  else
+    return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
+}
+
+
 /* Free the memory allocated for the SLP instance.  FINAL_P is true if we
    have vectorized the instance or if we have made a final decision not
    to vectorize the statements in any way.  */
@@ -2121,6 +2133,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
   vec<stmt_vec_info> scalar_stmts;
   bool constructor = false;
 
+  if (is_a <bb_vec_info> (vinfo))
+    vect_location = stmt_info->stmt;
   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
     {
       scalar_type = TREE_TYPE (DR_REF (dr));
@@ -3120,6 +3134,8 @@ vect_slp_analyze_operations (vec_info *vinfo)
       hash_set<slp_tree> lvisited;
       stmt_vector_for_cost cost_vec;
       cost_vec.create (2);
+      if (is_a <bb_vec_info> (vinfo))
+       vect_location = instance->location ();
       if (!vect_slp_analyze_node_operations (vinfo,
                                             SLP_INSTANCE_TREE (instance),
                                             instance, visited, lvisited,
@@ -3157,8 +3173,11 @@ vect_slp_analyze_operations (vec_info *vinfo)
     {
       hash_set<stmt_vec_info> svisited;
       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
-       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
-                                    instance, &instance->cost_vec, svisited);
+       {
+         vect_location = instance->location ();
+         vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
+                                      instance, &instance->cost_vec, svisited);
+       }
     }
 
   return !vinfo->slp_instances.is_empty ();
@@ -3435,54 +3454,6 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
   return true;
 }
 
-/* For each SLP subgraph determine profitability and remove parts not so.
-   Returns true if any profitable to vectorize subgraph remains.  */
-
-static bool
-vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
-{
-  slp_instance instance;
-  unsigned i;
-
-  auto_vec<slp_instance> subgraphs (BB_VINFO_SLP_INSTANCES (bb_vinfo).length ());
-  FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
-    if (!instance->subgraph_entries.is_empty ())
-      subgraphs.quick_push (instance);
-  BB_VINFO_SLP_INSTANCES (bb_vinfo).truncate (0);
-  for (i = 0; i < subgraphs.length ();)
-    {
-      instance = subgraphs[i];
-      if (!vect_bb_vectorization_profitable_p (bb_vinfo,
-                                              instance->subgraph_entries))
-       {
-         /* ???  We need to think of providing better dump/opt-report
-            locations here.  */
-         if (dump_enabled_p ())
-           {
-             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "not vectorized: vectorization is not "
-                              "profitable.\n");
-           }
-         slp_instance entry;
-         unsigned j;
-         FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
-           if (entry != instance)
-             vect_free_slp_instance (entry, false);
-         vect_free_slp_instance (instance, false);
-         subgraphs.ordered_remove (i);
-       }
-      else
-       {
-         slp_instance entry;
-         unsigned j;
-         FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
-           BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (entry);
-         ++i;
-       }
-    }
-  return !BB_VINFO_SLP_INSTANCES (bb_vinfo).is_empty ();
-}
-
 /* Find any vectorizable constructors and add them to the grouped_store
    array.  */
 
@@ -3590,6 +3561,7 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
      dependence in the SLP instances.  */
   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
     {
+      vect_location = instance->location ();
       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
          || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
        {
@@ -3626,14 +3598,6 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
 
   vect_bb_partition_graph (bb_vinfo);
 
-  /* Cost model: check if the vectorization opportunities are worthwhile.  */
-  if (!unlimited_cost_model (NULL)
-      && !vect_bb_vectorization_profitable_p (bb_vinfo))
-    return false;
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-                    "Basic block will be vectorized using SLP\n");
   return true;
 }
 
@@ -3686,22 +3650,48 @@ vect_slp_region (gimple_stmt_iterator region_begin,
            }
 
          bb_vinfo->shared->check_datarefs ();
-         vect_schedule_slp (bb_vinfo);
 
-         unsigned HOST_WIDE_INT bytes;
-         if (dump_enabled_p ())
+         unsigned i;
+         slp_instance instance;
+         FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
            {
-             if (GET_MODE_SIZE (bb_vinfo->vector_mode).is_constant (&bytes))
-               dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-                                "basic block part vectorized using %wu byte "
-                                "vectors\n", bytes);
-             else
-               dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-                                "basic block part vectorized using variable "
-                                "length vectors\n");
-           }
+             if (instance->subgraph_entries.is_empty ())
+               continue;
 
-         vectorized = true;
+             vect_location = instance->location ();
+             if (!unlimited_cost_model (NULL)
+                 && !vect_bb_vectorization_profitable_p
+                       (bb_vinfo, instance->subgraph_entries))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "not vectorized: vectorization is not "
+                                    "profitable.\n");
+                 continue;
+               }
+
+             if (!vectorized && dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "Basic block will be vectorized "
+                                "using SLP\n");
+             vectorized = true;
+
+             vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
+
+             unsigned HOST_WIDE_INT bytes;
+             if (dump_enabled_p ())
+               {
+                 if (GET_MODE_SIZE
+                       (bb_vinfo->vector_mode).is_constant (&bytes))
+                   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+                                    "basic block part vectorized using %wu "
+                                    "byte vectors\n", bytes);
+                 else
+                   dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
+                                    "basic block part vectorized using "
+                                    "variable length vectors\n");
+               }
+           }
        }
       else
        {
@@ -4828,16 +4818,14 @@ vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
     gsi_replace (&rgsi, rstmt, true);
 }
 
-/* Generate vector code for all SLP instances in the loop/basic block.  */
+/* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
 
 void
-vect_schedule_slp (vec_info *vinfo)
+vect_schedule_slp (vec_info *vinfo, vec<slp_instance> slp_instances)
 {
-  vec<slp_instance> slp_instances;
   slp_instance instance;
   unsigned int i;
 
-  slp_instances = vinfo->slp_instances;
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       slp_tree node = SLP_INSTANCE_TREE (instance);
index 3c60f30ed8fe6425b77e1e40deb51ec66c247e5f..bbe2de56365290849612fbabbe200cb89e2df24d 100644 (file)
@@ -603,9 +603,13 @@ vec_info::remove_stmt (stmt_vec_info stmt_info)
 {
   gcc_assert (!stmt_info->pattern_stmt_p);
   set_vinfo_for_stmt (stmt_info->stmt, NULL);
-  gimple_stmt_iterator si = gsi_for_stmt (stmt_info->stmt);
   unlink_stmt_vdef (stmt_info->stmt);
-  gsi_remove (&si, true);
+  gimple_stmt_iterator si = gsi_for_stmt (stmt_info->stmt);
+  gimple_stmt_iterator *psi = &si;
+  if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (this))
+    if (gsi_stmt (bb_vinfo->region_begin) == stmt_info->stmt)
+      psi = &bb_vinfo->region_begin;
+  gsi_remove (psi, true);
   release_defs (stmt_info->stmt);
   free_stmt_vec_info (stmt_info);
 }
index 8bf33137395db61f15dae24383e57b289abf7dff..6c29ee6cfed6beed5f5937d1e90dac1f25ca2d00 100644 (file)
@@ -190,6 +190,8 @@ public:
   /* If this instance is the main entry of a subgraph the set of
      entries into the same subgraph, including itself.  */
   vec<_slp_instance *> subgraph_entries;
+
+  dump_user_location_t location () const;
 } *slp_instance;
 
 
@@ -2027,7 +2029,7 @@ extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec<tree>,
                                          gimple_stmt_iterator *, poly_uint64,
                                          bool, unsigned *);
 extern bool vect_slp_analyze_operations (vec_info *);
-extern void vect_schedule_slp (vec_info *);
+extern void vect_schedule_slp (vec_info *, vec<slp_instance>);
 extern opt_result vect_analyze_slp (vec_info *, unsigned);
 extern bool vect_make_slp_decision (loop_vec_info);
 extern void vect_detect_hybrid_slp (loop_vec_info);