tree-optimization/96043 - BB vectorization costing improvement
authorRichard Biener <rguenther@suse.de>
Tue, 8 Sep 2020 12:49:59 +0000 (14:49 +0200)
committerRichard Biener <rguenther@suse.de>
Thu, 10 Sep 2020 09:18:18 +0000 (11:18 +0200)
This makes the BB vectorizer cost independent SLP subgraphs
separately.  While on pristine trunk and for x86_64 I failed to
distill a testcase where the vectorizer would think _any_
basic-block vectorization opportunity is not profitable I do
have pending work that would make the cost savings of a
profitable opportunity make another independently not
profitable opportunity vectorized.

2020-09-08  Richard Biener  <rguenther@suse.de>

PR tree-optimization/96043
* tree-vectorizer.h (_slp_instance::cost_vec): New.
(_slp_instance::subgraph_entries): Likewise.
(BB_VINFO_TARGET_COST_DATA): Remove.
* tree-vect-slp.c (vect_free_slp_instance): Free
cost_vec and subgraph_entries.
(vect_analyze_slp_instance): Initialize them.
(vect_slp_analyze_operations): Defer passing costs to
the target, instead record them in the SLP graph entry.
(get_ultimate_leader): New helper for graph partitioning.
(vect_bb_partition_graph_r): Likewise.
(vect_bb_partition_graph): New function to partition the
SLP graph into independently costable parts.
(vect_bb_vectorization_profitable_p): Adjust to work on
a subgraph.
(vect_bb_vectorization_profitable_p): New wrapper,
discarding non-profitable vectorization of subgraphs.
(vect_slp_analyze_bb_1): Call vect_bb_partition_graph before
costing.

* gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c: Adjust.

gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr69297.c
gcc/tree-vect-slp.c
gcc/tree-vectorizer.h

index e65a30c06d6e7e8c6795907b4245e4b079fbfa62..ef74785f6a8cc3b44db5c794d349461f4152a8a5 100644 (file)
@@ -74,10 +74,28 @@ foo (int* diff)
     d[13] = m[12] - m[13];
     d[14] = m[14] + m[15];
     d[15] = m[15] - m[14];
+    /* The following obviously profitable part should not make
+       the former unprofitable one profitable.  */
+    diff[16 + 16] = diff[16];
+    diff[17 + 16] = diff[17];
+    diff[18 + 16] = diff[18];
+    diff[19 + 16] = diff[19];
+    diff[20 + 16] = diff[20];
+    diff[21 + 16] = diff[21];
+    diff[22 + 16] = diff[22];
+    diff[23 + 16] = diff[23];
+    diff[24 + 16] = diff[24];
+    diff[25 + 16] = diff[25];
+    diff[26 + 16] = diff[26];
+    diff[27 + 16] = diff[27];
+    diff[28 + 16] = diff[28];
+    diff[29 + 16] = diff[29];
+    diff[30 + 16] = diff[30];
+    diff[31 + 16] = diff[31];
     for (k=0; k<16; k++)
       satd += abs(d[k]);
   return satd;
 }
 
 /* { dg-final { scan-tree-dump "vectorization is not profitable" "slp1" } } */
-/* { dg-final { scan-tree-dump-not "basic block vectorized" "slp1" } } */
+/* { dg-final { scan-tree-dump-times "Vectorizing SLP tree" 1 "slp1" } } */
index 7860fe3f32bc6046144e622c9a3967aef785bd45..458a2c5bb301157f4f843776943435eba8c49d65 100644 (file)
@@ -126,6 +126,8 @@ vect_free_slp_instance (slp_instance instance, bool final_p)
 {
   vect_free_slp_tree (SLP_INSTANCE_TREE (instance), final_p);
   SLP_INSTANCE_LOADS (instance).release ();
+  instance->subgraph_entries.release ();
+  instance->cost_vec.release ();
   free (instance);
 }
 
@@ -2263,6 +2265,8 @@ vect_analyze_slp_instance (vec_info *vinfo,
          SLP_INSTANCE_LOADS (new_instance) = vNULL;
          SLP_INSTANCE_ROOT_STMT (new_instance) = constructor ? stmt_info : NULL;
          new_instance->reduc_phis = NULL;
+         new_instance->cost_vec = vNULL;
+         new_instance->subgraph_entries = vNULL;
 
          vect_gather_slp_loads (new_instance, node);
          if (dump_enabled_p ())
@@ -3133,8 +3137,8 @@ vect_slp_analyze_operations (vec_info *vinfo)
            visited.add (*x);
          i++;
 
-         add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
-         cost_vec.release ();
+         /* Remember the SLP graph entry cost for later.  */
+         instance->cost_vec = cost_vec;
        }
     }
 
@@ -3142,18 +3146,108 @@ vect_slp_analyze_operations (vec_info *vinfo)
   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
     {
       hash_set<stmt_vec_info> svisited;
-      stmt_vector_for_cost cost_vec;
-      cost_vec.create (2);
       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
        vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
-                                    instance, &cost_vec, svisited);
-      add_stmt_costs (vinfo, vinfo->target_cost_data, &cost_vec);
-      cost_vec.release ();
+                                    instance, &instance->cost_vec, svisited);
     }
 
   return !vinfo->slp_instances.is_empty ();
 }
 
+/* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
+   closing the eventual chain.  */
+
+static slp_instance
+get_ultimate_leader (slp_instance instance,
+                    hash_map<slp_instance, slp_instance> &instance_leader)
+{
+  auto_vec<slp_instance *, 8> chain;
+  slp_instance *tem;
+  while (*(tem = instance_leader.get (instance)) != instance)
+    {
+      chain.safe_push (tem);
+      instance = *tem;
+    }
+  while (!chain.is_empty ())
+    *chain.pop () = instance;
+  return instance;
+}
+
+/* Worker of vect_bb_partition_graph, recurse on NODE.  */
+
+static void
+vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
+                          slp_instance instance, slp_tree node,
+                          hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
+                          hash_map<slp_instance, slp_instance> &instance_leader)
+{
+  stmt_vec_info stmt_info;
+  unsigned i;
+  bool all = true;
+  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
+    {
+      bool existed_p;
+      slp_instance &stmt_instance
+       = stmt_to_instance.get_or_insert (stmt_info, &existed_p);
+      if (!existed_p)
+       all = false;
+      else if (stmt_instance != instance)
+       {
+         /* If we're running into a previously marked stmt make us the
+            leader of the current ultimate leader.  This keeps the
+            leader chain acyclic and works even when the current instance
+            connects two previously independent graph parts.  */
+         slp_instance stmt_leader
+           = get_ultimate_leader (stmt_instance, instance_leader);
+         if (stmt_leader != instance)
+           instance_leader.put (stmt_leader, instance);
+       }
+      stmt_instance = instance;
+    }
+  /* If not all stmts had been visited we have to recurse on children.  */
+  if (all)
+    return;
+
+  slp_tree child;
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
+      vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
+                                instance_leader);
+}
+
+/* Partition the SLP graph into pieces that can be costed independently.  */
+
+static void
+vect_bb_partition_graph (bb_vec_info bb_vinfo)
+{
+  DUMP_VECT_SCOPE ("vect_bb_partition_graph");
+
+  /* First walk the SLP graph assigning each involved scalar stmt a
+     corresponding SLP graph entry and upon visiting a previously
+     marked stmt, make the stmts leader the current SLP graph entry.  */
+  hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
+  hash_map<slp_instance, slp_instance> instance_leader;
+  slp_instance instance;
+  for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
+    {
+      instance_leader.put (instance, instance);
+      vect_bb_partition_graph_r (bb_vinfo,
+                                instance, SLP_INSTANCE_TREE (instance),
+                                stmt_to_instance, instance_leader);
+    }
+
+  /* Then collect entries to each independent subgraph.  */
+  for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
+    {
+      slp_instance leader = get_ultimate_leader (instance, instance_leader);
+      leader->subgraph_entries.safe_push (instance);
+      if (dump_enabled_p ()
+         && leader != instance)
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "instance %p is leader of %p\n",
+                        leader, instance);
+    }
+}
 
 /* Compute the scalar cost of the SLP node NODE and its children
    and return it.  Do not account defs that are marked in LIFE and
@@ -3261,18 +3355,22 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
     }
 }
 
-/* Check if vectorization of the basic block is profitable.  */
+/* Check if vectorization of the basic block is profitable for the
+   subgraph denoted by SLP_INSTANCES.  */
 
 static bool
-vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
+vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
+                                   vec<slp_instance> slp_instances)
 {
-  vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
   slp_instance instance;
   int i;
   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
 
-  /* Calculate scalar cost.  */
+  void *vect_target_cost_data = init_cost (NULL);
+
+  /* Calculate scalar cost and sum the cost for the vector stmts
+     previously collected.  */
   stmt_vector_for_cost scalar_costs;
   scalar_costs.create (0);
   hash_set<slp_tree> visited;
@@ -3284,22 +3382,25 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
       vect_bb_slp_scalar_cost (bb_vinfo,
                               SLP_INSTANCE_TREE (instance),
                               &life, &scalar_costs, visited);
+      add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
+      instance->cost_vec.release ();
     }
   /* Unset visited flag.  */
   stmt_info_for_cost *si;
   FOR_EACH_VEC_ELT (scalar_costs, i, si)
     gimple_set_visited  (si->stmt_info->stmt, false);
 
-  void *target_cost_data = init_cost (NULL);
-  add_stmt_costs (bb_vinfo, target_cost_data, &scalar_costs);
+  void *scalar_target_cost_data = init_cost (NULL);
+  add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
   scalar_costs.release ();
   unsigned dummy;
-  finish_cost (target_cost_data, &dummy, &scalar_cost, &dummy);
-  destroy_cost_data (target_cost_data);
+  finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
+  destroy_cost_data (scalar_target_cost_data);
 
-  /* Complete the target-specific cost calculation.  */
-  finish_cost (BB_VINFO_TARGET_COST_DATA (bb_vinfo), &vec_prologue_cost,
+  /* Complete the target-specific vector cost calculation.  */
+  finish_cost (vect_target_cost_data, &vec_prologue_cost,
               &vec_inside_cost, &vec_epilogue_cost);
+  destroy_cost_data (vect_target_cost_data);
 
   vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
 
@@ -3324,6 +3425,54 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
   return true;
 }
 
+/* For each SLP subgraph determine profitability and remove parts not so.
+   Returns true if any profitable to vectorize subgraph remains.  */
+
+static bool
+vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
+{
+  slp_instance instance;
+  unsigned i;
+
+  auto_vec<slp_instance> subgraphs (BB_VINFO_SLP_INSTANCES (bb_vinfo).length ());
+  FOR_EACH_VEC_ELT (BB_VINFO_SLP_INSTANCES (bb_vinfo), i, instance)
+    if (!instance->subgraph_entries.is_empty ())
+      subgraphs.quick_push (instance);
+  BB_VINFO_SLP_INSTANCES (bb_vinfo).truncate (0);
+  for (i = 0; i < subgraphs.length ();)
+    {
+      instance = subgraphs[i];
+      if (!vect_bb_vectorization_profitable_p (bb_vinfo,
+                                              instance->subgraph_entries))
+       {
+         /* ???  We need to think of providing better dump/opt-report
+            locations here.  */
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "not vectorized: vectorization is not "
+                              "profitable.\n");
+           }
+         slp_instance entry;
+         unsigned j;
+         FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
+           if (entry != instance)
+             vect_free_slp_instance (entry, false);
+         vect_free_slp_instance (instance, false);
+         subgraphs.ordered_remove (i);
+       }
+      else
+       {
+         slp_instance entry;
+         unsigned j;
+         FOR_EACH_VEC_ELT (instance->subgraph_entries, j, entry)
+           BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (entry);
+         ++i;
+       }
+    }
+  return !BB_VINFO_SLP_INSTANCES (bb_vinfo).is_empty ();
+}
+
 /* Find any vectorizable constructors and add them to the grouped_store
    array.  */
 
@@ -3465,6 +3614,8 @@ vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
       return false;
     }
 
+  vect_bb_partition_graph (bb_vinfo);
+
   /* Cost model: check if the vectorization is worthwhile.  */
   if (!unlimited_cost_model (NULL)
       && !vect_bb_vectorization_profitable_p (bb_vinfo))
index fb1111c8615e83ce926db15d9702b44023b5b799..8bf33137395db61f15dae24383e57b289abf7dff 100644 (file)
@@ -183,6 +183,13 @@ public:
 
   /* The SLP node containing the reduction PHIs.  */
   slp_tree reduc_phis;
+
+  /* Vector cost of this entry to the SLP graph.  */
+  stmt_vector_for_cost cost_vec;
+
+  /* If this instance is the main entry of a subgraph the set of
+     entries into the same subgraph, including itself.  */
+  vec<_slp_instance *> subgraph_entries;
 } *slp_instance;
 
 
@@ -913,7 +920,6 @@ public:
 #define BB_VINFO_SLP_INSTANCES(B)    (B)->slp_instances
 #define BB_VINFO_DATAREFS(B)         (B)->shared->datarefs
 #define BB_VINFO_DDRS(B)             (B)->shared->ddrs
-#define BB_VINFO_TARGET_COST_DATA(B) (B)->target_cost_data
 
 static inline bb_vec_info
 vec_info_for_bb (basic_block bb)