tree-optimization/98855 - redo BB vectorization costing

author Richard Biener <rguenther@suse.de>

Fri, 5 Feb 2021 08:54:00 +0000 (09:54 +0100)

committer Richard Biener <rguenther@suse.de>

Fri, 5 Feb 2021 13:03:00 +0000 (14:03 +0100)
author Richard Biener <rguenther@suse.de>
Fri, 5 Feb 2021 08:54:00 +0000 (09:54 +0100)
committer Richard Biener <rguenther@suse.de>
Fri, 5 Feb 2021 13:03:00 +0000 (14:03 +0100)
diff --git a/gcc/testsuite/g++.dg/vect/slp-pr98855.cc b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc

new file mode 100644 (file)

index 0000000..0b4e479
--- /dev/null
+++ b/gcc/testsuite/g++.dg/vect/slp-pr98855.cc
@@ -0,0 +1,84 @@
+// { dg-do compile }
+// { dg-additional-options "-fvect-cost-model=cheap" }
+// { dg-additional-options "-mavx2" { target x86_64-*-* i?86-*-* } }
+
+#include <stdint.h>
+#include <stdlib.h>
+
+inline uint32_t make_uint32(uint8_t i0, uint8_t i1, uint8_t i2, uint8_t i3)
+{
+  return ((static_cast<uint32_t>(i0) << 24) |
+         (static_cast<uint32_t>(i1) << 16) |
+         (static_cast<uint32_t>(i2) <<  8) |
+         (static_cast<uint32_t>(i3)));
+}
+
+inline uint32_t load_be(const uint8_t in[], size_t off)
+{
+  in += off * sizeof(uint32_t);
+  return make_uint32(in[0], in[1], in[2], in[3]);
+}
+
+template<typename T>
+inline void load_be(const uint8_t in[],
+                   T& x0, T& x1, T& x2, T& x3,
+                   T& x4, T& x5, T& x6, T& x7)
+{
+  x0 = load_be(in, 0);
+  x1 = load_be(in, 1);
+  x2 = load_be(in, 2);
+  x3 = load_be(in, 3);
+  x4 = load_be(in, 4);
+  x5 = load_be(in, 5);
+  x6 = load_be(in, 6);
+  x7 = load_be(in, 7);
+}
+
+inline void store_be(uint32_t in, uint8_t out[4])
+{
+  uint32_t o = __builtin_bswap32 (in);
+  __builtin_memcpy (out, &o, sizeof (uint32_t));
+}
+
+template<typename T>
+inline void store_be(uint8_t out[], T x0, T x1, T x2, T x3,
+                    T x4, T x5, T x6, T x7)
+{
+  store_be(x0, out + (0 * sizeof(T)));
+  store_be(x1, out + (1 * sizeof(T)));
+  store_be(x2, out + (2 * sizeof(T)));
+  store_be(x3, out + (3 * sizeof(T)));
+  store_be(x4, out + (4 * sizeof(T)));
+  store_be(x5, out + (5 * sizeof(T)));
+  store_be(x6, out + (6 * sizeof(T)));
+  store_be(x7, out + (7 * sizeof(T)));
+}
+
+#define BLOCK_SIZE 8
+void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, uint32_t *EK)
+{
+  const size_t blocks4 = blocks / 4;
+
+  for (size_t i = 0; i < blocks4; i++)
+    {
+      uint32_t L0, R0, L1, R1, L2, R2, L3, R3;
+      load_be(in + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+
+      for(size_t r = 0; r != 32; ++r)
+       {
+         L0 += (((R0 << 4) ^ (R0 >> 5)) + R0) ^ EK[2*r];
+         L1 += (((R1 << 4) ^ (R1 >> 5)) + R1) ^ EK[2*r];
+         L2 += (((R2 << 4) ^ (R2 >> 5)) + R2) ^ EK[2*r];
+         L3 += (((R3 << 4) ^ (R3 >> 5)) + R3) ^ EK[2*r];
+
+         R0 += (((L0 << 4) ^ (L0 >> 5)) + L0) ^ EK[2*r+1];
+         R1 += (((L1 << 4) ^ (L1 >> 5)) + L1) ^ EK[2*r+1];
+         R2 += (((L2 << 4) ^ (L2 >> 5)) + L2) ^ EK[2*r+1];
+         R3 += (((L3 << 4) ^ (L3 >> 5)) + L3) ^ EK[2*r+1];
+       }
+
+      store_be(out + 4*BLOCK_SIZE*i, L0, R0, L1, R1, L2, R2, L3, R3);
+    }
+}
+
+// { dg-final { scan-tree-dump-times "not vectorized: vectorization is not profitable" 2 "slp1" { target x86_64-*-* i?86-*-* } } }
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c

index 2305bbdec3ae8d69dea906c8b1f23e18620ac263..b9f12c30fb834afc6f00cc6ac050d40a1cbd88ed 100644 (file)
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -4340,6 +4340,20 @@ vect_bb_slp_scalar_cost (vec_info *vinfo,
      }
  }
  
+/* Comparator for the loop-index sorted cost vectors.  */
+
+static int
+li_cost_vec_cmp (const void *a_, const void *b_)
+{
+  auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
+  auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
+  if (a->first < b->first)
+    return -1;
+  else if (a->first == b->first)
+    return 0;
+  return 1;
+}
+
  /* Check if vectorization of the basic block is profitable for the
     subgraph denoted by SLP_INSTANCES.  */
  
@@ -4352,61 +4366,152 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
    unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
    unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
  
-  void *vect_target_cost_data = init_cost (NULL);
-
    /* Calculate scalar cost and sum the cost for the vector stmts
       previously collected.  */
-  stmt_vector_for_cost scalar_costs;
-  scalar_costs.create (0);
+  stmt_vector_for_cost scalar_costs = vNULL;
+  stmt_vector_for_cost vector_costs = vNULL;
    hash_set<slp_tree> visited;
    FOR_EACH_VEC_ELT (slp_instances, i, instance)
      {
        auto_vec<bool, 20> life;
        life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
                               true);
+      if (SLP_INSTANCE_ROOT_STMT (instance))
+       record_stmt_cost (&scalar_costs, 1, scalar_stmt,
+                         SLP_INSTANCE_ROOT_STMT (instance), 0, vect_body);
        vect_bb_slp_scalar_cost (bb_vinfo,
                                SLP_INSTANCE_TREE (instance),
                                &life, &scalar_costs, visited);
-      add_stmt_costs (bb_vinfo, vect_target_cost_data, &instance->cost_vec);
+      vector_costs.safe_splice (instance->cost_vec);
        instance->cost_vec.release ();
      }
    /* Unset visited flag.  */
-  stmt_info_for_cost *si;
-  FOR_EACH_VEC_ELT (scalar_costs, i, si)
-    gimple_set_visited  (si->stmt_info->stmt, false);
+  stmt_info_for_cost *cost;
+  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+    gimple_set_visited  (cost->stmt_info->stmt, false);
  
-  void *scalar_target_cost_data = init_cost (NULL);
-  add_stmt_costs (bb_vinfo, scalar_target_cost_data, &scalar_costs);
-  scalar_costs.release ();
-  unsigned dummy;
-  finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
-  destroy_cost_data (scalar_target_cost_data);
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
+
+  /* When costing non-loop vectorization we need to consider each covered
+     loop independently and make sure vectorization is profitable.  For
+     now we assume a loop may be not entered or executed an arbitrary
+     number of iterations (???  static information can provide more
+     precise info here) which means we can simply cost each containing
+     loops stmts separately.  */
+
+  /* First produce cost vectors sorted by loop index.  */
+  auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
+    li_scalar_costs (scalar_costs.length ());
+  auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
+    li_vector_costs (vector_costs.length ());
+  FOR_EACH_VEC_ELT (scalar_costs, i, cost)
+    {
+      unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
+      li_scalar_costs.quick_push (std::make_pair (l, cost));
+    }
+  /* Use a random used loop as fallback in case the first vector_costs
+     entry does not have a stmt_info associated with it.  */
+  unsigned l = li_scalar_costs[0].first;
+  FOR_EACH_VEC_ELT (vector_costs, i, cost)
+    {
+      /* We inherit from the previous COST, invariants, externals and
+        extracts immediately follow the cost for the related stmt.  */
+      if (cost->stmt_info)
+       l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
+      li_vector_costs.quick_push (std::make_pair (l, cost));
+    }
+  li_scalar_costs.qsort (li_cost_vec_cmp);
+  li_vector_costs.qsort (li_cost_vec_cmp);
+
+  /* Now cost the portions individually.  */
+  unsigned vi = 0;
+  unsigned si = 0;
+  do
+    {
+      unsigned sl = li_scalar_costs[si].first;
+      unsigned vl = li_vector_costs[vi].first;
+      if (sl != vl)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "Scalar %d and vector %d loop part do not "
+                            "match up, skipping scalar part\n", sl, vl);
+         /* Skip the scalar part, assuming zero cost on the vector side.  */
+         do
+           {
+             si++;
+           }
+         while (si < li_scalar_costs.length ()
+                && li_scalar_costs[si].first == sl);
+         continue;
+       }
  
-  /* Complete the target-specific vector cost calculation.  */
-  finish_cost (vect_target_cost_data, &vec_prologue_cost,
-              &vec_inside_cost, &vec_epilogue_cost);
-  destroy_cost_data (vect_target_cost_data);
+      void *scalar_target_cost_data = init_cost (NULL);
+      do
+       {
+         add_stmt_cost (bb_vinfo, scalar_target_cost_data,
+                        li_scalar_costs[si].second);
+         si++;
+       }
+      while (si < li_scalar_costs.length ()
+            && li_scalar_costs[si].first == sl);
+      unsigned dummy;
+      finish_cost (scalar_target_cost_data, &dummy, &scalar_cost, &dummy);
+      destroy_cost_data (scalar_target_cost_data);
+
+      /* Complete the target-specific vector cost calculation.  */
+      void *vect_target_cost_data = init_cost (NULL);
+      do
+       {
+         add_stmt_cost (bb_vinfo, vect_target_cost_data,
+                        li_vector_costs[vi].second);
+         vi++;
+       }
+      while (vi < li_vector_costs.length ()
+            && li_vector_costs[vi].first == vl);
+      finish_cost (vect_target_cost_data, &vec_prologue_cost,
+                  &vec_inside_cost, &vec_epilogue_cost);
+      destroy_cost_data (vect_target_cost_data);
  
-  vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
+      vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
  
-  if (dump_enabled_p ())
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "Cost model analysis for part in loop %d:\n", sl);
+         dump_printf (MSG_NOTE, "  Vector cost: %d\n",
+                      vec_inside_cost + vec_outside_cost);
+         dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
+       }
+
+      /* Vectorization is profitable if its cost is more than the cost of scalar
+        version.  Note that we err on the vector side for equal cost because
+        the cost estimate is otherwise quite pessimistic (constant uses are
+        free on the scalar side but cost a load on the vector side for
+        example).  */
+      if (vec_outside_cost + vec_inside_cost > scalar_cost)
+       {
+         scalar_costs.release ();
+         vector_costs.release ();
+         return false;
+       }
+    }
+  while (si < li_scalar_costs.length ()
+        && vi < li_vector_costs.length ());
+  if (vi < li_vector_costs.length ())
      {
-      dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
-      dump_printf (MSG_NOTE, "  Vector inside of basic block cost: %d\n",
-                  vec_inside_cost);
-      dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n", vec_prologue_cost);
-      dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n", vec_epilogue_cost);
-      dump_printf (MSG_NOTE, "  Scalar cost of basic block: %d\n", scalar_cost);
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Excess vector cost for part in loop %d:\n",
+                        li_vector_costs[vi].first);
+      scalar_costs.release ();
+      vector_costs.release ();
+      return false;
      }
  
-  /* Vectorization is profitable if its cost is more than the cost of scalar
-     version.  Note that we err on the vector side for equal cost because
-     the cost estimate is otherwise quite pessimistic (constant uses are
-     free on the scalar side but cost a load on the vector side for
-     example).  */
-  if (vec_outside_cost + vec_inside_cost > scalar_cost)
-    return false;
-
+  scalar_costs.release ();
+  vector_costs.release ();
    return true;
  }
  
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h

index e564fcf835a46a8c1aa6b5fb52f7ecd60bcb1bc9..b861c97ab3aef179ba9b2900701cf09e75a847a5 100644 (file)
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -1481,6 +1481,15 @@ add_stmt_cost (vec_info *vinfo, void *data, int count,
    return cost;
  }
  
+/* Alias targetm.vectorize.add_stmt_cost.  */
+
+static inline unsigned
+add_stmt_cost (vec_info *vinfo, void *data, stmt_info_for_cost *i)
+{
+  return add_stmt_cost (vinfo, data, i->count, i->kind, i->stmt_info,
+                       i->vectype, i->misalign, i->where);
+}
+
  /* Alias targetm.vectorize.finish_cost.  */
  
  static inline void
author	Richard Biener <rguenther@suse.de>
	Fri, 5 Feb 2021 08:54:00 +0000 (09:54 +0100)
committer	Richard Biener <rguenther@suse.de>
	Fri, 5 Feb 2021 13:03:00 +0000 (14:03 +0100)
gcc/testsuite/g++.dg/vect/slp-pr98855.cc	[new file with mode: 0644]	patch \| blob
gcc/tree-vect-slp.c		patch \| blob \| history
gcc/tree-vectorizer.h		patch \| blob \| history