vect: Fix load costs for SLP permutes
authorRichard Sandiford <richard.sandiford@arm.com>
Thu, 29 Oct 2020 13:38:01 +0000 (13:38 +0000)
committerRichard Sandiford <richard.sandiford@arm.com>
Thu, 29 Oct 2020 13:38:01 +0000 (13:38 +0000)
For the following test case (compiled with load/store lanes
disabled locally):

  void
  f (uint32_t *restrict x, uint8_t *restrict y, int n)
  {
    for (int i = 0; i < n; ++i)
      {
x[i * 2] = x[i * 2] + y[i * 2];
x[i * 2 + 1] = x[i * 2 + 1] + y[i * 2];
      }
  }

we have a redundant no-op permute on the x[] load node:

   node 0x4472350 (max_nunits=8, refcnt=2)
          stmt 0 _5 = *_4;
          stmt 1 _13 = *_12;
          load permutation { 0 1 }

Then, when costing it, we pick a cost of 1, even though we need 4 copies
of the x[] load to match a single y[] load:

   ==> examining statement: _5 = *_4;
   Vectorizing an unaligned access.
   vect_model_load_cost: unaligned supported by hardware.
   vect_model_load_cost: inside_cost = 1, prologue_cost = 0 .

The problem is that the code only considers the permutation for
the first scalar iteration, rather than for all VF iterations.

This patch tries to fix that by making vect_transform_slp_perm_load
calculate the value instead.

gcc/
* tree-vectorizer.h (vect_transform_slp_perm_load): Take an
optional extra parameter.
* tree-vect-slp.c (vect_transform_slp_perm_load): Calculate
the number of loads as well as the number of permutes, taking
the counting loop from...
* tree-vect-stmts.c (vect_model_load_cost): ...here.  Use the
value computed by vect_transform_slp_perm_load for ncopies.

gcc/tree-vect-slp.c
gcc/tree-vect-stmts.c
gcc/tree-vectorizer.h

index 7a08908cde87db88f35e5d0f08063e0971e2c2a7..5d69a98c2a90be47ff852c90039b0ab2cd2e58ef 100644 (file)
@@ -4830,13 +4830,16 @@ vect_get_slp_defs (vec_info *,
 
 /* Generate vector permute statements from a list of loads in DR_CHAIN.
    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
-   permute statements for the SLP node NODE.  */
+   permute statements for the SLP node NODE.  Store the number of vector
+   permute instructions in *N_PERMS and the number of vector load
+   instructions in *N_LOADS.  */
 
 bool
 vect_transform_slp_perm_load (vec_info *vinfo,
                              slp_tree node, vec<tree> dr_chain,
                              gimple_stmt_iterator *gsi, poly_uint64 vf,
-                             bool analyze_only, unsigned *n_perms)
+                             bool analyze_only, unsigned *n_perms,
+                             unsigned int *n_loads)
 {
   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
   int vec_index = 0;
@@ -4888,6 +4891,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
   vec_perm_builder mask;
   unsigned int nelts_to_build;
   unsigned int nvectors_per_build;
+  unsigned int in_nlanes;
   bool repeating_p = (group_size == DR_GROUP_SIZE (stmt_info)
                      && multiple_p (nunits, group_size));
   if (repeating_p)
@@ -4898,6 +4902,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
       mask.new_vector (nunits, group_size, 3);
       nelts_to_build = mask.encoded_nelts ();
       nvectors_per_build = SLP_TREE_VEC_STMTS (node).length ();
+      in_nlanes = DR_GROUP_SIZE (stmt_info) * 3;
     }
   else
     {
@@ -4909,7 +4914,10 @@ vect_transform_slp_perm_load (vec_info *vinfo,
       mask.new_vector (const_nunits, const_nunits, 1);
       nelts_to_build = const_vf * group_size;
       nvectors_per_build = 1;
+      in_nlanes = const_vf * DR_GROUP_SIZE (stmt_info);
     }
+  auto_sbitmap used_in_lanes (in_nlanes);
+  bitmap_clear (used_in_lanes);
 
   unsigned int count = mask.encoded_nelts ();
   mask.quick_grow (count);
@@ -4921,6 +4929,7 @@ vect_transform_slp_perm_load (vec_info *vinfo,
       unsigned int stmt_num = j % group_size;
       unsigned int i = (iter_num * DR_GROUP_SIZE (stmt_info)
                        + SLP_TREE_LOAD_PERMUTATION (node)[stmt_num]);
+      bitmap_set_bit (used_in_lanes, i);
       if (repeating_p)
        {
          first_vec_index = 0;
@@ -5034,6 +5043,32 @@ vect_transform_slp_perm_load (vec_info *vinfo,
        }
     }
 
+  if (n_loads)
+    {
+      if (repeating_p)
+       *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
+      else
+       {
+         /* Enforced above when !repeating_p.  */
+         unsigned int const_nunits = nunits.to_constant ();
+         *n_loads = 0;
+         bool load_seen = false;
+         for (unsigned i = 0; i < in_nlanes; ++i)
+           {
+             if (i % const_nunits == 0)
+               {
+                 if (load_seen)
+                   *n_loads += 1;
+                 load_seen = false;
+               }
+             if (bitmap_bit_p (used_in_lanes, i))
+               load_seen = true;
+           }
+         if (load_seen)
+           *n_loads += 1;
+       }
+    }
+
   return true;
 }
 
index 7f0763f15c4a7cb63bb496d432368ab31122f1fb..1a0da0e84cc84b0219f7b8332e26ffaf67673df5 100644 (file)
@@ -1098,39 +1098,15 @@ vect_model_load_cost (vec_info *vinfo,
         the first group element not by the first scalar stmt DR.  */
       stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
       /* Record the cost for the permutation.  */
-      unsigned n_perms;
-      unsigned assumed_nunits
-       = vect_nunits_for_cost (STMT_VINFO_VECTYPE (first_stmt_info));
+      unsigned n_perms, n_loads;
       vect_transform_slp_perm_load (vinfo, slp_node, vNULL, NULL,
-                                   vf, true, &n_perms);
+                                   vf, true, &n_perms, &n_loads);
       inside_cost += record_stmt_cost (cost_vec, n_perms, vec_perm,
                                       first_stmt_info, 0, vect_body);
+
       /* And adjust the number of loads performed.  This handles
         redundancies as well as loads that are later dead.  */
-      auto_sbitmap perm (DR_GROUP_SIZE (first_stmt_info));
-      bitmap_clear (perm);
-      for (unsigned i = 0;
-          i < SLP_TREE_LOAD_PERMUTATION (slp_node).length (); ++i)
-       bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (slp_node)[i]);
-      ncopies = 0;
-      bool load_seen = false;
-      for (unsigned i = 0; i < DR_GROUP_SIZE (first_stmt_info); ++i)
-       {
-         if (i % assumed_nunits == 0)
-           {
-             if (load_seen)
-               ncopies++;
-             load_seen = false;
-           }
-         if (bitmap_bit_p (perm, i))
-           load_seen = true;
-       }
-      if (load_seen)
-       ncopies++;
-      gcc_assert (ncopies
-                 <= (DR_GROUP_SIZE (first_stmt_info)
-                     - DR_GROUP_GAP (first_stmt_info)
-                     + assumed_nunits - 1) / assumed_nunits);
+      ncopies = n_loads;
     }
 
   /* Grouped loads read all elements in the group at once,
index 13a02cd0d0cab06dd531c9e50b7c920ecf79b07d..fbf5291cf065f3944040937db92d3997acd45f23 100644 (file)
@@ -1952,7 +1952,8 @@ extern tree cse_and_gimplify_to_preheader (loop_vec_info, tree);
 extern void vect_free_slp_instance (slp_instance);
 extern bool vect_transform_slp_perm_load (vec_info *, slp_tree, vec<tree>,
                                          gimple_stmt_iterator *, poly_uint64,
-                                         bool, unsigned *);
+                                         bool, unsigned *,
+                                         unsigned * = nullptr);
 extern bool vect_slp_analyze_operations (vec_info *);
 extern void vect_schedule_slp (vec_info *, vec<slp_instance>);
 extern opt_result vect_analyze_slp (vec_info *, unsigned);