re PR middle-end/37150 (basic-block vectorization misses some unrolled loops)
authorRichard Biener <rguenther@suse.de>
Mon, 7 Nov 2016 08:06:08 +0000 (08:06 +0000)
committerRichard Biener <rguenth@gcc.gnu.org>
Mon, 7 Nov 2016 08:06:08 +0000 (08:06 +0000)
2016-11-07  Richard Biener  <rguenther@suse.de>

PR tree-optimization/37150
* tree-vectorizer.h (vect_transform_slp_perm_load): Add n_perms
parameter.
* tree-vect-slp.c (vect_supported_load_permutation_p): Adjust.
(vect_analyze_slp_cost_1): Account for the real number of
permutations emitted and for dead loads.
(vect_transform_slp_perm_load): Add n_perms parameter counting
the number of emitted permutations.
* tree-vect-stmts.c (vectorizable_load): Adjust.

From-SVN: r241893

gcc/ChangeLog
gcc/tree-vect-slp.c
gcc/tree-vect-stmts.c
gcc/tree-vectorizer.h

index 960ea67b0bc3c3ece1c3910d2bdcb9e1bef2aa82..7e5c970380fa53f2b24b6445bae299574d01f216 100644 (file)
@@ -1,3 +1,15 @@
+2016-11-07  Richard Biener  <rguenther@suse.de>
+
+       PR tree-optimization/37150
+       * tree-vectorizer.h (vect_transform_slp_perm_load): Add n_perms
+       parameter.
+       * tree-vect-slp.c (vect_supported_load_permutation_p): Adjust.
+       (vect_analyze_slp_cost_1): Account for the real number of
+       permutations emitted and for dead loads.
+       (vect_transform_slp_perm_load): Add n_perms parameter counting
+       the number of emitted permutations.
+       * tree-vect-stmts.c (vectorizable_load): Adjust.
+
 2016-11-07  Richard Biener  <rguenther@suse.de>
 
        PR tree-optimization/78189
index 62f060c2c81fcef57324b6c02b5d352c7ba4c97c..6694164effb4383ed9dd8a68e960808886b65740 100644 (file)
@@ -1461,8 +1461,9 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
            {
              /* Verify the permutation can be generated.  */
              vec<tree> tem;
+             unsigned n_perms;
              if (!vect_transform_slp_perm_load (node, tem, NULL,
-                                                1, slp_instn, true))
+                                                1, slp_instn, true, &n_perms))
                {
                  dump_printf_loc (MSG_MISSED_OPTIMIZATION,
                                   vect_location,
@@ -1475,11 +1476,13 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
     }
 
   /* For loop vectorization verify we can generate the permutation.  */
+  unsigned n_perms;
   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
     if (node->load_permutation.exists ()
        && !vect_transform_slp_perm_load
              (node, vNULL, NULL,
-              SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
+              SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true,
+              &n_perms))
       return false;
 
   return true;
@@ -1548,14 +1551,38 @@ vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
              stmt = GROUP_FIRST_ELEMENT (stmt_info);
              stmt_info = vinfo_for_stmt (stmt);
              /* Record the cost for the permutation.  */
-             record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
+             unsigned n_perms;
+             vect_transform_slp_perm_load (node, vNULL, NULL,
+                                           ncopies_for_cost, instance, true,
+                                           &n_perms);
+             record_stmt_cost (body_cost_vec, n_perms, vec_perm,
                                stmt_info, 0, vect_body);
-             /* And adjust the number of loads performed.  */
              unsigned nunits
                = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
-             ncopies_for_cost
-               = (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
-                  + nunits - 1) / nunits;
+             /* And adjust the number of loads performed.  This handles
+                redundancies as well as loads that are later dead.  */
+             auto_sbitmap perm (GROUP_SIZE (stmt_info));
+             bitmap_clear (perm);
+             for (i = 0; i < SLP_TREE_LOAD_PERMUTATION (node).length (); ++i)
+               bitmap_set_bit (perm, SLP_TREE_LOAD_PERMUTATION (node)[i]);
+             ncopies_for_cost = 0;
+             bool load_seen = false;
+             for (i = 0; i < GROUP_SIZE (stmt_info); ++i)
+               {
+                 if (i % nunits == 0)
+                   {
+                     if (load_seen)
+                       ncopies_for_cost++;
+                     load_seen = false;
+                   }
+                 if (bitmap_bit_p (perm, i))
+                   load_seen = true;
+               }
+             if (load_seen)
+               ncopies_for_cost++;
+             gcc_assert (ncopies_for_cost
+                         <= (GROUP_SIZE (stmt_info) - GROUP_GAP (stmt_info)
+                             + nunits - 1) / nunits);
              ncopies_for_cost *= SLP_INSTANCE_UNROLLING_FACTOR (instance);
            }
          /* Record the cost for the vector loads.  */
@@ -3402,7 +3429,8 @@ vect_create_mask_and_perm (gimple *stmt,
 bool
 vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                               gimple_stmt_iterator *gsi, int vf,
-                              slp_instance slp_node_instance, bool analyze_only)
+                              slp_instance slp_node_instance, bool analyze_only,
+                             unsigned *n_perms)
 {
   gimple *stmt = SLP_TREE_SCALAR_STMTS (node)[0];
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
@@ -3457,6 +3485,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
   int first_vec_index = -1;
   int second_vec_index = -1;
   bool noop_p = true;
+  *n_perms = 0;
 
   for (int j = 0; j < unroll_factor; j++)
     {
@@ -3513,6 +3542,9 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                  return false;
                }
 
+             if (! noop_p)
+               ++*n_perms;
+
              if (!analyze_only)
                {
                  tree mask_vec = NULL_TREE;
index 1d17156b0b613ecc194767575fbfe6441f3dedd0..ab01defbe55102a9d9cfc71b0afccbe0bf85084f 100644 (file)
@@ -6978,8 +6978,11 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
            }
        }
       if (slp_perm)
-       vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
-                                     slp_node_instance, false);
+       {
+         unsigned n_perms;
+         vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
+                                       slp_node_instance, false, &n_perms);
+       }
       return true;
     }
 
@@ -7497,8 +7500,10 @@ vectorizable_load (gimple *stmt, gimple_stmt_iterator *gsi, gimple **vec_stmt,
 
       if (slp_perm)
         {
+         unsigned n_perms;
           if (!vect_transform_slp_perm_load (slp_node, dr_chain, gsi, vf,
-                                             slp_node_instance, false))
+                                             slp_node_instance, false,
+                                            &n_perms))
             {
               dr_chain.release ();
               return false;
index 386654862b4a11df804c1d5636c15957b0e27ecd..2a7cdfe27a5d60007b654e93ce73d74849ae54c8 100644 (file)
@@ -1166,7 +1166,7 @@ extern int vect_get_known_peeling_cost (loop_vec_info, int, int *,
 extern void vect_free_slp_instance (slp_instance);
 extern bool vect_transform_slp_perm_load (slp_tree, vec<tree> ,
                                           gimple_stmt_iterator *, int,
-                                          slp_instance, bool);
+                                          slp_instance, bool, unsigned *);
 extern bool vect_slp_analyze_operations (vec<slp_instance> slp_instances,
                                         void *);
 extern bool vect_schedule_slp (vec_info *);