From: Richard Biener <rguenther@suse.de>
Date: Fri, 26 Oct 2018 07:38:59 +0000 (+0000)
Subject: re PR tree-optimization/87105 (Autovectorization [X86, SSE2, AVX2, DoublePrecision])
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=a1f072e2441c58f6a486f90bb9a32bd5f6c51cb4;p=gcc.git

re PR tree-optimization/87105 (Autovectorization [X86, SSE2, AVX2, DoublePrecision])

2018-10-26  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/87105
	* tree-vectorizer.h (_slp_tree::refcnt): New member.
	* tree-vect-slp.c (vect_free_slp_tree): Decrement and honor
	refcnt.
	(vect_create_new_slp_node): Initialize refcnt to one.
	(bst_traits): Move.
	(scalar_stmts_set_t, bst_fail): Remove.
	(vect_build_slp_tree_2): Add bst_map argument and adjust calls.
	(vect_build_slp_tree): Add bst_map argument and lookup
	already created SLP nodes.
	(vect_print_slp_tree): Handle a SLP graph, print SLP node
	addresses.
	(vect_slp_rearrange_stmts): Handle a SLP graph.
	(vect_analyze_slp_instance): Adjust and free SLP nodes from
	the CSE map.  Fix indenting.
	(vect_schedule_slp_instance): Add short-cut.

	* g++.dg/vect/slp-pr87105.cc: Adjust.
	* gcc.dg/torture/20181024-1.c: New testcase.
	* g++.dg/opt/20181025-1.C: Likewise.

From-SVN: r265522
---

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b60988c910e..d46a3b47a55 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,22 @@
+2018-10-26  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/87105
+	* tree-vectorizer.h (_slp_tree::refcnt): New member.
+	* tree-vect-slp.c (vect_free_slp_tree): Decrement and honor
+	refcnt.
+	(vect_create_new_slp_node): Initialize refcnt to one.
+	(bst_traits): Move.
+	(scalar_stmts_set_t, bst_fail): Remove.
+	(vect_build_slp_tree_2): Add bst_map argument and adjust calls.
+	(vect_build_slp_tree): Add bst_map argument and lookup
+	already created SLP nodes.
+	(vect_print_slp_tree): Handle a SLP graph, print SLP node
+	addresses.
+	(vect_slp_rearrange_stmts): Handle a SLP graph.
+	(vect_analyze_slp_instance): Adjust and free SLP nodes from
+	the CSE map.  Fix indenting.
+	(vect_schedule_slp_instance): Add short-cut.
+
 2018-10-26  Martin Liska  <mliska@suse.cz>
 
 	PR testsuite/86158
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 73739030993..36627afeaa3 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@
+2018-10-26  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/87105
+	* g++.dg/vect/slp-pr87105.cc: Adjust.
+	* gcc.dg/torture/20181024-1.c: New testcase.
+	* g++.dg/opt/20181025-1.C: Likewise.
+
 2018-10-26  Richard Biener  <rguenther@suse.de>
 
 	PR testsuite/87754
diff --git a/gcc/testsuite/g++.dg/opt/20181025-1.C b/gcc/testsuite/g++.dg/opt/20181025-1.C
new file mode 100644
index 00000000000..43d1614f023
--- /dev/null
+++ b/gcc/testsuite/g++.dg/opt/20181025-1.C
@@ -0,0 +1,31 @@
+// { dg-do compile }
+// { dg-options "-Ofast" }
+
+template <typename Number>
+class Vector {
+    typedef Number value_type;
+    typedef const value_type *const_iterator;
+    Number norm_sqr () const;
+    const_iterator begin () const;
+    unsigned int dim;
+};
+template <typename Number>
+static inline Number
+local_sqr (const Number x)
+{
+  return x*x;
+}
+template <typename Number>
+Number
+Vector<Number>::norm_sqr () const
+{
+  Number sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+  const_iterator ptr = begin(), eptr = ptr + (dim/4)*4;
+  while (ptr!=eptr) 
+    {
+      sum0 += ::local_sqr(*ptr++);
+      sum1 += ::local_sqr(*ptr++);
+    }
+  return sum0+sum1+sum2+sum3;
+}
+template class Vector<double>;
diff --git a/gcc/testsuite/g++.dg/vect/slp-pr87105.cc b/gcc/testsuite/g++.dg/vect/slp-pr87105.cc
index 1023d915201..949b16c848f 100644
--- a/gcc/testsuite/g++.dg/vect/slp-pr87105.cc
+++ b/gcc/testsuite/g++.dg/vect/slp-pr87105.cc
@@ -2,7 +2,7 @@
 // { dg-require-effective-target c++11 }
 // { dg-require-effective-target vect_double }
 // For MIN/MAX recognition
-// { dg-additional-options "-ffast-math -fvect-cost-model" }
+// { dg-additional-options "-ffast-math" }
 
 #include <algorithm>
 #include <cmath>
@@ -99,6 +99,7 @@ void quadBoundingBoxA(const Point bez[3], Box& bBox) noexcept {
 
 // We should have if-converted everything down to straight-line code
 // { dg-final { scan-tree-dump-times "<bb \[0-9\]+>" 1 "slp2" } }
-// We fail to elide an earlier store which makes us not handle a later
-// duplicate one for vectorization.
-// { dg-final { scan-tree-dump-times "basic block part vectorized" 1 "slp2" { xfail *-*-* } } }
+// { dg-final { scan-tree-dump-times "basic block part vectorized" 1 "slp2" } }
+// It's a bit awkward to detect that all stores were vectorized but the
+// following more or less does the trick
+// { dg-final { scan-tree-dump "vect_iftmp\[^\r\m\]* = MIN" "slp2" } }
diff --git a/gcc/testsuite/gcc.dg/torture/20181024-1.c b/gcc/testsuite/gcc.dg/torture/20181024-1.c
new file mode 100644
index 00000000000..f2cfe7f6d67
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/torture/20181024-1.c
@@ -0,0 +1,41 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-march=core-avx2" { target { x86_64-*-* i?86-*-* } } } */
+
+typedef enum {
+ C = 0,               N, S, E, W, T, B,               NE, NW, SE, SW,               NT, NB, ST, SB,               ET, EB, WT, WB,               FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
+typedef double LBM_Grid[(130)*100*100*N_CELL_ENTRIES];
+void foo( LBM_Grid srcGrid )
+{
+  double ux , uy , uz , rho ,         ux1, uy1, uz1, rho1,         ux2, uy2, uz2, rho2,         u2, px, py;
+  int i;
+  for( i = 0;
+       i < (N_CELL_ENTRIES*( 100*100));
+       i += N_CELL_ENTRIES )
+    {
+      rho1 = + ((srcGrid)[((C)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((N)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((S)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((E)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((W)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((T)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((B)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((NE)+N_CELL_ENTRIES*( 100*100))+(i)]) 
+	  + ((srcGrid)[((NW)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((SE)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((SW)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((NT)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((NB)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((ST)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((SB)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((ET)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((EB)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((WT)+N_CELL_ENTRIES*( 100*100))+(i)])
+	  + ((srcGrid)[((WB)+N_CELL_ENTRIES*( 100*100))+(i)]);
+      rho = 2.0*rho1 - rho2;
+      px = (((i / N_CELL_ENTRIES) % 100) / (0.5*(100-1))) - 1.0;
+      uz = 0.01 * (1.0-px*px) * (1.0-py*py);
+      u2 = 1.5 * (ux*ux + uy*uy + uz*uz);
+      (((srcGrid)[((C))+(i)])) = (1.0/ 3.0)*rho*(1.0 - u2);
+      (((srcGrid)[((N))+(i)])) = (1.0/18.0)*rho*(1.0 + uy*(4.5*uy + 3.0) - u2);
+    }
+}
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 3aae1776ef9..ab8504a10bd 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -57,6 +57,9 @@ vect_free_slp_tree (slp_tree node, bool final_p)
   int i;
   slp_tree child;
 
+  if (--node->refcnt != 0)
+    return;
+
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
     vect_free_slp_tree (child, final_p);
 
@@ -82,7 +85,6 @@ vect_free_slp_tree (slp_tree node, bool final_p)
   free (node);
 }
 
-
 /* Free the memory allocated for the SLP instance.  FINAL_P is true if we
    have vectorized the instance or if we have made a final decision not
    to vectorize the statements in any way.  */
@@ -126,6 +128,7 @@ vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts)
   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
   SLP_TREE_TWO_OPERATORS (node) = false;
   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
+  node->refcnt = 1;
 
   unsigned i;
   FOR_EACH_VEC_ELT (scalar_stmts, i, stmt_info)
@@ -1021,9 +1024,6 @@ bst_traits::equal (value_type existing, value_type candidate)
   return true;
 }
 
-typedef hash_set <vec <gimple *>, bst_traits> scalar_stmts_set_t;
-static scalar_stmts_set_t *bst_fail;
-
 typedef hash_map <vec <gimple *>, slp_tree,
 		  simple_hashmap_traits <bst_traits, slp_tree> >
   scalar_stmts_to_slp_tree_map_t;
@@ -1034,30 +1034,33 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 		       poly_uint64 *max_nunits,
 		       vec<slp_tree> *loads,
 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
-		       unsigned max_tree_size);
+		       unsigned max_tree_size,
+		       scalar_stmts_to_slp_tree_map_t *bst_map);
 
 static slp_tree
 vect_build_slp_tree (vec_info *vinfo,
 		     vec<stmt_vec_info> stmts, unsigned int group_size,
 		     poly_uint64 *max_nunits, vec<slp_tree> *loads,
 		     bool *matches, unsigned *npermutes, unsigned *tree_size,
-		     unsigned max_tree_size)
+		     unsigned max_tree_size,
+		     scalar_stmts_to_slp_tree_map_t *bst_map)
 {
-  if (bst_fail->contains (stmts))
-    return NULL;
-  slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
-					loads, matches, npermutes, tree_size,
-					max_tree_size);
-  /* When SLP build fails for stmts record this, otherwise SLP build
-     can be exponential in time when we allow to construct parts from
-     scalars, see PR81723.  */
-  if (! res)
+  if (slp_tree *leader = bst_map->get (stmts))
     {
-      vec <stmt_vec_info> x;
-      x.create (stmts.length ());
-      x.splice (stmts);
-      bst_fail->add (x);
+      if (dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
+			 *leader ? "" : "failed ", *leader);
+      if (*leader)
+	(*leader)->refcnt++;
+      return *leader;
     }
+  slp_tree res = vect_build_slp_tree_2 (vinfo, stmts, group_size, max_nunits,
+					loads, matches, npermutes, tree_size,
+					max_tree_size, bst_map);
+  /* Keep a reference for the bst_map use.  */
+  if (res)
+    res->refcnt++;
+  bst_map->put (stmts.copy (), res);
   return res;
 }
 
@@ -1074,7 +1077,8 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 		       poly_uint64 *max_nunits,
 		       vec<slp_tree> *loads,
 		       bool *matches, unsigned *npermutes, unsigned *tree_size,
-		       unsigned max_tree_size)
+		       unsigned max_tree_size,
+		       scalar_stmts_to_slp_tree_map_t *bst_map)
 {
   unsigned nops, i, this_tree_size = 0;
   poly_uint64 this_max_nunits = *max_nunits;
@@ -1205,7 +1209,7 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 					group_size, &this_max_nunits,
 					&this_loads, matches, npermutes,
 					&this_tree_size,
-					max_tree_size)) != NULL)
+					max_tree_size, bst_map)) != NULL)
 	{
 	  /* If we have all children of child built up from scalars then just
 	     throw that away and build it up this node from scalars.  */
@@ -1348,7 +1352,7 @@ vect_build_slp_tree_2 (vec_info *vinfo,
 					    group_size, &this_max_nunits,
 					    &this_loads, tem, npermutes,
 					    &this_tree_size,
-					    max_tree_size)) != NULL)
+					    max_tree_size, bst_map)) != NULL)
 	    {
 	      /* ... so if successful we can apply the operand swapping
 		 to the GIMPLE IL.  This is necessary because for example
@@ -1441,21 +1445,37 @@ fail:
 
 static void
 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
-		     slp_tree node)
+		     slp_tree node, hash_set<slp_tree> &visited)
 {
   int i;
   stmt_vec_info stmt_info;
   slp_tree child;
 
-  dump_printf_loc (dump_kind, loc, "node%s\n",
+  if (visited.add (node))
+    return;
+
+  dump_printf_loc (dump_kind, loc, "node%s %p\n",
 		   SLP_TREE_DEF_TYPE (node) != vect_internal_def
-		   ? " (external)" : "");
+		   ? " (external)" : "", node);
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
     dump_printf_loc (dump_kind, loc, "\tstmt %d %G", i, stmt_info->stmt);
+  if (SLP_TREE_CHILDREN (node).is_empty ())
+    return;
+  dump_printf_loc (dump_kind, loc, "\tchildren");
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_print_slp_tree (dump_kind, loc, child);
+    dump_printf (dump_kind, " %p", (void *)child);
+  dump_printf (dump_kind, "\n");
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
+    vect_print_slp_tree (dump_kind, loc, child, visited);
 }
 
+static void
+vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
+		     slp_tree node)
+{
+  hash_set<slp_tree> visited;
+  vect_print_slp_tree (dump_kind, loc, node, visited);
+}
 
 /* Mark the tree rooted at NODE with MARK (PURE_SLP or HYBRID).
    If MARK is HYBRID, it refers to a specific stmt in NODE (the stmt at index
@@ -1509,15 +1529,19 @@ vect_mark_slp_stmts_relevant (slp_tree node)
 
 static void
 vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
-                          vec<unsigned> permutation)
+                          vec<unsigned> permutation,
+			  hash_set<slp_tree> &visited)
 {
   stmt_vec_info stmt_info;
   vec<stmt_vec_info> tmp_stmts;
   unsigned int i;
   slp_tree child;
 
+  if (visited.add (node))
+    return;
+
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_slp_rearrange_stmts (child, group_size, permutation);
+    vect_slp_rearrange_stmts (child, group_size, permutation, visited);
 
   gcc_assert (group_size == SLP_TREE_SCALAR_STMTS (node).length ());
   tmp_stmts.create (group_size);
@@ -1578,8 +1602,9 @@ vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
      statements in the nodes is not important unless they are memory
      accesses, we can rearrange the statements in all the nodes
      according to the order of the loads.  */
+  hash_set<slp_tree> visited;
   vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
-			    node->load_permutation);
+			    node->load_permutation, visited);
 
   /* We are done, no actual permutations need to be generated.  */
   poly_uint64 unrolling_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_instn);
@@ -1889,12 +1914,18 @@ vect_analyze_slp_instance (vec_info *vinfo,
   /* Build the tree for the SLP instance.  */
   bool *matches = XALLOCAVEC (bool, group_size);
   unsigned npermutes = 0;
-  bst_fail = new scalar_stmts_set_t ();
+  scalar_stmts_to_slp_tree_map_t *bst_map
+    = new scalar_stmts_to_slp_tree_map_t ();
   poly_uint64 max_nunits = nunits;
   node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
 			      &max_nunits, &loads, matches, &npermutes,
-			      NULL, max_tree_size);
-  delete bst_fail;
+			      NULL, max_tree_size, bst_map);
+  /* The map keeps a reference on SLP nodes built, release that.  */
+  for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
+       it != bst_map->end (); ++it)
+    if ((*it).second)
+      vect_free_slp_tree ((*it).second, false);
+  delete bst_map;
   if (node != NULL)
     {
       /* Calculate the unrolling factor based on the smallest type.  */
@@ -1924,109 +1955,109 @@ vect_analyze_slp_instance (vec_info *vinfo,
 	}
       else
 	{
-      /* Create a new SLP instance.  */
-      new_instance = XNEW (struct _slp_instance);
-      SLP_INSTANCE_TREE (new_instance) = node;
-      SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
-      SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
-      SLP_INSTANCE_LOADS (new_instance) = loads;
-
-      /* Compute the load permutation.  */
-      slp_tree load_node;
-      bool loads_permuted = false;
-      FOR_EACH_VEC_ELT (loads, i, load_node)
-	{
-	  vec<unsigned> load_permutation;
-	  int j;
-	  stmt_vec_info load_info;
-	  bool this_load_permuted = false;
-	  load_permutation.create (group_size);
-	  stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
-	    (SLP_TREE_SCALAR_STMTS (load_node)[0]);
-	  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+	  /* Create a new SLP instance.  */
+	  new_instance = XNEW (struct _slp_instance);
+	  SLP_INSTANCE_TREE (new_instance) = node;
+	  SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
+	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
+	  SLP_INSTANCE_LOADS (new_instance) = loads;
+
+	  /* Compute the load permutation.  */
+	  slp_tree load_node;
+	  bool loads_permuted = false;
+	  FOR_EACH_VEC_ELT (loads, i, load_node)
 	    {
-	      int load_place = vect_get_place_in_interleaving_chain
-		(load_info, first_stmt_info);
-	      gcc_assert (load_place != -1);
-	      if (load_place != j)
-		this_load_permuted = true;
-	      load_permutation.safe_push (load_place);
+	      vec<unsigned> load_permutation;
+	      int j;
+	      stmt_vec_info load_info;
+	      bool this_load_permuted = false;
+	      load_permutation.create (group_size);
+	      stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT
+		  (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+	      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+		{
+		  int load_place = vect_get_place_in_interleaving_chain
+		      (load_info, first_stmt_info);
+		  gcc_assert (load_place != -1);
+		  if (load_place != j)
+		    this_load_permuted = true;
+		  load_permutation.safe_push (load_place);
+		}
+	      if (!this_load_permuted
+		  /* The load requires permutation when unrolling exposes
+		     a gap either because the group is larger than the SLP
+		     group-size or because there is a gap between the groups.  */
+		  && (known_eq (unrolling_factor, 1U)
+		      || (group_size == DR_GROUP_SIZE (first_stmt_info)
+			  && DR_GROUP_GAP (first_stmt_info) == 0)))
+		{
+		  load_permutation.release ();
+		  continue;
+		}
+	      SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
+	      loads_permuted = true;
 	    }
-	  if (!this_load_permuted
-	      /* The load requires permutation when unrolling exposes
-	         a gap either because the group is larger than the SLP
-		 group-size or because there is a gap between the groups.  */
-	      && (known_eq (unrolling_factor, 1U)
-		  || (group_size == DR_GROUP_SIZE (first_stmt_info)
-		      && DR_GROUP_GAP (first_stmt_info) == 0)))
+
+	  if (loads_permuted)
 	    {
-	      load_permutation.release ();
-	      continue;
+	      if (!vect_supported_load_permutation_p (new_instance))
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "Build SLP failed: unsupported load "
+				     "permutation %G", stmt_info->stmt);
+		  vect_free_slp_instance (new_instance, false);
+		  return false;
+		}
 	    }
-	  SLP_TREE_LOAD_PERMUTATION (load_node) = load_permutation;
-	  loads_permuted = true;
-	}
-
-      if (loads_permuted)
-        {
-          if (!vect_supported_load_permutation_p (new_instance))
-            {
-              if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "Build SLP failed: unsupported load "
-				 "permutation %G", stmt_info->stmt);
-	      vect_free_slp_instance (new_instance, false);
-              return false;
-            }
-        }
 
 	  /* If the loads and stores can be handled with load/store-lan
-	 instructions do not generate this SLP instance.  */
-      if (is_a <loop_vec_info> (vinfo)
-	  && loads_permuted
-	  && dr && vect_store_lanes_supported (vectype, group_size, false))
-	{
-	  slp_tree load_node;
-	  FOR_EACH_VEC_ELT (loads, i, load_node)
+	     instructions do not generate this SLP instance.  */
+	  if (is_a <loop_vec_info> (vinfo)
+	      && loads_permuted
+	      && dr && vect_store_lanes_supported (vectype, group_size, false))
 	    {
-	      stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
-		(SLP_TREE_SCALAR_STMTS (load_node)[0]);
-	      /* Use SLP for strided accesses (or if we can't load-lanes).  */
-	      if (STMT_VINFO_STRIDED_P (stmt_vinfo)
-		  || ! vect_load_lanes_supported
-			(STMT_VINFO_VECTYPE (stmt_vinfo),
-			 DR_GROUP_SIZE (stmt_vinfo), false))
-		break;
+	      slp_tree load_node;
+	      FOR_EACH_VEC_ELT (loads, i, load_node)
+		{
+		  stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+		      (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+		  /* Use SLP for strided accesses (or if we can't load-lanes).  */
+		  if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+		      || ! vect_load_lanes_supported
+		      (STMT_VINFO_VECTYPE (stmt_vinfo),
+		       DR_GROUP_SIZE (stmt_vinfo), false))
+		    break;
+		}
+	      if (i == loads.length ())
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+				     "Built SLP cancelled: can use "
+				     "load/store-lanes\n");
+		  vect_free_slp_instance (new_instance, false);
+		  return false;
+		}
 	    }
-	  if (i == loads.length ())
+
+	  vinfo->slp_instances.safe_push (new_instance);
+
+	  if (dump_enabled_p ())
 	    {
-	      if (dump_enabled_p ())
-		dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-				 "Built SLP cancelled: can use "
-				 "load/store-lanes\n");
-	      vect_free_slp_instance (new_instance, false);
-	      return false;
+	      dump_printf_loc (MSG_NOTE, vect_location,
+			       "Final SLP tree for instance:\n");
+	      vect_print_slp_tree (MSG_NOTE, vect_location, node);
 	    }
-	}
 
-      vinfo->slp_instances.safe_push (new_instance);
-
-      if (dump_enabled_p ())
-	{
-	  dump_printf_loc (MSG_NOTE, vect_location,
-			   "Final SLP tree for instance:\n");
-	  vect_print_slp_tree (MSG_NOTE, vect_location, node);
+	  return true;
 	}
-
-      return true;
-    }
     }
   else
     {
-  /* Failed to SLP.  */
-  /* Free the allocated memory.  */
-  scalar_stmts.release ();
-  loads.release ();
+      /* Failed to SLP.  */
+      /* Free the allocated memory.  */
+      scalar_stmts.release ();
+      loads.release ();
     }
 
   /* For basic block SLP, try to break the group up into multiples of the
@@ -3749,8 +3780,13 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
     return;
 
+  /* See if we have already vectorized the node in the graph of the
+     SLP instance.  */
+  if (SLP_TREE_VEC_STMTS (node).exists ())
+    return;
+
   /* See if we have already vectorized the same set of stmts and reuse their
-     vectorized stmts.  */
+     vectorized stmts across instances.  */
   if (slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node)))
     {
       SLP_TREE_VEC_STMTS (node).safe_splice (SLP_TREE_VEC_STMTS (*leader));
@@ -3778,8 +3814,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
   group_size = SLP_INSTANCE_GROUP_SIZE (instance);
 
   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
-  if (!SLP_TREE_VEC_STMTS (node).exists ())
-    SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
+  SLP_TREE_VEC_STMTS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 08d696a2f7c..e1292aa6eb6 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -130,6 +130,8 @@ struct _slp_tree {
      scalar elements in one scalar iteration (GROUP_SIZE) multiplied by VF
      divided by vector size.  */
   unsigned int vec_stmts_size;
+  /* Reference count in the SLP graph.  */
+  unsigned int refcnt;
   /* Whether the scalar computations use two different operators.  */
   bool two_operators;
   /* The DEF type of this node.  */