re PR target/65697 (__atomic memory barriers not strong enough for __sync builtins)

[gcc.git] / gcc / tree-vect-slp.c
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c

index 9137144e67a8b86636484cc329f55a49055e708f..e85e80dbbd10bec8cc7966fc8b11476b73803cc9 100644 (file)
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -24,15 +24,8 @@ along with GCC; see the file COPYING3.  If not see
  #include "coretypes.h"
  #include "dumpfile.h"
  #include "tm.h"
-#include "hash-set.h"
-#include "machmode.h"
-#include "vec.h"
-#include "double-int.h"
-#include "input.h"
  #include "alias.h"
  #include "symtab.h"
-#include "wide-int.h"
-#include "inchash.h"
  #include "tree.h"
  #include "fold-const.h"
  #include "stor-layout.h"
@@ -45,7 +38,6 @@ along with GCC; see the file COPYING3.  If not see
  #include "tree-ssa-alias.h"
  #include "internal-fn.h"
  #include "gimple-expr.h"
-#include "is-a.h"
  #include "gimple.h"
  #include "gimple-iterator.h"
  #include "gimple-ssa.h"
@@ -55,12 +47,8 @@ along with GCC; see the file COPYING3.  If not see
  #include "tree-ssanames.h"
  #include "tree-pass.h"
  #include "cfgloop.h"
-#include "hashtab.h"
  #include "rtl.h"
  #include "flags.h"
-#include "statistics.h"
-#include "real.h"
-#include "fixed-value.h"
  #include "insn-config.h"
  #include "expmed.h"
  #include "dojump.h"
@@ -301,13 +289,12 @@ again:
        oprnd_info = (*oprnds_info)[i];
  
        if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
-                              &def, &dt)
-         || (!def_stmt && dt != vect_constant_def))
+                              &def, &dt))
         {
           if (dump_enabled_p ())
             {
               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "Build SLP failed: can't find def for ");
+                              "Build SLP failed: can't analyze def for ");
               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
                dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
             }
@@ -493,14 +480,13 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
    enum tree_code first_cond_code = ERROR_MARK;
    tree lhs;
    bool need_same_oprnds = false;
-  tree vectype, scalar_type, first_op1 = NULL_TREE;
+  tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
    optab optab;
    int icode;
    machine_mode optab_op2_mode;
    machine_mode vec_mode;
-  struct data_reference *first_dr;
    HOST_WIDE_INT dummy;
-  gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
+  gimple first_load = NULL, prev_first_load = NULL;
    tree cond;
  
    /* For every stmt in NODE find its def stmt/s.  */
@@ -773,37 +759,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
           else
             {
               /* Load.  */
-             unsigned unrolling_factor
-               = least_common_multiple
-                   (*max_nunits, group_size) / group_size;
-              /* FORNOW: Check that there is no gap between the loads
-                and no gap between the groups when we need to load
-                multiple groups at once.
-                ???  We should enhance this to only disallow gaps
-                inside vectors.  */
-              if ((unrolling_factor > 1
-                  && ((GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
-                       && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
-                      /* If the group is split up then GROUP_GAP
-                         isn't correct here, nor is GROUP_FIRST_ELEMENT.  */
-                      || GROUP_SIZE (vinfo_for_stmt (stmt)) > group_size))
-                 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
-                     && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
-                {
-                  if (dump_enabled_p ())
-                    {
-                      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                      "Build SLP failed: grouped "
-                                      "loads have gaps ");
-                      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                       stmt, 0);
-                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-                    }
-                 /* Fatal mismatch.  */
-                 matches[0] = false;
-                  return false;
-                }
-
                /* Check that the size of interleaved loads group is not
                   greater than the SLP group size.  */
               unsigned ncopies
@@ -829,7 +784,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                    return false;
                  }
  
-             old_first_load = first_load;
                first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
                if (prev_first_load)
                  {
@@ -853,30 +807,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                  }
                else
                  prev_first_load = first_load;
-
-             /* In some cases a group of loads is just the same load
-                repeated N times.  Only analyze its cost once.  */
-              if (first_load == stmt && old_first_load != first_load)
-                {
-                  first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
-                  if (vect_supportable_dr_alignment (first_dr, false)
-                      == dr_unaligned_unsupported)
-                    {
-                      if (dump_enabled_p ())
-                        {
-                          dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                          vect_location, 
-                                          "Build SLP failed: unsupported "
-                                          "unaligned load ");
-                          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                           stmt, 0);
-                          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-                        }
-                     /* Fatal mismatch.  */
-                     matches[0] = false;
-                      return false;
-                    }
-                }
             }
          } /* Grouped access.  */
        else
@@ -1092,6 +1022,35 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                                vectorization_factor, matches,
                                npermutes, &this_tree_size, max_tree_size))
         {
+         /* If we have all children of child built up from scalars then just
+            throw that away and build it up this node from scalars.  */
+         if (!SLP_TREE_CHILDREN (child).is_empty ())
+           {
+             unsigned int j;
+             slp_tree grandchild;
+
+             FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
+               if (grandchild != NULL)
+                 break;
+             if (!grandchild)
+               {
+                 /* Roll back.  */
+                 *max_nunits = old_max_nunits;
+                 loads->truncate (old_nloads);
+                 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
+                     vect_free_slp_tree (grandchild);
+                 SLP_TREE_CHILDREN (child).truncate (0);
+
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  "Building parent vector operands from "
+                                  "scalars instead\n");
+                 oprnd_info->def_stmts = vNULL;
+                 vect_free_slp_tree (child);
+                 SLP_TREE_CHILDREN (*node).quick_push (NULL);
+                 continue;
+               }
+           }
+
           oprnd_info->def_stmts = vNULL;
           SLP_TREE_CHILDREN (*node).quick_push (child);
           continue;
@@ -1165,9 +1124,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
           for (j = 0; j < group_size; ++j)
             if (!matches[j])
               {
-               gimple tem = oprnds_info[0]->def_stmts[j];
-               oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
-               oprnds_info[1]->def_stmts[j] = tem;
+               std::swap (oprnds_info[0]->def_stmts[j],
+                          oprnds_info[1]->def_stmts[j]);
                 dump_printf (MSG_NOTE, "%d ", j);
               }
           dump_printf (MSG_NOTE, "\n");
@@ -1312,6 +1270,67 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
  }
  
  
+/* Attempt to reorder stmts in a reduction chain so that we don't
+   require any load permutation.  Return true if that was possible,
+   otherwise return false.  */
+
+static bool
+vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
+{
+  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
+  unsigned int i, j;
+  sbitmap load_index;
+  unsigned int lidx;
+  slp_tree node, load;
+
+  /* Compare all the permutation sequences to the first one.  We know
+     that at least one load is permuted.  */
+  node = SLP_INSTANCE_LOADS (slp_instn)[0];
+  if (!node->load_permutation.exists ())
+    return false;
+  for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
+    {
+      if (!load->load_permutation.exists ())
+       return false;
+      FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
+       if (lidx != node->load_permutation[j])
+         return false;
+    }
+
+  /* Check that the loads in the first sequence are different and there
+     are no gaps between them.  */
+  load_index = sbitmap_alloc (group_size);
+  bitmap_clear (load_index);
+  FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
+    {
+      if (bitmap_bit_p (load_index, lidx))
+       {
+         sbitmap_free (load_index);
+         return false;
+       }
+      bitmap_set_bit (load_index, lidx);
+    }
+  for (i = 0; i < group_size; i++)
+    if (!bitmap_bit_p (load_index, i))
+      {
+       sbitmap_free (load_index);
+       return false;
+      }
+  sbitmap_free (load_index);
+
+  /* This permutation is valid for reduction.  Since the order of the
+     statements in the nodes is not important unless they are memory
+     accesses, we can rearrange the statements in all the nodes
+     according to the order of the loads.  */
+  vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+                           node->load_permutation);
+
+  /* We are done, no actual permutations need to be generated.  */
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+    SLP_TREE_LOAD_PERMUTATION (node).release ();
+  return true;
+}
+
  /* Check if the required load permutations in the SLP instance
     SLP_INSTN are supported.  */
  
@@ -1320,7 +1339,6 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
  {
    unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
    unsigned int i, j, k, next;
-  sbitmap load_index;
    slp_tree node;
    gimple stmt, load, next_load, first_load;
    struct data_reference *dr;
@@ -1355,59 +1373,14 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
    stmt = SLP_TREE_SCALAR_STMTS (node)[0];
  
    /* Reduction (there are no data-refs in the root).
-     In reduction chain the order of the loads is important.  */
+     In reduction chain the order of the loads is not important.  */
    if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
        && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
      {
-      slp_tree load;
-      unsigned int lidx;
+      if (vect_attempt_slp_rearrange_stmts (slp_instn))
+       return true;
  
-      /* Compare all the permutation sequences to the first one.  We know
-         that at least one load is permuted.  */
-      node = SLP_INSTANCE_LOADS (slp_instn)[0];
-      if (!node->load_permutation.exists ())
-       return false;
-      for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
-       {
-         if (!load->load_permutation.exists ())
-           return false;
-         FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
-           if (lidx != node->load_permutation[j])
-             return false;
-       }
-
-      /* Check that the loads in the first sequence are different and there
-        are no gaps between them.  */
-      load_index = sbitmap_alloc (group_size);
-      bitmap_clear (load_index);
-      FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
-       {
-         if (bitmap_bit_p (load_index, lidx))
-           {
-             sbitmap_free (load_index);
-             return false;
-           }
-         bitmap_set_bit (load_index, lidx);
-       }
-      for (i = 0; i < group_size; i++)
-       if (!bitmap_bit_p (load_index, i))
-         {
-           sbitmap_free (load_index);
-           return false;
-         }
-      sbitmap_free (load_index);
-
-      /* This permutation is valid for reduction.  Since the order of the
-        statements in the nodes is not important unless they are memory
-        accesses, we can rearrange the statements in all the nodes
-        according to the order of the loads.  */
-      vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
-                               node->load_permutation);
-
-      /* We are done, no actual permutations need to be generated.  */
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-       SLP_TREE_LOAD_PERMUTATION (node).release ();
-      return true;
+      /* Fallthru to general load permutation handling.  */
      }
  
    /* In basic block vectorization we allow any subchain of an interleaving
@@ -1425,7 +1398,9 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
            next_load = NULL;
            FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
              {
-              if (j != 0 && next_load != load)
+              if (j != 0
+                 && (next_load != load
+                     || GROUP_GAP (vinfo_for_stmt (load)) != 1))
                 {
                   subchain_p = false;
                   break;
@@ -1478,47 +1453,14 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
        return true;
      }
  
-  /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
-     GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
-     well (unless it's reduction).  */
-  if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
-    return false;
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    if (!node->load_permutation.exists ())
-      return false;
-
-  load_index = sbitmap_alloc (group_size);
-  bitmap_clear (load_index);
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    {
-      unsigned int lidx = node->load_permutation[0];
-      if (bitmap_bit_p (load_index, lidx))
-       {
-         sbitmap_free (load_index);
-         return false;
-       }
-      bitmap_set_bit (load_index, lidx);
-      FOR_EACH_VEC_ELT (node->load_permutation, j, k)
-       if (k != lidx)
-         {
-           sbitmap_free (load_index);
-           return false;
-         }
-    }
-  for (i = 0; i < group_size; i++)
-    if (!bitmap_bit_p (load_index, i))
-      {
-       sbitmap_free (load_index);
-       return false;
-      }
-  sbitmap_free (load_index);
-
+  /* For loop vectorization verify we can generate the permutation.  */
    FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
      if (node->load_permutation.exists ()
         && !vect_transform_slp_perm_load
               (node, vNULL, NULL,
                SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
        return false;
+
    return true;
  }
  
@@ -1793,6 +1735,11 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
              scalar_stmts.safe_push (next);
            next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
          }
+      /* Mark the first element of the reduction chain as reduction to properly
+        transform the node.  In the reduction analysis phase only the last
+        element of the chain is marked as reduction.  */
+      if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
+       STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
      }
    else
      {
@@ -1858,7 +1805,13 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                 this_load_permuted = true;
               load_permutation.safe_push (load_place);
             }
-         if (!this_load_permuted)
+         if (!this_load_permuted
+             /* The load requires permutation when unrolling exposes
+                a gap either because the group is larger than the SLP
+                group-size or because there is a gap between the groups.  */
+             && (unrolling_factor == 1
+                 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
+                     && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
             {
               load_permutation.release ();
               continue;
@@ -2026,24 +1979,39 @@ vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
      {
        /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
        gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
+      /* We always get the pattern stmt here, but for immediate
+        uses we have to use the LHS of the original stmt.  */
+      gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
+      if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
+       stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
        if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
         FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
-         if (gimple_bb (use_stmt)
-             && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
-             && (use_vinfo = vinfo_for_stmt (use_stmt))
-             && !STMT_SLP_TYPE (use_vinfo)
-             && (STMT_VINFO_RELEVANT (use_vinfo)
-                 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo))
-                 || (STMT_VINFO_IN_PATTERN_P (use_vinfo)
-                     && STMT_VINFO_RELATED_STMT (use_vinfo)
-                     && !STMT_SLP_TYPE (vinfo_for_stmt
-                           (STMT_VINFO_RELATED_STMT (use_vinfo)))))
-             && !(gimple_code (use_stmt) == GIMPLE_PHI
-                  && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
-           stype = hybrid;
+         {
+           if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+             continue;
+           use_vinfo = vinfo_for_stmt (use_stmt);
+           if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
+               && STMT_VINFO_RELATED_STMT (use_vinfo))
+             use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
+           if (!STMT_SLP_TYPE (use_vinfo)
+               && (STMT_VINFO_RELEVANT (use_vinfo)
+                   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
+               && !(gimple_code (use_stmt) == GIMPLE_PHI
+                    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
+             {
+               if (dump_enabled_p ())
+                 {
+                   dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
+                                    "def in non-SLP stmt: ");
+                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
+                 }
+               stype = hybrid;
+             }
+         }
      }
  
-  if (stype == hybrid)
+  if (stype == hybrid
+      && !HYBRID_SLP_STMT (stmt_vinfo))
      {
        if (dump_enabled_p ())
         {
@@ -2738,7 +2706,7 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
       (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
       {s5, s6, s7, s8}.  */
  
-  number_of_copies = least_common_multiple (nunits, group_size) / group_size;
+  number_of_copies = nunits * number_of_vectors / group_size;
  
    number_of_places_left_in_vector = nunits;
    elts = XALLOCAVEC (tree, nunits);
@@ -3046,7 +3014,7 @@ vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
     the created stmts must be inserted.  */
  
  static inline void
-vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
+vect_create_mask_and_perm (gimple stmt,
                             tree mask, int first_vec_indx, int second_vec_indx,
                             gimple_stmt_iterator *gsi, slp_tree node,
                             tree vectype, vec<tree> dr_chain,
@@ -3054,7 +3022,6 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
  {
    tree perm_dest;
    gimple perm_stmt = NULL;
-  stmt_vec_info next_stmt_info;
    int i, stride;
    tree first_vec, second_vec, data_ref;
  
@@ -3085,10 +3052,6 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
        first_vec_indx += stride;
        second_vec_indx += stride;
      }
-
-  /* Mark the scalar stmt as vectorized.  */
-  next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
-  STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
  }
  
  
@@ -3201,9 +3164,8 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
    gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
    tree mask_element_type = NULL_TREE, mask_type;
-  int i, j, k, nunits, vec_index = 0, scalar_index;
+  int i, j, k, nunits, vec_index = 0;
    tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  gimple next_scalar_stmt;
    int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
    int first_mask_element;
    int index, unroll_factor, current_mask_element, ncopies;
@@ -3215,6 +3177,11 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
    bool needs_first_vector = false;
    machine_mode mode;
  
+  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    return false;
+
+  stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
+
    mode = TYPE_MODE (vectype);
  
    if (!can_vec_perm_p (mode, false, NULL))
@@ -3240,8 +3207,10 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
  
    /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
       unrolling factor.  */
-  orig_vec_stmts_num = group_size *
-                SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
+  orig_vec_stmts_num
+    = (STMT_VINFO_GROUP_SIZE (stmt_info)
+       * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
+       + nunits - 1) / nunits;
    if (orig_vec_stmts_num == 1)
      only_one_vec = true;
  
@@ -3249,9 +3218,6 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
       relatively to SLP_NODE_INSTANCE unrolling factor.  */
    ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
  
-  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
    /* Generate permutation masks for every NODE. Number of masks for each NODE
       is equal to GROUP_SIZE.
       E.g., we have a group of three nodes with three loads from the same
@@ -3271,7 +3237,6 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
       {c2,a3,b3,c3}.  */
  
    {
-      scalar_index = 0;
        index = 0;
        vect_stmts_counter = 0;
        vec_index = 0;
@@ -3286,7 +3251,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
            for (k = 0; k < group_size; k++)
              {
               i = SLP_TREE_LOAD_PERMUTATION (node)[k];
-              first_mask_element = i + j * group_size;
+              first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
                if (!vect_get_mask_element (stmt, first_mask_element, 0,
                                           nunits, only_one_vec, index,
                                           mask, &current_mask_element,
@@ -3332,10 +3297,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                            second_vec_index = vec_index;
                          }
  
-                      next_scalar_stmt
-                         = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
-
-                      vect_create_mask_and_perm (stmt, next_scalar_stmt,
+                      vect_create_mask_and_perm (stmt,
                                 mask_vec, first_vec_index, second_vec_index,
                                gsi, node, vectype, dr_chain,
                                ncopies, vect_stmts_counter++);
@@ -3383,8 +3345,14 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
       for the scalar stmts in each node of the SLP tree.  Number of vector
       elements in one vector iteration is the number of scalar elements in
       one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
-     size.  */
-  vec_stmts_size = (vectorization_factor * group_size) / nunits;
+     size.
+     Unless this is a SLP reduction in which case the number of vector
+     stmts is equal to the number of vector stmts of the children.  */
+  if (GROUP_FIRST_ELEMENT (stmt_info)
+      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
+  else
+    vec_stmts_size = (vectorization_factor * group_size) / nunits;
  
    if (!SLP_TREE_VEC_STMTS (node).exists ())
      {