re PR target/65697 (__atomic memory barriers not strong enough for __sync builtins)
[gcc.git] / gcc / tree-vect-slp.c
index b3b3abec11005a49ce734e84dd40d63128203fc3..e85e80dbbd10bec8cc7966fc8b11476b73803cc9 100644 (file)
@@ -1,5 +1,5 @@
 /* SLP - Basic Block Vectorization
-   Copyright (C) 2007-2013 Free Software Foundation, Inc.
+   Copyright (C) 2007-2015 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
    and Ira Rosen <irar@il.ibm.com>
 
@@ -24,40 +24,67 @@ along with GCC; see the file COPYING3.  If not see
 #include "coretypes.h"
 #include "dumpfile.h"
 #include "tm.h"
-#include "ggc.h"
+#include "alias.h"
+#include "symtab.h"
 #include "tree.h"
+#include "fold-const.h"
+#include "stor-layout.h"
 #include "target.h"
+#include "predict.h"
+#include "hard-reg-set.h"
+#include "function.h"
 #include "basic-block.h"
 #include "gimple-pretty-print.h"
-#include "tree-ssa.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-expr.h"
+#include "gimple.h"
+#include "gimple-iterator.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
 #include "tree-pass.h"
 #include "cfgloop.h"
+#include "rtl.h"
+#include "flags.h"
+#include "insn-config.h"
+#include "expmed.h"
+#include "dojump.h"
+#include "explow.h"
+#include "calls.h"
+#include "emit-rtl.h"
+#include "varasm.h"
+#include "stmt.h"
 #include "expr.h"
 #include "recog.h"             /* FIXME: for insn_data */
+#include "insn-codes.h"
 #include "optabs.h"
 #include "tree-vectorizer.h"
 #include "langhooks.h"
+#include "gimple-walk.h"
 
 /* Extract the location of the basic block in the source code.
    Return the basic block location if succeed and NULL if not.  */
 
-LOC
+source_location
 find_bb_location (basic_block bb)
 {
   gimple stmt = NULL;
   gimple_stmt_iterator si;
 
   if (!bb)
-    return UNKNOWN_LOC;
+    return UNKNOWN_LOCATION;
 
   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
     {
       stmt = gsi_stmt (si);
-      if (gimple_location (stmt) != UNKNOWN_LOC)
+      if (gimple_location (stmt) != UNKNOWN_LOCATION)
         return gimple_location (stmt);
     }
 
-  return UNKNOWN_LOC;
+  return UNKNOWN_LOCATION;
 }
 
 
@@ -91,7 +118,6 @@ vect_free_slp_instance (slp_instance instance)
 {
   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
   SLP_INSTANCE_LOADS (instance).release ();
-  SLP_INSTANCE_BODY_COST_VEC (instance).release ();
   free (instance);
 }
 
@@ -121,6 +147,7 @@ vect_create_new_slp_node (vec<gimple> scalar_stmts)
   SLP_TREE_VEC_STMTS (node).create (0);
   SLP_TREE_CHILDREN (node).create (nops);
   SLP_TREE_LOAD_PERMUTATION (node) = vNULL;
+  SLP_TREE_TWO_OPERATORS (node) = false;
 
   return node;
 }
@@ -143,6 +170,7 @@ vect_create_oprnd_info (int nops, int group_size)
       oprnd_info->first_dt = vect_uninitialized_def;
       oprnd_info->first_op_type = NULL_TREE;
       oprnd_info->first_pattern = false;
+      oprnd_info->second_pattern = false;
       oprnds_info.quick_push (oprnd_info);
     }
 
@@ -184,8 +212,9 @@ vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
     {
       if (next_stmt == stmt)
        return result;
-      result++;
       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
+      if (next_stmt)
+       result += GROUP_GAP (vinfo_for_stmt (next_stmt));
     }
   while (next_stmt);
 
@@ -195,11 +224,13 @@ vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
 
 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
    they are of a valid type and that they match the defs of the first stmt of
-   the SLP group (stored in OPRNDS_INFO).  */
+   the SLP group (stored in OPRNDS_INFO).  If there was a fatal error
+   return -1, if the error could be corrected by swapping operands of the
+   operation return 1, if everything is ok return 0.  */
 
-static bool
+static int 
 vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
-                             gimple stmt, bool first,
+                             gimple stmt, unsigned stmt_num,
                              vec<slp_oprnd_info> *oprnds_info)
 {
   tree oprnd;
@@ -210,8 +241,11 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   struct loop *loop = NULL;
   bool pattern = false;
   slp_oprnd_info oprnd_info;
-  int op_idx = 1;
-  tree compare_rhs = NULL_TREE;
+  int first_op_idx = 1;
+  bool commutative = false;
+  bool first_op_cond = false;
+  bool first = stmt_num == 0;
+  bool second = stmt_num == 1;
 
   if (loop_vinfo)
     loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -219,48 +253,53 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   if (is_gimple_call (stmt))
     {
       number_of_oprnds = gimple_call_num_args (stmt);
-      op_idx = 3;
+      first_op_idx = 3;
     }
   else if (is_gimple_assign (stmt))
     {
+      enum tree_code code = gimple_assign_rhs_code (stmt);
       number_of_oprnds = gimple_num_ops (stmt) - 1;
       if (gimple_assign_rhs_code (stmt) == COND_EXPR)
-        number_of_oprnds++;
+       {
+         first_op_cond = true;
+         commutative = true;
+         number_of_oprnds++;
+       }
+      else
+       commutative = commutative_tree_code (code);
     }
   else
-    return false;
+    return -1;
 
+  bool swapped = false;
   for (i = 0; i < number_of_oprnds; i++)
     {
-      if (compare_rhs)
+again:
+      if (first_op_cond)
        {
-         oprnd = compare_rhs;
-         compare_rhs = NULL_TREE;
+         if (i == 0 || i == 1)
+           oprnd = TREE_OPERAND (gimple_op (stmt, first_op_idx),
+                                 swapped ? !i : i);
+         else
+           oprnd = gimple_op (stmt, first_op_idx + i - 1);
        }
       else
-        oprnd = gimple_op (stmt, op_idx++);
+        oprnd = gimple_op (stmt, first_op_idx + (swapped ? !i : i));
 
       oprnd_info = (*oprnds_info)[i];
 
-      if (COMPARISON_CLASS_P (oprnd))
-        {
-          compare_rhs = TREE_OPERAND (oprnd, 1);
-          oprnd = TREE_OPERAND (oprnd, 0);
-       }
-
       if (!vect_is_simple_use (oprnd, NULL, loop_vinfo, bb_vinfo, &def_stmt,
-                              &def, &dt)
-         || (!def_stmt && dt != vect_constant_def))
+                              &def, &dt))
        {
          if (dump_enabled_p ())
            {
              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                              "Build SLP failed: can't find def for ");
+                              "Build SLP failed: can't analyze def for ");
              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, oprnd);
               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
            }
 
-         return false;
+         return -1;
        }
 
       /* Check if DEF_STMT is a part of a pattern in LOOP and get the def stmt
@@ -276,8 +315,20 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
          && !STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt)))
         {
           pattern = true;
-          if (!first && !oprnd_info->first_pattern)
+          if (!first && !oprnd_info->first_pattern
+             /* Allow different pattern state for the defs of the
+                first stmt in reduction chains.  */
+             && (oprnd_info->first_dt != vect_reduction_def
+                 || (!second && !oprnd_info->second_pattern)))
            {
+             if (i == 0
+                 && !swapped
+                 && commutative)
+               {
+                 swapped = true;
+                 goto again;
+               }
+
              if (dump_enabled_p ())
                {
                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -287,7 +338,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
 
-             return false;
+             return 1;
             }
 
           def_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (def_stmt));
@@ -298,7 +349,7 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
               if (dump_enabled_p ())
                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                 "Unsupported pattern.\n");
-              return false;
+              return -1;
             }
 
           switch (gimple_code (def_stmt))
@@ -315,10 +366,13 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                 if (dump_enabled_p ())
                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                   "unsupported defining stmt:\n");
-                return false;
+                return -1;
             }
         }
 
+      if (second)
+       oprnd_info->second_pattern = pattern;
+
       if (first)
        {
          oprnd_info->first_dt = dt;
@@ -342,11 +396,20 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                || !types_compatible_p (oprnd_info->first_op_type,
                                       TREE_TYPE (oprnd))))
            {
+             /* Try swapping operands if we got a mismatch.  */
+             if (i == 0
+                 && !swapped
+                 && commutative)
+               {
+                 swapped = true;
+                 goto again;
+               }
+
              if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                 "Build SLP failed: different types\n");
 
-             return false;
+             return 1;
            }
        }
 
@@ -372,11 +435,26 @@ vect_get_and_check_slp_defs (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
            }
 
-         return false;
+         return -1;
        }
     }
 
-  return true;
+  /* Swap operands.  */
+  if (swapped)
+    {
+      if (first_op_cond)
+       {
+         tree cond = gimple_assign_rhs1 (stmt);
+         swap_ssa_operands (stmt, &TREE_OPERAND (cond, 0),
+                            &TREE_OPERAND (cond, 1));
+         TREE_SET_CODE (cond, swap_tree_comparison (TREE_CODE (cond)));
+       }
+      else
+       swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
+                          gimple_assign_rhs2_ptr (stmt));
+    }
+
+  return 0;
 }
 
 
@@ -391,22 +469,24 @@ static bool
 vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                       vec<gimple> stmts, unsigned int group_size,
                       unsigned nops, unsigned int *max_nunits,
-                      unsigned int vectorization_factor, bool *matches)
+                      unsigned int vectorization_factor, bool *matches,
+                      bool *two_operators)
 {
   unsigned int i;
-  gimple stmt = stmts[0];
-  enum tree_code first_stmt_code = ERROR_MARK, rhs_code = ERROR_MARK;
+  gimple first_stmt = stmts[0], stmt = stmts[0];
+  enum tree_code first_stmt_code = ERROR_MARK;
+  enum tree_code alt_stmt_code = ERROR_MARK;
+  enum tree_code rhs_code = ERROR_MARK;
   enum tree_code first_cond_code = ERROR_MARK;
   tree lhs;
   bool need_same_oprnds = false;
-  tree vectype, scalar_type, first_op1 = NULL_TREE;
+  tree vectype = NULL_TREE, scalar_type, first_op1 = NULL_TREE;
   optab optab;
   int icode;
-  enum machine_mode optab_op2_mode;
-  enum machine_mode vec_mode;
-  struct data_reference *first_dr;
+  machine_mode optab_op2_mode;
+  machine_mode vec_mode;
   HOST_WIDE_INT dummy;
-  gimple first_load = NULL, prev_first_load = NULL, old_first_load = NULL;
+  gimple first_load = NULL, prev_first_load = NULL;
   tree cond;
 
   /* For every stmt in NODE find its def stmt/s.  */
@@ -487,6 +567,19 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
           return false;
         }
 
+      /* If populating the vector type requires unrolling then fail
+         before adjusting *max_nunits for basic-block vectorization.  */
+      if (bb_vinfo
+         && TYPE_VECTOR_SUBPARTS (vectype) > group_size)
+       {
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+                          "Build SLP failed: unrolling required "
+                          "in basic block SLP\n");
+         /* Fatal mismatch.  */
+         matches[0] = false;
+         return false;
+       }
+
       /* In case of multiple types we need to detect the smallest type.  */
       if (*max_nunits < TYPE_VECTOR_SUBPARTS (vectype))
         {
@@ -495,20 +588,21 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
             vectorization_factor = *max_nunits;
         }
 
-      if (is_gimple_call (stmt))
+      if (gcall *call_stmt = dyn_cast <gcall *> (stmt))
        {
          rhs_code = CALL_EXPR;
-         if (gimple_call_internal_p (stmt)
-             || gimple_call_tail_p (stmt)
-             || gimple_call_noreturn_p (stmt)
-             || !gimple_call_nothrow_p (stmt)
-             || gimple_call_chain (stmt))
+         if (gimple_call_internal_p (call_stmt)
+             || gimple_call_tail_p (call_stmt)
+             || gimple_call_noreturn_p (call_stmt)
+             || !gimple_call_nothrow_p (call_stmt)
+             || gimple_call_chain (call_stmt))
            {
              if (dump_enabled_p ())
                {
                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
                                   "Build SLP failed: unsupported call type ");
-                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                   call_stmt, 0);
                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              /* Fatal mismatch.  */
@@ -579,11 +673,21 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
        }
       else
        {
+         if (first_stmt_code != rhs_code
+             && alt_stmt_code == ERROR_MARK)
+           alt_stmt_code = rhs_code;
          if (first_stmt_code != rhs_code
              && (first_stmt_code != IMAGPART_EXPR
                  || rhs_code != REALPART_EXPR)
              && (first_stmt_code != REALPART_EXPR
                  || rhs_code != IMAGPART_EXPR)
+             /* Handle mismatches in plus/minus by computing both
+                and merging the results.  */
+             && !((first_stmt_code == PLUS_EXPR
+                   || first_stmt_code == MINUS_EXPR)
+                  && (alt_stmt_code == PLUS_EXPR
+                      || alt_stmt_code == MINUS_EXPR)
+                  && rhs_code == alt_stmt_code)
               && !(STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
                    && (first_stmt_code == ARRAY_REF
                        || first_stmt_code == BIT_FIELD_REF
@@ -597,7 +701,10 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                                   "Build SLP failed: different operation "
                                   "in stmt ");
                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
-                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "original stmt ");
+                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                   first_stmt, 0);
                }
              /* Mismatch.  */
              continue;
@@ -652,34 +759,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
          else
            {
              /* Load.  */
-             unsigned unrolling_factor
-               = least_common_multiple
-                   (*max_nunits, group_size) / group_size;
-              /* FORNOW: Check that there is no gap between the loads
-                and no gap between the groups when we need to load
-                multiple groups at once.
-                ???  We should enhance this to only disallow gaps
-                inside vectors.  */
-              if ((unrolling_factor > 1
-                  && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt
-                  && GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
-                 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) != stmt
-                     && GROUP_GAP (vinfo_for_stmt (stmt)) != 1))
-                {
-                  if (dump_enabled_p ())
-                    {
-                      dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                      "Build SLP failed: grouped "
-                                      "loads have gaps ");
-                      dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                       stmt, 0);
-                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-                    }
-                 /* Fatal mismatch.  */
-                 matches[0] = false;
-                  return false;
-                }
-
               /* Check that the size of interleaved loads group is not
                  greater than the SLP group size.  */
              unsigned ncopies
@@ -705,7 +784,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                   return false;
                 }
 
-             old_first_load = first_load;
               first_load = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
               if (prev_first_load)
                 {
@@ -729,30 +807,6 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                 }
               else
                 prev_first_load = first_load;
-
-             /* In some cases a group of loads is just the same load
-                repeated N times.  Only analyze its cost once.  */
-              if (first_load == stmt && old_first_load != first_load)
-                {
-                  first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
-                  if (vect_supportable_dr_alignment (first_dr, false)
-                      == dr_unaligned_unsupported)
-                    {
-                      if (dump_enabled_p ())
-                        {
-                          dump_printf_loc (MSG_MISSED_OPTIMIZATION,
-                                          vect_location, 
-                                          "Build SLP failed: unsupported "
-                                          "unaligned load ");
-                          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                           stmt, 0);
-                          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-                        }
-                     /* Fatal mismatch.  */
-                     matches[0] = false;
-                      return false;
-                    }
-                }
            }
         } /* Grouped access.  */
       else
@@ -777,7 +831,7 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
          /* Not memory operation.  */
          if (TREE_CODE_CLASS (rhs_code) != tcc_binary
              && TREE_CODE_CLASS (rhs_code) != tcc_unary
-             && rhs_code != COND_EXPR
+             && TREE_CODE_CLASS (rhs_code) != tcc_expression
              && rhs_code != CALL_EXPR)
            {
              if (dump_enabled_p ())
@@ -823,6 +877,43 @@ vect_build_slp_tree_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
     if (!matches[i])
       return false;
 
+  /* If we allowed a two-operation SLP node verify the target can cope
+     with the permute we are going to use.  */
+  if (alt_stmt_code != ERROR_MARK
+      && TREE_CODE_CLASS (alt_stmt_code) != tcc_reference)
+    {
+      unsigned char *sel
+       = XALLOCAVEC (unsigned char, TYPE_VECTOR_SUBPARTS (vectype));
+      for (i = 0; i < TYPE_VECTOR_SUBPARTS (vectype); ++i)
+       {
+         sel[i] = i;
+         if (gimple_assign_rhs_code (stmts[i % group_size]) == alt_stmt_code)
+           sel[i] += TYPE_VECTOR_SUBPARTS (vectype);
+       }
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         for (i = 0; i < group_size; ++i)
+           if (gimple_assign_rhs_code (stmts[i]) == alt_stmt_code)
+             {
+               matches[i] = false;
+               if (dump_enabled_p ())
+                 {
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Build SLP failed: different operation "
+                                    "in stmt ");
+                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                     stmts[i], 0);
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "original stmt ");
+                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                     first_stmt, 0);
+                 }
+             }
+         return false;
+       }
+      *two_operators = true;
+    }
+
   return true;
 }
 
@@ -839,16 +930,12 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                      unsigned int *max_nunits,
                      vec<slp_tree> *loads,
                      unsigned int vectorization_factor,
-                    bool *matches, unsigned *npermutes)
+                    bool *matches, unsigned *npermutes, unsigned *tree_size,
+                    unsigned max_tree_size)
 {
-  unsigned nops, i, this_npermutes = 0;
+  unsigned nops, i, this_tree_size = 0;
   gimple stmt;
 
-  if (!matches)
-    matches = XALLOCAVEC (bool, group_size);
-  if (!npermutes)
-    npermutes = &this_npermutes;
-
   matches[0] = false;
 
   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
@@ -863,10 +950,13 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   else
     return false;
 
+  bool two_operators = false;
   if (!vect_build_slp_tree_1 (loop_vinfo, bb_vinfo,
                              SLP_TREE_SCALAR_STMTS (*node), group_size, nops,
-                             max_nunits, vectorization_factor, matches))
+                             max_nunits, vectorization_factor, matches,
+                             &two_operators))
     return false;
+  SLP_TREE_TWO_OPERATORS (*node) = two_operators;
 
   /* If the SLP node is a load, terminate the recursion.  */
   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt))
@@ -881,13 +971,26 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   slp_oprnd_info oprnd_info;
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (*node), i, stmt)
     {
-      if (!vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
-                                       stmt, (i == 0), &oprnds_info))
+      switch (vect_get_and_check_slp_defs (loop_vinfo, bb_vinfo,
+                                          stmt, i, &oprnds_info))
        {
+       case 0:
+         break;
+       case -1:
+         matches[0] = false;
          vect_free_oprnd_info (oprnds_info);
          return false;
+       case 1:
+         matches[i] = false;
+         break;
        }
     }
+  for (i = 0; i < group_size; ++i)
+    if (!matches[i])
+      {
+       vect_free_oprnd_info (oprnds_info);
+       return false;
+      }
 
   stmt = SLP_TREE_SCALAR_STMTS (*node)[0];
 
@@ -901,6 +1004,12 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       if (oprnd_info->first_dt != vect_internal_def)
         continue;
 
+      if (++this_tree_size > max_tree_size)
+       {
+         vect_free_oprnd_info (oprnds_info);
+         return false;
+       }
+
       child = vect_create_new_slp_node (oprnd_info->def_stmts);
       if (!child)
        {
@@ -908,16 +1017,78 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
          return false;
        }
 
-      bool *matches = XALLOCAVEC (bool, group_size);
       if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
                               group_size, max_nunits, loads,
-                              vectorization_factor, matches, npermutes))
+                              vectorization_factor, matches,
+                              npermutes, &this_tree_size, max_tree_size))
        {
+         /* If we have all children of child built up from scalars then just
+            throw that away and build it up this node from scalars.  */
+         if (!SLP_TREE_CHILDREN (child).is_empty ())
+           {
+             unsigned int j;
+             slp_tree grandchild;
+
+             FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
+               if (grandchild != NULL)
+                 break;
+             if (!grandchild)
+               {
+                 /* Roll back.  */
+                 *max_nunits = old_max_nunits;
+                 loads->truncate (old_nloads);
+                 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
+                     vect_free_slp_tree (grandchild);
+                 SLP_TREE_CHILDREN (child).truncate (0);
+
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  "Building parent vector operands from "
+                                  "scalars instead\n");
+                 oprnd_info->def_stmts = vNULL;
+                 vect_free_slp_tree (child);
+                 SLP_TREE_CHILDREN (*node).quick_push (NULL);
+                 continue;
+               }
+           }
+
          oprnd_info->def_stmts = vNULL;
          SLP_TREE_CHILDREN (*node).quick_push (child);
          continue;
        }
 
+      /* If the SLP build failed fatally and we analyze a basic-block
+         simply treat nodes we fail to build as externally defined
+        (and thus build vectors from the scalar defs).
+        The cost model will reject outright expensive cases.
+        ???  This doesn't treat cases where permutation ultimatively
+        fails (or we don't try permutation below).  Ideally we'd
+        even compute a permutation that will end up with the maximum
+        SLP tree size...  */
+      if (bb_vinfo
+         && !matches[0]
+         /* ???  Rejecting patterns this way doesn't work.  We'd have to
+            do extra work to cancel the pattern so the uses see the
+            scalar version.  */
+         && !is_pattern_stmt_p (vinfo_for_stmt (stmt)))
+       {
+         unsigned int j;
+         slp_tree grandchild;
+
+         /* Roll back.  */
+         *max_nunits = old_max_nunits;
+         loads->truncate (old_nloads);
+         FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
+           vect_free_slp_tree (grandchild);
+         SLP_TREE_CHILDREN (child).truncate (0);
+
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "Building vector operands from scalars\n");
+         oprnd_info->def_stmts = vNULL;
+         vect_free_slp_tree (child);
+         SLP_TREE_CHILDREN (*node).quick_push (NULL);
+         continue;
+       }
+
       /* If the SLP build for operand zero failed and operand zero
         and one can be commutated try that for the scalar stmts
         that failed the match.  */
@@ -930,29 +1101,53 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
          && oprnds_info[1]->first_dt == vect_internal_def
          && is_gimple_assign (stmt)
          && commutative_tree_code (gimple_assign_rhs_code (stmt))
+         && !SLP_TREE_TWO_OPERATORS (*node)
          /* Do so only if the number of not successful permutes was nor more
             than a cut-ff as re-trying the recursive match on
             possibly each level of the tree would expose exponential
             behavior.  */
          && *npermutes < 4)
        {
+         unsigned int j;
+         slp_tree grandchild;
+
          /* Roll back.  */
          *max_nunits = old_max_nunits;
          loads->truncate (old_nloads);
+         FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (child), j, grandchild)
+           vect_free_slp_tree (grandchild);
+         SLP_TREE_CHILDREN (child).truncate (0);
+
          /* Swap mismatched definition stmts.  */
-         for (unsigned j = 0; j < group_size; ++j)
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "Re-trying with swapped operands of stmts ");
+         for (j = 0; j < group_size; ++j)
            if (!matches[j])
              {
-               gimple tem = oprnds_info[0]->def_stmts[j];
-               oprnds_info[0]->def_stmts[j] = oprnds_info[1]->def_stmts[j];
-               oprnds_info[1]->def_stmts[j] = tem;
+               std::swap (oprnds_info[0]->def_stmts[j],
+                          oprnds_info[1]->def_stmts[j]);
+               dump_printf (MSG_NOTE, "%d ", j);
              }
-         /* And try again ... */
+         dump_printf (MSG_NOTE, "\n");
+         /* And try again with scratch 'matches' ... */
+         bool *tem = XALLOCAVEC (bool, group_size);
          if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &child,
                                   group_size, max_nunits, loads,
                                   vectorization_factor,
-                                  matches, npermutes))
+                                  tem, npermutes, &this_tree_size,
+                                  max_tree_size))
            {
+             /* ... so if successful we can apply the operand swapping
+                to the GIMPLE IL.  This is necessary because for example
+                vect_get_slp_defs uses operand indexes and thus expects
+                canonical operand order.  */
+             for (j = 0; j < group_size; ++j)
+               if (!matches[j])
+                 {
+                   gimple stmt = SLP_TREE_SCALAR_STMTS (*node)[j];
+                   swap_ssa_operands (stmt, gimple_assign_rhs1_ptr (stmt),
+                                      gimple_assign_rhs2_ptr (stmt));
+                 }
              oprnd_info->def_stmts = vNULL;
              SLP_TREE_CHILDREN (*node).quick_push (child);
              continue;
@@ -967,6 +1162,9 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       return false;
     }
 
+  if (tree_size)
+    *tree_size += this_tree_size;
+
   vect_free_oprnd_info (oprnds_info);
   return true;
 }
@@ -1072,6 +1270,67 @@ vect_slp_rearrange_stmts (slp_tree node, unsigned int group_size,
 }
 
 
+/* Attempt to reorder stmts in a reduction chain so that we don't
+   require any load permutation.  Return true if that was possible,
+   otherwise return false.  */
+
+static bool
+vect_attempt_slp_rearrange_stmts (slp_instance slp_instn)
+{
+  unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
+  unsigned int i, j;
+  sbitmap load_index;
+  unsigned int lidx;
+  slp_tree node, load;
+
+  /* Compare all the permutation sequences to the first one.  We know
+     that at least one load is permuted.  */
+  node = SLP_INSTANCE_LOADS (slp_instn)[0];
+  if (!node->load_permutation.exists ())
+    return false;
+  for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
+    {
+      if (!load->load_permutation.exists ())
+       return false;
+      FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
+       if (lidx != node->load_permutation[j])
+         return false;
+    }
+
+  /* Check that the loads in the first sequence are different and there
+     are no gaps between them.  */
+  load_index = sbitmap_alloc (group_size);
+  bitmap_clear (load_index);
+  FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
+    {
+      if (bitmap_bit_p (load_index, lidx))
+       {
+         sbitmap_free (load_index);
+         return false;
+       }
+      bitmap_set_bit (load_index, lidx);
+    }
+  for (i = 0; i < group_size; i++)
+    if (!bitmap_bit_p (load_index, i))
+      {
+       sbitmap_free (load_index);
+       return false;
+      }
+  sbitmap_free (load_index);
+
+  /* This permutation is valid for reduction.  Since the order of the
+     statements in the nodes is not important unless they are memory
+     accesses, we can rearrange the statements in all the nodes
+     according to the order of the loads.  */
+  vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
+                           node->load_permutation);
+
+  /* We are done, no actual permutations need to be generated.  */
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
+    SLP_TREE_LOAD_PERMUTATION (node).release ();
+  return true;
+}
+
 /* Check if the required load permutations in the SLP instance
    SLP_INSTN are supported.  */
 
@@ -1080,7 +1339,6 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
 {
   unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_instn);
   unsigned int i, j, k, next;
-  sbitmap load_index;
   slp_tree node;
   gimple stmt, load, next_load, first_load;
   struct data_reference *dr;
@@ -1093,8 +1351,8 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
          FOR_EACH_VEC_ELT (node->load_permutation, j, next)
            dump_printf (MSG_NOTE, "%d ", next);
        else
-         for (i = 0; i < group_size; ++i)
-           dump_printf (MSG_NOTE, "%d ", i);
+         for (k = 0; k < group_size; ++k)
+           dump_printf (MSG_NOTE, "%d ", k);
       dump_printf (MSG_NOTE, "\n");
     }
 
@@ -1115,59 +1373,14 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
 
   /* Reduction (there are no data-refs in the root).
-     In reduction chain the order of the loads is important.  */
+     In reduction chain the order of the loads is not important.  */
   if (!STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))
       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
-      slp_tree load;
-      unsigned int lidx;
-
-      /* Compare all the permutation sequences to the first one.  We know
-         that at least one load is permuted.  */
-      node = SLP_INSTANCE_LOADS (slp_instn)[0];
-      if (!node->load_permutation.exists ())
-       return false;
-      for (i = 1; SLP_INSTANCE_LOADS (slp_instn).iterate (i, &load); ++i)
-       {
-         if (!load->load_permutation.exists ())
-           return false;
-         FOR_EACH_VEC_ELT (load->load_permutation, j, lidx)
-           if (lidx != node->load_permutation[j])
-             return false;
-       }
-
-      /* Check that the loads in the first sequence are different and there
-        are no gaps between them.  */
-      load_index = sbitmap_alloc (group_size);
-      bitmap_clear (load_index);
-      FOR_EACH_VEC_ELT (node->load_permutation, i, lidx)
-       {
-         if (bitmap_bit_p (load_index, lidx))
-           {
-             sbitmap_free (load_index);
-             return false;
-           }
-         bitmap_set_bit (load_index, lidx);
-       }
-      for (i = 0; i < group_size; i++)
-       if (!bitmap_bit_p (load_index, i))
-         {
-           sbitmap_free (load_index);
-           return false;
-         }
-      sbitmap_free (load_index);
-
-      /* This permutation is valid for reduction.  Since the order of the
-        statements in the nodes is not important unless they are memory
-        accesses, we can rearrange the statements in all the nodes
-        according to the order of the loads.  */
-      vect_slp_rearrange_stmts (SLP_INSTANCE_TREE (slp_instn), group_size,
-                               node->load_permutation);
+      if (vect_attempt_slp_rearrange_stmts (slp_instn))
+       return true;
 
-      /* We are done, no actual permutations need to be generated.  */
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-       SLP_TREE_LOAD_PERMUTATION (node).release ();
-      return true;
+      /* Fallthru to general load permutation handling.  */
     }
 
   /* In basic block vectorization we allow any subchain of an interleaving
@@ -1175,17 +1388,40 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
      FORNOW: not supported in loop SLP because of realignment compications.  */
   if (STMT_VINFO_BB_VINFO (vinfo_for_stmt (stmt)))
     {
-      /* Check that for every node in the instance the loads
-        form a subchain.  */
+      /* Check whether the loads in an instance form a subchain and thus
+         no permutation is necessary.  */
       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
         {
+         if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
+           continue;
+         bool subchain_p = true;
           next_load = NULL;
           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load)
             {
-              if (j != 0 && next_load != load)
-               return false;
+              if (j != 0
+                 && (next_load != load
+                     || GROUP_GAP (vinfo_for_stmt (load)) != 1))
+               {
+                 subchain_p = false;
+                 break;
+               }
               next_load = GROUP_NEXT_ELEMENT (vinfo_for_stmt (load));
             }
+         if (subchain_p)
+           SLP_TREE_LOAD_PERMUTATION (node).release ();
+         else
+           {
+             /* Verify the permutation can be generated.  */
+             vec<tree> tem;
+             if (!vect_transform_slp_perm_load (node, tem, NULL,
+                                                1, slp_instn, true))
+               {
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
+                                  vect_location,
+                                  "unsupported load permutation\n");
+                 return false;
+               }
+           }
         }
 
       /* Check that the alignment of the first load in every subchain, i.e.,
@@ -1214,105 +1450,48 @@ vect_supported_load_permutation_p (slp_instance slp_instn)
            }
        }
 
-      /* We are done, no actual permutations need to be generated.  */
-      FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-       SLP_TREE_LOAD_PERMUTATION (node).release ();
       return true;
     }
 
-  /* FORNOW: the only supported permutation is 0..01..1.. of length equal to
-     GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as
-     well (unless it's reduction).  */
-  if (SLP_INSTANCE_LOADS (slp_instn).length () != group_size)
-    return false;
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    if (!node->load_permutation.exists ())
-      return false;
-
-  load_index = sbitmap_alloc (group_size);
-  bitmap_clear (load_index);
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
-    {
-      unsigned int lidx = node->load_permutation[0];
-      if (bitmap_bit_p (load_index, lidx))
-       {
-         sbitmap_free (load_index);
-         return false;
-       }
-      bitmap_set_bit (load_index, lidx);
-      FOR_EACH_VEC_ELT (node->load_permutation, j, k)
-       if (k != lidx)
-         {
-           sbitmap_free (load_index);
-           return false;
-         }
-    }
-  for (i = 0; i < group_size; i++)
-    if (!bitmap_bit_p (load_index, i))
-      {
-       sbitmap_free (load_index);
-       return false;
-      }
-  sbitmap_free (load_index);
-
+  /* For loop vectorization verify we can generate the permutation.  */
   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (slp_instn), i, node)
     if (node->load_permutation.exists ()
        && !vect_transform_slp_perm_load
              (node, vNULL, NULL,
               SLP_INSTANCE_UNROLLING_FACTOR (slp_instn), slp_instn, true))
       return false;
-  return true;
-}
-
-
-/* Find the first load in the loop that belongs to INSTANCE.
-   When loads are in several SLP nodes, there can be a case in which the first
-   load does not appear in the first SLP node to be transformed, causing
-   incorrect order of statements.  Since we generate all the loads together,
-   they must be inserted before the first load of the SLP instance and not
-   before the first load of the first node of the instance.  */
-
-static gimple
-vect_find_first_load_in_slp_instance (slp_instance instance)
-{
-  int i, j;
-  slp_tree load_node;
-  gimple first_load = NULL, load;
 
-  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
-    FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load)
-      first_load = get_earlier_stmt (load, first_load);
-
-  return first_load;
+  return true;
 }
 
 
 /* Find the last store in SLP INSTANCE.  */
 
 static gimple
-vect_find_last_store_in_slp_instance (slp_instance instance)
+vect_find_last_scalar_stmt_in_slp (slp_tree node)
 {
-  int i;
-  slp_tree node;
-  gimple last_store = NULL, store;
+  gimple last = NULL, stmt;
 
-  node = SLP_INSTANCE_TREE (instance);
-  for (i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &store); i++)
-    last_store = get_later_stmt (store, last_store);
+  for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt); i++)
+    {
+      stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
+      if (is_pattern_stmt_p (stmt_vinfo))
+       last = get_later_stmt (STMT_VINFO_RELATED_STMT (stmt_vinfo), last);
+      else
+       last = get_later_stmt (stmt, last);
+    }
 
-  return last_store;
+  return last;
 }
 
 /* Compute the cost for the SLP node NODE in the SLP instance INSTANCE.  */
 
 static void
-vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
-                        slp_instance instance, slp_tree node,
+vect_analyze_slp_cost_1 (slp_instance instance, slp_tree node,
                         stmt_vector_for_cost *prologue_cost_vec,
+                        stmt_vector_for_cost *body_cost_vec,
                         unsigned ncopies_for_cost)
 {
-  stmt_vector_for_cost *body_cost_vec = &SLP_INSTANCE_BODY_COST_VEC (instance);
-
   unsigned i;
   slp_tree child;
   gimple stmt, s;
@@ -1322,9 +1501,9 @@ vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 
   /* Recurse down the SLP tree.  */
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
-                            instance, child, prologue_cost_vec,
-                            ncopies_for_cost);
+    if (child)
+      vect_analyze_slp_cost_1 (instance, child, prologue_cost_vec,
+                              body_cost_vec, ncopies_for_cost);
 
   /* Look at the first scalar stmt to determine the cost.  */
   stmt = SLP_TREE_SCALAR_STMTS (node)[0];
@@ -1358,8 +1537,17 @@ vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
        }
     }
   else
-    record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
-                     stmt_info, 0, vect_body);
+    {
+      record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
+                       stmt_info, 0, vect_body);
+      if (SLP_TREE_TWO_OPERATORS (node))
+       {
+         record_stmt_cost (body_cost_vec, ncopies_for_cost, vector_stmt,
+                           stmt_info, 0, vect_body);
+         record_stmt_cost (body_cost_vec, ncopies_for_cost, vec_perm,
+                           stmt_info, 0, vect_body);
+       }
+    }
 
   /* Scan operands and account for prologue cost of constants/externals.
      ???  This over-estimates cost for multiple uses and should be
@@ -1372,19 +1560,28 @@ vect_analyze_slp_cost_1 (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       enum vect_def_type dt;
       if (!op || op == lhs)
        continue;
-      if (vect_is_simple_use (op, NULL, loop_vinfo, bb_vinfo,
-                             &def_stmt, &def, &dt)
-         && (dt == vect_constant_def || dt == vect_external_def))
-       record_stmt_cost (prologue_cost_vec, 1, vector_stmt,
-                         stmt_info, 0, vect_prologue);
+      if (vect_is_simple_use (op, NULL, STMT_VINFO_LOOP_VINFO (stmt_info),
+                             STMT_VINFO_BB_VINFO (stmt_info),
+                             &def_stmt, &def, &dt))
+       {
+         /* Without looking at the actual initializer a vector of
+            constants can be implemented as load from the constant pool.
+            ???  We need to pass down stmt_info for a vector type
+            even if it points to the wrong stmt.  */
+         if (dt == vect_constant_def)
+           record_stmt_cost (prologue_cost_vec, 1, vector_load,
+                             stmt_info, 0, vect_prologue);
+         else if (dt == vect_external_def)
+           record_stmt_cost (prologue_cost_vec, 1, vec_construct,
+                             stmt_info, 0, vect_prologue);
+       }
     }
 }
 
 /* Compute the cost for the SLP instance INSTANCE.  */
 
 static void
-vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
-                      slp_instance instance, unsigned nunits)
+vect_analyze_slp_cost (slp_instance instance, void *data)
 {
   stmt_vector_for_cost body_cost_vec, prologue_cost_vec;
   unsigned ncopies_for_cost;
@@ -1395,20 +1592,38 @@ vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
      factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is
      GROUP_SIZE / NUNITS otherwise.  */
   unsigned group_size = SLP_INSTANCE_GROUP_SIZE (instance);
+  slp_tree node = SLP_INSTANCE_TREE (instance);
+  stmt_vec_info stmt_info = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
+  /* Adjust the group_size by the vectorization factor which is always one
+     for basic-block vectorization.  */
+  if (STMT_VINFO_LOOP_VINFO (stmt_info))
+    group_size *= LOOP_VINFO_VECT_FACTOR (STMT_VINFO_LOOP_VINFO (stmt_info));
+  unsigned nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
+  /* For reductions look at a reduction operand in case the reduction
+     operation is widening like DOT_PROD or SAD.  */
+  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    {
+      gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
+      switch (gimple_assign_rhs_code (stmt))
+       {
+       case DOT_PROD_EXPR:
+       case SAD_EXPR:
+         nunits = TYPE_VECTOR_SUBPARTS (get_vectype_for_scalar_type
+                               (TREE_TYPE (gimple_assign_rhs1 (stmt))));
+         break;
+       default:;
+       }
+    }
   ncopies_for_cost = least_common_multiple (nunits, group_size) / nunits;
 
   prologue_cost_vec.create (10);
   body_cost_vec.create (10);
-  SLP_INSTANCE_BODY_COST_VEC (instance) = body_cost_vec;
-  vect_analyze_slp_cost_1 (loop_vinfo, bb_vinfo,
-                          instance, SLP_INSTANCE_TREE (instance),
-                          &prologue_cost_vec, ncopies_for_cost);
+  vect_analyze_slp_cost_1 (instance, SLP_INSTANCE_TREE (instance),
+                          &prologue_cost_vec, &body_cost_vec,
+                          ncopies_for_cost);
 
   /* Record the prologue costs, which were delayed until we were
-     sure that SLP was successful.  Unlike the body costs, we know
-     the final values now regardless of the loop vectorization factor.  */
-  void *data = (loop_vinfo ? LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
-               : BB_VINFO_TARGET_COST_DATA (bb_vinfo));
+     sure that SLP was successful.  */
   FOR_EACH_VEC_ELT (prologue_cost_vec, i, si)
     {
       struct _stmt_vec_info *stmt_info
@@ -1417,7 +1632,17 @@ vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                            si->misalign, vect_prologue);
     }
 
+  /* Record the instance's instructions in the target cost model.  */
+  FOR_EACH_VEC_ELT (body_cost_vec, i, si)
+    {
+      struct _stmt_vec_info *stmt_info
+       = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
+      (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
+                           si->misalign, vect_body);
+    }
+
   prologue_cost_vec.release ();
+  body_cost_vec.release ();
 }
 
 /* Analyze an SLP instance starting from a group of grouped stores.  Call
@@ -1426,7 +1651,7 @@ vect_analyze_slp_cost (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
 
 static bool
 vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
-                           gimple stmt)
+                           gimple stmt, unsigned max_tree_size)
 {
   slp_instance new_instance;
   slp_tree node;
@@ -1510,6 +1735,11 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
             scalar_stmts.safe_push (next);
           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
         }
+      /* Mark the first element of the reduction chain as reduction to properly
+        transform the node.  In the reduction analysis phase only the last
+        element of the chain is marked as reduction.  */
+      if (!STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (stmt)))
+       STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) = vect_reduction_def;
     }
   else
     {
@@ -1524,9 +1754,12 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
   loads.create (group_size);
 
   /* Build the tree for the SLP instance.  */
+  bool *matches = XALLOCAVEC (bool, group_size);
+  unsigned npermutes = 0;
   if (vect_build_slp_tree (loop_vinfo, bb_vinfo, &node, group_size,
                           &max_nunits, &loads,
-                          vectorization_factor, NULL, NULL))
+                          vectorization_factor, matches, &npermutes, NULL,
+                          max_tree_size))
     {
       /* Calculate the unrolling factor based on the smallest type.  */
       if (max_nunits > nunits)
@@ -1549,9 +1782,7 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
       SLP_INSTANCE_TREE (new_instance) = node;
       SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size;
       SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
-      SLP_INSTANCE_BODY_COST_VEC (new_instance) = vNULL;
       SLP_INSTANCE_LOADS (new_instance) = loads;
-      SLP_INSTANCE_FIRST_LOAD_STMT (new_instance) = NULL;
 
       /* Compute the load permutation.  */
       slp_tree load_node;
@@ -1574,7 +1805,13 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
                this_load_permuted = true;
              load_permutation.safe_push (load_place);
            }
-         if (!this_load_permuted)
+         if (!this_load_permuted
+             /* The load requires permutation when unrolling exposes
+                a gap either because the group is larger than the SLP
+                group-size or because there is a gap between the groups.  */
+             && (unrolling_factor == 1
+                 || (group_size == GROUP_SIZE (vinfo_for_stmt (first_stmt))
+                     && GROUP_GAP (vinfo_for_stmt (first_stmt)) == 0)))
            {
              load_permutation.release ();
              continue;
@@ -1598,17 +1835,11 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
               vect_free_slp_instance (new_instance);
               return false;
             }
-
-          SLP_INSTANCE_FIRST_LOAD_STMT (new_instance)
-           = vect_find_first_load_in_slp_instance (new_instance);
         }
 
-      /* Compute the costs of this SLP instance.  */
-      vect_analyze_slp_cost (loop_vinfo, bb_vinfo,
-                            new_instance, TYPE_VECTOR_SUBPARTS (vectype));
 
       if (loop_vinfo)
-        LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
+       LOOP_VINFO_SLP_INSTANCES (loop_vinfo).safe_push (new_instance);
       else
         BB_VINFO_SLP_INSTANCES (bb_vinfo).safe_push (new_instance);
 
@@ -1631,7 +1862,8 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
    trees of packed scalar stmts if SLP is possible.  */
 
 bool
-vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
+vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo,
+                 unsigned max_tree_size)
 {
   unsigned int i;
   vec<gimple> grouped_stores;
@@ -1654,24 +1886,16 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 
   /* Find SLP sequences starting from groups of grouped stores.  */
   FOR_EACH_VEC_ELT (grouped_stores, i, first_element)
-    if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
+    if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
+                                  max_tree_size))
       ok = true;
 
-  if (bb_vinfo && !ok)
-    {
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "Failed to SLP the basic block.\n");
-
-      return false;
-    }
-
-  if (loop_vinfo
-      && LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).length () > 0)
+  if (reduc_chains.length () > 0)
     {
       /* Find SLP sequences starting from reduction chains.  */
       FOR_EACH_VEC_ELT (reduc_chains, i, first_element)
-        if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element))
+        if (vect_analyze_slp_instance (loop_vinfo, bb_vinfo, first_element,
+                                      max_tree_size))
           ok = true;
         else
           return false;
@@ -1682,8 +1906,9 @@ vect_analyze_slp (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
     }
 
   /* Find SLP sequences starting from groups of reductions.  */
-  if (loop_vinfo && LOOP_VINFO_REDUCTIONS (loop_vinfo).length () > 1
-      && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0]))
+  if (reductions.length () > 1
+      && vect_analyze_slp_instance (loop_vinfo, bb_vinfo, reductions[0],
+                                   max_tree_size))
     ok = true;
 
   return true;
@@ -1734,48 +1959,113 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
    can't be SLPed) in the tree rooted at NODE.  Mark such stmts as HYBRID.  */
 
 static void
-vect_detect_hybrid_slp_stmts (slp_tree node)
+vect_detect_hybrid_slp_stmts (slp_tree node, unsigned i, slp_vect_type stype)
 {
-  int i;
-  vec<gimple> stmts = SLP_TREE_SCALAR_STMTS (node);
-  gimple stmt = stmts[0];
+  gimple stmt = SLP_TREE_SCALAR_STMTS (node)[i];
   imm_use_iterator imm_iter;
   gimple use_stmt;
-  stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
+  stmt_vec_info use_vinfo, stmt_vinfo = vinfo_for_stmt (stmt);
   slp_tree child;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
-  struct loop *loop = NULL;
-  bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_vinfo);
-  basic_block bb = NULL;
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  int j;
+
+  /* Propagate hybrid down the SLP tree.  */
+  if (stype == hybrid)
+    ;
+  else if (HYBRID_SLP_STMT (stmt_vinfo))
+    stype = hybrid;
+  else
+    {
+      /* Check if a pure SLP stmt has uses in non-SLP stmts.  */
+      gcc_checking_assert (PURE_SLP_STMT (stmt_vinfo));
+      /* We always get the pattern stmt here, but for immediate
+        uses we have to use the LHS of the original stmt.  */
+      gcc_checking_assert (!STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
+      if (STMT_VINFO_RELATED_STMT (stmt_vinfo))
+       stmt = STMT_VINFO_RELATED_STMT (stmt_vinfo);
+      if (TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
+       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
+         {
+           if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
+             continue;
+           use_vinfo = vinfo_for_stmt (use_stmt);
+           if (STMT_VINFO_IN_PATTERN_P (use_vinfo)
+               && STMT_VINFO_RELATED_STMT (use_vinfo))
+             use_vinfo = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (use_vinfo));
+           if (!STMT_SLP_TYPE (use_vinfo)
+               && (STMT_VINFO_RELEVANT (use_vinfo)
+                   || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (use_vinfo)))
+               && !(gimple_code (use_stmt) == GIMPLE_PHI
+                    && STMT_VINFO_DEF_TYPE (use_vinfo) == vect_reduction_def))
+             {
+               if (dump_enabled_p ())
+                 {
+                   dump_printf_loc (MSG_NOTE, vect_location, "use of SLP "
+                                    "def in non-SLP stmt: ");
+                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, use_stmt, 0);
+                 }
+               stype = hybrid;
+             }
+         }
+    }
 
-  if (!node)
-    return;
+  if (stype == hybrid
+      && !HYBRID_SLP_STMT (stmt_vinfo))
+    {
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
+         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+       }
+      STMT_SLP_TYPE (stmt_vinfo) = hybrid;
+    }
 
-  if (loop_vinfo)
-    loop = LOOP_VINFO_LOOP (loop_vinfo);
-  else
-    bb = BB_VINFO_BB (bb_vinfo);
+  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
+    if (child)
+      vect_detect_hybrid_slp_stmts (child, i, stype);
+}
 
-  FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
-    if (PURE_SLP_STMT (vinfo_for_stmt (stmt))
-       && TREE_CODE (gimple_op (stmt, 0)) == SSA_NAME)
-      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, gimple_op (stmt, 0))
-       if (gimple_bb (use_stmt)
-            && ((loop && flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
-                || bb == gimple_bb (use_stmt))
-           && (stmt_vinfo = vinfo_for_stmt (use_stmt))
-           && !STMT_SLP_TYPE (stmt_vinfo)
-            && (STMT_VINFO_RELEVANT (stmt_vinfo)
-                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_vinfo)))
-           && !(gimple_code (use_stmt) == GIMPLE_PHI
-                 && STMT_VINFO_DEF_TYPE (stmt_vinfo)
-                  == vect_reduction_def))
-         vect_mark_slp_stmts (node, hybrid, i);
+/* Helpers for vect_detect_hybrid_slp walking pattern stmt uses.  */
 
-  FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    vect_detect_hybrid_slp_stmts (child);
+static tree
+vect_detect_hybrid_slp_1 (tree *tp, int *, void *data)
+{
+  walk_stmt_info *wi = (walk_stmt_info *)data;
+  struct loop *loopp = (struct loop *)wi->info;
+
+  if (wi->is_lhs)
+    return NULL_TREE;
+
+  if (TREE_CODE (*tp) == SSA_NAME
+      && !SSA_NAME_IS_DEFAULT_DEF (*tp))
+    {
+      gimple def_stmt = SSA_NAME_DEF_STMT (*tp);
+      if (flow_bb_inside_loop_p (loopp, gimple_bb (def_stmt))
+         && PURE_SLP_STMT (vinfo_for_stmt (def_stmt)))
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: ");
+             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+           }
+         STMT_SLP_TYPE (vinfo_for_stmt (def_stmt)) = hybrid;
+       }
+    }
+
+  return NULL_TREE;
 }
 
+static tree
+vect_detect_hybrid_slp_2 (gimple_stmt_iterator *gsi, bool *handled,
+                         walk_stmt_info *)
+{
+  /* If the stmt is in a SLP instance then this isn't a reason
+     to mark use definitions in other SLP instances as hybrid.  */
+  if (STMT_SLP_TYPE (vinfo_for_stmt (gsi_stmt (*gsi))) != loop_vect)
+    *handled = true;
+  return NULL_TREE;
+}
 
 /* Find stmts that must be both vectorized and SLPed.  */
 
@@ -1790,8 +2080,41 @@ vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
     dump_printf_loc (MSG_NOTE, vect_location, "=== vect_detect_hybrid_slp ==="
                      "\n");
 
+  /* First walk all pattern stmt in the loop and mark defs of uses as
+     hybrid because immediate uses in them are not recorded.  */
+  for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
+    {
+      basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
+          gsi_next (&gsi))
+       {
+         gimple stmt = gsi_stmt (gsi);
+         stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+         if (STMT_VINFO_IN_PATTERN_P (stmt_info))
+           {
+             walk_stmt_info wi;
+             memset (&wi, 0, sizeof (wi));
+             wi.info = LOOP_VINFO_LOOP (loop_vinfo);
+             gimple_stmt_iterator gsi2
+               = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
+             walk_gimple_stmt (&gsi2, vect_detect_hybrid_slp_2,
+                               vect_detect_hybrid_slp_1, &wi);
+             walk_gimple_seq (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info),
+                              vect_detect_hybrid_slp_2,
+                              vect_detect_hybrid_slp_1, &wi);
+           }
+       }
+    }
+
+  /* Then walk the SLP instance trees marking stmts with uses in
+     non-SLP stmts as hybrid, also propagating hybrid down the
+     SLP tree, collecting the above info on-the-fly.  */
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
-    vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance));
+    {
+      for (unsigned i = 0; i < SLP_INSTANCE_GROUP_SIZE (instance); ++i)
+       vect_detect_hybrid_slp_stmts (SLP_INSTANCE_TREE (instance),
+                                     i, pure_slp);
+    }
 }
 
 
@@ -1867,7 +2190,7 @@ destroy_bb_vec_info (bb_vec_info bb_vinfo)
    the subtree. Return TRUE if the operations are supported.  */
 
 static bool
-vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
+vect_slp_analyze_node_operations (slp_tree node)
 {
   bool dummy;
   int i;
@@ -1878,17 +2201,17 @@ vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
     return true;
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    if (!vect_slp_analyze_node_operations (bb_vinfo, child))
+    if (!vect_slp_analyze_node_operations (child))
       return false;
 
   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
     {
       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
       gcc_assert (stmt_info);
-      gcc_assert (PURE_SLP_STMT (stmt_info));
+      gcc_assert (STMT_SLP_TYPE (stmt_info) != loop_vect);
 
       if (!vect_analyze_stmt (stmt, &dummy, node))
-        return false;
+       return false;
     }
 
   return true;
@@ -1898,23 +2221,34 @@ vect_slp_analyze_node_operations (bb_vec_info bb_vinfo, slp_tree node)
 /* Analyze statements in SLP instances of the basic block.  Return TRUE if the
    operations are supported. */
 
-static bool
-vect_slp_analyze_operations (bb_vec_info bb_vinfo)
+bool
+vect_slp_analyze_operations (vec<slp_instance> slp_instances, void *data)
 {
-  vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
   slp_instance instance;
   int i;
 
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "=== vect_slp_analyze_operations ===\n");
+
   for (i = 0; slp_instances.iterate (i, &instance); )
     {
-      if (!vect_slp_analyze_node_operations (bb_vinfo,
-                                             SLP_INSTANCE_TREE (instance)))
+      if (!vect_slp_analyze_node_operations (SLP_INSTANCE_TREE (instance)))
         {
-         vect_free_slp_instance (instance);
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "removing SLP instance operations starting from: ");
+         dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
+                           SLP_TREE_SCALAR_STMTS
+                             (SLP_INSTANCE_TREE (instance))[0], 0);
+         vect_free_slp_instance (instance);
           slp_instances.ordered_remove (i);
        }
       else
-        i++;
+       {
+         /* Compute the costs of the SLP instance.  */
+         vect_analyze_slp_cost (instance, data);
+         i++;
+       }
     }
 
   if (!slp_instances.length ())
@@ -1930,7 +2264,7 @@ vect_slp_analyze_operations (bb_vec_info bb_vinfo)
 
 static unsigned
 vect_bb_slp_scalar_cost (basic_block bb,
-                        slp_tree node, vec<bool, va_stack> life)
+                        slp_tree node, vec<bool, va_heap> *life)
 {
   unsigned scalar_cost = 0;
   unsigned i;
@@ -1944,7 +2278,7 @@ vect_bb_slp_scalar_cost (basic_block bb,
       def_operand_p def_p;
       stmt_vec_info stmt_info;
 
-      if (life[i])
+      if ((*life)[i])
        continue;
 
       /* If there is a non-vectorized use of the defs then the scalar
@@ -1957,15 +2291,16 @@ vect_bb_slp_scalar_cost (basic_block bb,
          imm_use_iterator use_iter;
          gimple use_stmt;
          FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
-           if (gimple_code (use_stmt) == GIMPLE_PHI
-               || gimple_bb (use_stmt) != bb
-               || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt)))
+           if (!is_gimple_debug (use_stmt)
+               && (gimple_code (use_stmt) == GIMPLE_PHI
+                   || gimple_bb (use_stmt) != bb
+                   || !STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (use_stmt))))
              {
-               life[i] = true;
+               (*life)[i] = true;
                BREAK_FROM_IMM_USE_STMT (use_iter);
              }
        }
-      if (life[i])
+      if ((*life)[i])
        continue;
 
       stmt_info = vinfo_for_stmt (stmt);
@@ -1983,7 +2318,8 @@ vect_bb_slp_scalar_cost (basic_block bb,
     }
 
   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
-    scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
+    if (child)
+      scalar_cost += vect_bb_slp_scalar_cost (bb, child, life);
 
   return scalar_cost;
 }
@@ -1995,37 +2331,18 @@ vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo)
 {
   vec<slp_instance> slp_instances = BB_VINFO_SLP_INSTANCES (bb_vinfo);
   slp_instance instance;
-  int i, j;
+  int i;
   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
-  void *target_cost_data = BB_VINFO_TARGET_COST_DATA (bb_vinfo);
-  stmt_vec_info stmt_info = NULL;
-  stmt_vector_for_cost body_cost_vec;
-  stmt_info_for_cost *ci;
-
-  /* Calculate vector costs.  */
-  FOR_EACH_VEC_ELT (slp_instances, i, instance)
-    {
-      body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
-
-      FOR_EACH_VEC_ELT (body_cost_vec, j, ci)
-       {
-         stmt_info = ci->stmt ? vinfo_for_stmt (ci->stmt) : NULL;
-         (void) add_stmt_cost (target_cost_data, ci->count, ci->kind,
-                               stmt_info, ci->misalign, vect_body);
-       }
-    }
 
   /* Calculate scalar cost.  */
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
-      vec<bool, va_stack> life;
-      vec_stack_alloc (bool, life, SLP_INSTANCE_GROUP_SIZE (instance));
-      life.quick_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
+      auto_vec<bool, 20> life;
+      life.safe_grow_cleared (SLP_INSTANCE_GROUP_SIZE (instance));
       scalar_cost += vect_bb_slp_scalar_cost (BB_VINFO_BB (bb_vinfo),
                                              SLP_INSTANCE_TREE (instance),
-                                             life);
-      life.release ();
+                                             &life);
     }
 
   /* Complete the target-specific cost calculation.  */
@@ -2062,12 +2379,13 @@ vect_slp_analyze_bb_1 (basic_block bb)
   slp_instance instance;
   int i;
   int min_vf = 2;
+  unsigned n_stmts = 0;
 
   bb_vinfo = new_bb_vec_info (bb);
   if (!bb_vinfo)
     return NULL;
 
-  if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf))
+  if (!vect_analyze_data_refs (NULL, bb_vinfo, &min_vf, &n_stmts))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -2102,17 +2420,6 @@ vect_slp_analyze_bb_1 (basic_block bb)
 
   vect_pattern_recog (NULL, bb_vinfo);
 
-  if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
-     {
-       if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "not vectorized: unhandled data dependence "
-                         "in basic block.\n");
-
-       destroy_bb_vec_info (bb_vinfo);
-       return NULL;
-     }
-
   if (!vect_analyze_data_refs_alignment (NULL, bb_vinfo))
     {
       if (dump_enabled_p ())
@@ -2126,12 +2433,16 @@ vect_slp_analyze_bb_1 (basic_block bb)
 
   /* Check the SLP opportunities in the basic block, analyze and build SLP
      trees.  */
-  if (!vect_analyze_slp (NULL, bb_vinfo))
+  if (!vect_analyze_slp (NULL, bb_vinfo, n_stmts))
     {
       if (dump_enabled_p ())
-        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                        "not vectorized: failed to find SLP opportunities "
-                        "in basic block.\n");
+       {
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                          "Failed to SLP the basic block.\n");
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+                          "not vectorized: failed to find SLP opportunities "
+                          "in basic block.\n");
+       }
 
       destroy_bb_vec_info (bb_vinfo);
       return NULL;
@@ -2147,6 +2458,29 @@ vect_slp_analyze_bb_1 (basic_block bb)
       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
     }
 
+  /* Mark all the statements that we do not want to vectorize.  */
+  for (gimple_stmt_iterator gsi = gsi_start_bb (BB_VINFO_BB (bb_vinfo));
+       !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt_vec_info vinfo = vinfo_for_stmt (gsi_stmt (gsi));
+      if (STMT_SLP_TYPE (vinfo) != pure_slp)
+       STMT_VINFO_VECTORIZABLE (vinfo) = false;
+    }
+
+  /* Analyze dependences.  At this point all stmts not participating in
+     vectorization have to be marked.  Dependence analysis assumes
+     that we either vectorize all SLP instances or none at all.  */
+  if (!vect_slp_analyze_data_ref_dependences (bb_vinfo))
+     {
+       if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                         "not vectorized: unhandled data dependence "
+                         "in basic block.\n");
+
+       destroy_bb_vec_info (bb_vinfo);
+       return NULL;
+     }
+
   if (!vect_verify_datarefs_alignment (NULL, bb_vinfo))
     {
       if (dump_enabled_p ())
@@ -2157,7 +2491,8 @@ vect_slp_analyze_bb_1 (basic_block bb)
       return NULL;
     }
 
-  if (!vect_slp_analyze_operations (bb_vinfo))
+  if (!vect_slp_analyze_operations (BB_VINFO_SLP_INSTANCES (bb_vinfo),
+                                   BB_VINFO_TARGET_COST_DATA (bb_vinfo)))
     {
       if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -2168,7 +2503,7 @@ vect_slp_analyze_bb_1 (basic_block bb)
     }
 
   /* Cost model: check if the vectorization is worthwhile.  */
-  if (!unlimited_cost_model ()
+  if (!unlimited_cost_model (NULL)
       && !vect_bb_vectorization_profitable_p (bb_vinfo))
     {
       if (dump_enabled_p ())
@@ -2245,45 +2580,6 @@ vect_slp_analyze_bb (basic_block bb)
 }
 
 
-/* SLP costs are calculated according to SLP instance unrolling factor (i.e.,
-   the number of created vector stmts depends on the unrolling factor).
-   However, the actual number of vector stmts for every SLP node depends on
-   VF which is set later in vect_analyze_operations ().  Hence, SLP costs
-   should be updated.  In this function we assume that the inside costs
-   calculated in vect_model_xxx_cost are linear in ncopies.  */
-
-void
-vect_update_slp_costs_according_to_vf (loop_vec_info loop_vinfo)
-{
-  unsigned int i, j, vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  vec<slp_instance> slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
-  slp_instance instance;
-  stmt_vector_for_cost body_cost_vec;
-  stmt_info_for_cost *si;
-  void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-                    "=== vect_update_slp_costs_according_to_vf ===\n");
-
-  FOR_EACH_VEC_ELT (slp_instances, i, instance)
-    {
-      /* We assume that costs are linear in ncopies.  */
-      int ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (instance);
-
-      /* Record the instance's instructions in the target cost model.
-        This was delayed until here because the count of instructions
-        isn't known beforehand.  */
-      body_cost_vec = SLP_INSTANCE_BODY_COST_VEC (instance);
-
-      FOR_EACH_VEC_ELT (body_cost_vec, j, si)
-       (void) add_stmt_cost (data, si->count * ncopies, si->kind,
-                             vinfo_for_stmt (si->stmt), si->misalign,
-                             vect_body);
-    }
-}
-
-
 /* For constant and loop invariant defs of SLP_NODE this function returns
    (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts.
    OP_NUM determines if we gather defs for operand 0 or operand 1 of the RHS of
@@ -2318,11 +2614,14 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
   struct loop *loop;
   gimple_seq ctor_seq = NULL;
 
+  vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
+  nunits = TYPE_VECTOR_SUBPARTS (vector_type);
+
   if (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
       && reduc_index != -1)
     {
-      op_num = reduc_index - 1;
-      op = gimple_op (stmt, reduc_index);
+      op_num = reduc_index;
+      op = gimple_op (stmt, op_num + 1);
       /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
          we need either neutral operands or the original operands.  See
          get_initial_def_for_reduction() for details.  */
@@ -2330,6 +2629,7 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
         {
           case WIDEN_SUM_EXPR:
           case DOT_PROD_EXPR:
+         case SAD_EXPR:
           case PLUS_EXPR:
           case MINUS_EXPR:
           case BIT_IOR_EXPR:
@@ -2353,15 +2653,24 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
             neutral_op = build_int_cst (TREE_TYPE (op), -1);
             break;
 
-          case MAX_EXPR:
-          case MIN_EXPR:
-            def_stmt = SSA_NAME_DEF_STMT (op);
-            loop = (gimple_bb (stmt))->loop_father;
-            neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
-                                                loop_preheader_edge (loop));
-            break;
+         /* For MIN/MAX we don't have an easy neutral operand but
+            the initial values can be used fine here.  Only for
+            a reduction chain we have to force a neutral element.  */
+         case MAX_EXPR:
+         case MIN_EXPR:
+           if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
+             neutral_op = NULL;
+           else
+             {
+               def_stmt = SSA_NAME_DEF_STMT (op);
+               loop = (gimple_bb (stmt))->loop_father;
+               neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
+                                                   loop_preheader_edge (loop));
+             }
+           break;
 
           default:
+           gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
             neutral_op = NULL;
         }
     }
@@ -2381,10 +2690,6 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
   else
     constant_p = false;
 
-  vector_type = get_vectype_for_scalar_type (TREE_TYPE (op));
-  gcc_assert (vector_type);
-  nunits = TYPE_VECTOR_SUBPARTS (vector_type);
-
   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
      created vectors. It is greater than 1 if unrolling is performed.
 
@@ -2401,10 +2706,11 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
      {s5, s6, s7, s8}.  */
 
-  number_of_copies = least_common_multiple (nunits, group_size) / group_size;
+  number_of_copies = nunits * number_of_vectors / group_size;
 
   number_of_places_left_in_vector = nunits;
   elts = XALLOCAVEC (tree, nunits);
+  bool place_after_defs = false;
   for (j = 0; j < number_of_copies; j++)
     {
       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
@@ -2475,6 +2781,7 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
 
           /* Create 'vect_ = {op0,op1,...,opn}'.  */
           number_of_places_left_in_vector--;
+         tree orig_op = op;
          if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
            {
              if (CONSTANT_CLASS_P (op))
@@ -2485,14 +2792,11 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
                }
              else
                {
-                 tree new_temp
-                   = make_ssa_name (TREE_TYPE (vector_type), NULL);
+                 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
                  gimple init_stmt;
-                 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
-                              op);               
+                 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type), op);
                  init_stmt
-                   = gimple_build_assign_with_ops (VIEW_CONVERT_EXPR,
-                                                   new_temp, op, NULL_TREE);
+                   = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR, op);
                  gimple_seq_add_stmt (&ctor_seq, init_stmt);
                  op = new_temp;
                }
@@ -2500,6 +2804,12 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
          elts[number_of_places_left_in_vector] = op;
          if (!CONSTANT_CLASS_P (op))
            constant_p = false;
+         if (TREE_CODE (orig_op) == SSA_NAME
+             && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
+             && STMT_VINFO_BB_VINFO (stmt_vinfo)
+             && (STMT_VINFO_BB_VINFO (stmt_vinfo)->bb
+                 == gimple_bb (SSA_NAME_DEF_STMT (orig_op))))
+           place_after_defs = true;
 
           if (number_of_places_left_in_vector == 0)
             {
@@ -2516,16 +2826,25 @@ vect_get_constant_vectors (tree op, slp_tree slp_node,
                    CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
                  vec_cst = build_constructor (vector_type, v);
                }
-              voprnds.quick_push (vect_init_vector (stmt, vec_cst,
-                                                   vector_type, NULL));
+             tree init;
+             gimple_stmt_iterator gsi;
+             if (place_after_defs)
+               {
+                 gsi = gsi_for_stmt
+                         (vect_find_last_scalar_stmt_in_slp (slp_node));
+                 init = vect_init_vector (stmt, vec_cst, vector_type, &gsi);
+               }
+             else
+               init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
              if (ctor_seq != NULL)
                {
-                 gimple init_stmt = SSA_NAME_DEF_STMT (voprnds.last ());
-                 gimple_stmt_iterator gsi = gsi_for_stmt (init_stmt);
+                 gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
                  gsi_insert_seq_before_without_update (&gsi, ctor_seq,
                                                        GSI_SAME_STMT);
                  ctor_seq = NULL;
                }
+             voprnds.quick_push (init);
+             place_after_defs = false;
             }
         }
     }
@@ -2621,20 +2940,26 @@ vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
           child = SLP_TREE_CHILDREN (slp_node)[child_index];
 
          /* We have to check both pattern and original def, if available.  */
-         gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
-         gimple related = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
-
-         if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
-             || (related
-                 && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
+         if (child)
            {
-             /* The number of vector defs is determined by the number of
-                vector statements in the node from which we get those
-                statements.  */
-             number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
-             vectorized_defs = true;
-             child_index++;
+             gimple first_def = SLP_TREE_SCALAR_STMTS (child)[0];
+             gimple related
+               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first_def));
+
+             if (operand_equal_p (oprnd, gimple_get_lhs (first_def), 0)
+                 || (related
+                     && operand_equal_p (oprnd, gimple_get_lhs (related), 0)))
+               {
+                 /* The number of vector defs is determined by the number of
+                    vector statements in the node from which we get those
+                    statements.  */
+                 number_of_vects = SLP_TREE_NUMBER_OF_VEC_STMTS (child);
+                 vectorized_defs = true;
+                 child_index++;
+               }
            }
+         else
+           child_index++;
         }
 
       if (!vectorized_defs)
@@ -2689,7 +3014,7 @@ vect_get_slp_defs (vec<tree> ops, slp_tree slp_node,
    the created stmts must be inserted.  */
 
 static inline void
-vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
+vect_create_mask_and_perm (gimple stmt,
                            tree mask, int first_vec_indx, int second_vec_indx,
                            gimple_stmt_iterator *gsi, slp_tree node,
                            tree vectype, vec<tree> dr_chain,
@@ -2697,7 +3022,6 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
 {
   tree perm_dest;
   gimple perm_stmt = NULL;
-  stmt_vec_info next_stmt_info;
   int i, stride;
   tree first_vec, second_vec, data_ref;
 
@@ -2716,8 +3040,8 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
       second_vec = dr_chain[second_vec_indx];
 
       /* Generate the permute statement.  */
-      perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, perm_dest,
-                                               first_vec, second_vec, mask);
+      perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
+                                      first_vec, second_vec, mask);
       data_ref = make_ssa_name (perm_dest, perm_stmt);
       gimple_set_lhs (perm_stmt, data_ref);
       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
@@ -2728,10 +3052,6 @@ vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt,
       first_vec_indx += stride;
       second_vec_indx += stride;
     }
-
-  /* Mark the scalar stmt as vectorized.  */
-  next_stmt_info = vinfo_for_stmt (next_scalar_stmt);
-  STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt;
 }
 
 
@@ -2755,6 +3075,18 @@ vect_get_mask_element (gimple stmt, int first_mask_element, int m,
   /* Adjust the value in case it's a mask for second and third vectors.  */
   *current_mask_element -= mask_nunits * (*number_of_mask_fixes - 1);
 
+  if (*current_mask_element < 0)
+    {
+      if (dump_enabled_p ())
+       {
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                          "permutation requires past vector ");
+         dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+         dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+       }
+      return false;
+    }
+
   if (*current_mask_element < mask_nunits)
     *needs_first_vector = true;
 
@@ -2774,7 +3106,7 @@ vect_get_mask_element (gimple stmt, int first_mask_element, int m,
     }
 
   /* The mask requires the next vector.  */
-  if (*current_mask_element >= mask_nunits * 2)
+  while (*current_mask_element >= mask_nunits * 2)
     {
       if (*needs_first_vector || *mask_fixed)
         {
@@ -2832,9 +3164,8 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
   gimple stmt = SLP_TREE_SCALAR_STMTS (node)[0];
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   tree mask_element_type = NULL_TREE, mask_type;
-  int i, j, k, nunits, vec_index = 0, scalar_index;
+  int i, j, k, nunits, vec_index = 0;
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  gimple next_scalar_stmt;
   int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
   int first_mask_element;
   int index, unroll_factor, current_mask_element, ncopies;
@@ -2844,7 +3175,12 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
   int number_of_mask_fixes = 1;
   bool mask_fixed = false;
   bool needs_first_vector = false;
-  enum machine_mode mode;
+  machine_mode mode;
+
+  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    return false;
+
+  stmt_info = vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info));
 
   mode = TYPE_MODE (vectype);
 
@@ -2871,8 +3207,10 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
 
   /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE
      unrolling factor.  */
-  orig_vec_stmts_num = group_size *
-                SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits;
+  orig_vec_stmts_num
+    = (STMT_VINFO_GROUP_SIZE (stmt_info)
+       * SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance)
+       + nunits - 1) / nunits;
   if (orig_vec_stmts_num == 1)
     only_one_vec = true;
 
@@ -2880,9 +3218,6 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
      relatively to SLP_NODE_INSTANCE unrolling factor.  */
   ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance);
 
-  if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
-    return false;
-
   /* Generate permutation masks for every NODE. Number of masks for each NODE
      is equal to GROUP_SIZE.
      E.g., we have a group of three nodes with three loads from the same
@@ -2901,8 +3236,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
      we need the second and the third vectors: {b1,c1,a2,b2} and
      {c2,a3,b3,c3}.  */
 
-    {
-      scalar_index = 0;
+  {
       index = 0;
       vect_stmts_counter = 0;
       vec_index = 0;
@@ -2917,7 +3251,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
           for (k = 0; k < group_size; k++)
             {
              i = SLP_TREE_LOAD_PERMUTATION (node)[k];
-              first_mask_element = i + j * group_size;
+              first_mask_element = i + j * STMT_VINFO_GROUP_SIZE (stmt_info);
               if (!vect_get_mask_element (stmt, first_mask_element, 0,
                                          nunits, only_one_vec, index,
                                          mask, &current_mask_element,
@@ -2925,6 +3259,8 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                                          &number_of_mask_fixes, &mask_fixed,
                                          &needs_first_vector))
                return false;
+             gcc_assert (current_mask_element >= 0
+                         && current_mask_element < 2 * nunits);
              mask[index++] = current_mask_element;
 
               if (index == nunits)
@@ -2961,10 +3297,7 @@ vect_transform_slp_perm_load (slp_tree node, vec<tree> dr_chain,
                           second_vec_index = vec_index;
                         }
 
-                      next_scalar_stmt
-                         = SLP_TREE_SCALAR_STMTS (node)[scalar_index++];
-
-                      vect_create_mask_and_perm (stmt, next_scalar_stmt,
+                      vect_create_mask_and_perm (stmt,
                                mask_vec, first_vec_index, second_vec_index,
                               gsi, node, vectype, dr_chain,
                               ncopies, vect_stmts_counter++);
@@ -3012,8 +3345,14 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
      for the scalar stmts in each node of the SLP tree.  Number of vector
      elements in one vector iteration is the number of scalar elements in
      one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector
-     size.  */
-  vec_stmts_size = (vectorization_factor * group_size) / nunits;
+     size.
+     Unless this is a SLP reduction in which case the number of vector
+     stmts is equal to the number of vector stmts of the children.  */
+  if (GROUP_FIRST_ELEMENT (stmt_info)
+      && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
+    vec_stmts_size = SLP_TREE_NUMBER_OF_VEC_STMTS (SLP_TREE_CHILDREN (node)[0]);
+  else
+    vec_stmts_size = (vectorization_factor * group_size) / nunits;
 
   if (!SLP_TREE_VEC_STMTS (node).exists ())
     {
@@ -3029,26 +3368,9 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
       dump_printf (MSG_NOTE, "\n");
     }
 
-  /* Loads should be inserted before the first load.  */
-  if (SLP_INSTANCE_FIRST_LOAD_STMT (instance)
-      && STMT_VINFO_GROUPED_ACCESS (stmt_info)
-      && !REFERENCE_CLASS_P (gimple_get_lhs (stmt))
-      && SLP_TREE_LOAD_PERMUTATION (node).exists ())
-    si = gsi_for_stmt (SLP_INSTANCE_FIRST_LOAD_STMT (instance));
-  else if (is_pattern_stmt_p (stmt_info))
-    si = gsi_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
-  else
-    si = gsi_for_stmt (stmt);
-
-  /* Stores should be inserted just before the last store.  */
-  if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
-      && REFERENCE_CLASS_P (gimple_get_lhs (stmt)))
-    { 
-      gimple last_store = vect_find_last_store_in_slp_instance (instance);
-      if (is_pattern_stmt_p (vinfo_for_stmt (last_store)))
-       last_store = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (last_store));
-      si = gsi_for_stmt (last_store);
-    }
+  /* Vectorized stmts go before the last scalar stmt which is where
+     all uses are ready.  */
+  si = gsi_for_stmt (vect_find_last_scalar_stmt_in_slp (node));
 
   /* Mark the first element of the reduction chain as reduction to properly
      transform the node.  In the analysis phase only the last element of the
@@ -3060,6 +3382,74 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance,
       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
     }
 
+  /* Handle two-operation SLP nodes by vectorizing the group with
+     both operations and then performing a merge.  */
+  if (SLP_TREE_TWO_OPERATORS (node))
+    {
+      enum tree_code code0 = gimple_assign_rhs_code (stmt);
+      enum tree_code ocode;
+      gimple ostmt;
+      unsigned char *mask = XALLOCAVEC (unsigned char, group_size);
+      bool allsame = true;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, ostmt)
+       if (gimple_assign_rhs_code (ostmt) != code0)
+         {
+           mask[i] = 1;
+           allsame = false;
+           ocode = gimple_assign_rhs_code (ostmt);
+         }
+       else
+         mask[i] = 0;
+      if (!allsame)
+       {
+         vec<gimple> v0;
+         vec<gimple> v1;
+         unsigned j;
+         tree tmask = NULL_TREE;
+         vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+         v0 = SLP_TREE_VEC_STMTS (node).copy ();
+         SLP_TREE_VEC_STMTS (node).truncate (0);
+         gimple_assign_set_rhs_code (stmt, ocode);
+         vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
+         gimple_assign_set_rhs_code (stmt, code0);
+         v1 = SLP_TREE_VEC_STMTS (node).copy ();
+         SLP_TREE_VEC_STMTS (node).truncate (0);
+         tree meltype = build_nonstandard_integer_type
+             (GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype))), 1);
+         tree mvectype = get_same_sized_vectype (meltype, vectype);
+         unsigned k = 0, l;
+         for (j = 0; j < v0.length (); ++j)
+           {
+             tree *melts = XALLOCAVEC (tree, TYPE_VECTOR_SUBPARTS (vectype));
+             for (l = 0; l < TYPE_VECTOR_SUBPARTS (vectype); ++l)
+               {
+                 if (k >= group_size)
+                   k = 0;
+                 melts[l] = build_int_cst
+                     (meltype, mask[k++] * TYPE_VECTOR_SUBPARTS (vectype) + l);
+               }
+             tmask = build_vector (mvectype, melts);
+
+             /* ???  Not all targets support a VEC_PERM_EXPR with a
+                constant mask that would translate to a vec_merge RTX
+                (with their vec_perm_const_ok).  We can either not
+                vectorize in that case or let veclower do its job.
+                Unfortunately that isn't too great and at least for
+                plus/minus we'd eventually like to match targets
+                vector addsub instructions.  */
+             gimple vstmt;
+             vstmt = gimple_build_assign (make_ssa_name (vectype),
+                                          VEC_PERM_EXPR,
+                                          gimple_assign_lhs (v0[j]),
+                                          gimple_assign_lhs (v1[j]), tmask);
+             vect_finish_stmt_generation (stmt, vstmt, &si);
+             SLP_TREE_VEC_STMTS (node).quick_push (vstmt);
+           }
+         v0.release ();
+         v1.release ();
+         return false;
+       }
+    }
   is_store = vect_transform_stmt (stmt, &si, &grouped_store, node, instance);
   return is_store;
 }