Daily bump.
[gcc.git] / gcc / tree-vect-loop.c
index c9b653491f73879e300526918a5533ee0e627c1d..72bbec4b45d225d73eddfaca69468cd23842a42a 100644 (file)
@@ -161,7 +161,7 @@ static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
    may already be set for general statements (not just data refs).  */
 
 static opt_result
-vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
+vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
                              bool vectype_maybe_set_p,
                              poly_uint64 *vf)
 {
@@ -177,7 +177,8 @@ vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
     }
 
   tree stmt_vectype, nunits_vectype;
-  opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
+  opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
+                                                  &stmt_vectype,
                                                   &nunits_vectype);
   if (!res)
     return res;
@@ -207,13 +208,13 @@ vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
    or false if something prevented vectorization.  */
 
 static opt_result
-vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
+vect_determine_vf_for_stmt (vec_info *vinfo,
+                           stmt_vec_info stmt_info, poly_uint64 *vf)
 {
-  vec_info *vinfo = stmt_info->vinfo;
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
                     stmt_info->stmt);
-  opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
+  opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
   if (!res)
     return res;
 
@@ -232,7 +233,7 @@ vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
            dump_printf_loc (MSG_NOTE, vect_location,
                             "==> examining pattern def stmt: %G",
                             def_stmt_info->stmt);
-         res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
+         res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
          if (!res)
            return res;
        }
@@ -241,7 +242,7 @@ vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
        dump_printf_loc (MSG_NOTE, vect_location,
                         "==> examining pattern statement: %G",
                         stmt_info->stmt);
-      res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
+      res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
       if (!res)
        return res;
     }
@@ -341,9 +342,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
           gsi_next (&si))
        {
+         if (is_gimple_debug (gsi_stmt (si)))
+           continue;
          stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
          opt_result res
-           = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
+           = vect_determine_vf_for_stmt (loop_vinfo,
+                                         stmt_info, &vectorization_factor);
          if (!res)
            return res;
         }
@@ -440,9 +444,8 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
    this function would then return true for x_2.  */
 
 static bool
-vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
+vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   use_operand_p use_p;
   ssa_op_iter op_iter;
   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
@@ -505,7 +508,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
        }
 
       if (!access_fn
-         || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
+         || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
          || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
          || (LOOP_VINFO_LOOP (loop_vinfo) != loop
              && TREE_CODE (step) != INTEGER_CST))
@@ -663,27 +666,50 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
   unsigned i;
 
   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
-    if (STMT_VINFO_IN_PATTERN_P (first))
-      {
-       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
-       while (next)
-         {
-           if (! STMT_VINFO_IN_PATTERN_P (next)
-               || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
-             break;
-           next = REDUC_GROUP_NEXT_ELEMENT (next);
-         }
-       /* If not all stmt in the chain are patterns or if we failed
-          to update STMT_VINFO_REDUC_IDX try to handle the chain
-          without patterns.  */
-       if (! next
-           && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
-         {
-           vect_fixup_reduc_chain (first);
-           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
-             = STMT_VINFO_RELATED_STMT (first);
-         }
-      }
+    {
+      stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
+      while (next)
+       {
+         if ((STMT_VINFO_IN_PATTERN_P (next)
+              != STMT_VINFO_IN_PATTERN_P (first))
+             || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
+           break;
+         next = REDUC_GROUP_NEXT_ELEMENT (next);
+       }
+      /* If all reduction chain members are well-formed patterns adjust
+        the group to group the pattern stmts instead.  */
+      if (! next
+         && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
+       {
+         if (STMT_VINFO_IN_PATTERN_P (first))
+           {
+             vect_fixup_reduc_chain (first);
+             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
+               = STMT_VINFO_RELATED_STMT (first);
+           }
+       }
+      /* If not all stmt in the chain are patterns or if we failed
+        to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
+        it as regular reduction instead.  */
+      else
+       {
+         stmt_vec_info vinfo = first;
+         stmt_vec_info last = NULL;
+         while (vinfo)
+           {
+             next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
+             REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
+             REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+             last = vinfo;
+             vinfo = next;
+           }
+         STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
+           = vect_internal_def;
+         loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
+         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
+         --i;
+       }
+    }
 }
 
 /* Function vect_get_loop_niters.
@@ -799,7 +825,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     vectorization_factor (0),
     max_vectorization_factor (0),
     mask_skip_niters (NULL_TREE),
-    mask_compare_type (NULL_TREE),
+    rgroup_compare_type (NULL_TREE),
     simd_if_cond (NULL_TREE),
     unaligned_dr (NULL),
     peeling_for_alignment (0),
@@ -811,8 +837,9 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     vec_outside_cost (0),
     vec_inside_cost (0),
     vectorizable (false),
-    can_fully_mask_p (true),
-    fully_masked_p (false),
+    can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
+    using_partial_vectors_p (false),
+    epil_using_partial_vectors_p (false),
     peeling_for_gaps (false),
     peeling_for_niter (false),
     no_data_dependencies (false),
@@ -846,6 +873,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
        {
          gimple *stmt = gsi_stmt (si);
          gimple_set_uid (stmt, 0);
+         if (is_gimple_debug (stmt))
+           continue;
          add_stmt (stmt);
          /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
             third argument is the #pragma omp simd if (x) condition, when 0,
@@ -873,16 +902,16 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
   epilogue_vinfos.create (6);
 }
 
-/* Free all levels of MASKS.  */
+/* Free all levels of rgroup CONTROLS.  */
 
 void
-release_vec_loop_masks (vec_loop_masks *masks)
+release_vec_loop_controls (vec<rgroup_controls> *controls)
 {
-  rgroup_masks *rgm;
+  rgroup_controls *rgc;
   unsigned int i;
-  FOR_EACH_VEC_ELT (*masks, i, rgm)
-    rgm->masks.release ();
-  masks->release ();
+  FOR_EACH_VEC_ELT (*controls, i, rgc)
+    rgc->controls.release ();
+  controls->release ();
 }
 
 /* Free all memory used by the _loop_vec_info, as well as all the
@@ -892,7 +921,8 @@ _loop_vec_info::~_loop_vec_info ()
 {
   free (bbs);
 
-  release_vec_loop_masks (&masks);
+  release_vec_loop_controls (&masks);
+  release_vec_loop_controls (&lens);
   delete ivexpr_map;
   delete scan_map;
   epilogue_vinfos.release ();
@@ -933,12 +963,12 @@ cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 static bool
 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 {
-  rgroup_masks *rgm;
+  rgroup_controls *rgm;
   unsigned int i;
   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
-    if (rgm->mask_type != NULL_TREE
+    if (rgm->type != NULL_TREE
        && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
-                                           cmp_type, rgm->mask_type,
+                                           cmp_type, rgm->type,
                                            OPTIMIZE_FOR_SPEED))
       return false;
   return true;
@@ -952,20 +982,90 @@ vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 {
   unsigned int res = 1;
   unsigned int i;
-  rgroup_masks *rgm;
+  rgroup_controls *rgm;
   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
     res = MAX (res, rgm->max_nscalars_per_iter);
   return res;
 }
 
+/* Calculate the minimum precision necessary to represent:
+
+      MAX_NITERS * FACTOR
+
+   as an unsigned integer, where MAX_NITERS is the maximum number of
+   loop header iterations for the original scalar form of LOOP_VINFO.  */
+
+static unsigned
+vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
+{
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+  /* Get the maximum number of iterations that is representable
+     in the counter type.  */
+  tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
+  widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
+
+  /* Get a more refined estimate for the number of iterations.  */
+  widest_int max_back_edges;
+  if (max_loop_iterations (loop, &max_back_edges))
+    max_ni = wi::smin (max_ni, max_back_edges + 1);
+
+  /* Work out how many bits we need to represent the limit.  */
+  return wi::min_precision (max_ni * factor, UNSIGNED);
+}
+
+/* True if the loop needs peeling or partial vectors when vectorized.  */
+
+static bool
+vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
+{
+  unsigned HOST_WIDE_INT const_vf;
+  HOST_WIDE_INT max_niter
+    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
+  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
+                                         (loop_vinfo));
+
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+    {
+      /* Work out the (constant) number of iterations that need to be
+        peeled for reasons other than niters.  */
+      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+       peel_niter += 1;
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
+                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+       return true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      /* ??? When peeling for gaps but not alignment, we could
+        try to check whether the (variable) niters is known to be
+        VF * N + 1.  That's something of a niche case though.  */
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+      || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+      || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+          < (unsigned) exact_log2 (const_vf))
+         /* In case of versioning, check if the maximum number of
+            iterations is greater than th.  If they are identical,
+            the epilogue is unnecessary.  */
+         && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+             || ((unsigned HOST_WIDE_INT) max_niter
+                 > (th / const_vf) * const_vf))))
+    return true;
+
+  return false;
+}
+
 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
    whether we can actually generate the masks required.  Return true if so,
-   storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
+   storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
 
 static bool
 vect_verify_full_masking (loop_vec_info loop_vinfo)
 {
-  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int min_ni_width;
   unsigned int max_nscalars_per_iter
     = vect_get_max_nscalars_per_iter (loop_vinfo);
@@ -976,27 +1076,15 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
     return false;
 
-  /* Get the maximum number of iterations that is representable
-     in the counter type.  */
-  tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
-  widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
-
-  /* Get a more refined estimate for the number of iterations.  */
-  widest_int max_back_edges;
-  if (max_loop_iterations (loop, &max_back_edges))
-    max_ni = wi::smin (max_ni, max_back_edges + 1);
-
-  /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= max_nscalars_per_iter;
-
   /* Work out how many bits we need to represent the limit.  */
-  min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+  min_ni_width
+    = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
 
   /* Find a scalar mode for which WHILE_ULT is supported.  */
   opt_scalar_int_mode cmp_mode_iter;
   tree cmp_type = NULL_TREE;
   tree iv_type = NULL_TREE;
-  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
   unsigned int iv_precision = UINT_MAX;
 
   if (iv_limit != -1)
@@ -1049,8 +1137,83 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   if (!cmp_type)
     return false;
 
-  LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
-  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
+  LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+  return true;
+}
+
+/* Check whether we can use vector access with length based on precison
+   comparison.  So far, to keep it simple, we only allow the case that the
+   precision of the target supported length is larger than the precision
+   required by loop niters.  */
+
+static bool
+vect_verify_loop_lens (loop_vec_info loop_vinfo)
+{
+  if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+    return false;
+
+  unsigned int max_nitems_per_iter = 1;
+  unsigned int i;
+  rgroup_controls *rgl;
+  /* Find the maximum number of items per iteration for every rgroup.  */
+  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
+    {
+      unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
+      max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
+    }
+
+  /* Work out how many bits we need to represent the length limit.  */
+  unsigned int min_ni_prec
+    = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
+
+  /* Now use the maximum of below precisions for one suitable IV type:
+     - the IV's natural precision
+     - the precision needed to hold: the maximum number of scalar
+       iterations multiplied by the scale factor (min_ni_prec above)
+     - the Pmode precision
+
+     If min_ni_prec is less than the precision of the current niters,
+     we perfer to still use the niters type.  Prefer to use Pmode and
+     wider IV to avoid narrow conversions.  */
+
+  unsigned int ni_prec
+    = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
+  min_ni_prec = MAX (min_ni_prec, ni_prec);
+  min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
+
+  tree iv_type = NULL_TREE;
+  opt_scalar_int_mode tmode_iter;
+  FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
+    {
+      scalar_mode tmode = tmode_iter.require ();
+      unsigned int tbits = GET_MODE_BITSIZE (tmode);
+
+      /* ??? Do we really want to construct one IV whose precision exceeds
+        BITS_PER_WORD?  */
+      if (tbits > BITS_PER_WORD)
+       break;
+
+      /* Find the first available standard integral type.  */
+      if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
+       {
+         iv_type = build_nonstandard_integer_type (tbits, true);
+         break;
+       }
+    }
+
+  if (!iv_type)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "can't vectorize with length-based partial vectors"
+                        " because there is no suitable iv type.\n");
+      return false;
+    }
+
+  LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
+  LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+
   return true;
 }
 
@@ -1122,9 +1285,9 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
   int j;
   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
                    j, si)
-    (void) add_stmt_cost (target_cost_data, si->count,
-                         si->kind, si->stmt_info, si->misalign,
-                         vect_body);
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
+                         si->kind, si->stmt_info, si->vectype,
+                         si->misalign, vect_body);
   unsigned dummy, body_cost = 0;
   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
   destroy_cost_data (target_cost_data);
@@ -1392,6 +1555,8 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
           gsi_next (&si))
        {
+         if (is_gimple_debug (gsi_stmt (si)))
+           continue;
          stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
          stmt_info = vect_stmt_to_vectorize (stmt_info);
          if ((STMT_VINFO_RELEVANT_P (stmt_info)
@@ -1529,7 +1694,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
                  if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
                       || (STMT_VINFO_DEF_TYPE (stmt_info)
                           == vect_double_reduction_def))
-                     && !vectorizable_lc_phi (stmt_info, NULL, NULL))
+                     && !vectorizable_lc_phi (loop_vinfo,
+                                              stmt_info, NULL, NULL))
                    return opt_result::failure_at (phi, "unsupported phi\n");
                 }
 
@@ -1551,21 +1717,24 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
               need_to_vectorize = true;
               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
                  && ! PURE_SLP_STMT (stmt_info))
-               ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
+               ok = vectorizable_induction (loop_vinfo,
+                                            stmt_info, NULL, NULL,
                                             &cost_vec);
              else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
                        || (STMT_VINFO_DEF_TYPE (stmt_info)
                            == vect_double_reduction_def)
                        || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
                       && ! PURE_SLP_STMT (stmt_info))
-               ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
+               ok = vectorizable_reduction (loop_vinfo,
+                                            stmt_info, NULL, NULL, &cost_vec);
             }
 
          /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
          if (ok
              && STMT_VINFO_LIVE_P (stmt_info)
              && !PURE_SLP_STMT (stmt_info))
-           ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
+           ok = vectorizable_live_operation (loop_vinfo,
+                                             stmt_info, NULL, NULL, NULL,
                                              -1, false, &cost_vec);
 
           if (!ok)
@@ -1579,10 +1748,12 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
           gsi_next (&si))
         {
          gimple *stmt = gsi_stmt (si);
-         if (!gimple_clobber_p (stmt))
+         if (!gimple_clobber_p (stmt)
+             && !is_gimple_debug (stmt))
            {
              opt_result res
-               = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
+               = vect_analyze_stmt (loop_vinfo,
+                                    loop_vinfo->lookup_stmt (stmt),
                                     &need_to_vectorize,
                                     NULL, NULL, &cost_vec);
              if (!res)
@@ -1591,7 +1762,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
         }
     } /* bbs */
 
-  add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
+  add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
 
   /* All operations in the loop are either irrelevant (deal with loop
      control, or dead), or only used outside the loop and can be moved
@@ -1611,6 +1782,27 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
   return opt_result::success ();
 }
 
+/* Return true if we know that the iteration count is smaller than the
+   vectorization factor.  Return false if it isn't, or if we can't be sure
+   either way.  */
+
+static bool
+vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
+{
+  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
+
+  HOST_WIDE_INT max_niter;
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
+  else
+    max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
+    return true;
+
+  return false;
+}
+
 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
    definitely no, or -1 if it's worth retrying.  */
@@ -1621,19 +1813,11 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
 
-  /* Only fully-masked loops can have iteration counts less than the
-     vectorization factor.  */
-  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  /* Only loops that can handle partially-populated vectors can have iteration
+     counts less than the vectorization factor.  */
+  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
-      HOST_WIDE_INT max_niter;
-
-      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
-       max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
-      else
-       max_niter = max_stmt_executions_int (loop);
-
-      if (max_niter != -1
-         && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
+      if (vect_known_niters_smaller_than_vf (loop_vinfo))
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1643,6 +1827,19 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
        }
     }
 
+  /* If using the "very cheap" model. reject cases in which we'd keep
+     a copy of the scalar code (even if we might be able to vectorize it).  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "some scalar iterations would need to be peeled\n");
+      return 0;
+    }
+
   int min_profitable_iters, min_profitable_estimate;
   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
                                      &min_profitable_estimate);
@@ -1701,6 +1898,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
       min_profitable_estimate = min_profitable_iters;
     }
 
+  /* If the vector loop needs multiple iterations to be beneficial then
+     things are probably too close to call, and the conservative thing
+     would be to stick with the scalar code.  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "one iteration of the vector loop would be"
+                        " more expensive than the equivalent number of"
+                        " iterations of the scalar loop\n");
+      return 0;
+    }
+
   HOST_WIDE_INT estimated_niter;
 
   /* If we are vectorizing an epilogue then we know the maximum number of
@@ -1749,7 +1960,8 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
        if (is_gimple_debug (stmt))
          continue;
        ++(*n_stmts);
-       opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
+       opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
+                                                       NULL, 0);
        if (!res)
          {
            if (is_gimple_call (stmt) && loop->safelen)
@@ -1804,7 +2016,7 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
 
   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
 
-  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
       gcc_assert (DR_REF (dr));
@@ -1841,56 +2053,123 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
     }
 }
 
+/* Determine if operating on full vectors for LOOP_VINFO might leave
+   some scalar iterations still to do.  If so, decide how we should
+   handle those scalar iterations.  The possibilities are:
 
-/* Decides whether we need to create an epilogue loop to handle
-   remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
+   (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
+       In this case:
 
-void
-determine_peel_for_niter (loop_vec_info loop_vinfo)
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
+        LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == false
+
+   (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
+       to handle the remaining scalar iterations.  In this case:
+
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == true
+
+       There are two choices:
+
+       (2a) Consider vectorizing the epilogue loop at the same VF as the
+           main loop, but using partial vectors instead of full vectors.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
+
+       (2b) Consider vectorizing the epilogue loop at lower VFs only.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+
+   When FOR_EPILOGUE_P is true, make this determination based on the
+   assumption that LOOP_VINFO is an epilogue loop, otherwise make it
+   based on the assumption that LOOP_VINFO is the main loop.  The caller
+   has made sure that the number of iterations is set appropriately for
+   this value of FOR_EPILOGUE_P.  */
+
+opt_result
+vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
+                                           bool for_epilogue_p)
 {
-  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  /* Determine whether there would be any scalar iterations left over.  */
+  bool need_peeling_or_partial_vectors_p
+    = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
+
+  /* Decide whether to vectorize the loop with partial vectors.  */
+  LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && need_peeling_or_partial_vectors_p)
+    {
+      /* For partial-vector-usage=1, try to push the handling of partial
+        vectors to the epilogue, with the main loop continuing to operate
+        on full vectors.
+
+        ??? We could then end up failing to use partial vectors if we
+        decide to peel iterations into a prologue, and if the main loop
+        then ends up processing fewer than VF iterations.  */
+      if (param_vect_partial_vector_usage == 1
+         && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+         && !vect_known_niters_smaller_than_vf (loop_vinfo))
+       LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+      else
+       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+    }
 
-  unsigned HOST_WIDE_INT const_vf;
-  HOST_WIDE_INT max_niter
-    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+  if (dump_enabled_p ())
+    {
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating on partial vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+      else
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating only on full vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+    }
 
-  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
-    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
-                                         (loop_vinfo));
+  if (for_epilogue_p)
+    {
+      loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+      gcc_assert (orig_loop_vinfo);
+      if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                             LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
+    }
 
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-    /* The main loop handles all iterations.  */
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
-  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-          && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
-      /* Work out the (constant) number of iterations that need to be
-        peeled for reasons other than niters.  */
-      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-       peel_niter += 1;
-      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
-                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-       LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+      /* Check that the loop processes at least one full vector.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
+      if (known_lt (wi::to_widest (scalar_niters), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support vectorization.\n");
+
+      /* If we need to peel an extra epilogue iteration to handle data
+        accesses with gaps, check that there are enough scalar iterations
+        available.
+
+        The check above is redundant with this one when peeling for gaps,
+        but the distinction is useful for diagnostics.  */
+      tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         && known_lt (wi::to_widest (scalar_nitersm1), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support peeling for gaps.\n");
     }
-  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
-          /* ??? When peeling for gaps but not alignment, we could
-             try to check whether the (variable) niters is known to be
-             VF * N + 1.  That's something of a niche case though.  */
-          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-          || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
-          || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
-               < (unsigned) exact_log2 (const_vf))
-              /* In case of versioning, check if the maximum number of
-                 iterations is greater than th.  If they are identical,
-                 the epilogue is unnecessary.  */
-              && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-                  || ((unsigned HOST_WIDE_INT) max_niter
-                      > (th / const_vf) * const_vf))))
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-}
 
+  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+    = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+       && need_peeling_or_partial_vectors_p);
+
+  return opt_result::success ();
+}
 
 /* Function vect_analyze_loop_2.
 
@@ -1971,7 +2250,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   /* Analyze the access patterns of the data-refs in the loop (consecutive,
      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
 
-  ok = vect_analyze_data_ref_accesses (loop_vinfo);
+  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
   if (!ok)
     {
       if (dump_enabled_p ())
@@ -2043,9 +2322,16 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
 
       /* Update the vectorization factor based on the SLP decision.  */
       vect_update_vf_for_slp (loop_vinfo);
+
+      /* Optimize the SLP graph with the vectorization factor fixed.  */
+      vect_optimize_slp (loop_vinfo);
+
+      /* Gather the loads reachable from the SLP graph entries.  */
+      vect_gather_slp_loads (loop_vinfo);
     }
 
-  bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
+  bool saved_can_use_partial_vectors_p
+    = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
 
   /* We don't expect to have to roll back to anything other than an empty
      set of rgroups.  */
@@ -2093,8 +2379,6 @@ start_over:
     /* This pass will decide on using loop versioning and/or loop peeling in
        order to enhance the alignment of data references in the loop.  */
     ok = vect_enhance_data_refs_alignment (loop_vinfo);
-  else
-    ok = vect_verify_datarefs_alignment (loop_vinfo);
   if (!ok)
     return ok;
 
@@ -2111,6 +2395,79 @@ start_over:
                                       "unsupported SLP instances\n");
          goto again;
        }
+
+      /* Check whether any load in ALL SLP instances is possibly permuted.  */
+      slp_tree load_node, slp_root;
+      unsigned i, x;
+      slp_instance instance;
+      bool can_use_lanes = true;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
+       {
+         slp_root = SLP_INSTANCE_TREE (instance);
+         int group_size = SLP_TREE_LANES (slp_root);
+         tree vectype = SLP_TREE_VECTYPE (slp_root);
+         bool loads_permuted = false;
+         FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+           {
+             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+               continue;
+             unsigned j;
+             stmt_vec_info load_info;
+             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+               if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+                 {
+                   loads_permuted = true;
+                   break;
+                 }
+           }
+
+         /* If the loads and stores can be handled with load/store-lane
+            instructions record it and move on to the next instance.  */
+         if (loads_permuted
+             && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+             && vect_store_lanes_supported (vectype, group_size, false))
+           {
+             FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+               {
+                 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+                     (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+                 /* Use SLP for strided accesses (or if we can't
+                    load-lanes).  */
+                 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+                     || ! vect_load_lanes_supported
+                           (STMT_VINFO_VECTYPE (stmt_vinfo),
+                            DR_GROUP_SIZE (stmt_vinfo), false))
+                   break;
+               }
+
+             can_use_lanes
+               = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
+
+             if (can_use_lanes && dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "SLP instance %p can use load/store-lanes\n",
+                                instance);
+           }
+         else
+           {
+             can_use_lanes = false;
+             break;
+           }
+       }
+
+      /* If all SLP instances can use load/store-lanes abort SLP and try again
+        with SLP disabled.  */
+      if (can_use_lanes)
+       {
+         ok = opt_result::failure_at (vect_location,
+                                      "Built SLP cancelled: can use "
+                                      "load/store-lanes\n");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Built SLP cancelled: all SLP instances support "
+                            "load/store-lanes\n");
+         goto again;
+       }
     }
 
   /* Dissolve SLP-only groups.  */
@@ -2127,47 +2484,47 @@ start_over:
       return ok;
     }
 
-  /* Decide whether to use a fully-masked loop for this vectorization
-     factor.  */
-  LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-    = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
-       && vect_verify_full_masking (loop_vinfo));
-  if (dump_enabled_p ())
-    {
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "using a fully-masked loop.\n");
-      else
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "not using a fully-masked loop.\n");
-    }
-
-  /* If epilog loop is required because of data accesses with gaps,
-     one additional iteration needs to be peeled.  Check if there is
-     enough iterations for vectorization.  */
-  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  /* For now, we don't expect to mix both masking and length approaches for one
+     loop, disable it if both are recorded.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
     {
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
-
-      if (known_lt (wi::to_widest (scalar_niters), vf))
-       return opt_result::failure_at (vect_location,
-                                      "loop has no enough iterations to"
-                                      " support peeling for gaps.\n");
-    }
-
-  /* If we're vectorizing an epilogue loop, we either need a fully-masked
-     loop or a loop that has a lower VF than the main loop.  */
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "can't vectorize a loop with partial vectors"
+                        " because we don't expect to mix different"
+                        " approaches with partial vectors for the"
+                        " same loop.\n");
+      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
+
+  /* If we still have the option of using partial vectors,
+     check whether we can generate the necessary loop controls.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !vect_verify_full_masking (loop_vinfo)
+      && !vect_verify_loop_lens (loop_vinfo))
+    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+
+  /* If we're vectorizing an epilogue loop, the vectorized loop either needs
+     to be able to handle fewer than VF scalars, or needs to have a lower VF
+     than the main loop.  */
   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
                   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
     return opt_result::failure_at (vect_location,
                                   "Vectorization factor too high for"
                                   " epilogue loop.\n");
 
+  /* Decide whether this loop_vinfo should use partial vectors or peeling,
+     assuming that the loop will be used as a main loop.  We will redo
+     this analysis later if we instead decide to use the loop as an
+     epilogue loop.  */
+  ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
+  if (!ok)
+    return ok;
+
   /* Check the costings of the loop make vectorizing worthwhile.  */
   res = vect_analyze_loop_costing (loop_vinfo);
   if (res < 0)
@@ -2180,7 +2537,6 @@ start_over:
     return opt_result::failure_at (vect_location,
                                   "Loop costings not worthwhile.\n");
 
-  determine_peel_for_niter (loop_vinfo);
   /* If an epilogue loop is required make sure we can create one.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
@@ -2232,7 +2588,7 @@ start_over:
        }
 
       /* Niters for at least one iteration of vectorized loop.  */
-      if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
        niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
       /* One additional iteration because of peeling for gap.  */
       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
@@ -2310,7 +2666,7 @@ again:
   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
   /* Free the SLP instances.  */
   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
-    vect_free_slp_instance (instance, false);
+    vect_free_slp_instance (instance);
   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
   /* Reset SLP type to loop_vect on all stmts.  */
   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
@@ -2336,6 +2692,8 @@ again:
       for (gimple_stmt_iterator si = gsi_start_bb (bb);
           !gsi_end_p (si); gsi_next (&si))
        {
+         if (is_gimple_debug (gsi_stmt (si)))
+           continue;
          stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
          STMT_SLP_TYPE (stmt_info) = loop_vect;
          if (STMT_VINFO_IN_PATTERN_P (stmt_info))
@@ -2359,13 +2717,15 @@ again:
   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
   /* Reset accumulated rgroup information.  */
-  release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
   /* Reset assorted flags.  */
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
-  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
+  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+    = saved_can_use_partial_vectors_p;
 
   goto start_over;
 }
@@ -2414,7 +2774,36 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
                             * poly_widest_int (new_vf));
   if (maybe_lt (rel_old, rel_new))
-    return false;
+    {
+      /* When old_loop_vinfo uses a variable vectorization factor,
+        we know that it has a lower cost for at least one runtime VF.
+        However, we don't know how likely that VF is.
+
+        One option would be to compare the costs for the estimated VFs.
+        The problem is that that can put too much pressure on the cost
+        model.  E.g. if the estimated VF is also the lowest possible VF,
+        and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
+        for the estimated VF, we'd then choose new_loop_vinfo even
+        though (a) new_loop_vinfo might not actually be better than
+        old_loop_vinfo for that VF and (b) it would be significantly
+        worse at larger VFs.
+
+        Here we go for a hacky compromise: pick new_loop_vinfo if it is
+        no more expensive than old_loop_vinfo even after doubling the
+        estimated old_loop_vinfo VF.  For all but trivial loops, this
+        ensures that we only pick new_loop_vinfo if it is significantly
+        better than old_loop_vinfo at the estimated VF.  */
+      if (rel_new.is_constant ())
+       return false;
+
+      HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
+      HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
+      widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
+                                     * widest_int (old_estimated_vf));
+      widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
+                                     * widest_int (new_estimated_vf));
+      return estimated_rel_new * 2 <= estimated_rel_old;
+    }
   if (known_lt (rel_new, rel_old))
     return true;
 
@@ -2616,7 +3005,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
                lowest_th = ordered_min (lowest_th, th);
            }
          else
-           delete loop_vinfo;
+           {
+             delete loop_vinfo;
+             loop_vinfo = opt_loop_vec_info::success (NULL);
+           }
 
          /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
             enabled, SIMDUID is not set, it is the innermost loop and we have
@@ -2641,6 +3033,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
       else
        {
          delete loop_vinfo;
+         loop_vinfo = opt_loop_vec_info::success (NULL);
          if (fatal)
            {
              gcc_checking_assert (first_loop_vinfo == NULL);
@@ -2648,6 +3041,23 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
            }
        }
 
+      /* Handle the case that the original loop can use partial
+        vectorization, but want to only adopt it for the epilogue.
+        The retry should be in the same mode as original.  */
+      if (vect_epilogues
+         && loop_vinfo
+         && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       {
+         gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+                     && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "***** Re-trying analysis with same vector mode"
+                            " %s for epilogue with partial vectors.\n",
+                            GET_MODE_NAME (loop_vinfo->vector_mode));
+         continue;
+       }
+
       if (mode_i < vector_modes.length ()
          && VECTOR_MODE_P (autodetected_vector_mode)
          && (related_vector_mode (vector_modes[mode_i],
@@ -2946,14 +3356,17 @@ pop:
          fail = true;
          break;
        }
-      /* Check there's only a single stmt the op is used on inside
-         of the loop.  */
+      /* Check there's only a single stmt the op is used on.  For the
+        not value-changing tail and the last stmt allow out-of-loop uses.
+        ???  We could relax this and handle arbitrary live stmts by
+        forcing a scalar epilogue for example.  */
       imm_use_iterator imm_iter;
       gimple *op_use_stmt;
       unsigned cnt = 0;
       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
        if (!is_gimple_debug (op_use_stmt)
-           && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
+           && (*code != ERROR_MARK
+               || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
          {
            /* We want to allow x + x but not x < 1 ? x : 2.  */
            if (is_gimple_assign (op_use_stmt)
@@ -3264,42 +3677,58 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
   return NULL;
 }
 
-/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
-int
-vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
-                             int *peel_iters_epilogue,
-                             stmt_vector_for_cost *scalar_cost_vec,
-                            stmt_vector_for_cost *prologue_cost_vec,
-                            stmt_vector_for_cost *epilogue_cost_vec)
+/* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
+   PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
+   or -1 if not known.  */
+
+static int
+vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
 {
-  int retval = 0;
   int assumed_vf = vect_vf_for_cost (loop_vinfo);
-
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
     {
-      *peel_iters_epilogue = assumed_vf / 2;
       if (dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location,
+       dump_printf_loc (MSG_NOTE, vect_location,
                         "cost model: epilogue peel iters set to vf/2 "
                         "because loop iterations are unknown .\n");
-
-      /* If peeled iterations are known but number of scalar loop
-         iterations are unknown, count a taken branch per peeled loop.  */
-      retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
-                                NULL, 0, vect_prologue);
-      retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
-                                 NULL, 0, vect_epilogue);
+      return assumed_vf / 2;
     }
   else
     {
       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
-      peel_iters_prologue = niters < peel_iters_prologue ?
-                            niters : peel_iters_prologue;
-      *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
+      peel_iters_prologue = MIN (niters, peel_iters_prologue);
+      int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
       /* If we need to peel for gaps, but no peeling is required, we have to
         peel VF iterations.  */
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
-       *peel_iters_epilogue = assumed_vf;
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
+       peel_iters_epilogue = assumed_vf;
+      return peel_iters_epilogue;
+    }
+}
+
+/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
+int
+vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
+                            int *peel_iters_epilogue,
+                            stmt_vector_for_cost *scalar_cost_vec,
+                            stmt_vector_for_cost *prologue_cost_vec,
+                            stmt_vector_for_cost *epilogue_cost_vec)
+{
+  int retval = 0;
+
+  *peel_iters_epilogue
+    = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
+
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      /* If peeled iterations are known but number of scalar loop
+        iterations are unknown, count a taken branch per peeled loop.  */
+      if (peel_iters_prologue > 0)
+       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
+                                  NULL, NULL_TREE, 0, vect_prologue);
+      if (*peel_iters_epilogue > 0)
+       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
+                                   NULL, NULL_TREE, 0, vect_epilogue);
     }
 
   stmt_info_for_cost *si;
@@ -3368,8 +3797,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     {
       /*  FIXME: Make cost depend on complexity of individual check.  */
       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
-      (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
-                           vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
+                           NULL, NULL_TREE, 0, vect_prologue);
       if (dump_enabled_p ())
        dump_printf (MSG_NOTE,
                     "cost model: Adding cost of checks for loop "
@@ -3381,13 +3810,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     {
       /*  FIXME: Make cost depend on complexity of individual check.  */
       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
-      (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
-                           vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
+                           NULL, NULL_TREE, 0, vect_prologue);
       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
       if (len)
        /* Count LEN - 1 ANDs and LEN comparisons.  */
-       (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
-                             NULL, 0, vect_prologue);
+       (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
+                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
       if (len)
        {
@@ -3397,8 +3826,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
          for (unsigned int i = 0; i < len; ++i)
            if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
              nstmts += 1;
-         (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
-                               NULL, 0, vect_prologue);
+         (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
+                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
        }
       if (dump_enabled_p ())
        dump_printf (MSG_NOTE,
@@ -3410,8 +3839,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
     {
       /*  FIXME: Make cost depend on complexity of individual check.  */
-      (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
-                           vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
+                           NULL, NULL_TREE, 0, vect_prologue);
       if (dump_enabled_p ())
        dump_printf (MSG_NOTE,
                     "cost model: Adding cost of checks for loop "
@@ -3419,8 +3848,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     }
 
   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
-    (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
-                         vect_prologue);
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+                         NULL, NULL_TREE, 0, vect_prologue);
 
   /* Count statements in scalar loop.  Using this as scalar cost for a single
      iteration for now.
@@ -3442,30 +3871,116 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      TODO: Build an expression that represents peel_iters for prologue and
      epilogue to be used in a run-time test.  */
 
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  bool prologue_need_br_taken_cost = false;
+  bool prologue_need_br_not_taken_cost = false;
+
+  /* Calculate peel_iters_prologue.  */
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
+    peel_iters_prologue = 0;
+  else if (npeel < 0)
     {
-      peel_iters_prologue = 0;
-      peel_iters_epilogue = 0;
+      peel_iters_prologue = assumed_vf / 2;
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "prologue peel iters set to vf/2.\n");
 
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-       {
-         /* We need to peel exactly one iteration.  */
-         peel_iters_epilogue += 1;
-         stmt_info_for_cost *si;
-         int j;
-         FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
-                           j, si)
-           (void) add_stmt_cost (target_cost_data, si->count,
-                                 si->kind, si->stmt_info, si->misalign,
-                                 vect_epilogue);
-       }
+      /* If peeled iterations are unknown, count a taken branch and a not taken
+        branch per peeled loop.  Even if scalar loop iterations are known,
+        vector iterations are not known since peeled prologue iterations are
+        not known.  Hence guards remain the same.  */
+      prologue_need_br_taken_cost = true;
+      prologue_need_br_not_taken_cost = true;
+    }
+  else
+    {
+      peel_iters_prologue = npeel;
+      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
+       /* If peeled iterations are known but number of scalar loop
+          iterations are unknown, count a taken branch per peeled loop.  */
+       prologue_need_br_taken_cost = true;
+    }
+
+  bool epilogue_need_br_taken_cost = false;
+  bool epilogue_need_br_not_taken_cost = false;
+
+  /* Calculate peel_iters_epilogue.  */
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+    /* We need to peel exactly one iteration for gaps.  */
+    peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
+  else if (npeel < 0)
+    {
+      /* If peeling for alignment is unknown, loop bound of main loop
+        becomes unknown.  */
+      peel_iters_epilogue = assumed_vf / 2;
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "epilogue peel iters set to vf/2 because "
+                    "peeling for alignment is unknown.\n");
+
+      /* See the same reason above in peel_iters_prologue calculation.  */
+      epilogue_need_br_taken_cost = true;
+      epilogue_need_br_not_taken_cost = true;
+    }
+  else
+    {
+      peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
+      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
+       /* If peeled iterations are known but number of scalar loop
+          iterations are unknown, count a taken branch per peeled loop.  */
+       epilogue_need_br_taken_cost = true;
+    }
+
+  stmt_info_for_cost *si;
+  int j;
+  /* Add costs associated with peel_iters_prologue.  */
+  if (peel_iters_prologue)
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+      {
+       (void) add_stmt_cost (loop_vinfo, target_cost_data,
+                             si->count * peel_iters_prologue, si->kind,
+                             si->stmt_info, si->vectype, si->misalign,
+                             vect_prologue);
+      }
+
+  /* Add costs associated with peel_iters_epilogue.  */
+  if (peel_iters_epilogue)
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+      {
+       (void) add_stmt_cost (loop_vinfo, target_cost_data,
+                             si->count * peel_iters_epilogue, si->kind,
+                             si->stmt_info, si->vectype, si->misalign,
+                             vect_epilogue);
+      }
+
+  /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
 
+  if (prologue_need_br_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+                         NULL, NULL_TREE, 0, vect_prologue);
+
+  if (prologue_need_br_not_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+                         cond_branch_not_taken, NULL, NULL_TREE, 0,
+                         vect_prologue);
+
+  if (epilogue_need_br_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+                         NULL, NULL_TREE, 0, vect_epilogue);
+
+  if (epilogue_need_br_not_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+                         cond_branch_not_taken, NULL, NULL_TREE, 0,
+                         vect_epilogue);
+
+  /* Take care of special costs for rgroup controls of partial vectors.  */
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
       /* Calculate how many masks we need to generate.  */
       unsigned int num_masks = 0;
-      rgroup_masks *rgm;
+      rgroup_controls *rgm;
       unsigned int num_vectors_m1;
       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
-       if (rgm->mask_type)
+       if (rgm->type)
          num_masks += num_vectors_m1 + 1;
       gcc_assert (num_masks > 0);
 
@@ -3481,80 +3996,62 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
         simpler and safer to use the worst-case cost; if this ends up
         being the tie-breaker between vectorizing or not, then it's
         probably better not to vectorize.  */
-      (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
-                           NULL, 0, vect_prologue);
-      (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
-                           NULL, 0, vect_body);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
+                           vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
+                           vector_stmt, NULL, NULL_TREE, 0, vect_body);
     }
-  else if (npeel < 0)
+  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
     {
-      peel_iters_prologue = assumed_vf / 2;
-      if (dump_enabled_p ())
-       dump_printf (MSG_NOTE, "cost model: "
-                    "prologue peel iters set to vf/2.\n");
+      /* Referring to the functions vect_set_loop_condition_partial_vectors
+        and vect_set_loop_controls_directly, we need to generate each
+        length in the prologue and in the loop body if required. Although
+        there are some possible optimizations, we consider the worst case
+        here.  */
 
-      /* If peeling for alignment is unknown, loop bound of main loop becomes
-         unknown.  */
-      peel_iters_epilogue = assumed_vf / 2;
-      if (dump_enabled_p ())
-       dump_printf (MSG_NOTE, "cost model: "
-                    "epilogue peel iters set to vf/2 because "
-                    "peeling for alignment is unknown.\n");
+      bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
+      bool need_iterate_p
+       = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+          && !vect_known_niters_smaller_than_vf (loop_vinfo));
 
-      /* If peeled iterations are unknown, count a taken branch and a not taken
-         branch per peeled loop. Even if scalar loop iterations are known,
-         vector iterations are not known since peeled prologue iterations are
-         not known. Hence guards remain the same.  */
-      (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
-                           NULL, 0, vect_prologue);
-      (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
-                           NULL, 0, vect_prologue);
-      (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
-                           NULL, 0, vect_epilogue);
-      (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
-                           NULL, 0, vect_epilogue);
-      stmt_info_for_cost *si;
-      int j;
-      FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
-       {
-         (void) add_stmt_cost (target_cost_data,
-                               si->count * peel_iters_prologue,
-                               si->kind, si->stmt_info, si->misalign,
-                               vect_prologue);
-         (void) add_stmt_cost (target_cost_data,
-                               si->count * peel_iters_epilogue,
-                               si->kind, si->stmt_info, si->misalign,
-                               vect_epilogue);
-       }
-    }
-  else
-    {
-      stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
-      stmt_info_for_cost *si;
-      int j;
-      void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
+      /* Calculate how many statements to be added.  */
+      unsigned int prologue_stmts = 0;
+      unsigned int body_stmts = 0;
 
-      prologue_cost_vec.create (2);
-      epilogue_cost_vec.create (2);
-      peel_iters_prologue = npeel;
+      rgroup_controls *rgc;
+      unsigned int num_vectors_m1;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+       if (rgc->type)
+         {
+           /* May need one SHIFT for nitems_total computation.  */
+           unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+           if (nitems != 1 && !niters_known_p)
+             prologue_stmts += 1;
+
+           /* May need one MAX and one MINUS for wrap around.  */
+           if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
+             prologue_stmts += 2;
 
-      (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
-                                         &peel_iters_epilogue,
-                                         &LOOP_VINFO_SCALAR_ITERATION_COST
-                                           (loop_vinfo),
-                                         &prologue_cost_vec,
-                                         &epilogue_cost_vec);
+           /* Need one MAX and one MINUS for each batch limit excepting for
+              the 1st one.  */
+           prologue_stmts += num_vectors_m1 * 2;
 
-      FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
-       (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
-                             si->misalign, vect_prologue);
+           unsigned int num_vectors = num_vectors_m1 + 1;
 
-      FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
-       (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
-                             si->misalign, vect_epilogue);
+           /* Need to set up lengths in prologue, only one MIN required
+              for each since start index is zero.  */
+           prologue_stmts += num_vectors;
+
+           /* Each may need two MINs and one MINUS to update lengths in body
+              for next iteration.  */
+           if (need_iterate_p)
+             body_stmts += 3 * num_vectors;
+         }
 
-      prologue_cost_vec.release ();
-      epilogue_cost_vec.release ();
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
+                           scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
+                           scalar_stmt, NULL, NULL_TREE, 0, vect_body);
     }
 
   /* FORNOW: The scalar outside cost is incremented in one of the
@@ -3690,8 +4187,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     }
 
   /* ??? The "if" arm is written to handle all cases; see below for what
-     we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+     we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
       /* Rewriting the condition above in terms of the number of
         vector iterations (vniters) rather than the number of
@@ -3718,7 +4215,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
        dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
                     min_vec_niters);
 
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
        {
          /* Now that we know the minimum number of vector iterations,
             find the minimum niters for which the scalar cost is larger:
@@ -3769,10 +4266,14 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
                 "  Calculated minimum iters for profitability: %d\n",
                 min_profitable_iters);
 
-  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
     /* We want the vectorized loop to execute at least once.  */
     min_profitable_iters = assumed_vf + peel_iters_prologue;
+  else if (min_profitable_iters < peel_iters_prologue)
+    /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
+       vectorized loop executes at least once.  */
+    min_profitable_iters = peel_iters_prologue;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -3790,7 +4291,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 
   if (vec_outside_cost <= 0)
     min_profitable_estimate = 0;
-  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  /* ??? This "else if" arm is written to handle all cases; see below for
+     what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
+  else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
       /* This is a repeat of the code above, but with + SOC rather
         than - SOC.  */
@@ -3802,7 +4305,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       if (outside_overhead > 0)
        min_vec_niters = outside_overhead / saving_per_viter + 1;
 
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
        {
          int threshold = (vec_inside_cost * min_vec_niters
                           + vec_outside_cost
@@ -3881,7 +4384,8 @@ have_whole_vector_shift (machine_mode mode)
    the loop, and the epilogue code that must be generated.  */
 
 static void
-vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
+vect_model_reduction_cost (loop_vec_info loop_vinfo,
+                          stmt_vec_info stmt_info, internal_fn reduc_fn,
                           vect_reduction_type reduction_type,
                           int ncopies, stmt_vector_for_cost *cost_vec)
 {
@@ -3890,7 +4394,6 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
   optab optab;
   tree vectype;
   machine_mode mode;
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   class loop *loop = NULL;
 
   if (loop_vinfo)
@@ -4046,34 +4549,6 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
 }
 
 
-/* Function vect_model_induction_cost.
-
-   Models cost for induction operations.  */
-
-static void
-vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
-                          stmt_vector_for_cost *cost_vec)
-{
-  unsigned inside_cost, prologue_cost;
-
-  if (PURE_SLP_STMT (stmt_info))
-    return;
-
-  /* loop cost for vec_loop.  */
-  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
-                                 stmt_info, 0, vect_body);
-
-  /* prologue cost for vec_init and vec_step.  */
-  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
-                                   stmt_info, 0, vect_prologue);
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-                     "vect_model_induction_cost: inside_cost = %d, "
-                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
-}
-
-
 
 /* Function get_initial_def_for_reduction
 
@@ -4119,11 +4594,11 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
    A cost model should help decide between these two schemes.  */
 
 static tree
-get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
+get_initial_def_for_reduction (loop_vec_info loop_vinfo,
+                              stmt_vec_info stmt_vinfo,
                               enum tree_code code, tree init_val,
                                tree *adjustment_def)
 {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree scalar_type = TREE_TYPE (init_val);
   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
@@ -4223,14 +4698,14 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
    value will not change the result.  */
 
 static void
-get_initial_defs_for_reduction (slp_tree slp_node,
+get_initial_defs_for_reduction (vec_info *vinfo,
+                               slp_tree slp_node,
                                vec<tree> *vec_oprnds,
                                unsigned int number_of_vectors,
                                bool reduc_chain, tree neutral_op)
 {
   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
   stmt_vec_info stmt_vinfo = stmts[0];
-  vec_info *vinfo = stmt_vinfo->vinfo;
   unsigned HOST_WIDE_INT nunits;
   unsigned j, number_of_places_left_in_vector;
   tree vector_type;
@@ -4343,11 +4818,12 @@ get_initial_defs_for_reduction (slp_tree slp_node,
    the stmt_vec_info the meta information is stored on.  */
 
 stmt_vec_info
-info_for_reduction (stmt_vec_info stmt_info)
+info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
 {
   stmt_info = vect_orig_stmt (stmt_info);
   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
-  if (!is_a <gphi *> (stmt_info->stmt))
+  if (!is_a <gphi *> (stmt_info->stmt)
+      || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
   gphi *phi = as_a <gphi *> (stmt_info->stmt);
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
@@ -4359,7 +4835,7 @@ info_for_reduction (stmt_vec_info stmt_info)
     {
       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
       stmt_vec_info info
-         = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
+         = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
        stmt_info = info;
     }
@@ -4414,13 +4890,13 @@ info_for_reduction (stmt_vec_info stmt_info)
 */
 
 static void
-vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
+vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
+                                 stmt_vec_info stmt_info,
                                  slp_tree slp_node,
                                  slp_instance slp_node_instance)
 {
-  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
   gcc_assert (reduc_info->is_reduc_info);
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   /* For double reductions we need to get at the inner loop reduction
      stmt which has the meta info attached.  Our stmt_info is that of the
      loop-closed PHI of the inner loop which we remember as
@@ -4439,7 +4915,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
-  stmt_vec_info prev_phi_info;
   tree vectype;
   machine_mode mode;
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
@@ -4447,7 +4922,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
   tree scalar_dest;
   tree scalar_type;
   gimple *new_phi = NULL, *phi;
-  stmt_vec_info phi_info;
   gimple_stmt_iterator exit_gsi;
   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
   gimple *epilog_stmt = NULL;
@@ -4470,7 +4944,7 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
   tree induction_index = NULL_TREE;
 
   if (slp_node)
-    group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 
+    group_size = SLP_TREE_LANES (slp_node);
 
   if (nested_in_vect_loop_p (loop, stmt_info))
     {
@@ -4517,15 +4991,9 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
     }
   else
     {
+      stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
       vec_num = 1;
-      ncopies = 0;
-      phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
-      do
-       {
-         ncopies++;
-         phi_info = STMT_VINFO_RELATED_STMT (phi_info);
-       }
-      while (phi_info);
+      ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
     }
 
   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
@@ -4547,7 +5015,7 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
        {
          if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
            {
-             gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
+             gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
              gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
              ccompares.safe_push
                (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
@@ -4598,7 +5066,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
       /* Create a vector phi node.  */
       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
       new_phi = create_phi_node (new_phi_tree, loop->header);
-      loop_vinfo->add_stmt (new_phi);
       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
                   loop_preheader_edge (loop), UNKNOWN_LOCATION);
 
@@ -4625,9 +5092,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
                                         new_phi_tree, indx_before_incr);
        }
       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
-      stmt_vec_info index_vec_info
-       = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
-      STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
 
       /* Update the phi with the vec cond.  */
       induction_index = new_phi_tree;
@@ -4668,29 +5132,26 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
   if (double_reduc)
     loop = outer_loop;
   exit_bb = single_exit (loop)->dest;
-  prev_phi_info = NULL;
   new_phis.create (slp_node ? vec_num : ncopies);
   for (unsigned i = 0; i < vec_num; i++)
     {
       if (slp_node)
-       def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
+       def = vect_get_slp_vect_def (slp_node, i);
       else
-       def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
+       def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
       for (j = 0; j < ncopies; j++)
         {
          tree new_def = copy_ssa_name (def);
           phi = create_phi_node (new_def, exit_bb);
-         stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
           if (j == 0)
             new_phis.quick_push (phi);
           else
            {
-             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
-             STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
+             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
+             new_phis.quick_push (phi);
            }
 
           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
-         prev_phi_info = phi_info;
         }
     }
 
@@ -4761,15 +5222,12 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
   /* Likewise if we couldn't use a single defuse cycle.  */
   else if (ncopies > 1)
     {
-      gcc_assert (new_phis.length () == 1);
       gimple_seq stmts = NULL;
       tree first_vect = PHI_RESULT (new_phis[0]);
       first_vect = gimple_convert (&stmts, vectype, first_vect);
-      stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
       for (int k = 1; k < ncopies; ++k)
        {
-         next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
-         tree second_vect = PHI_RESULT (next_phi_info->stmt);
+         tree second_vect = PHI_RESULT (new_phis[k]);
          second_vect = gimple_convert (&stmts, vectype, second_vect);
          first_vect = gimple_build (&stmts, code, vectype,
                                     first_vect, second_vect);
@@ -5413,10 +5871,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
       if (nested_in_vect_loop)
         {
-         stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
-         STMT_VINFO_RELATED_STMT (epilog_stmt_info)
-           = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
-
           if (!double_reduc)
             scalar_results.quick_push (new_temp);
           else
@@ -5630,18 +6084,17 @@ get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
    that should be used to control the operation in a fully-masked loop.  */
 
 static bool
-vectorize_fold_left_reduction (stmt_vec_info stmt_info,
+vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
+                              stmt_vec_info stmt_info,
                               gimple_stmt_iterator *gsi,
-                              stmt_vec_info *vec_stmt, slp_tree slp_node,
+                              gimple **vec_stmt, slp_tree slp_node,
                               gimple *reduc_def_stmt,
                               tree_code code, internal_fn reduc_fn,
                               tree ops[3], tree vectype_in,
                               int reduc_index, vec_loop_masks *masks)
 {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
-  stmt_vec_info new_stmt_info = NULL;
   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
 
   int ncopies;
@@ -5666,7 +6119,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
   if (slp_node)
     {
       auto_vec<vec<tree> > vec_defs (2);
-      vect_get_slp_defs (slp_node, &vec_defs);
+      vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
       vec_defs[0].release ();
       vec_defs[1].release ();
@@ -5675,9 +6128,8 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
     }
   else
     {
-      tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
-      vec_oprnds0.create (1);
-      vec_oprnds0.quick_push (loop_vec_def0);
+      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+                                    op0, &vec_oprnds0);
       scalar_dest_def_info = stmt_info;
     }
 
@@ -5753,20 +6205,24 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
       if (i == vec_num - 1)
        {
          gimple_set_lhs (new_stmt, scalar_dest);
-         new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
-                                                   new_stmt);
+         vect_finish_replace_stmt (loop_vinfo,
+                                   scalar_dest_def_info,
+                                   new_stmt);
        }
       else
-       new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
-                                                    new_stmt, gsi);
+       vect_finish_stmt_generation (loop_vinfo,
+                                    scalar_dest_def_info,
+                                    new_stmt, gsi);
 
       if (slp_node)
-       SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
+       SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+      else
+       {
+         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+         *vec_stmt = new_stmt;
+       }
     }
 
-  if (!slp_node)
-    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
-
   return true;
 }
 
@@ -5924,13 +6380,13 @@ build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
    does *NOT* necessarily hold for reduction patterns.  */
 
 bool
-vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
+vectorizable_reduction (loop_vec_info loop_vinfo,
+                       stmt_vec_info stmt_info, slp_tree slp_node,
                        slp_instance slp_node_instance,
                        stmt_vector_for_cost *cost_vec)
 {
   tree scalar_dest;
   tree vectype_in = NULL_TREE;
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
   stmt_vec_info cond_stmt_vinfo = NULL;
@@ -5952,15 +6408,34 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
     return false;
 
   /* The stmt we store reduction analysis meta on.  */
-  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
   reduc_info->is_reduc_info = true;
 
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
     {
       if (is_a <gphi *> (stmt_info->stmt))
-       /* Analysis for double-reduction is done on the outer
-          loop PHI, nested cycles have no further restrictions.  */
-       STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       {
+         if (slp_node)
+           {
+             /* We eventually need to set a vector type on invariant
+                arguments.  */
+             unsigned j;
+             slp_tree child;
+             FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+               if (!vect_maybe_update_slp_op_vectype
+                      (child, SLP_TREE_VECTYPE (slp_node)))
+                 {
+                   if (dump_enabled_p ())
+                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                      "incompatible vector types for "
+                                      "invariants\n");
+                   return false;
+                 }
+           }
+         /* Analysis for double-reduction is done on the outer
+            loop PHI, nested cycles have no further restrictions.  */
+         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       }
       else
        STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
       return true;
@@ -6002,12 +6477,16 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
 
   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
-     and compute the reduction chain length.  */
-  tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                         loop_latch_edge (loop));
+     and compute the reduction chain length.  Discover the real
+     reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
+  tree reduc_def
+    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                            loop_latch_edge
+                              (gimple_bb (reduc_def_phi)->loop_father));
   unsigned reduc_chain_length = 0;
   bool only_slp_reduc_chain = true;
   stmt_info = NULL;
+  slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
   while (reduc_def != PHI_RESULT (reduc_def_phi))
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
@@ -6050,6 +6529,8 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
        stmt_info = vdef;
       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
       reduc_chain_length++;
+      if (!stmt_info && slp_node)
+       slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
     }
   /* PHIs should not participate in patterns.  */
   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
@@ -6136,17 +6617,24 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
      The last use is the reduction variable.  In case of nested cycle this
      assumption is not true: we use reduc_index to record the index of the
      reduction variable.  */
-  reduc_def = PHI_RESULT (reduc_def_phi);
+  slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
+  /* We need to skip an extra operand for COND_EXPRs with embedded
+     comparison.  */
+  unsigned opno_adjust = 0;
+  if (code == COND_EXPR
+      && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
+    opno_adjust = 1;
   for (i = 0; i < op_type; i++)
     {
-      tree op = gimple_op (stmt, i + 1);
       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
       if (i == 0 && code == COND_EXPR)
         continue;
 
       stmt_vec_info def_stmt_info;
       enum vect_def_type dt;
-      if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
+      tree op;
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
+                              i + opno_adjust, &op, &slp_op[i], &dt, &tem,
                               &def_stmt_info))
        {
          if (dump_enabled_p ())
@@ -6527,7 +7015,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
         which each SLP statement has its own initial value and in which
         that value needs to be repeated for every instance of the
         statement within the initial vector.  */
-      unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+      unsigned int group_size = SLP_TREE_LANES (slp_node);
       if (!neutral_op
          && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
                                              TREE_TYPE (vectype_out)))
@@ -6680,13 +7168,28 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
       return false;
     }
 
+  if (slp_node
+      && !(!single_defuse_cycle
+          && code != DOT_PROD_EXPR
+          && code != WIDEN_SUM_EXPR
+          && code != SAD_EXPR
+          && reduction_type != FOLD_LEFT_REDUCTION))
+    for (i = 0; i < op_type; i++)
+      if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "incompatible vector types for invariants\n");
+         return false;
+       }
+
   if (slp_node)
     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
   else
     vec_num = 1;
 
-  vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
-                            cost_vec);
+  vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
+                            reduction_type, ncopies, cost_vec);
   if (dump_enabled_p ()
       && reduction_type == FOLD_LEFT_REDUCTION)
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -6710,7 +7213,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
     }
-  else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+  else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
     {
       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
       internal_fn cond_fn = get_conditional_internal_fn (code);
@@ -6723,9 +7226,9 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "can't use a fully-masked loop because no"
-                            " conditional operation is available.\n");
-         LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                            "can't operate on partial vectors because"
+                            " no conditional operation is available.\n");
+         LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
        }
       else if (reduction_type == FOLD_LEFT_REDUCTION
               && reduc_fn == IFN_LAST
@@ -6735,9 +7238,9 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "can't use a fully-masked loop because no"
-                            " conditional operation is available.\n");
-         LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                            "can't operate on partial vectors because"
+                            " no conditional operation is available.\n");
+         LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
        }
       else
        vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
@@ -6750,18 +7253,17 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
    value.  */
 
 bool
-vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
-                         stmt_vec_info *vec_stmt, slp_tree slp_node)
+vect_transform_reduction (loop_vec_info loop_vinfo,
+                         stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+                         gimple **vec_stmt, slp_tree slp_node)
 {
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
-  int j;
   int vec_num;
 
-  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
   gcc_assert (reduc_info->is_reduc_info);
 
   if (nested_in_vect_loop_p (loop, stmt_info))
@@ -6814,8 +7316,6 @@ vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
 
   /* Transform.  */
-  stmt_vec_info new_stmt_info = NULL;
-  stmt_vec_info prev_stmt_info;
   tree new_temp = NULL_TREE;
   auto_vec<tree> vec_oprnds0;
   auto_vec<tree> vec_oprnds1;
@@ -6836,7 +7336,7 @@ vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
     {
       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
       return vectorize_fold_left_reduction
-         (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
+         (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
           reduc_fn, ops, vectype_in, reduc_index, masks);
     }
 
@@ -6850,137 +7350,83 @@ vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
   tree scalar_dest = gimple_assign_lhs (stmt);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
 
-  prev_stmt_info = NULL;
-  if (!slp_node)
+  vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
+                    single_defuse_cycle && reduc_index == 0
+                    ? NULL_TREE : ops[0], &vec_oprnds0,
+                    single_defuse_cycle && reduc_index == 1
+                    ? NULL_TREE : ops[1], &vec_oprnds1,
+                    op_type == ternary_op
+                    && !(single_defuse_cycle && reduc_index == 2)
+                    ? ops[2] : NULL_TREE, &vec_oprnds2);
+  if (single_defuse_cycle)
     {
-      vec_oprnds0.create (1);
-      vec_oprnds1.create (1);
-      if (op_type == ternary_op)
-        vec_oprnds2.create (1);
+      gcc_assert (!slp_node);
+      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+                                    ops[reduc_index],
+                                    reduc_index == 0 ? &vec_oprnds0
+                                    : (reduc_index == 1 ? &vec_oprnds1
+                                       : &vec_oprnds2));
     }
 
-  for (j = 0; j < ncopies; j++)
+  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
     {
-      /* Handle uses.  */
-      if (j == 0)
-        {
-         if (slp_node)
-           {
-             /* Get vec defs for all the operands except the reduction index,
-                ensuring the ordering of the ops in the vector is kept.  */
-             auto_vec<vec<tree>, 3> vec_defs;
-             vect_get_slp_defs (slp_node, &vec_defs);
-             vec_oprnds0.safe_splice (vec_defs[0]);
-             vec_defs[0].release ();
-             vec_oprnds1.safe_splice (vec_defs[1]);
-             vec_defs[1].release ();
-             if (op_type == ternary_op)
-               {
-                 vec_oprnds2.safe_splice (vec_defs[2]);
-                 vec_defs[2].release ();
-               }
-           }
-          else
+      gimple *new_stmt;
+      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
+      if (masked_loop_p && !mask_by_cond_expr)
+       {
+         /* Make sure that the reduction accumulator is vop[0].  */
+         if (reduc_index == 1)
            {
-              vec_oprnds0.quick_push
-               (vect_get_vec_def_for_operand (ops[0], stmt_info));
-              vec_oprnds1.quick_push
-               (vect_get_vec_def_for_operand (ops[1], stmt_info));
-              if (op_type == ternary_op)
-               vec_oprnds2.quick_push 
-                 (vect_get_vec_def_for_operand (ops[2], stmt_info));
+             gcc_assert (commutative_tree_code (code));
+             std::swap (vop[0], vop[1]);
            }
-        }
+         tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+                                         vectype_in, i);
+         gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
+                                                   vop[0], vop[1], vop[0]);
+         new_temp = make_ssa_name (vec_dest, call);
+         gimple_call_set_lhs (call, new_temp);
+         gimple_call_set_nothrow (call, true);
+         vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
+         new_stmt = call;
+       }
       else
-        {
-          if (!slp_node)
-            {
-             gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
-
-             if (single_defuse_cycle && reduc_index == 0)
-               vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
-             else
-               vec_oprnds0[0]
-                 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-                                                   vec_oprnds0[0]);
-             if (single_defuse_cycle && reduc_index == 1)
-               vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
-             else
-               vec_oprnds1[0]
-                 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-                                                   vec_oprnds1[0]);
-             if (op_type == ternary_op)
-               {
-                 if (single_defuse_cycle && reduc_index == 2)
-                   vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
-                 else
-                   vec_oprnds2[0] 
-                     = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-                                                       vec_oprnds2[0]);
-               }
-            }
-        }
+       {
+         if (op_type == ternary_op)
+           vop[2] = vec_oprnds2[i];
 
-      FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
-        {
-         tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-         if (masked_loop_p && !mask_by_cond_expr)
+         if (masked_loop_p && mask_by_cond_expr)
            {
-             /* Make sure that the reduction accumulator is vop[0].  */
-             if (reduc_index == 1)
-               {
-                 gcc_assert (commutative_tree_code (code));
-                 std::swap (vop[0], vop[1]);
-               }
              tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
-                                             vectype_in, i * ncopies + j);
-             gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
-                                                       vop[0], vop[1],
-                                                       vop[0]);
-             new_temp = make_ssa_name (vec_dest, call);
-             gimple_call_set_lhs (call, new_temp);
-             gimple_call_set_nothrow (call, true);
-             new_stmt_info
-               = vect_finish_stmt_generation (stmt_info, call, gsi);
-           }
-         else
-           {
-             if (op_type == ternary_op)
-               vop[2] = vec_oprnds2[i];
-
-             if (masked_loop_p && mask_by_cond_expr)
-               {
-                 tree mask = vect_get_loop_mask (gsi, masks,
-                                                 vec_num * ncopies,
-                                                 vectype_in, i * ncopies + j);
-                 build_vect_cond_expr (code, vop, mask, gsi);
-               }
-
-             gassign *new_stmt = gimple_build_assign (vec_dest, code,
-                                                      vop[0], vop[1], vop[2]);
-             new_temp = make_ssa_name (vec_dest, new_stmt);
-             gimple_assign_set_lhs (new_stmt, new_temp);
-             new_stmt_info
-               = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+                                             vectype_in, i);
+             build_vect_cond_expr (code, vop, mask, gsi);
            }
 
-          if (slp_node)
-           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
-        }
-
-      if (slp_node || single_defuse_cycle)
-        continue;
+         new_stmt = gimple_build_assign (vec_dest, code,
+                                         vop[0], vop[1], vop[2]);
+         new_temp = make_ssa_name (vec_dest, new_stmt);
+         gimple_assign_set_lhs (new_stmt, new_temp);
+         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+       }
 
-      if (j == 0)
-       STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+      if (slp_node)
+       SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+      else if (single_defuse_cycle
+              && i < ncopies - 1)
+       {
+         if (reduc_index == 0)
+           vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
+         else if (reduc_index == 1)
+           vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
+         else if (reduc_index == 2)
+           vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
+       }
       else
-       STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
-
-      prev_stmt_info = new_stmt_info;
+       STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
     }
 
-  if (single_defuse_cycle && !slp_node)
-    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+  if (!slp_node)
+    *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
 
   return true;
 }
@@ -6988,15 +7434,14 @@ vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
 /* Transform phase of a cycle PHI.  */
 
 bool
-vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+vect_transform_cycle_phi (loop_vec_info loop_vinfo,
+                         stmt_vec_info stmt_info, gimple **vec_stmt,
                          slp_tree slp_node, slp_instance slp_node_instance)
 {
   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   int i;
   int ncopies;
-  stmt_vec_info prev_phi_info;
   int j;
   bool nested_cycle = false;
   int vec_num;
@@ -7009,7 +7454,7 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
 
   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
-  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
   gcc_assert (reduc_info->is_reduc_info);
 
   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
@@ -7026,9 +7471,8 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
   if (slp_node)
     {
       /* The size vect_schedule_slp_instance computes is off for us.  */
-      vec_num = vect_get_num_vectors
-         (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-          * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
+      vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+                                     * SLP_TREE_LANES (slp_node), vectype_in);
       ncopies = 1;
     }
   else
@@ -7053,15 +7497,24 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
   if (slp_node)
     {
       vec_initial_defs.reserve (vec_num);
-      gcc_assert (slp_node == slp_node_instance->reduc_phis);
-      stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
-      tree neutral_op
-       = neutral_op_for_slp_reduction (slp_node, vectype_out,
-                                       STMT_VINFO_REDUC_CODE (reduc_info),
-                                       first != NULL);
-      get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
-                                     &vec_initial_defs, vec_num,
-                                     first != NULL, neutral_op);
+      if (nested_cycle)
+       {
+         unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
+         vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
+                            &vec_initial_defs);
+       }
+      else
+       {
+         gcc_assert (slp_node == slp_node_instance->reduc_phis);
+         stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
+         tree neutral_op
+             = neutral_op_for_slp_reduction (slp_node, vectype_out,
+                                             STMT_VINFO_REDUC_CODE (reduc_info),
+                                             first != NULL);
+         get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
+                                         &vec_initial_defs, vec_num,
+                                         first != NULL, neutral_op);
+       }
     }
   else
     {
@@ -7088,13 +7541,17 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
              STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
            }
          vec_initial_def = build_vector_from_val (vectype_out, induc_val);
+         vec_initial_defs.create (ncopies);
+         for (i = 0; i < ncopies; ++i)
+           vec_initial_defs.quick_push (vec_initial_def);
        }
       else if (nested_cycle)
        {
          /* Do not use an adjustment def as that case is not supported
             correctly if ncopies is not one.  */
-         vec_initial_def = vect_get_vec_def_for_operand (initial_def,
-                                                         reduc_stmt_info);
+         vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
+                                        ncopies, initial_def,
+                                        &vec_initial_defs);
        }
       else
        {
@@ -7104,16 +7561,16 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
          if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
            adjustment_defp = NULL;
          vec_initial_def
-           = get_initial_def_for_reduction (reduc_stmt_info, code,
+           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
                                             initial_def, adjustment_defp);
          STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
+         vec_initial_defs.create (ncopies);
+         for (i = 0; i < ncopies; ++i)
+           vec_initial_defs.quick_push (vec_initial_def);
        }
-      vec_initial_defs.create (1);
-      vec_initial_defs.quick_push (vec_initial_def);
     }
 
   /* Generate the reduction PHIs upfront.  */
-  prev_phi_info = NULL;
   for (i = 0; i < vec_num; i++)
     {
       tree vec_init_def = vec_initial_defs[i];
@@ -7122,26 +7579,22 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
          /* Create the reduction-phi that defines the reduction
             operand.  */
          gphi *new_phi = create_phi_node (vec_dest, loop->header);
-         stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
 
          /* Set the loop-entry arg of the reduction-phi.  */
          if (j != 0 && nested_cycle)
-           vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-                                                          vec_init_def);
+           vec_init_def = vec_initial_defs[j];
          add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
                       UNKNOWN_LOCATION);
 
          /* The loop-latch arg is set in epilogue processing.  */
 
          if (slp_node)
-           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
+           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
          else
            {
              if (j == 0)
-               STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
-             else
-               STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
-             prev_phi_info = new_phi_info;
+               *vec_stmt = new_phi;
+             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
            }
        }
     }
@@ -7152,10 +7605,10 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
 /* Vectorizes LC PHIs.  */
 
 bool
-vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+vectorizable_lc_phi (loop_vec_info loop_vinfo,
+                    stmt_vec_info stmt_info, gimple **vec_stmt,
                     slp_tree slp_node)
 {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   if (!loop_vinfo
       || !is_a <gphi *> (stmt_info->stmt)
       || gimple_phi_num_args (stmt_info->stmt) != 1)
@@ -7167,6 +7620,17 @@ vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      /* Deal with copies from externs or constants that disguise as
+        loop-closed PHI nodes (PR97886).  */
+      if (slp_node
+         && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
+                                               SLP_TREE_VECTYPE (slp_node)))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "incompatible vector types for invariants\n");
+         return false;
+       }
       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
       return true;
     }
@@ -7176,42 +7640,97 @@ vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
   basic_block bb = gimple_bb (stmt_info->stmt);
   edge e = single_pred_edge (bb);
   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
-  vec<tree> vec_oprnds = vNULL;
-  vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
-                    stmt_info, &vec_oprnds, NULL, slp_node);
-  if (slp_node)
+  auto_vec<tree> vec_oprnds;
+  vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
+                    !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
+                    gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
+  for (unsigned i = 0; i < vec_oprnds.length (); i++)
+    {
+      /* Create the vectorized LC PHI node.  */
+      gphi *new_phi = create_phi_node (vec_dest, bb);
+      add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
+      if (slp_node)
+       SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
+      else
+       STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
+    }
+  if (!slp_node)
+    *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+
+  return true;
+}
+
+/* Vectorizes PHIs.  */
+
+bool
+vectorizable_phi (vec_info *,
+                 stmt_vec_info stmt_info, gimple **vec_stmt,
+                 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
+    return false;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+    return false;
+
+  tree vectype = SLP_TREE_VECTYPE (slp_node);
+
+  if (!vec_stmt) /* transformation not required.  */
     {
-      unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      gcc_assert (vec_oprnds.length () == vec_num);
-      for (unsigned i = 0; i < vec_num; i++)
-       {
-         /* Create the vectorized LC PHI node.  */
-         gphi *new_phi = create_phi_node (vec_dest, bb);
-         add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
-         stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
-         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
-       }
+      slp_tree child;
+      unsigned i;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
+       if (!child)
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "PHI node with unvectorized backedge def\n");
+           return false;
+         }
+       else if (!vect_maybe_update_slp_op_vectype (child, vectype))
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "incompatible vector types for invariants\n");
+           return false;
+         }
+      record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                       vector_stmt, stmt_info, vectype, 0, vect_body);
+      STMT_VINFO_TYPE (stmt_info) = phi_info_type;
+      return true;
     }
-  else
+
+  tree scalar_dest = gimple_phi_result (stmt_info->stmt);
+  basic_block bb = gimple_bb (stmt_info->stmt);
+  tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
+  auto_vec<gphi *> new_phis;
+  for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
     {
-      unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
-      stmt_vec_info prev_phi_info = NULL;
-      for (unsigned i = 0; i < ncopies; i++)
-       {
-         if (i != 0)
-           vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
-         /* Create the vectorized LC PHI node.  */
-         gphi *new_phi = create_phi_node (vec_dest, bb);
-         add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
-         stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
-         if (i == 0)
-           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
-         else
-           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
-         prev_phi_info = new_phi_info;
+      slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
+
+      /* Skip not yet vectorized defs.  */
+      if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
+         && SLP_TREE_VEC_STMTS (child).is_empty ())
+       continue;
+
+      auto_vec<tree> vec_oprnds;
+      vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
+      if (!new_phis.exists ())
+       {
+         new_phis.create (vec_oprnds.length ());
+         for (unsigned j = 0; j < vec_oprnds.length (); j++)
+           {
+             /* Create the vectorized LC PHI node.  */
+             new_phis.quick_push (create_phi_node (vec_dest, bb));
+             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
+           }
        }
+      edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
+      for (unsigned j = 0; j < vec_oprnds.length (); j++)
+       add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
     }
-  vec_oprnds.release ();
+  /* We should have at least one already vectorized child.  */
+  gcc_assert (new_phis.exists ());
 
   return true;
 }
@@ -7265,12 +7784,11 @@ vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
    Return true if STMT_INFO is vectorizable in this way.  */
 
 bool
-vectorizable_induction (stmt_vec_info stmt_info,
-                       gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
-                       stmt_vec_info *vec_stmt, slp_tree slp_node,
+vectorizable_induction (loop_vec_info loop_vinfo,
+                       stmt_vec_info stmt_info,
+                       gimple **vec_stmt, slp_tree slp_node,
                        stmt_vector_for_cost *cost_vec)
 {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   unsigned ncopies;
   bool nested_in_vect_loop = false;
@@ -7287,12 +7805,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned i;
   tree expr;
-  gimple_seq stmts;
-  imm_use_iterator imm_iter;
-  use_operand_p use_p;
-  gimple *exit_phi;
-  edge latch_e;
-  tree loop_arg;
   gimple_stmt_iterator si;
 
   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
@@ -7332,10 +7844,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
          return false;
        }
 
-      /* FORNOW: outer loop induction with SLP not supported.  */
-      if (STMT_SLP_TYPE (stmt_info))
-       return false;
-
       exit_phi = NULL;
       latch_e = loop_latch_edge (loop->inner);
       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
@@ -7374,7 +7882,7 @@ vectorizable_induction (stmt_vec_info stmt_info,
 
   if (slp_node && !nunits.is_constant ())
     {
-      /* The current SLP code creates the initial value element-by-element.  */
+      /* The current SLP code creates the step value element-by-element.  */
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "SLP induction not supported for variable-length"
@@ -7384,9 +7892,50 @@ vectorizable_induction (stmt_vec_info stmt_info,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      unsigned inside_cost = 0, prologue_cost = 0;
+      if (slp_node)
+       {
+         /* We eventually need to set a vector type on invariant
+            arguments.  */
+         unsigned j;
+         slp_tree child;
+         FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+           if (!vect_maybe_update_slp_op_vectype
+               (child, SLP_TREE_VECTYPE (slp_node)))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "incompatible vector types for "
+                                  "invariants\n");
+               return false;
+             }
+         /* loop cost for vec_loop.  */
+         inside_cost
+           = record_stmt_cost (cost_vec,
+                               SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                               vector_stmt, stmt_info, 0, vect_body);
+         /* prologue cost for vec_init (if not nested) and step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
+                                           scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      else /* if (!slp_node) */
+       {
+         /* loop cost for vec_loop.  */
+         inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+                                         stmt_info, 0, vect_body);
+         /* prologue cost for vec_init and vec_step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "vect_model_induction_cost: inside_cost = %d, "
+                        "prologue_cost = %d .\n", inside_cost,
+                        prologue_cost);
+
       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
       DUMP_VECT_SCOPE ("vectorizable_induction");
-      vect_model_induction_cost (stmt_info, ncopies, cost_vec);
       return true;
     }
 
@@ -7400,161 +7949,200 @@ vectorizable_induction (stmt_vec_info stmt_info,
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
 
-  latch_e = loop_latch_edge (iv_loop);
-  loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
-
   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
   gcc_assert (step_expr != NULL_TREE);
   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
 
   pe = loop_preheader_edge (iv_loop);
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-                                    loop_preheader_edge (iv_loop));
-
-  stmts = NULL;
-  if (!nested_in_vect_loop)
-    {
-      /* Convert the initial value to the IV update type.  */
-      tree new_type = TREE_TYPE (step_expr);
-      init_expr = gimple_convert (&stmts, new_type, init_expr);
-
-      /* If we are using the loop mask to "peel" for alignment then we need
-        to adjust the start value here.  */
-      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-      if (skip_niters != NULL_TREE)
-       {
-         if (FLOAT_TYPE_P (vectype))
-           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
-                                       skip_niters);
-         else
-           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
-         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
-                                        skip_niters, step_expr);
-         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
-                                   init_expr, skip_step);
-       }
-    }
-
-  if (stmts)
-    {
-      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-      gcc_assert (!new_bb);
-    }
-
   /* Find the first insertion point in the BB.  */
   basic_block bb = gimple_bb (phi);
   si = gsi_after_labels (bb);
 
   /* For SLP induction we have to generate several IVs as for example
-     with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
-     [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
-     [VF*S, VF*S, VF*S, VF*S] for all.  */
+     with group size 3 we need
+       [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
+       [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
   if (slp_node)
     {
       /* Enforced above.  */
       unsigned int const_nunits = nunits.to_constant ();
 
-      /* Generate [VF*S, VF*S, ... ].  */
-      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+      /* The initial values are vectorized, but any lanes > group_size
+        need adjustment.  */
+      slp_tree init_node
+       = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
+
+      /* Gather steps.  Since we do not vectorize inductions as
+        cycles we have to reconstruct the step from SCEV data.  */
+      unsigned group_size = SLP_TREE_LANES (slp_node);
+      tree *steps = XALLOCAVEC (tree, group_size);
+      tree *inits = XALLOCAVEC (tree, group_size);
+      stmt_vec_info phi_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
        {
-         expr = build_int_cst (integer_type_node, vf);
-         expr = fold_convert (TREE_TYPE (step_expr), expr);
+         steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+         if (!init_node)
+           inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
+                                          pe->dest_idx);
        }
-      else
-       expr = build_int_cst (TREE_TYPE (step_expr), vf);
-      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                             expr, step_expr);
-      if (! CONSTANT_CLASS_P (new_name))
-       new_name = vect_init_vector (stmt_info, new_name,
-                                    TREE_TYPE (step_expr), NULL);
-      new_vec = build_vector_from_val (step_vectype, new_name);
-      vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
 
       /* Now generate the IVs.  */
-      unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      unsigned elts = const_nunits * nvects;
-      unsigned nivs = least_common_multiple (group_size,
-                                            const_nunits) / const_nunits;
-      gcc_assert (elts % group_size == 0);
-      tree elt = init_expr;
+      gcc_assert ((const_nunits * nvects) % group_size == 0);
+      unsigned nivs;
+      if (nested_in_vect_loop)
+       nivs = nvects;
+      else
+       {
+         /* Compute the number of distinct IVs we need.  First reduce
+            group_size if it is a multiple of const_nunits so we get
+            one IV for a group_size of 4 but const_nunits 2.  */
+         unsigned group_sizep = group_size;
+         if (group_sizep % const_nunits == 0)
+           group_sizep = group_sizep / const_nunits;
+         nivs = least_common_multiple (group_sizep,
+                                       const_nunits) / const_nunits;
+       }
+      tree stept = TREE_TYPE (step_vectype);
+      tree lupdate_mul = NULL_TREE;
+      if (!nested_in_vect_loop)
+       {
+         /* The number of iterations covered in one vector iteration.  */
+         unsigned lup_mul = (nvects * const_nunits) / group_size;
+         lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept, lup_mul,
+                                                            UNSIGNED)
+                                    : build_int_cstu (stept, lup_mul));
+       }
+      tree peel_mul = NULL_TREE;
+      gimple_seq init_stmts = NULL;
+      if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+       {
+         if (SCALAR_FLOAT_TYPE_P (stept))
+           peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
+                                    LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         else
+           peel_mul = gimple_convert (&init_stmts, stept,
+                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         peel_mul = gimple_build_vector_from_val (&init_stmts,
+                                                  step_vectype, peel_mul);
+       }
       unsigned ivn;
+      auto_vec<tree> vec_steps;
       for (ivn = 0; ivn < nivs; ++ivn)
        {
-         tree_vector_builder elts (step_vectype, const_nunits, 1);
-         stmts = NULL;
+         tree_vector_builder step_elts (step_vectype, const_nunits, 1);
+         tree_vector_builder init_elts (vectype, const_nunits, 1);
+         tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
          for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
            {
-             if (ivn*const_nunits + eltn >= group_size
-                 && (ivn * const_nunits + eltn) % group_size == 0)
-               elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
-                                   elt, step_expr);
-             elts.quick_push (elt);
-           }
-         vec_init = gimple_build_vector (&stmts, &elts);
-         vec_init = gimple_convert (&stmts, vectype, vec_init);
-         if (stmts)
-           {
-             new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-             gcc_assert (!new_bb);
+             /* The scalar steps of the IVs.  */
+             tree elt = steps[(ivn*const_nunits + eltn) % group_size];
+             elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
+             step_elts.quick_push (elt);
+             if (!init_node)
+               {
+                 /* The scalar inits of the IVs if not vectorized.  */
+                 elt = inits[(ivn*const_nunits + eltn) % group_size];
+                 if (!useless_type_conversion_p (TREE_TYPE (vectype),
+                                                 TREE_TYPE (elt)))
+                   elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
+                                       TREE_TYPE (vectype), elt);
+                 init_elts.quick_push (elt);
+               }
+             /* The number of steps to add to the initial values.  */
+             unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
+             mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
+                                  ? build_real_from_wide (stept,
+                                                          mul_elt, UNSIGNED)
+                                  : build_int_cstu (stept, mul_elt));
            }
+         vec_step = gimple_build_vector (&init_stmts, &step_elts);
+         vec_steps.safe_push (vec_step);
+         tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
+         if (peel_mul)
+           step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                    step_mul, peel_mul);
+         if (!init_node)
+           vec_init = gimple_build_vector (&init_stmts, &init_elts);
 
          /* Create the induction-phi that defines the induction-operand.  */
-         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
+                                           "vec_iv_");
          induction_phi = create_phi_node (vec_dest, iv_loop->header);
-         stmt_vec_info induction_phi_info
-           = loop_vinfo->add_stmt (induction_phi);
          induc_def = PHI_RESULT (induction_phi);
 
          /* Create the iv update inside the loop  */
+         tree up = vec_step;
+         if (lupdate_mul)
+           up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                              vec_step, lupdate_mul);
          gimple_seq stmts = NULL;
          vec_def = gimple_convert (&stmts, step_vectype, induc_def);
          vec_def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, vec_def, vec_step);
+                                 PLUS_EXPR, step_vectype, vec_def, up);
          vec_def = gimple_convert (&stmts, vectype, vec_def);
-         loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+                      UNKNOWN_LOCATION);
+
+         if (init_node)
+           vec_init = vect_get_slp_vect_def (init_node, ivn);
+         if (!nested_in_vect_loop
+             && !integer_zerop (step_mul))
+           {
+             vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
+             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                vec_step, step_mul);
+             vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                     vec_def, up);
+             vec_init = gimple_convert (&init_stmts, vectype, vec_def);
+           }
 
          /* Set the arguments of the phi node:  */
          add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
-         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
-                      UNKNOWN_LOCATION);
 
-         SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
+         SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
+       }
+      if (!nested_in_vect_loop)
+       {
+         /* Fill up to the number of vectors we need for the whole group.  */
+         nivs = least_common_multiple (group_size,
+                                       const_nunits) / const_nunits;
+         for (; ivn < nivs; ++ivn)
+           {
+             SLP_TREE_VEC_STMTS (slp_node)
+               .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+             vec_steps.safe_push (vec_steps[0]);
+           }
        }
 
-      /* Re-use IVs when we can.  */
+      /* Re-use IVs when we can.  We are generating further vector
+        stmts by adding VF' * stride to the IVs generated above.  */
       if (ivn < nvects)
        {
          unsigned vfp
            = least_common_multiple (group_size, const_nunits) / group_size;
-         /* Generate [VF'*S, VF'*S, ... ].  */
-         if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-           {
-             expr = build_int_cst (integer_type_node, vfp);
-             expr = fold_convert (TREE_TYPE (step_expr), expr);
-           }
-         else
-           expr = build_int_cst (TREE_TYPE (step_expr), vfp);
-         new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                                 expr, step_expr);
-         if (! CONSTANT_CLASS_P (new_name))
-           new_name = vect_init_vector (stmt_info, new_name,
-                                        TREE_TYPE (step_expr), NULL);
-         new_vec = build_vector_from_val (step_vectype, new_name);
-         vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
+         tree lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept,
+                                                            vfp, UNSIGNED)
+                                    : build_int_cstu (stept, vfp));
          for (; ivn < nvects; ++ivn)
            {
-             gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
-             tree def;
-             if (gimple_code (iv) == GIMPLE_PHI)
-               def = gimple_phi_result (iv);
-             else
-               def = gimple_assign_lhs (iv);
+             gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
+             tree def = gimple_get_lhs (iv);
+             if (ivn < 2*nivs)
+               vec_steps[ivn - nivs]
+                 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                 vec_steps[ivn - nivs], lupdate_mul);
              gimple_seq stmts = NULL;
              def = gimple_convert (&stmts, step_vectype, def);
-             def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, def, vec_step);
+             def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
+                                 def, vec_steps[ivn % nivs]);
              def = gimple_convert (&stmts, vectype, def);
              if (gimple_code (iv) == GIMPLE_PHI)
                gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
@@ -7563,21 +8151,60 @@ vectorizable_induction (stmt_vec_info stmt_info,
                  gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
                  gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
                }
-             SLP_TREE_VEC_STMTS (slp_node).quick_push
-               (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
+             SLP_TREE_VEC_STMTS (slp_node)
+               .quick_push (SSA_NAME_DEF_STMT (def));
            }
        }
 
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
+      gcc_assert (!new_bb);
+
       return true;
     }
 
+  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
+                                    loop_preheader_edge (iv_loop));
+
+  gimple_seq stmts = NULL;
+  if (!nested_in_vect_loop)
+    {
+      /* Convert the initial value to the IV update type.  */
+      tree new_type = TREE_TYPE (step_expr);
+      init_expr = gimple_convert (&stmts, new_type, init_expr);
+
+      /* If we are using the loop mask to "peel" for alignment then we need
+        to adjust the start value here.  */
+      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+      if (skip_niters != NULL_TREE)
+       {
+         if (FLOAT_TYPE_P (vectype))
+           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
+                                       skip_niters);
+         else
+           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
+         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
+                                        skip_niters, step_expr);
+         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
+                                   init_expr, skip_step);
+       }
+    }
+
+  if (stmts)
+    {
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+      gcc_assert (!new_bb);
+    }
+
   /* Create the vector that holds the initial_value of the induction.  */
   if (nested_in_vect_loop)
     {
       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
         been created during vectorization of previous stmts.  We obtain it
         from the STMT_VINFO_VEC_STMT of the defining stmt.  */
-      vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
+      auto_vec<tree> vec_inits;
+      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+                                    init_expr, &vec_inits);
+      vec_init = vec_inits[0];
       /* If the initial value is not of proper type, convert it.  */
       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
        {
@@ -7592,7 +8219,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
          new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
                                                 new_stmt);
          gcc_assert (!new_bb);
-         loop_vinfo->add_stmt (new_stmt);
        }
     }
   else
@@ -7680,7 +8306,8 @@ vectorizable_induction (stmt_vec_info stmt_info,
   gcc_assert (CONSTANT_CLASS_P (new_name)
              || TREE_CODE (new_name) == SSA_NAME);
   new_vec = build_vector_from_val (step_vectype, t);
-  vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
+  vec_step = vect_init_vector (loop_vinfo, stmt_info,
+                              new_vec, step_vectype, NULL);
 
 
   /* Create the following def-use cycle:
@@ -7697,7 +8324,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
   /* Create the induction-phi that defines the induction-operand.  */
   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
   induction_phi = create_phi_node (vec_dest, iv_loop->header);
-  stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
   induc_def = PHI_RESULT (induction_phi);
 
   /* Create the iv update inside the loop  */
@@ -7707,14 +8333,14 @@ vectorizable_induction (stmt_vec_info stmt_info,
   vec_def = gimple_convert (&stmts, vectype, vec_def);
   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
   new_stmt = SSA_NAME_DEF_STMT (vec_def);
-  stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
 
   /* Set the arguments of the phi node:  */
   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
               UNKNOWN_LOCATION);
 
-  STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
+  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
+  *vec_stmt = induction_phi;
 
   /* In case that vectorization factor (VF) is bigger than the number
      of elements that we can fit in a vectype (nunits), we have to generate
@@ -7725,7 +8351,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
   if (ncopies > 1)
     {
       gimple_seq seq = NULL;
-      stmt_vec_info prev_stmt_vinfo;
       /* FORNOW. This restriction should be relaxed.  */
       gcc_assert (!nested_in_vect_loop);
 
@@ -7749,10 +8374,10 @@ vectorizable_induction (stmt_vec_info stmt_info,
       gcc_assert (CONSTANT_CLASS_P (new_name)
                  || TREE_CODE (new_name) == SSA_NAME);
       new_vec = build_vector_from_val (step_vectype, t);
-      vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
+      vec_step = vect_init_vector (loop_vinfo, stmt_info,
+                                  new_vec, step_vectype, NULL);
 
       vec_def = induc_def;
-      prev_stmt_vinfo = induction_phi_info;
       for (i = 1; i < ncopies; i++)
        {
          /* vec_i = vec_prev + vec_step  */
@@ -7764,46 +8389,10 @@ vectorizable_induction (stmt_vec_info stmt_info,
  
          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
          new_stmt = SSA_NAME_DEF_STMT (vec_def);
-         new_stmt_info = loop_vinfo->add_stmt (new_stmt);
-         STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
-         prev_stmt_vinfo = new_stmt_info;
+         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
        }
     }
 
-  if (nested_in_vect_loop)
-    {
-      /* Find the loop-closed exit-phi of the induction, and record
-         the final vector of induction results:  */
-      exit_phi = NULL;
-      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
-        {
-         gimple *use_stmt = USE_STMT (use_p);
-         if (is_gimple_debug (use_stmt))
-           continue;
-
-         if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
-           {
-             exit_phi = use_stmt;
-             break;
-           }
-        }
-      if (exit_phi)
-       {
-         stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
-         /* FORNOW. Currently not supporting the case that an inner-loop induction
-            is not used in the outer-loop (i.e. only outside the outer-loop).  */
-         gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
-                     && !STMT_VINFO_LIVE_P (stmt_vinfo));
-
-         STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location,
-                            "vector of inductions after inner-loop:%G",
-                            new_stmt);
-       }
-    }
-
-
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                     "transform induction: created def-use cycle: %G%G",
@@ -7818,17 +8407,19 @@ vectorizable_induction (stmt_vec_info stmt_info,
    it can be supported.  */
 
 bool
-vectorizable_live_operation (stmt_vec_info stmt_info,
+vectorizable_live_operation (vec_info *vinfo,
+                            stmt_vec_info stmt_info,
                             gimple_stmt_iterator *gsi,
                             slp_tree slp_node, slp_instance slp_node_instance,
                             int slp_index, bool vec_stmt_p,
-                            stmt_vector_for_cost *)
+                            stmt_vector_for_cost *cost_vec)
 {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   imm_use_iterator imm_iter;
   tree lhs, lhs_type, bitsize, vec_bitsize;
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype = (slp_node
+                 ? SLP_TREE_VECTYPE (slp_node)
+                 : STMT_VINFO_VECTYPE (stmt_info));
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   int ncopies;
   gimple *use_stmt;
@@ -7855,21 +8446,21 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
             all involved stmts together.  */
          else if (slp_index != 0)
            return true;
+         else
+           /* For SLP reductions the meta-info is attached to
+              the representative.  */
+           stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
        }
-      stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+      stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
       gcc_assert (reduc_info->is_reduc_info);
       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
          || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
        return true;
-      vect_create_epilog_for_reduction (stmt_info, slp_node,
+      vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
                                        slp_node_instance);
       return true;
     }
 
-  /* FORNOW.  CHECKME.  */
-  if (nested_in_vect_loop_p (loop, stmt_info))
-    return false;
-
   /* If STMT is not relevant and it is a simple assignment and its inputs are
      invariant then it can remain in place, unvectorized.  The original last
      scalar value that it computes will be used.  */
@@ -7892,12 +8483,11 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
     {
       gcc_assert (slp_index >= 0);
 
-      int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
-      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-
       /* Get the last occurrence of the scalar index from the concatenation of
         all the slp vectors. Calculate which slp vector it is and the index
         within.  */
+      int num_scalar = SLP_TREE_LANES (slp_node);
+      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
 
       /* Calculate which vector contains the result, and which lane of
@@ -7915,33 +8505,34 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
   if (!vec_stmt_p)
     {
       /* No transformation required.  */
-      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+      if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
        {
          if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
                                               OPTIMIZE_FOR_SPEED))
            {
              if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "can't use a fully-masked loop because "
-                                "the target doesn't support extract last "
-                                "reduction.\n");
-             LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                                "can't operate on partial vectors "
+                                "because the target doesn't support extract "
+                                "last reduction.\n");
+             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
            }
          else if (slp_node)
            {
              if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "can't use a fully-masked loop because an "
-                                "SLP statement is live after the loop.\n");
-             LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                                "can't operate on partial vectors "
+                                "because an SLP statement is live after "
+                                "the loop.\n");
+             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
            }
          else if (ncopies > 1)
            {
              if (dump_enabled_p ())
                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "can't use a fully-masked loop because"
-                                " ncopies is greater than 1.\n");
-             LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                                "can't operate on partial vectors "
+                                "because ncopies is greater than 1.\n");
+             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
            }
          else
            {
@@ -7951,33 +8542,35 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
                                     1, vectype, NULL);
            }
        }
+      /* ???  Enable for loop costing as well.  */
+      if (!loop_vinfo)
+       record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
+                         0, vect_epilogue);
       return true;
     }
 
   /* Use the lhs of the original scalar statement.  */
   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
+                    "stmt %G", stmt);
 
-  lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
-       : gimple_get_lhs (stmt);
+  lhs = gimple_get_lhs (stmt);
   lhs_type = TREE_TYPE (lhs);
 
-  bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
-            ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
-            : TYPE_SIZE (TREE_TYPE (vectype)));
+  bitsize = vector_element_bits_tree (vectype);
   vec_bitsize = TYPE_SIZE (vectype);
 
   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
   tree vec_lhs, bitstart;
+  gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
 
       /* Get the correct slp vectorized stmt.  */
-      gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
-      if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
-       vec_lhs = gimple_phi_result (phi);
-      else
-       vec_lhs = gimple_get_lhs (vec_stmt);
+      vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
+      vec_lhs = gimple_get_lhs (vec_stmt);
 
       /* Get entry to use.  */
       bitstart = bitsize_int (vec_index);
@@ -7985,109 +8578,193 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
     }
   else
     {
-      enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
-      vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
-      gcc_checking_assert (ncopies == 1
-                          || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
-
       /* For multiple copies, get the last copy.  */
-      for (int i = 1; i < ncopies; ++i)
-       vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
+      vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
+      vec_lhs = gimple_get_lhs (vec_stmt);
 
       /* Get the last lane in the vector.  */
       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
     }
 
-  /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
-     requirement, insert one phi node for it.  It looks like:
-        loop;
-       BB:
-        # lhs' = PHI <lhs>
-     ==>
-        loop;
-       BB:
-        # vec_lhs' = PHI <vec_lhs>
-        new_tree = lane_extract <vec_lhs', ...>;
-        lhs' = new_tree;  */
+  if (loop_vinfo)
+    {
+      /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
+        requirement, insert one phi node for it.  It looks like:
+          loop;
+        BB:
+          # lhs' = PHI <lhs>
+        ==>
+          loop;
+        BB:
+          # vec_lhs' = PHI <vec_lhs>
+          new_tree = lane_extract <vec_lhs', ...>;
+          lhs' = new_tree;  */
+
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      basic_block exit_bb = single_exit (loop)->dest;
+      gcc_assert (single_pred_p (exit_bb));
+
+      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
+      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
+      SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
+
+      gimple_seq stmts = NULL;
+      tree new_tree;
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         /* Emit:
 
-  basic_block exit_bb = single_exit (loop)->dest;
-  gcc_assert (single_pred_p (exit_bb));
+              SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
 
-  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
-  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
+            where VEC_LHS is the vectorized live-out result and MASK is
+            the loop mask for the final iteration.  */
+         gcc_assert (ncopies == 1 && !slp_node);
+         tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+         tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+                                         1, vectype, 0);
+         tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+                                         mask, vec_lhs_phi);
 
-  gimple_seq stmts = NULL;
-  tree new_tree;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-    {
-      /* Emit:
+         /* Convert the extracted vector element to the scalar type.  */
+         new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+       }
+      else
+       {
+         tree bftype = TREE_TYPE (vectype);
+         if (VECTOR_BOOLEAN_TYPE_P (vectype))
+           bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+         new_tree = build3 (BIT_FIELD_REF, bftype,
+                            vec_lhs_phi, bitsize, bitstart);
+         new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+                                          &stmts, true, NULL_TREE);
+       }
 
-          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+      if (stmts)
+       {
+         gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
+         gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
-        where VEC_LHS is the vectorized live-out result and MASK is
-        the loop mask for the final iteration.  */
-      gcc_assert (ncopies == 1 && !slp_node);
-      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
-      tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
-                                     vectype, 0);
-      tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-                                     mask, vec_lhs_phi);
+         /* Remove existing phi from lhs and create one copy from new_tree.  */
+         tree lhs_phi = NULL_TREE;
+         gimple_stmt_iterator gsi;
+         for (gsi = gsi_start_phis (exit_bb);
+              !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             gimple *phi = gsi_stmt (gsi);
+             if ((gimple_phi_arg_def (phi, 0) == lhs))
+               {
+                 remove_phi_node (&gsi, false);
+                 lhs_phi = gimple_phi_result (phi);
+                 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
+                 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
+                 break;
+               }
+           }
+       }
 
-      /* Convert the extracted vector element to the required scalar type.  */
-      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+      /* Replace use of lhs with newly computed result.  If the use stmt is a
+        single arg PHI, just replace all uses of PHI result.  It's necessary
+        because lcssa PHI defining lhs may be before newly inserted stmt.  */
+      use_operand_p use_p;
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
+           && !is_gimple_debug (use_stmt))
+         {
+           if (gimple_code (use_stmt) == GIMPLE_PHI
+               && gimple_phi_num_args (use_stmt) == 1)
+             {
+               replace_uses_by (gimple_phi_result (use_stmt), new_tree);
+             }
+           else
+             {
+               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+                   SET_USE (use_p, new_tree);
+             }
+           update_stmt (use_stmt);
+         }
     }
   else
     {
+      /* For basic-block vectorization simply insert the lane-extraction.  */
       tree bftype = TREE_TYPE (vectype);
       if (VECTOR_BOOLEAN_TYPE_P (vectype))
        bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
-      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
+      tree new_tree = build3 (BIT_FIELD_REF, bftype,
+                             vec_lhs, bitsize, bitstart);
+      gimple_seq stmts = NULL;
       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
                                       &stmts, true, NULL_TREE);
-    }
-
-  if (stmts)
-    {
-      gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
-      gsi_insert_before (&exit_gsi, stmts, GSI_CONTINUE_LINKING);
-
-      /* Remove existing phi from lhs and create one copy from new_tree.  */
-      tree lhs_phi = NULL_TREE;
-      gimple_stmt_iterator gsi;
-      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
-       {
-         gimple *phi = gsi_stmt (gsi);
-         if ((gimple_phi_arg_def (phi, 0) == lhs))
-           {
-             remove_phi_node (&gsi, false);
-             lhs_phi = gimple_phi_result (phi);
-             gimple *copy = gimple_build_assign (lhs_phi, new_tree);
-             gsi_insert_after (&exit_gsi, copy, GSI_CONTINUE_LINKING);
-             break;
-           }
-       }
-    }
-
-  /* Replace use of lhs with newly computed result.  If the use stmt is a
-     single arg PHI, just replace all uses of PHI result.  It's necessary
-     because lcssa PHI defining lhs may be before newly inserted stmt.  */
-  use_operand_p use_p;
-  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
-    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
-       && !is_gimple_debug (use_stmt))
-    {
-      if (gimple_code (use_stmt) == GIMPLE_PHI
-         && gimple_phi_num_args (use_stmt) == 1)
+      if (TREE_CODE (new_tree) == SSA_NAME
+         && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
+       SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
+      if (is_a <gphi *> (vec_stmt))
        {
-         replace_uses_by (gimple_phi_result (use_stmt), new_tree);
+         gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
+         gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
        }
       else
        {
-         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-           SET_USE (use_p, new_tree);
+         gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
+         gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
        }
-      update_stmt (use_stmt);
+
+      /* Replace use of lhs with newly computed result.  If the use stmt is a
+        single arg PHI, just replace all uses of PHI result.  It's necessary
+        because lcssa PHI defining lhs may be before newly inserted stmt.  */
+      use_operand_p use_p;
+      stmt_vec_info use_stmt_info;
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+       if (!is_gimple_debug (use_stmt)
+           && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
+               || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
+         {
+           /* ???  This can happen when the live lane ends up being
+              used in a vector construction code-generated by an
+              external SLP node (and code-generation for that already
+              happened).  See gcc.dg/vect/bb-slp-47.c.
+              Doing this is what would happen if that vector CTOR
+              were not code-generated yet so it is not too bad.
+              ???  In fact we'd likely want to avoid this situation
+              in the first place.  */
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && gimple_code (use_stmt) != GIMPLE_PHI
+               && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
+                                               use_stmt))
+             {
+               enum tree_code code = gimple_assign_rhs_code (use_stmt);
+               gcc_assert (code == CONSTRUCTOR
+                           || code == VIEW_CONVERT_EXPR
+                           || CONVERT_EXPR_CODE_P (code));
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "Using original scalar computation for "
+                                  "live lane because use preceeds vector "
+                                  "def\n");
+               continue;
+             }
+           /* ???  It can also happen that we end up pulling a def into
+              a loop where replacing out-of-loop uses would require
+              a new LC SSA PHI node.  Retain the original scalar in
+              those cases as well.  PR98064.  */
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && (gimple_bb (use_stmt)->loop_father
+                   != gimple_bb (vec_stmt)->loop_father)
+               && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
+                                       gimple_bb (use_stmt)->loop_father))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "Using original scalar computation for "
+                                  "live lane because there is an out-of-loop "
+                                  "definition for it\n");
+               continue;
+             }
+           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+             SET_USE (use_p, new_tree);
+           update_stmt (use_stmt);
+         }
     }
 
   return true;
@@ -8196,8 +8873,8 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
 {
   gcc_assert (nvectors != 0);
   if (masks->length () < nvectors)
-    masks->safe_grow_cleared (nvectors);
-  rgroup_masks *rgm = &(*masks)[nvectors - 1];
+    masks->safe_grow_cleared (nvectors, true);
+  rgroup_controls *rgm = &(*masks)[nvectors - 1];
   /* The number of scalars per iteration and the number of vectors are
      both compile-time constants.  */
   unsigned int nscalars_per_iter
@@ -8213,7 +8890,8 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
     {
       rgm->max_nscalars_per_iter = nscalars_per_iter;
-      rgm->mask_type = truth_type_for (vectype);
+      rgm->type = truth_type_for (vectype);
+      rgm->factor = 1;
     }
 }
 
@@ -8228,24 +8906,24 @@ tree
 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
                    unsigned int nvectors, tree vectype, unsigned int index)
 {
-  rgroup_masks *rgm = &(*masks)[nvectors - 1];
-  tree mask_type = rgm->mask_type;
+  rgroup_controls *rgm = &(*masks)[nvectors - 1];
+  tree mask_type = rgm->type;
 
   /* Populate the rgroup's mask array, if this is the first time we've
      used it.  */
-  if (rgm->masks.is_empty ())
+  if (rgm->controls.is_empty ())
     {
-      rgm->masks.safe_grow_cleared (nvectors);
+      rgm->controls.safe_grow_cleared (nvectors, true);
       for (unsigned int i = 0; i < nvectors; ++i)
        {
          tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
          /* Provide a dummy definition until the real one is available.  */
          SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
-         rgm->masks[i] = mask;
+         rgm->controls[i] = mask;
        }
     }
 
-  tree mask = rgm->masks[index];
+  tree mask = rgm->controls[index];
   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
                TYPE_VECTOR_SUBPARTS (vectype)))
     {
@@ -8266,6 +8944,69 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
   return mask;
 }
 
+/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
+   lengths for controlling an operation on VECTYPE.  The operation splits
+   each element of VECTYPE into FACTOR separate subelements, measuring the
+   length as a number of these subelements.  */
+
+void
+vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+                     unsigned int nvectors, tree vectype, unsigned int factor)
+{
+  gcc_assert (nvectors != 0);
+  if (lens->length () < nvectors)
+    lens->safe_grow_cleared (nvectors, true);
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* The number of scalars per iteration, scalar occupied bytes and
+     the number of vectors are both compile-time constants.  */
+  unsigned int nscalars_per_iter
+    = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+                LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+  if (rgl->max_nscalars_per_iter < nscalars_per_iter)
+    {
+      /* For now, we only support cases in which all loads and stores fall back
+        to VnQI or none do.  */
+      gcc_assert (!rgl->max_nscalars_per_iter
+                 || (rgl->factor == 1 && factor == 1)
+                 || (rgl->max_nscalars_per_iter * rgl->factor
+                     == nscalars_per_iter * factor));
+      rgl->max_nscalars_per_iter = nscalars_per_iter;
+      rgl->type = vectype;
+      rgl->factor = factor;
+    }
+}
+
+/* Given a complete set of length LENS, extract length number INDEX for an
+   rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
+
+tree
+vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+                  unsigned int nvectors, unsigned int index)
+{
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* Populate the rgroup's len array, if this is the first time we've
+     used it.  */
+  if (rgl->controls.is_empty ())
+    {
+      rgl->controls.safe_grow_cleared (nvectors, true);
+      for (unsigned int i = 0; i < nvectors; ++i)
+       {
+         tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+         gcc_assert (len_type != NULL_TREE);
+         tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
+
+         /* Provide a dummy definition until the real one is available.  */
+         SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
+         rgl->controls[i] = len;
+       }
+    }
+
+  return rgl->controls[index];
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
    by factor VF.  */
 
@@ -8300,11 +9041,47 @@ scale_profile_for_vect_loop (class loop *loop, unsigned vf)
     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
 }
 
+/* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
+   latch edge values originally defined by it.  */
+
+static void
+maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
+                                    stmt_vec_info def_stmt_info)
+{
+  tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
+  if (!def || TREE_CODE (def) != SSA_NAME)
+    return;
+  stmt_vec_info phi_info;
+  imm_use_iterator iter;
+  use_operand_p use_p;
+  FOR_EACH_IMM_USE_FAST (use_p, iter, def)
+    if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
+      if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
+         && (phi_info = loop_vinfo->lookup_stmt (phi))
+         && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
+         && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
+         && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
+       {
+         loop_p loop = gimple_bb (phi)->loop_father;
+         edge e = loop_latch_edge (loop);
+         if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
+           {
+             vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
+             vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
+             gcc_assert (phi_defs.length () == latch_defs.length ());
+             for (unsigned i = 0; i < phi_defs.length (); ++i)
+               add_phi_arg (as_a <gphi *> (phi_defs[i]),
+                            gimple_get_lhs (latch_defs[i]), e,
+                            gimple_phi_arg_location (phi, e->dest_idx));
+           }
+       }
+}
+
 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
    stmt_vec_info.  */
 
-static void
+static bool
 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
                          gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
 {
@@ -8320,7 +9097,7 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info)
       && !STMT_VINFO_LIVE_P (stmt_info))
-    return;
+    return false;
 
   if (STMT_VINFO_VECTYPE (stmt_info))
     {
@@ -8337,13 +9114,15 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
   /* Pure SLP statements have already been vectorized.  We still need
      to apply loop vectorization to hybrid SLP statements.  */
   if (PURE_SLP_STMT (stmt_info))
-    return;
+    return false;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
 
-  if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
+  if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
     *seen_store = stmt_info;
+
+  return true;
 }
 
 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
@@ -8392,6 +9171,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
   basic_block *epilogue_bbs = get_loop_body (epilogue);
   unsigned i;
 
+  free (LOOP_VINFO_BBS (epilogue_vinfo));
   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
 
   /* Advance data_reference's with the number of iterations of the previous
@@ -8429,6 +9209,8 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
           !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
        {
          new_stmt = gsi_stmt (epilogue_gsi);
+         if (is_gimple_debug (new_stmt))
+           continue;
 
          gcc_assert (gimple_uid (new_stmt) > 0);
          stmt_vinfo
@@ -8493,7 +9275,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
     }
 
   struct data_reference *dr;
-  vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
+  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
       orig_stmt = DR_STMT (dr);
@@ -8625,7 +9407,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   if (niters_vector == NULL_TREE)
     {
       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-         && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+         && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
          && known_eq (lowest_vf, vf))
        {
          niters_vector
@@ -8633,9 +9415,15 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                             LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
          step_vector = build_one_cst (TREE_TYPE (niters));
        }
-      else
+      else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
        vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
                                     &step_vector, niters_no_overflow);
+      else
+       /* vect_do_peeling subtracted the number of peeled prologue
+          iterations from LOOP_VINFO_NITERS.  */
+       vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
+                                    &niters_vector, &step_vector,
+                                    niters_no_overflow);
     }
 
   /* 1) Make sure the loop header has exactly two entries
@@ -8645,8 +9433,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 
   split_edge (loop_preheader_edge (loop));
 
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-      && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
     /* This will deal with any possible peeling.  */
     vect_prepare_for_masked_peels (loop_vinfo);
 
@@ -8655,7 +9442,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   if (!loop_vinfo->slp_instances.is_empty ())
     {
       DUMP_VECT_SCOPE ("scheduling SLP instances");
-      vect_schedule_slp (loop_vinfo);
+      vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
     }
 
   /* FORNOW: the vectorizer supports only loops which body consist
@@ -8670,7 +9457,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 
       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
           gsi_next (&si))
-        {
+       {
          gphi *phi = si.phi ();
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
@@ -8701,10 +9488,31 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
            {
              if (dump_enabled_p ())
                dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
-             vect_transform_stmt (stmt_info, NULL, NULL, NULL);
+             vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
            }
        }
 
+      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
+          gsi_next (&si))
+       {
+         gphi *phi = si.phi ();
+         stmt_info = loop_vinfo->lookup_stmt (phi);
+         if (!stmt_info)
+           continue;
+
+         if (!STMT_VINFO_RELEVANT_P (stmt_info)
+             && !STMT_VINFO_LIVE_P (stmt_info))
+           continue;
+
+         if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
+             && ! PURE_SLP_STMT (stmt_info))
+           maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
+       }
+
       for (gimple_stmt_iterator si = gsi_start_bb (bb);
           !gsi_end_p (si);)
        {
@@ -8718,6 +9526,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
            }
          else
            {
+             /* Ignore vector stmts created in the outer loop.  */
              stmt_info = loop_vinfo->lookup_stmt (stmt);
 
              /* vector stmts created in the outer-loop during vectorization of
@@ -8739,11 +9548,18 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                        }
                      stmt_vec_info pat_stmt_info
                        = STMT_VINFO_RELATED_STMT (stmt_info);
-                     vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
-                                               &seen_store);
+                     if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
+                                                   &si, &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            pat_stmt_info);
+                   }
+                 else
+                   {
+                     if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
+                                                   &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            stmt_info);
                    }
-                 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
-                                           &seen_store);
                }
              gsi_next (&si);
              if (seen_store)
@@ -8752,7 +9568,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                    /* Interleaving.  If IS_STORE is TRUE, the
                       vectorization of the interleaving chain was
                       completed - free all the stores in the chain.  */
-                   vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
+                   vect_remove_stores (loop_vinfo,
+                                       DR_GROUP_FIRST_ELEMENT (seen_store));
                  else
                    /* Free the attached stmt_vec_info and remove the stmt.  */
                    loop_vinfo->remove_stmt (stmt_info);
@@ -8792,7 +9609,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 
   /* True if the final iteration might not handle a full vector's
      worth of scalar iterations.  */
-  bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool final_iter_may_be_partial
+    = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
   /* The minimum number of iterations performed by the epilogue.  This
      is 1 when peeling for gaps because we always need a final scalar
      iteration.  */
@@ -8803,7 +9621,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   int bias_for_lowest = 1 - min_epilogue_iters;
   int bias_for_assumed = bias_for_lowest;
   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-  if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
       /* When the amount of peeling is known at compile time, the first
         iteration will have exactly alignment_npeels active elements.
@@ -8866,7 +9684,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
      won't work.  */
   slp_instance instance;
   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
-    vect_free_slp_instance (instance, true);
+    vect_free_slp_instance (instance);
   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
   /* Clear-up safelen field since its value is invalid after vectorization
      since vectorized loop can have loop-carried dependencies.  */
@@ -9095,12 +9913,13 @@ optimize_mask_stores (class loop *loop)
 }
 
 /* Decide whether it is possible to use a zero-based induction variable
-   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
-   return the value that the induction variable must be able to hold
-   in order to ensure that the loop ends with an all-false mask.
+   when vectorizing LOOP_VINFO with partial vectors.  If it is, return
+   the value that the induction variable must be able to hold in order
+   to ensure that the rgroups eventually have no active vector elements.
    Return -1 otherwise.  */
+
 widest_int
-vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
 {
   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -9135,3 +9954,25 @@ vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
   return iv_limit;
 }
 
+/* For the given rgroup_controls RGC, check whether an induction variable
+   would ever hit a value that produces a set of all-false masks or zero
+   lengths before wrapping around.  Return true if it's possible to wrap
+   around before hitting the desirable value, otherwise return false.  */
+
+bool
+vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
+{
+  widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
+
+  if (iv_limit == -1)
+    return true;
+
+  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  unsigned int compare_precision = TYPE_PRECISION (compare_type);
+  unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+
+  if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
+    return true;
+
+  return false;
+}