Daily bump.

[gcc.git] / gcc / tree-vect-loop.c
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index c9b653491f73879e300526918a5533ee0e627c1d..72bbec4b45d225d73eddfaca69468cd23842a42a 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -161,7 +161,7 @@ static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
     may already be set for general statements (not just data refs).  */
  
  static opt_result
-vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
+vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
                               bool vectype_maybe_set_p,
                               poly_uint64 *vf)
  {
@@ -177,7 +177,8 @@ vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
      }
  
    tree stmt_vectype, nunits_vectype;
-  opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
+  opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
+                                                  &stmt_vectype,
                                                    &nunits_vectype);
    if (!res)
      return res;
@@ -207,13 +208,13 @@ vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
     or false if something prevented vectorization.  */
  
  static opt_result
-vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
+vect_determine_vf_for_stmt (vec_info *vinfo,
+                           stmt_vec_info stmt_info, poly_uint64 *vf)
  {
-  vec_info *vinfo = stmt_info->vinfo;
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
                      stmt_info->stmt);
-  opt_result res = vect_determine_vf_for_stmt_1 (stmt_info, false, vf);
+  opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
    if (!res)
      return res;
  
@@ -232,7 +233,7 @@ vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
             dump_printf_loc (MSG_NOTE, vect_location,
                              "==> examining pattern def stmt: %G",
                              def_stmt_info->stmt);
-         res = vect_determine_vf_for_stmt_1 (def_stmt_info, true, vf);
+         res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
           if (!res)
             return res;
         }
@@ -241,7 +242,7 @@ vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf)
         dump_printf_loc (MSG_NOTE, vect_location,
                          "==> examining pattern statement: %G",
                          stmt_info->stmt);
-      res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf);
+      res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
        if (!res)
         return res;
      }
@@ -341,9 +342,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
        for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
            gsi_next (&si))
         {
+         if (is_gimple_debug (gsi_stmt (si)))
+           continue;
           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
           opt_result res
-           = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor);
+           = vect_determine_vf_for_stmt (loop_vinfo,
+                                         stmt_info, &vectorization_factor);
           if (!res)
             return res;
          }
@@ -440,9 +444,8 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
     this function would then return true for x_2.  */
  
  static bool
-vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
+vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
  {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    use_operand_p use_p;
    ssa_op_iter op_iter;
    FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
@@ -505,7 +508,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
         }
  
        if (!access_fn
-         || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
+         || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
               && TREE_CODE (step) != INTEGER_CST))
@@ -663,27 +666,50 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
    unsigned i;
  
    FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
-    if (STMT_VINFO_IN_PATTERN_P (first))
-      {
-       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
-       while (next)
-         {
-           if (! STMT_VINFO_IN_PATTERN_P (next)
-               || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
-             break;
-           next = REDUC_GROUP_NEXT_ELEMENT (next);
-         }
-       /* If not all stmt in the chain are patterns or if we failed
-          to update STMT_VINFO_REDUC_IDX try to handle the chain
-          without patterns.  */
-       if (! next
-           && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
-         {
-           vect_fixup_reduc_chain (first);
-           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
-             = STMT_VINFO_RELATED_STMT (first);
-         }
-      }
+    {
+      stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
+      while (next)
+       {
+         if ((STMT_VINFO_IN_PATTERN_P (next)
+              != STMT_VINFO_IN_PATTERN_P (first))
+             || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
+           break;
+         next = REDUC_GROUP_NEXT_ELEMENT (next);
+       }
+      /* If all reduction chain members are well-formed patterns adjust
+        the group to group the pattern stmts instead.  */
+      if (! next
+         && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
+       {
+         if (STMT_VINFO_IN_PATTERN_P (first))
+           {
+             vect_fixup_reduc_chain (first);
+             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
+               = STMT_VINFO_RELATED_STMT (first);
+           }
+       }
+      /* If not all stmt in the chain are patterns or if we failed
+        to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
+        it as regular reduction instead.  */
+      else
+       {
+         stmt_vec_info vinfo = first;
+         stmt_vec_info last = NULL;
+         while (vinfo)
+           {
+             next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
+             REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
+             REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+             last = vinfo;
+             vinfo = next;
+           }
+         STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
+           = vect_internal_def;
+         loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
+         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
+         --i;
+       }
+    }
  }
  
  /* Function vect_get_loop_niters.
@@ -799,7 +825,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
      vectorization_factor (0),
      max_vectorization_factor (0),
      mask_skip_niters (NULL_TREE),
-    mask_compare_type (NULL_TREE),
+    rgroup_compare_type (NULL_TREE),
      simd_if_cond (NULL_TREE),
      unaligned_dr (NULL),
      peeling_for_alignment (0),
@@ -811,8 +837,9 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
      vec_outside_cost (0),
      vec_inside_cost (0),
      vectorizable (false),
-    can_fully_mask_p (true),
-    fully_masked_p (false),
+    can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
+    using_partial_vectors_p (false),
+    epil_using_partial_vectors_p (false),
      peeling_for_gaps (false),
      peeling_for_niter (false),
      no_data_dependencies (false),
@@ -846,6 +873,8 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
         {
           gimple *stmt = gsi_stmt (si);
           gimple_set_uid (stmt, 0);
+         if (is_gimple_debug (stmt))
+           continue;
           add_stmt (stmt);
           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
              third argument is the #pragma omp simd if (x) condition, when 0,
@@ -873,16 +902,16 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
    epilogue_vinfos.create (6);
  }
  
-/* Free all levels of MASKS.  */
+/* Free all levels of rgroup CONTROLS.  */
  
  void
-release_vec_loop_masks (vec_loop_masks *masks)
+release_vec_loop_controls (vec<rgroup_controls> *controls)
  {
-  rgroup_masks *rgm;
+  rgroup_controls *rgc;
    unsigned int i;
-  FOR_EACH_VEC_ELT (*masks, i, rgm)
-    rgm->masks.release ();
-  masks->release ();
+  FOR_EACH_VEC_ELT (*controls, i, rgc)
+    rgc->controls.release ();
+  controls->release ();
  }
  
  /* Free all memory used by the _loop_vec_info, as well as all the
@@ -892,7 +921,8 @@ _loop_vec_info::~_loop_vec_info ()
  {
    free (bbs);
  
-  release_vec_loop_masks (&masks);
+  release_vec_loop_controls (&masks);
+  release_vec_loop_controls (&lens);
    delete ivexpr_map;
    delete scan_map;
    epilogue_vinfos.release ();
@@ -933,12 +963,12 @@ cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
  static bool
  can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
  {
-  rgroup_masks *rgm;
+  rgroup_controls *rgm;
    unsigned int i;
    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
-    if (rgm->mask_type != NULL_TREE
+    if (rgm->type != NULL_TREE
         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
-                                           cmp_type, rgm->mask_type,
+                                           cmp_type, rgm->type,
                                             OPTIMIZE_FOR_SPEED))
        return false;
    return true;
@@ -952,20 +982,90 @@ vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
  {
    unsigned int res = 1;
    unsigned int i;
-  rgroup_masks *rgm;
+  rgroup_controls *rgm;
    FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
      res = MAX (res, rgm->max_nscalars_per_iter);
    return res;
  }
  
+/* Calculate the minimum precision necessary to represent:
+
+      MAX_NITERS * FACTOR
+
+   as an unsigned integer, where MAX_NITERS is the maximum number of
+   loop header iterations for the original scalar form of LOOP_VINFO.  */
+
+static unsigned
+vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
+{
+  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+
+  /* Get the maximum number of iterations that is representable
+     in the counter type.  */
+  tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
+  widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
+
+  /* Get a more refined estimate for the number of iterations.  */
+  widest_int max_back_edges;
+  if (max_loop_iterations (loop, &max_back_edges))
+    max_ni = wi::smin (max_ni, max_back_edges + 1);
+
+  /* Work out how many bits we need to represent the limit.  */
+  return wi::min_precision (max_ni * factor, UNSIGNED);
+}
+
+/* True if the loop needs peeling or partial vectors when vectorized.  */
+
+static bool
+vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
+{
+  unsigned HOST_WIDE_INT const_vf;
+  HOST_WIDE_INT max_niter
+    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
+  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
+                                         (loop_vinfo));
+
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+    {
+      /* Work out the (constant) number of iterations that need to be
+        peeled for reasons other than niters.  */
+      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+       peel_niter += 1;
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
+                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+       return true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      /* ??? When peeling for gaps but not alignment, we could
+        try to check whether the (variable) niters is known to be
+        VF * N + 1.  That's something of a niche case though.  */
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+      || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+      || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+          < (unsigned) exact_log2 (const_vf))
+         /* In case of versioning, check if the maximum number of
+            iterations is greater than th.  If they are identical,
+            the epilogue is unnecessary.  */
+         && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+             || ((unsigned HOST_WIDE_INT) max_niter
+                 > (th / const_vf) * const_vf))))
+    return true;
+
+  return false;
+}
+
  /* Each statement in LOOP_VINFO can be masked where necessary.  Check
     whether we can actually generate the masks required.  Return true if so,
-   storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
+   storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
  
  static bool
  vect_verify_full_masking (loop_vec_info loop_vinfo)
  {
-  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    unsigned int min_ni_width;
    unsigned int max_nscalars_per_iter
      = vect_get_max_nscalars_per_iter (loop_vinfo);
@@ -976,27 +1076,15 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
    if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
      return false;
  
-  /* Get the maximum number of iterations that is representable
-     in the counter type.  */
-  tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
-  widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
-
-  /* Get a more refined estimate for the number of iterations.  */
-  widest_int max_back_edges;
-  if (max_loop_iterations (loop, &max_back_edges))
-    max_ni = wi::smin (max_ni, max_back_edges + 1);
-
-  /* Account for rgroup masks, in which each bit is replicated N times.  */
-  max_ni *= max_nscalars_per_iter;
-
    /* Work out how many bits we need to represent the limit.  */
-  min_ni_width = wi::min_precision (max_ni, UNSIGNED);
+  min_ni_width
+    = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
  
    /* Find a scalar mode for which WHILE_ULT is supported.  */
    opt_scalar_int_mode cmp_mode_iter;
    tree cmp_type = NULL_TREE;
    tree iv_type = NULL_TREE;
-  widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
+  widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
    unsigned int iv_precision = UINT_MAX;
  
    if (iv_limit != -1)
@@ -1049,8 +1137,83 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
    if (!cmp_type)
      return false;
  
-  LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
-  LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
+  LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
+  LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+  return true;
+}
+
+/* Check whether we can use vector access with length based on precison
+   comparison.  So far, to keep it simple, we only allow the case that the
+   precision of the target supported length is larger than the precision
+   required by loop niters.  */
+
+static bool
+vect_verify_loop_lens (loop_vec_info loop_vinfo)
+{
+  if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+    return false;
+
+  unsigned int max_nitems_per_iter = 1;
+  unsigned int i;
+  rgroup_controls *rgl;
+  /* Find the maximum number of items per iteration for every rgroup.  */
+  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
+    {
+      unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
+      max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
+    }
+
+  /* Work out how many bits we need to represent the length limit.  */
+  unsigned int min_ni_prec
+    = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
+
+  /* Now use the maximum of below precisions for one suitable IV type:
+     - the IV's natural precision
+     - the precision needed to hold: the maximum number of scalar
+       iterations multiplied by the scale factor (min_ni_prec above)
+     - the Pmode precision
+
+     If min_ni_prec is less than the precision of the current niters,
+     we perfer to still use the niters type.  Prefer to use Pmode and
+     wider IV to avoid narrow conversions.  */
+
+  unsigned int ni_prec
+    = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
+  min_ni_prec = MAX (min_ni_prec, ni_prec);
+  min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
+
+  tree iv_type = NULL_TREE;
+  opt_scalar_int_mode tmode_iter;
+  FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
+    {
+      scalar_mode tmode = tmode_iter.require ();
+      unsigned int tbits = GET_MODE_BITSIZE (tmode);
+
+      /* ??? Do we really want to construct one IV whose precision exceeds
+        BITS_PER_WORD?  */
+      if (tbits > BITS_PER_WORD)
+       break;
+
+      /* Find the first available standard integral type.  */
+      if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
+       {
+         iv_type = build_nonstandard_integer_type (tbits, true);
+         break;
+       }
+    }
+
+  if (!iv_type)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "can't vectorize with length-based partial vectors"
+                        " because there is no suitable iv type.\n");
+      return false;
+    }
+
+  LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
+  LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+
    return true;
  }
  
@@ -1122,9 +1285,9 @@ vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
    int j;
    FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
                     j, si)
-    (void) add_stmt_cost (target_cost_data, si->count,
-                         si->kind, si->stmt_info, si->misalign,
-                         vect_body);
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
+                         si->kind, si->stmt_info, si->vectype,
+                         si->misalign, vect_body);
    unsigned dummy, body_cost = 0;
    finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
    destroy_cost_data (target_cost_data);
@@ -1392,6 +1555,8 @@ vect_update_vf_for_slp (loop_vec_info loop_vinfo)
        for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
            gsi_next (&si))
         {
+         if (is_gimple_debug (gsi_stmt (si)))
+           continue;
           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
           stmt_info = vect_stmt_to_vectorize (stmt_info);
           if ((STMT_VINFO_RELEVANT_P (stmt_info)
@@ -1529,7 +1694,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
                        || (STMT_VINFO_DEF_TYPE (stmt_info)
                            == vect_double_reduction_def))
-                     && !vectorizable_lc_phi (stmt_info, NULL, NULL))
+                     && !vectorizable_lc_phi (loop_vinfo,
+                                              stmt_info, NULL, NULL))
                     return opt_result::failure_at (phi, "unsupported phi\n");
                  }
  
@@ -1551,21 +1717,24 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
                need_to_vectorize = true;
                if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
                   && ! PURE_SLP_STMT (stmt_info))
-               ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
+               ok = vectorizable_induction (loop_vinfo,
+                                            stmt_info, NULL, NULL,
                                              &cost_vec);
               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
                         || (STMT_VINFO_DEF_TYPE (stmt_info)
                             == vect_double_reduction_def)
                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
                        && ! PURE_SLP_STMT (stmt_info))
-               ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
+               ok = vectorizable_reduction (loop_vinfo,
+                                            stmt_info, NULL, NULL, &cost_vec);
              }
  
           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
           if (ok
               && STMT_VINFO_LIVE_P (stmt_info)
               && !PURE_SLP_STMT (stmt_info))
-           ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
+           ok = vectorizable_live_operation (loop_vinfo,
+                                             stmt_info, NULL, NULL, NULL,
                                               -1, false, &cost_vec);
  
            if (!ok)
@@ -1579,10 +1748,12 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
            gsi_next (&si))
          {
           gimple *stmt = gsi_stmt (si);
-         if (!gimple_clobber_p (stmt))
+         if (!gimple_clobber_p (stmt)
+             && !is_gimple_debug (stmt))
             {
               opt_result res
-               = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
+               = vect_analyze_stmt (loop_vinfo,
+                                    loop_vinfo->lookup_stmt (stmt),
                                      &need_to_vectorize,
                                      NULL, NULL, &cost_vec);
               if (!res)
@@ -1591,7 +1762,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
          }
      } /* bbs */
  
-  add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
+  add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
  
    /* All operations in the loop are either irrelevant (deal with loop
       control, or dead), or only used outside the loop and can be moved
@@ -1611,6 +1782,27 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo)
    return opt_result::success ();
  }
  
+/* Return true if we know that the iteration count is smaller than the
+   vectorization factor.  Return false if it isn't, or if we can't be sure
+   either way.  */
+
+static bool
+vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
+{
+  unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
+
+  HOST_WIDE_INT max_niter;
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
+  else
+    max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
+    return true;
+
+  return false;
+}
+
  /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
     is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
     definitely no, or -1 if it's worth retrying.  */
@@ -1621,19 +1813,11 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
  
-  /* Only fully-masked loops can have iteration counts less than the
-     vectorization factor.  */
-  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  /* Only loops that can handle partially-populated vectors can have iteration
+     counts less than the vectorization factor.  */
+  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
      {
-      HOST_WIDE_INT max_niter;
-
-      if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
-       max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
-      else
-       max_niter = max_stmt_executions_int (loop);
-
-      if (max_niter != -1
-         && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
+      if (vect_known_niters_smaller_than_vf (loop_vinfo))
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
@@ -1643,6 +1827,19 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
         }
      }
  
+  /* If using the "very cheap" model. reject cases in which we'd keep
+     a copy of the scalar code (even if we might be able to vectorize it).  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "some scalar iterations would need to be peeled\n");
+      return 0;
+    }
+
    int min_profitable_iters, min_profitable_estimate;
    vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
                                       &min_profitable_estimate);
@@ -1701,6 +1898,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
        min_profitable_estimate = min_profitable_iters;
      }
  
+  /* If the vector loop needs multiple iterations to be beneficial then
+     things are probably too close to call, and the conservative thing
+     would be to stick with the scalar code.  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "one iteration of the vector loop would be"
+                        " more expensive than the equivalent number of"
+                        " iterations of the scalar loop\n");
+      return 0;
+    }
+
    HOST_WIDE_INT estimated_niter;
  
    /* If we are vectorizing an epilogue then we know the maximum number of
@@ -1749,7 +1960,8 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
         if (is_gimple_debug (stmt))
           continue;
         ++(*n_stmts);
-       opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
+       opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
+                                                       NULL, 0);
         if (!res)
           {
             if (is_gimple_call (stmt) && loop->safelen)
@@ -1804,7 +2016,7 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
  
    DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
  
-  vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
+  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
    FOR_EACH_VEC_ELT (datarefs, i, dr)
      {
        gcc_assert (DR_REF (dr));
@@ -1841,56 +2053,123 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
      }
  }
  
+/* Determine if operating on full vectors for LOOP_VINFO might leave
+   some scalar iterations still to do.  If so, decide how we should
+   handle those scalar iterations.  The possibilities are:
  
-/* Decides whether we need to create an epilogue loop to handle
-   remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
+   (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
+       In this case:
  
-void
-determine_peel_for_niter (loop_vec_info loop_vinfo)
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
+        LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == false
+
+   (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
+       to handle the remaining scalar iterations.  In this case:
+
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == true
+
+       There are two choices:
+
+       (2a) Consider vectorizing the epilogue loop at the same VF as the
+           main loop, but using partial vectors instead of full vectors.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
+
+       (2b) Consider vectorizing the epilogue loop at lower VFs only.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+
+   When FOR_EPILOGUE_P is true, make this determination based on the
+   assumption that LOOP_VINFO is an epilogue loop, otherwise make it
+   based on the assumption that LOOP_VINFO is the main loop.  The caller
+   has made sure that the number of iterations is set appropriately for
+   this value of FOR_EPILOGUE_P.  */
+
+opt_result
+vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
+                                           bool for_epilogue_p)
  {
-  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  /* Determine whether there would be any scalar iterations left over.  */
+  bool need_peeling_or_partial_vectors_p
+    = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
+
+  /* Decide whether to vectorize the loop with partial vectors.  */
+  LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && need_peeling_or_partial_vectors_p)
+    {
+      /* For partial-vector-usage=1, try to push the handling of partial
+        vectors to the epilogue, with the main loop continuing to operate
+        on full vectors.
+
+        ??? We could then end up failing to use partial vectors if we
+        decide to peel iterations into a prologue, and if the main loop
+        then ends up processing fewer than VF iterations.  */
+      if (param_vect_partial_vector_usage == 1
+         && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+         && !vect_known_niters_smaller_than_vf (loop_vinfo))
+       LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+      else
+       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+    }
  
-  unsigned HOST_WIDE_INT const_vf;
-  HOST_WIDE_INT max_niter
-    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+  if (dump_enabled_p ())
+    {
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating on partial vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+      else
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating only on full vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+    }
  
-  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
-    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
-                                         (loop_vinfo));
+  if (for_epilogue_p)
+    {
+      loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+      gcc_assert (orig_loop_vinfo);
+      if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                             LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
+    }
  
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-    /* The main loop handles all iterations.  */
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
-  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-          && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
      {
-      /* Work out the (constant) number of iterations that need to be
-        peeled for reasons other than niters.  */
-      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-       peel_niter += 1;
-      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
-                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-       LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+      /* Check that the loop processes at least one full vector.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
+      if (known_lt (wi::to_widest (scalar_niters), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support vectorization.\n");
+
+      /* If we need to peel an extra epilogue iteration to handle data
+        accesses with gaps, check that there are enough scalar iterations
+        available.
+
+        The check above is redundant with this one when peeling for gaps,
+        but the distinction is useful for diagnostics.  */
+      tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         && known_lt (wi::to_widest (scalar_nitersm1), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support peeling for gaps.\n");
      }
-  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
-          /* ??? When peeling for gaps but not alignment, we could
-             try to check whether the (variable) niters is known to be
-             VF * N + 1.  That's something of a niche case though.  */
-          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-          || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
-          || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
-               < (unsigned) exact_log2 (const_vf))
-              /* In case of versioning, check if the maximum number of
-                 iterations is greater than th.  If they are identical,
-                 the epilogue is unnecessary.  */
-              && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-                  || ((unsigned HOST_WIDE_INT) max_niter
-                      > (th / const_vf) * const_vf))))
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-}
  
+  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+    = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+       && need_peeling_or_partial_vectors_p);
+
+  return opt_result::success ();
+}
  
  /* Function vect_analyze_loop_2.
  
@@ -1971,7 +2250,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
    /* Analyze the access patterns of the data-refs in the loop (consecutive,
       complex, etc.). FORNOW: Only handle consecutive access pattern.  */
  
-  ok = vect_analyze_data_ref_accesses (loop_vinfo);
+  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
    if (!ok)
      {
        if (dump_enabled_p ())
@@ -2043,9 +2322,16 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
  
        /* Update the vectorization factor based on the SLP decision.  */
        vect_update_vf_for_slp (loop_vinfo);
+
+      /* Optimize the SLP graph with the vectorization factor fixed.  */
+      vect_optimize_slp (loop_vinfo);
+
+      /* Gather the loads reachable from the SLP graph entries.  */
+      vect_gather_slp_loads (loop_vinfo);
      }
  
-  bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
+  bool saved_can_use_partial_vectors_p
+    = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
  
    /* We don't expect to have to roll back to anything other than an empty
       set of rgroups.  */
@@ -2093,8 +2379,6 @@ start_over:
      /* This pass will decide on using loop versioning and/or loop peeling in
         order to enhance the alignment of data references in the loop.  */
      ok = vect_enhance_data_refs_alignment (loop_vinfo);
-  else
-    ok = vect_verify_datarefs_alignment (loop_vinfo);
    if (!ok)
      return ok;
  
@@ -2111,6 +2395,79 @@ start_over:
                                        "unsupported SLP instances\n");
           goto again;
         }
+
+      /* Check whether any load in ALL SLP instances is possibly permuted.  */
+      slp_tree load_node, slp_root;
+      unsigned i, x;
+      slp_instance instance;
+      bool can_use_lanes = true;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
+       {
+         slp_root = SLP_INSTANCE_TREE (instance);
+         int group_size = SLP_TREE_LANES (slp_root);
+         tree vectype = SLP_TREE_VECTYPE (slp_root);
+         bool loads_permuted = false;
+         FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+           {
+             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+               continue;
+             unsigned j;
+             stmt_vec_info load_info;
+             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+               if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+                 {
+                   loads_permuted = true;
+                   break;
+                 }
+           }
+
+         /* If the loads and stores can be handled with load/store-lane
+            instructions record it and move on to the next instance.  */
+         if (loads_permuted
+             && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+             && vect_store_lanes_supported (vectype, group_size, false))
+           {
+             FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+               {
+                 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+                     (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+                 /* Use SLP for strided accesses (or if we can't
+                    load-lanes).  */
+                 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+                     || ! vect_load_lanes_supported
+                           (STMT_VINFO_VECTYPE (stmt_vinfo),
+                            DR_GROUP_SIZE (stmt_vinfo), false))
+                   break;
+               }
+
+             can_use_lanes
+               = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
+
+             if (can_use_lanes && dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "SLP instance %p can use load/store-lanes\n",
+                                instance);
+           }
+         else
+           {
+             can_use_lanes = false;
+             break;
+           }
+       }
+
+      /* If all SLP instances can use load/store-lanes abort SLP and try again
+        with SLP disabled.  */
+      if (can_use_lanes)
+       {
+         ok = opt_result::failure_at (vect_location,
+                                      "Built SLP cancelled: can use "
+                                      "load/store-lanes\n");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Built SLP cancelled: all SLP instances support "
+                            "load/store-lanes\n");
+         goto again;
+       }
      }
  
    /* Dissolve SLP-only groups.  */
@@ -2127,47 +2484,47 @@ start_over:
        return ok;
      }
  
-  /* Decide whether to use a fully-masked loop for this vectorization
-     factor.  */
-  LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-    = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
-       && vect_verify_full_masking (loop_vinfo));
-  if (dump_enabled_p ())
-    {
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "using a fully-masked loop.\n");
-      else
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "not using a fully-masked loop.\n");
-    }
-
-  /* If epilog loop is required because of data accesses with gaps,
-     one additional iteration needs to be peeled.  Check if there is
-     enough iterations for vectorization.  */
-  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  /* For now, we don't expect to mix both masking and length approaches for one
+     loop, disable it if both are recorded.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
      {
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
-
-      if (known_lt (wi::to_widest (scalar_niters), vf))
-       return opt_result::failure_at (vect_location,
-                                      "loop has no enough iterations to"
-                                      " support peeling for gaps.\n");
-    }
-
-  /* If we're vectorizing an epilogue loop, we either need a fully-masked
-     loop or a loop that has a lower VF than the main loop.  */
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "can't vectorize a loop with partial vectors"
+                        " because we don't expect to mix different"
+                        " approaches with partial vectors for the"
+                        " same loop.\n");
+      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+    }
+
+  /* If we still have the option of using partial vectors,
+     check whether we can generate the necessary loop controls.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !vect_verify_full_masking (loop_vinfo)
+      && !vect_verify_loop_lens (loop_vinfo))
+    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
+
+  /* If we're vectorizing an epilogue loop, the vectorized loop either needs
+     to be able to handle fewer than VF scalars, or needs to have a lower VF
+     than the main loop.  */
    if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-      && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+      && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
        && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
      return opt_result::failure_at (vect_location,
                                    "Vectorization factor too high for"
                                    " epilogue loop.\n");
  
+  /* Decide whether this loop_vinfo should use partial vectors or peeling,
+     assuming that the loop will be used as a main loop.  We will redo
+     this analysis later if we instead decide to use the loop as an
+     epilogue loop.  */
+  ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
+  if (!ok)
+    return ok;
+
    /* Check the costings of the loop make vectorizing worthwhile.  */
    res = vect_analyze_loop_costing (loop_vinfo);
    if (res < 0)
@@ -2180,7 +2537,6 @@ start_over:
      return opt_result::failure_at (vect_location,
                                    "Loop costings not worthwhile.\n");
  
-  determine_peel_for_niter (loop_vinfo);
    /* If an epilogue loop is required make sure we can create one.  */
    if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
        || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
@@ -2232,7 +2588,7 @@ start_over:
         }
  
        /* Niters for at least one iteration of vectorized loop.  */
-      if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
        /* One additional iteration because of peeling for gap.  */
        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
@@ -2310,7 +2666,7 @@ again:
    LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
    /* Free the SLP instances.  */
    FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
-    vect_free_slp_instance (instance, false);
+    vect_free_slp_instance (instance);
    LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
    /* Reset SLP type to loop_vect on all stmts.  */
    for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
@@ -2336,6 +2692,8 @@ again:
        for (gimple_stmt_iterator si = gsi_start_bb (bb);
            !gsi_end_p (si); gsi_next (&si))
         {
+         if (is_gimple_debug (gsi_stmt (si)))
+           continue;
           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
           STMT_SLP_TYPE (stmt_info) = loop_vect;
           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
@@ -2359,13 +2717,15 @@ again:
    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
      = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
    /* Reset accumulated rgroup information.  */
-  release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
    /* Reset assorted flags.  */
    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
    LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
    LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
    LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
-  LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
+  LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+    = saved_can_use_partial_vectors_p;
  
    goto start_over;
  }
@@ -2414,7 +2774,36 @@ vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
    poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
                              * poly_widest_int (new_vf));
    if (maybe_lt (rel_old, rel_new))
-    return false;
+    {
+      /* When old_loop_vinfo uses a variable vectorization factor,
+        we know that it has a lower cost for at least one runtime VF.
+        However, we don't know how likely that VF is.
+
+        One option would be to compare the costs for the estimated VFs.
+        The problem is that that can put too much pressure on the cost
+        model.  E.g. if the estimated VF is also the lowest possible VF,
+        and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
+        for the estimated VF, we'd then choose new_loop_vinfo even
+        though (a) new_loop_vinfo might not actually be better than
+        old_loop_vinfo for that VF and (b) it would be significantly
+        worse at larger VFs.
+
+        Here we go for a hacky compromise: pick new_loop_vinfo if it is
+        no more expensive than old_loop_vinfo even after doubling the
+        estimated old_loop_vinfo VF.  For all but trivial loops, this
+        ensures that we only pick new_loop_vinfo if it is significantly
+        better than old_loop_vinfo at the estimated VF.  */
+      if (rel_new.is_constant ())
+       return false;
+
+      HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
+      HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
+      widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
+                                     * widest_int (old_estimated_vf));
+      widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
+                                     * widest_int (new_estimated_vf));
+      return estimated_rel_new * 2 <= estimated_rel_old;
+    }
    if (known_lt (rel_new, rel_old))
      return true;
  
@@ -2616,7 +3005,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
                 lowest_th = ordered_min (lowest_th, th);
             }
           else
-           delete loop_vinfo;
+           {
+             delete loop_vinfo;
+             loop_vinfo = opt_loop_vec_info::success (NULL);
+           }
  
           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
              enabled, SIMDUID is not set, it is the innermost loop and we have
@@ -2641,6 +3033,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
        else
         {
           delete loop_vinfo;
+         loop_vinfo = opt_loop_vec_info::success (NULL);
           if (fatal)
             {
               gcc_checking_assert (first_loop_vinfo == NULL);
@@ -2648,6 +3041,23 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
             }
         }
  
+      /* Handle the case that the original loop can use partial
+        vectorization, but want to only adopt it for the epilogue.
+        The retry should be in the same mode as original.  */
+      if (vect_epilogues
+         && loop_vinfo
+         && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       {
+         gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+                     && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "***** Re-trying analysis with same vector mode"
+                            " %s for epilogue with partial vectors.\n",
+                            GET_MODE_NAME (loop_vinfo->vector_mode));
+         continue;
+       }
+
        if (mode_i < vector_modes.length ()
           && VECTOR_MODE_P (autodetected_vector_mode)
           && (related_vector_mode (vector_modes[mode_i],
@@ -2946,14 +3356,17 @@ pop:
           fail = true;
           break;
         }
-      /* Check there's only a single stmt the op is used on inside
-         of the loop.  */
+      /* Check there's only a single stmt the op is used on.  For the
+        not value-changing tail and the last stmt allow out-of-loop uses.
+        ???  We could relax this and handle arbitrary live stmts by
+        forcing a scalar epilogue for example.  */
        imm_use_iterator imm_iter;
        gimple *op_use_stmt;
        unsigned cnt = 0;
        FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
         if (!is_gimple_debug (op_use_stmt)
-           && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
+           && (*code != ERROR_MARK
+               || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
           {
             /* We want to allow x + x but not x < 1 ? x : 2.  */
             if (is_gimple_assign (op_use_stmt)
@@ -3264,42 +3677,58 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
    return NULL;
  }
  
-/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
-int
-vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
-                             int *peel_iters_epilogue,
-                             stmt_vector_for_cost *scalar_cost_vec,
-                            stmt_vector_for_cost *prologue_cost_vec,
-                            stmt_vector_for_cost *epilogue_cost_vec)
+/* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
+   PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
+   or -1 if not known.  */
+
+static int
+vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
  {
-  int retval = 0;
    int assumed_vf = vect_vf_for_cost (loop_vinfo);
-
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
      {
-      *peel_iters_epilogue = assumed_vf / 2;
        if (dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location,
+       dump_printf_loc (MSG_NOTE, vect_location,
                          "cost model: epilogue peel iters set to vf/2 "
                          "because loop iterations are unknown .\n");
-
-      /* If peeled iterations are known but number of scalar loop
-         iterations are unknown, count a taken branch per peeled loop.  */
-      retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
-                                NULL, 0, vect_prologue);
-      retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
-                                 NULL, 0, vect_epilogue);
+      return assumed_vf / 2;
      }
    else
      {
        int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
-      peel_iters_prologue = niters < peel_iters_prologue ?
-                            niters : peel_iters_prologue;
-      *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
+      peel_iters_prologue = MIN (niters, peel_iters_prologue);
+      int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
        /* If we need to peel for gaps, but no peeling is required, we have to
          peel VF iterations.  */
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
-       *peel_iters_epilogue = assumed_vf;
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
+       peel_iters_epilogue = assumed_vf;
+      return peel_iters_epilogue;
+    }
+}
+
+/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
+int
+vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
+                            int *peel_iters_epilogue,
+                            stmt_vector_for_cost *scalar_cost_vec,
+                            stmt_vector_for_cost *prologue_cost_vec,
+                            stmt_vector_for_cost *epilogue_cost_vec)
+{
+  int retval = 0;
+
+  *peel_iters_epilogue
+    = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
+
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      /* If peeled iterations are known but number of scalar loop
+        iterations are unknown, count a taken branch per peeled loop.  */
+      if (peel_iters_prologue > 0)
+       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
+                                  NULL, NULL_TREE, 0, vect_prologue);
+      if (*peel_iters_epilogue > 0)
+       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
+                                   NULL, NULL_TREE, 0, vect_epilogue);
      }
  
    stmt_info_for_cost *si;
@@ -3368,8 +3797,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      {
        /*  FIXME: Make cost depend on complexity of individual check.  */
        unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
-      (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
-                           vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
+                           NULL, NULL_TREE, 0, vect_prologue);
        if (dump_enabled_p ())
         dump_printf (MSG_NOTE,
                      "cost model: Adding cost of checks for loop "
@@ -3381,13 +3810,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      {
        /*  FIXME: Make cost depend on complexity of individual check.  */
        unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
-      (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
-                           vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
+                           NULL, NULL_TREE, 0, vect_prologue);
        len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
        if (len)
         /* Count LEN - 1 ANDs and LEN comparisons.  */
-       (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
-                             NULL, 0, vect_prologue);
+       (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
+                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
        len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
        if (len)
         {
@@ -3397,8 +3826,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
           for (unsigned int i = 0; i < len; ++i)
             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
               nstmts += 1;
-         (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
-                               NULL, 0, vect_prologue);
+         (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
+                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
         }
        if (dump_enabled_p ())
         dump_printf (MSG_NOTE,
@@ -3410,8 +3839,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
    if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
      {
        /*  FIXME: Make cost depend on complexity of individual check.  */
-      (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
-                           vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
+                           NULL, NULL_TREE, 0, vect_prologue);
        if (dump_enabled_p ())
         dump_printf (MSG_NOTE,
                      "cost model: Adding cost of checks for loop "
@@ -3419,8 +3848,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      }
  
    if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
-    (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
-                         vect_prologue);
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+                         NULL, NULL_TREE, 0, vect_prologue);
  
    /* Count statements in scalar loop.  Using this as scalar cost for a single
       iteration for now.
@@ -3442,30 +3871,116 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       TODO: Build an expression that represents peel_iters for prologue and
       epilogue to be used in a run-time test.  */
  
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  bool prologue_need_br_taken_cost = false;
+  bool prologue_need_br_not_taken_cost = false;
+
+  /* Calculate peel_iters_prologue.  */
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
+    peel_iters_prologue = 0;
+  else if (npeel < 0)
      {
-      peel_iters_prologue = 0;
-      peel_iters_epilogue = 0;
+      peel_iters_prologue = assumed_vf / 2;
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "prologue peel iters set to vf/2.\n");
  
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-       {
-         /* We need to peel exactly one iteration.  */
-         peel_iters_epilogue += 1;
-         stmt_info_for_cost *si;
-         int j;
-         FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
-                           j, si)
-           (void) add_stmt_cost (target_cost_data, si->count,
-                                 si->kind, si->stmt_info, si->misalign,
-                                 vect_epilogue);
-       }
+      /* If peeled iterations are unknown, count a taken branch and a not taken
+        branch per peeled loop.  Even if scalar loop iterations are known,
+        vector iterations are not known since peeled prologue iterations are
+        not known.  Hence guards remain the same.  */
+      prologue_need_br_taken_cost = true;
+      prologue_need_br_not_taken_cost = true;
+    }
+  else
+    {
+      peel_iters_prologue = npeel;
+      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
+       /* If peeled iterations are known but number of scalar loop
+          iterations are unknown, count a taken branch per peeled loop.  */
+       prologue_need_br_taken_cost = true;
+    }
+
+  bool epilogue_need_br_taken_cost = false;
+  bool epilogue_need_br_not_taken_cost = false;
+
+  /* Calculate peel_iters_epilogue.  */
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+    /* We need to peel exactly one iteration for gaps.  */
+    peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
+  else if (npeel < 0)
+    {
+      /* If peeling for alignment is unknown, loop bound of main loop
+        becomes unknown.  */
+      peel_iters_epilogue = assumed_vf / 2;
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "epilogue peel iters set to vf/2 because "
+                    "peeling for alignment is unknown.\n");
+
+      /* See the same reason above in peel_iters_prologue calculation.  */
+      epilogue_need_br_taken_cost = true;
+      epilogue_need_br_not_taken_cost = true;
+    }
+  else
+    {
+      peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
+      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
+       /* If peeled iterations are known but number of scalar loop
+          iterations are unknown, count a taken branch per peeled loop.  */
+       epilogue_need_br_taken_cost = true;
+    }
+
+  stmt_info_for_cost *si;
+  int j;
+  /* Add costs associated with peel_iters_prologue.  */
+  if (peel_iters_prologue)
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+      {
+       (void) add_stmt_cost (loop_vinfo, target_cost_data,
+                             si->count * peel_iters_prologue, si->kind,
+                             si->stmt_info, si->vectype, si->misalign,
+                             vect_prologue);
+      }
+
+  /* Add costs associated with peel_iters_epilogue.  */
+  if (peel_iters_epilogue)
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+      {
+       (void) add_stmt_cost (loop_vinfo, target_cost_data,
+                             si->count * peel_iters_epilogue, si->kind,
+                             si->stmt_info, si->vectype, si->misalign,
+                             vect_epilogue);
+      }
+
+  /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
  
+  if (prologue_need_br_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+                         NULL, NULL_TREE, 0, vect_prologue);
+
+  if (prologue_need_br_not_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+                         cond_branch_not_taken, NULL, NULL_TREE, 0,
+                         vect_prologue);
+
+  if (epilogue_need_br_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+                         NULL, NULL_TREE, 0, vect_epilogue);
+
+  if (epilogue_need_br_not_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+                         cond_branch_not_taken, NULL, NULL_TREE, 0,
+                         vect_epilogue);
+
+  /* Take care of special costs for rgroup controls of partial vectors.  */
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
        /* Calculate how many masks we need to generate.  */
        unsigned int num_masks = 0;
-      rgroup_masks *rgm;
+      rgroup_controls *rgm;
        unsigned int num_vectors_m1;
        FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
-       if (rgm->mask_type)
+       if (rgm->type)
           num_masks += num_vectors_m1 + 1;
        gcc_assert (num_masks > 0);
  
@@ -3481,80 +3996,62 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
          simpler and safer to use the worst-case cost; if this ends up
          being the tie-breaker between vectorizing or not, then it's
          probably better not to vectorize.  */
-      (void) add_stmt_cost (target_cost_data, num_masks, vector_stmt,
-                           NULL, 0, vect_prologue);
-      (void) add_stmt_cost (target_cost_data, num_masks - 1, vector_stmt,
-                           NULL, 0, vect_body);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
+                           vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
+                           vector_stmt, NULL, NULL_TREE, 0, vect_body);
      }
-  else if (npeel < 0)
+  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
      {
-      peel_iters_prologue = assumed_vf / 2;
-      if (dump_enabled_p ())
-       dump_printf (MSG_NOTE, "cost model: "
-                    "prologue peel iters set to vf/2.\n");
+      /* Referring to the functions vect_set_loop_condition_partial_vectors
+        and vect_set_loop_controls_directly, we need to generate each
+        length in the prologue and in the loop body if required. Although
+        there are some possible optimizations, we consider the worst case
+        here.  */
  
-      /* If peeling for alignment is unknown, loop bound of main loop becomes
-         unknown.  */
-      peel_iters_epilogue = assumed_vf / 2;
-      if (dump_enabled_p ())
-       dump_printf (MSG_NOTE, "cost model: "
-                    "epilogue peel iters set to vf/2 because "
-                    "peeling for alignment is unknown.\n");
+      bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
+      bool need_iterate_p
+       = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+          && !vect_known_niters_smaller_than_vf (loop_vinfo));
  
-      /* If peeled iterations are unknown, count a taken branch and a not taken
-         branch per peeled loop. Even if scalar loop iterations are known,
-         vector iterations are not known since peeled prologue iterations are
-         not known. Hence guards remain the same.  */
-      (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
-                           NULL, 0, vect_prologue);
-      (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
-                           NULL, 0, vect_prologue);
-      (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
-                           NULL, 0, vect_epilogue);
-      (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
-                           NULL, 0, vect_epilogue);
-      stmt_info_for_cost *si;
-      int j;
-      FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
-       {
-         (void) add_stmt_cost (target_cost_data,
-                               si->count * peel_iters_prologue,
-                               si->kind, si->stmt_info, si->misalign,
-                               vect_prologue);
-         (void) add_stmt_cost (target_cost_data,
-                               si->count * peel_iters_epilogue,
-                               si->kind, si->stmt_info, si->misalign,
-                               vect_epilogue);
-       }
-    }
-  else
-    {
-      stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
-      stmt_info_for_cost *si;
-      int j;
-      void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
+      /* Calculate how many statements to be added.  */
+      unsigned int prologue_stmts = 0;
+      unsigned int body_stmts = 0;
  
-      prologue_cost_vec.create (2);
-      epilogue_cost_vec.create (2);
-      peel_iters_prologue = npeel;
+      rgroup_controls *rgc;
+      unsigned int num_vectors_m1;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+       if (rgc->type)
+         {
+           /* May need one SHIFT for nitems_total computation.  */
+           unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+           if (nitems != 1 && !niters_known_p)
+             prologue_stmts += 1;
+
+           /* May need one MAX and one MINUS for wrap around.  */
+           if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
+             prologue_stmts += 2;
  
-      (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
-                                         &peel_iters_epilogue,
-                                         &LOOP_VINFO_SCALAR_ITERATION_COST
-                                           (loop_vinfo),
-                                         &prologue_cost_vec,
-                                         &epilogue_cost_vec);
+           /* Need one MAX and one MINUS for each batch limit excepting for
+              the 1st one.  */
+           prologue_stmts += num_vectors_m1 * 2;
  
-      FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
-       (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
-                             si->misalign, vect_prologue);
+           unsigned int num_vectors = num_vectors_m1 + 1;
  
-      FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
-       (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
-                             si->misalign, vect_epilogue);
+           /* Need to set up lengths in prologue, only one MIN required
+              for each since start index is zero.  */
+           prologue_stmts += num_vectors;
+
+           /* Each may need two MINs and one MINUS to update lengths in body
+              for next iteration.  */
+           if (need_iterate_p)
+             body_stmts += 3 * num_vectors;
+         }
  
-      prologue_cost_vec.release ();
-      epilogue_cost_vec.release ();
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
+                           scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
+                           scalar_stmt, NULL, NULL_TREE, 0, vect_body);
      }
  
    /* FORNOW: The scalar outside cost is incremented in one of the
@@ -3690,8 +4187,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      }
  
    /* ??? The "if" arm is written to handle all cases; see below for what
-     we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+     we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
      {
        /* Rewriting the condition above in terms of the number of
          vector iterations (vniters) rather than the number of
@@ -3718,7 +4215,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
                      min_vec_niters);
  
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
         {
           /* Now that we know the minimum number of vector iterations,
              find the minimum niters for which the scalar cost is larger:
@@ -3769,10 +4266,14 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
                  "  Calculated minimum iters for profitability: %d\n",
                  min_profitable_iters);
  
-  if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+  if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
        && min_profitable_iters < (assumed_vf + peel_iters_prologue))
      /* We want the vectorized loop to execute at least once.  */
      min_profitable_iters = assumed_vf + peel_iters_prologue;
+  else if (min_profitable_iters < peel_iters_prologue)
+    /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
+       vectorized loop executes at least once.  */
+    min_profitable_iters = peel_iters_prologue;
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
@@ -3790,7 +4291,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
  
    if (vec_outside_cost <= 0)
      min_profitable_estimate = 0;
-  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  /* ??? This "else if" arm is written to handle all cases; see below for
+     what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
+  else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
      {
        /* This is a repeat of the code above, but with + SOC rather
          than - SOC.  */
@@ -3802,7 +4305,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
        if (outside_overhead > 0)
         min_vec_niters = outside_overhead / saving_per_viter + 1;
  
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
         {
           int threshold = (vec_inside_cost * min_vec_niters
                            + vec_outside_cost
@@ -3881,7 +4384,8 @@ have_whole_vector_shift (machine_mode mode)
     the loop, and the epilogue code that must be generated.  */
  
  static void
-vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
+vect_model_reduction_cost (loop_vec_info loop_vinfo,
+                          stmt_vec_info stmt_info, internal_fn reduc_fn,
                            vect_reduction_type reduction_type,
                            int ncopies, stmt_vector_for_cost *cost_vec)
  {
@@ -3890,7 +4394,6 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
    optab optab;
    tree vectype;
    machine_mode mode;
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    class loop *loop = NULL;
  
    if (loop_vinfo)
@@ -4046,34 +4549,6 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
  }
  
  
-/* Function vect_model_induction_cost.
-
-   Models cost for induction operations.  */
-
-static void
-vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
-                          stmt_vector_for_cost *cost_vec)
-{
-  unsigned inside_cost, prologue_cost;
-
-  if (PURE_SLP_STMT (stmt_info))
-    return;
-
-  /* loop cost for vec_loop.  */
-  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
-                                 stmt_info, 0, vect_body);
-
-  /* prologue cost for vec_init and vec_step.  */
-  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
-                                   stmt_info, 0, vect_prologue);
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-                     "vect_model_induction_cost: inside_cost = %d, "
-                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
-}
-
-
  
  /* Function get_initial_def_for_reduction
  
@@ -4119,11 +4594,11 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
     A cost model should help decide between these two schemes.  */
  
  static tree
-get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
+get_initial_def_for_reduction (loop_vec_info loop_vinfo,
+                              stmt_vec_info stmt_vinfo,
                                enum tree_code code, tree init_val,
                                 tree *adjustment_def)
  {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    tree scalar_type = TREE_TYPE (init_val);
    tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
@@ -4223,14 +4698,14 @@ get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
     value will not change the result.  */
  
  static void
-get_initial_defs_for_reduction (slp_tree slp_node,
+get_initial_defs_for_reduction (vec_info *vinfo,
+                               slp_tree slp_node,
                                 vec<tree> *vec_oprnds,
                                 unsigned int number_of_vectors,
                                 bool reduc_chain, tree neutral_op)
  {
    vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
    stmt_vec_info stmt_vinfo = stmts[0];
-  vec_info *vinfo = stmt_vinfo->vinfo;
    unsigned HOST_WIDE_INT nunits;
    unsigned j, number_of_places_left_in_vector;
    tree vector_type;
@@ -4343,11 +4818,12 @@ get_initial_defs_for_reduction (slp_tree slp_node,
     the stmt_vec_info the meta information is stored on.  */
  
  stmt_vec_info
-info_for_reduction (stmt_vec_info stmt_info)
+info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
  {
    stmt_info = vect_orig_stmt (stmt_info);
    gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
-  if (!is_a <gphi *> (stmt_info->stmt))
+  if (!is_a <gphi *> (stmt_info->stmt)
+      || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
      stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
    gphi *phi = as_a <gphi *> (stmt_info->stmt);
    if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
@@ -4359,7 +4835,7 @@ info_for_reduction (stmt_vec_info stmt_info)
      {
        edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
        stmt_vec_info info
-         = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
+         = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
        if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
         stmt_info = info;
      }
@@ -4414,13 +4890,13 @@ info_for_reduction (stmt_vec_info stmt_info)
  */
  
  static void
-vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
+vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
+                                 stmt_vec_info stmt_info,
                                   slp_tree slp_node,
                                   slp_instance slp_node_instance)
  {
-  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
    gcc_assert (reduc_info->is_reduc_info);
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    /* For double reductions we need to get at the inner loop reduction
       stmt which has the meta info attached.  Our stmt_info is that of the
       loop-closed PHI of the inner loop which we remember as
@@ -4439,7 +4915,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
      = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
    enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
    internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
-  stmt_vec_info prev_phi_info;
    tree vectype;
    machine_mode mode;
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
@@ -4447,7 +4922,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
    tree scalar_dest;
    tree scalar_type;
    gimple *new_phi = NULL, *phi;
-  stmt_vec_info phi_info;
    gimple_stmt_iterator exit_gsi;
    tree new_temp = NULL_TREE, new_name, new_scalar_dest;
    gimple *epilog_stmt = NULL;
@@ -4470,7 +4944,7 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
    tree induction_index = NULL_TREE;
  
    if (slp_node)
-    group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 
+    group_size = SLP_TREE_LANES (slp_node);
  
    if (nested_in_vect_loop_p (loop, stmt_info))
      {
@@ -4517,15 +4991,9 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
      }
    else
      {
+      stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
        vec_num = 1;
-      ncopies = 0;
-      phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
-      do
-       {
-         ncopies++;
-         phi_info = STMT_VINFO_RELATED_STMT (phi_info);
-       }
-      while (phi_info);
+      ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
      }
  
    /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
@@ -4547,7 +5015,7 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
         {
           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
             {
-             gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
+             gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
               ccompares.safe_push
                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
@@ -4598,7 +5066,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
        /* Create a vector phi node.  */
        tree new_phi_tree = make_ssa_name (cr_index_vector_type);
        new_phi = create_phi_node (new_phi_tree, loop->header);
-      loop_vinfo->add_stmt (new_phi);
        add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
  
@@ -4625,9 +5092,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
                                          new_phi_tree, indx_before_incr);
         }
        gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
-      stmt_vec_info index_vec_info
-       = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
-      STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
  
        /* Update the phi with the vec cond.  */
        induction_index = new_phi_tree;
@@ -4668,29 +5132,26 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
    if (double_reduc)
      loop = outer_loop;
    exit_bb = single_exit (loop)->dest;
-  prev_phi_info = NULL;
    new_phis.create (slp_node ? vec_num : ncopies);
    for (unsigned i = 0; i < vec_num; i++)
      {
        if (slp_node)
-       def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
+       def = vect_get_slp_vect_def (slp_node, i);
        else
-       def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
+       def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
        for (j = 0; j < ncopies; j++)
          {
           tree new_def = copy_ssa_name (def);
            phi = create_phi_node (new_def, exit_bb);
-         stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
            if (j == 0)
              new_phis.quick_push (phi);
            else
             {
-             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
-             STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
+             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
+             new_phis.quick_push (phi);
             }
  
            SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
-         prev_phi_info = phi_info;
          }
      }
  
@@ -4761,15 +5222,12 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
    /* Likewise if we couldn't use a single defuse cycle.  */
    else if (ncopies > 1)
      {
-      gcc_assert (new_phis.length () == 1);
        gimple_seq stmts = NULL;
        tree first_vect = PHI_RESULT (new_phis[0]);
        first_vect = gimple_convert (&stmts, vectype, first_vect);
-      stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
        for (int k = 1; k < ncopies; ++k)
         {
-         next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
-         tree second_vect = PHI_RESULT (next_phi_info->stmt);
+         tree second_vect = PHI_RESULT (new_phis[k]);
           second_vect = gimple_convert (&stmts, vectype, second_vect);
           first_vect = gimple_build (&stmts, code, vectype,
                                      first_vect, second_vect);
@@ -5413,10 +5871,6 @@ vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
        gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
        if (nested_in_vect_loop)
          {
-         stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
-         STMT_VINFO_RELATED_STMT (epilog_stmt_info)
-           = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
-
            if (!double_reduc)
              scalar_results.quick_push (new_temp);
            else
@@ -5630,18 +6084,17 @@ get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
     that should be used to control the operation in a fully-masked loop.  */
  
  static bool
-vectorize_fold_left_reduction (stmt_vec_info stmt_info,
+vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
+                              stmt_vec_info stmt_info,
                                gimple_stmt_iterator *gsi,
-                              stmt_vec_info *vec_stmt, slp_tree slp_node,
+                              gimple **vec_stmt, slp_tree slp_node,
                                gimple *reduc_def_stmt,
                                tree_code code, internal_fn reduc_fn,
                                tree ops[3], tree vectype_in,
                                int reduc_index, vec_loop_masks *masks)
  {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
-  stmt_vec_info new_stmt_info = NULL;
    internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
  
    int ncopies;
@@ -5666,7 +6119,7 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
    if (slp_node)
      {
        auto_vec<vec<tree> > vec_defs (2);
-      vect_get_slp_defs (slp_node, &vec_defs);
+      vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
        vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
        vec_defs[0].release ();
        vec_defs[1].release ();
@@ -5675,9 +6128,8 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
      }
    else
      {
-      tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
-      vec_oprnds0.create (1);
-      vec_oprnds0.quick_push (loop_vec_def0);
+      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+                                    op0, &vec_oprnds0);
        scalar_dest_def_info = stmt_info;
      }
  
@@ -5753,20 +6205,24 @@ vectorize_fold_left_reduction (stmt_vec_info stmt_info,
        if (i == vec_num - 1)
         {
           gimple_set_lhs (new_stmt, scalar_dest);
-         new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
-                                                   new_stmt);
+         vect_finish_replace_stmt (loop_vinfo,
+                                   scalar_dest_def_info,
+                                   new_stmt);
         }
        else
-       new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
-                                                    new_stmt, gsi);
+       vect_finish_stmt_generation (loop_vinfo,
+                                    scalar_dest_def_info,
+                                    new_stmt, gsi);
  
        if (slp_node)
-       SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
+       SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+      else
+       {
+         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+         *vec_stmt = new_stmt;
+       }
      }
  
-  if (!slp_node)
-    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
-
    return true;
  }
  
@@ -5924,13 +6380,13 @@ build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
     does *NOT* necessarily hold for reduction patterns.  */
  
  bool
-vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
+vectorizable_reduction (loop_vec_info loop_vinfo,
+                       stmt_vec_info stmt_info, slp_tree slp_node,
                         slp_instance slp_node_instance,
                         stmt_vector_for_cost *cost_vec)
  {
    tree scalar_dest;
    tree vectype_in = NULL_TREE;
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
    stmt_vec_info cond_stmt_vinfo = NULL;
@@ -5952,15 +6408,34 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
      return false;
  
    /* The stmt we store reduction analysis meta on.  */
-  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
    reduc_info->is_reduc_info = true;
  
    if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
      {
        if (is_a <gphi *> (stmt_info->stmt))
-       /* Analysis for double-reduction is done on the outer
-          loop PHI, nested cycles have no further restrictions.  */
-       STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       {
+         if (slp_node)
+           {
+             /* We eventually need to set a vector type on invariant
+                arguments.  */
+             unsigned j;
+             slp_tree child;
+             FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+               if (!vect_maybe_update_slp_op_vectype
+                      (child, SLP_TREE_VECTYPE (slp_node)))
+                 {
+                   if (dump_enabled_p ())
+                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                      "incompatible vector types for "
+                                      "invariants\n");
+                   return false;
+                 }
+           }
+         /* Analysis for double-reduction is done on the outer
+            loop PHI, nested cycles have no further restrictions.  */
+         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       }
        else
         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
        return true;
@@ -6002,12 +6477,16 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
    gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
  
    /* Verify following REDUC_IDX from the latch def leads us back to the PHI
-     and compute the reduction chain length.  */
-  tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                         loop_latch_edge (loop));
+     and compute the reduction chain length.  Discover the real
+     reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
+  tree reduc_def
+    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                            loop_latch_edge
+                              (gimple_bb (reduc_def_phi)->loop_father));
    unsigned reduc_chain_length = 0;
    bool only_slp_reduc_chain = true;
    stmt_info = NULL;
+  slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
    while (reduc_def != PHI_RESULT (reduc_def_phi))
      {
        stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
@@ -6050,6 +6529,8 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
         stmt_info = vdef;
        reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
        reduc_chain_length++;
+      if (!stmt_info && slp_node)
+       slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
      }
    /* PHIs should not participate in patterns.  */
    gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
@@ -6136,17 +6617,24 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
       The last use is the reduction variable.  In case of nested cycle this
       assumption is not true: we use reduc_index to record the index of the
       reduction variable.  */
-  reduc_def = PHI_RESULT (reduc_def_phi);
+  slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
+  /* We need to skip an extra operand for COND_EXPRs with embedded
+     comparison.  */
+  unsigned opno_adjust = 0;
+  if (code == COND_EXPR
+      && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
+    opno_adjust = 1;
    for (i = 0; i < op_type; i++)
      {
-      tree op = gimple_op (stmt, i + 1);
        /* The condition of COND_EXPR is checked in vectorizable_condition().  */
        if (i == 0 && code == COND_EXPR)
          continue;
  
        stmt_vec_info def_stmt_info;
        enum vect_def_type dt;
-      if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
+      tree op;
+      if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
+                              i + opno_adjust, &op, &slp_op[i], &dt, &tem,
                                &def_stmt_info))
         {
           if (dump_enabled_p ())
@@ -6527,7 +7015,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
          which each SLP statement has its own initial value and in which
          that value needs to be repeated for every instance of the
          statement within the initial vector.  */
-      unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
+      unsigned int group_size = SLP_TREE_LANES (slp_node);
        if (!neutral_op
           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
                                               TREE_TYPE (vectype_out)))
@@ -6680,13 +7168,28 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
        return false;
      }
  
+  if (slp_node
+      && !(!single_defuse_cycle
+          && code != DOT_PROD_EXPR
+          && code != WIDEN_SUM_EXPR
+          && code != SAD_EXPR
+          && reduction_type != FOLD_LEFT_REDUCTION))
+    for (i = 0; i < op_type; i++)
+      if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "incompatible vector types for invariants\n");
+         return false;
+       }
+
    if (slp_node)
      vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
    else
      vec_num = 1;
  
-  vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
-                            cost_vec);
+  vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
+                            reduction_type, ncopies, cost_vec);
    if (dump_enabled_p ()
        && reduction_type == FOLD_LEFT_REDUCTION)
      dump_printf_loc (MSG_NOTE, vect_location,
@@ -6710,7 +7213,7 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
        STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
        STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
      }
-  else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+  else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
      {
        vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
        internal_fn cond_fn = get_conditional_internal_fn (code);
@@ -6723,9 +7226,9 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "can't use a fully-masked loop because no"
-                            " conditional operation is available.\n");
-         LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                            "can't operate on partial vectors because"
+                            " no conditional operation is available.\n");
+         LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
         }
        else if (reduction_type == FOLD_LEFT_REDUCTION
                && reduc_fn == IFN_LAST
@@ -6735,9 +7238,9 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "can't use a fully-masked loop because no"
-                            " conditional operation is available.\n");
-         LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                            "can't operate on partial vectors because"
+                            " no conditional operation is available.\n");
+         LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
         }
        else
         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
@@ -6750,18 +7253,17 @@ vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
     value.  */
  
  bool
-vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
-                         stmt_vec_info *vec_stmt, slp_tree slp_node)
+vect_transform_reduction (loop_vec_info loop_vinfo,
+                         stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
+                         gimple **vec_stmt, slp_tree slp_node)
  {
    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    int i;
    int ncopies;
-  int j;
    int vec_num;
  
-  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
    gcc_assert (reduc_info->is_reduc_info);
  
    if (nested_in_vect_loop_p (loop, stmt_info))
@@ -6814,8 +7316,6 @@ vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
    bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
  
    /* Transform.  */
-  stmt_vec_info new_stmt_info = NULL;
-  stmt_vec_info prev_stmt_info;
    tree new_temp = NULL_TREE;
    auto_vec<tree> vec_oprnds0;
    auto_vec<tree> vec_oprnds1;
@@ -6836,7 +7336,7 @@ vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
      {
        internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
        return vectorize_fold_left_reduction
-         (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
+         (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
            reduc_fn, ops, vectype_in, reduc_index, masks);
      }
  
@@ -6850,137 +7350,83 @@ vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
    tree scalar_dest = gimple_assign_lhs (stmt);
    tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
  
-  prev_stmt_info = NULL;
-  if (!slp_node)
+  vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
+                    single_defuse_cycle && reduc_index == 0
+                    ? NULL_TREE : ops[0], &vec_oprnds0,
+                    single_defuse_cycle && reduc_index == 1
+                    ? NULL_TREE : ops[1], &vec_oprnds1,
+                    op_type == ternary_op
+                    && !(single_defuse_cycle && reduc_index == 2)
+                    ? ops[2] : NULL_TREE, &vec_oprnds2);
+  if (single_defuse_cycle)
      {
-      vec_oprnds0.create (1);
-      vec_oprnds1.create (1);
-      if (op_type == ternary_op)
-        vec_oprnds2.create (1);
+      gcc_assert (!slp_node);
+      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+                                    ops[reduc_index],
+                                    reduc_index == 0 ? &vec_oprnds0
+                                    : (reduc_index == 1 ? &vec_oprnds1
+                                       : &vec_oprnds2));
      }
  
-  for (j = 0; j < ncopies; j++)
+  FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
      {
-      /* Handle uses.  */
-      if (j == 0)
-        {
-         if (slp_node)
-           {
-             /* Get vec defs for all the operands except the reduction index,
-                ensuring the ordering of the ops in the vector is kept.  */
-             auto_vec<vec<tree>, 3> vec_defs;
-             vect_get_slp_defs (slp_node, &vec_defs);
-             vec_oprnds0.safe_splice (vec_defs[0]);
-             vec_defs[0].release ();
-             vec_oprnds1.safe_splice (vec_defs[1]);
-             vec_defs[1].release ();
-             if (op_type == ternary_op)
-               {
-                 vec_oprnds2.safe_splice (vec_defs[2]);
-                 vec_defs[2].release ();
-               }
-           }
-          else
+      gimple *new_stmt;
+      tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
+      if (masked_loop_p && !mask_by_cond_expr)
+       {
+         /* Make sure that the reduction accumulator is vop[0].  */
+         if (reduc_index == 1)
             {
-              vec_oprnds0.quick_push
-               (vect_get_vec_def_for_operand (ops[0], stmt_info));
-              vec_oprnds1.quick_push
-               (vect_get_vec_def_for_operand (ops[1], stmt_info));
-              if (op_type == ternary_op)
-               vec_oprnds2.quick_push 
-                 (vect_get_vec_def_for_operand (ops[2], stmt_info));
+             gcc_assert (commutative_tree_code (code));
+             std::swap (vop[0], vop[1]);
             }
-        }
+         tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
+                                         vectype_in, i);
+         gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
+                                                   vop[0], vop[1], vop[0]);
+         new_temp = make_ssa_name (vec_dest, call);
+         gimple_call_set_lhs (call, new_temp);
+         gimple_call_set_nothrow (call, true);
+         vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
+         new_stmt = call;
+       }
        else
-        {
-          if (!slp_node)
-            {
-             gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
-
-             if (single_defuse_cycle && reduc_index == 0)
-               vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
-             else
-               vec_oprnds0[0]
-                 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-                                                   vec_oprnds0[0]);
-             if (single_defuse_cycle && reduc_index == 1)
-               vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
-             else
-               vec_oprnds1[0]
-                 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-                                                   vec_oprnds1[0]);
-             if (op_type == ternary_op)
-               {
-                 if (single_defuse_cycle && reduc_index == 2)
-                   vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
-                 else
-                   vec_oprnds2[0] 
-                     = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-                                                       vec_oprnds2[0]);
-               }
-            }
-        }
+       {
+         if (op_type == ternary_op)
+           vop[2] = vec_oprnds2[i];
  
-      FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
-        {
-         tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
-         if (masked_loop_p && !mask_by_cond_expr)
+         if (masked_loop_p && mask_by_cond_expr)
             {
-             /* Make sure that the reduction accumulator is vop[0].  */
-             if (reduc_index == 1)
-               {
-                 gcc_assert (commutative_tree_code (code));
-                 std::swap (vop[0], vop[1]);
-               }
               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
-                                             vectype_in, i * ncopies + j);
-             gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
-                                                       vop[0], vop[1],
-                                                       vop[0]);
-             new_temp = make_ssa_name (vec_dest, call);
-             gimple_call_set_lhs (call, new_temp);
-             gimple_call_set_nothrow (call, true);
-             new_stmt_info
-               = vect_finish_stmt_generation (stmt_info, call, gsi);
-           }
-         else
-           {
-             if (op_type == ternary_op)
-               vop[2] = vec_oprnds2[i];
-
-             if (masked_loop_p && mask_by_cond_expr)
-               {
-                 tree mask = vect_get_loop_mask (gsi, masks,
-                                                 vec_num * ncopies,
-                                                 vectype_in, i * ncopies + j);
-                 build_vect_cond_expr (code, vop, mask, gsi);
-               }
-
-             gassign *new_stmt = gimple_build_assign (vec_dest, code,
-                                                      vop[0], vop[1], vop[2]);
-             new_temp = make_ssa_name (vec_dest, new_stmt);
-             gimple_assign_set_lhs (new_stmt, new_temp);
-             new_stmt_info
-               = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
+                                             vectype_in, i);
+             build_vect_cond_expr (code, vop, mask, gsi);
             }
  
-          if (slp_node)
-           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
-        }
-
-      if (slp_node || single_defuse_cycle)
-        continue;
+         new_stmt = gimple_build_assign (vec_dest, code,
+                                         vop[0], vop[1], vop[2]);
+         new_temp = make_ssa_name (vec_dest, new_stmt);
+         gimple_assign_set_lhs (new_stmt, new_temp);
+         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
+       }
  
-      if (j == 0)
-       STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+      if (slp_node)
+       SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+      else if (single_defuse_cycle
+              && i < ncopies - 1)
+       {
+         if (reduc_index == 0)
+           vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
+         else if (reduc_index == 1)
+           vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
+         else if (reduc_index == 2)
+           vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
+       }
        else
-       STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
-
-      prev_stmt_info = new_stmt_info;
+       STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
      }
  
-  if (single_defuse_cycle && !slp_node)
-    STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
+  if (!slp_node)
+    *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
  
    return true;
  }
@@ -6988,15 +7434,14 @@ vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
  /* Transform phase of a cycle PHI.  */
  
  bool
-vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+vect_transform_cycle_phi (loop_vec_info loop_vinfo,
+                         stmt_vec_info stmt_info, gimple **vec_stmt,
                           slp_tree slp_node, slp_instance slp_node_instance)
  {
    tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    int i;
    int ncopies;
-  stmt_vec_info prev_phi_info;
    int j;
    bool nested_cycle = false;
    int vec_num;
@@ -7009,7 +7454,7 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
  
    stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
    reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
-  stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+  stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
    gcc_assert (reduc_info->is_reduc_info);
  
    if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
@@ -7026,9 +7471,8 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
    if (slp_node)
      {
        /* The size vect_schedule_slp_instance computes is off for us.  */
-      vec_num = vect_get_num_vectors
-         (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
-          * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
+      vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
+                                     * SLP_TREE_LANES (slp_node), vectype_in);
        ncopies = 1;
      }
    else
@@ -7053,15 +7497,24 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
    if (slp_node)
      {
        vec_initial_defs.reserve (vec_num);
-      gcc_assert (slp_node == slp_node_instance->reduc_phis);
-      stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
-      tree neutral_op
-       = neutral_op_for_slp_reduction (slp_node, vectype_out,
-                                       STMT_VINFO_REDUC_CODE (reduc_info),
-                                       first != NULL);
-      get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
-                                     &vec_initial_defs, vec_num,
-                                     first != NULL, neutral_op);
+      if (nested_cycle)
+       {
+         unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
+         vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
+                            &vec_initial_defs);
+       }
+      else
+       {
+         gcc_assert (slp_node == slp_node_instance->reduc_phis);
+         stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
+         tree neutral_op
+             = neutral_op_for_slp_reduction (slp_node, vectype_out,
+                                             STMT_VINFO_REDUC_CODE (reduc_info),
+                                             first != NULL);
+         get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
+                                         &vec_initial_defs, vec_num,
+                                         first != NULL, neutral_op);
+       }
      }
    else
      {
@@ -7088,13 +7541,17 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
             }
           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
+         vec_initial_defs.create (ncopies);
+         for (i = 0; i < ncopies; ++i)
+           vec_initial_defs.quick_push (vec_initial_def);
         }
        else if (nested_cycle)
         {
           /* Do not use an adjustment def as that case is not supported
              correctly if ncopies is not one.  */
-         vec_initial_def = vect_get_vec_def_for_operand (initial_def,
-                                                         reduc_stmt_info);
+         vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
+                                        ncopies, initial_def,
+                                        &vec_initial_defs);
         }
        else
         {
@@ -7104,16 +7561,16 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
             adjustment_defp = NULL;
           vec_initial_def
-           = get_initial_def_for_reduction (reduc_stmt_info, code,
+           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
                                              initial_def, adjustment_defp);
           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
+         vec_initial_defs.create (ncopies);
+         for (i = 0; i < ncopies; ++i)
+           vec_initial_defs.quick_push (vec_initial_def);
         }
-      vec_initial_defs.create (1);
-      vec_initial_defs.quick_push (vec_initial_def);
      }
  
    /* Generate the reduction PHIs upfront.  */
-  prev_phi_info = NULL;
    for (i = 0; i < vec_num; i++)
      {
        tree vec_init_def = vec_initial_defs[i];
@@ -7122,26 +7579,22 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
           /* Create the reduction-phi that defines the reduction
              operand.  */
           gphi *new_phi = create_phi_node (vec_dest, loop->header);
-         stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
  
           /* Set the loop-entry arg of the reduction-phi.  */
           if (j != 0 && nested_cycle)
-           vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
-                                                          vec_init_def);
+           vec_init_def = vec_initial_defs[j];
           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
                        UNKNOWN_LOCATION);
  
           /* The loop-latch arg is set in epilogue processing.  */
  
           if (slp_node)
-           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
+           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
           else
             {
               if (j == 0)
-               STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
-             else
-               STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
-             prev_phi_info = new_phi_info;
+               *vec_stmt = new_phi;
+             STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
             }
         }
      }
@@ -7152,10 +7605,10 @@ vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
  /* Vectorizes LC PHIs.  */
  
  bool
-vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
+vectorizable_lc_phi (loop_vec_info loop_vinfo,
+                    stmt_vec_info stmt_info, gimple **vec_stmt,
                      slp_tree slp_node)
  {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    if (!loop_vinfo
        || !is_a <gphi *> (stmt_info->stmt)
        || gimple_phi_num_args (stmt_info->stmt) != 1)
@@ -7167,6 +7620,17 @@ vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
  
    if (!vec_stmt) /* transformation not required.  */
      {
+      /* Deal with copies from externs or constants that disguise as
+        loop-closed PHI nodes (PR97886).  */
+      if (slp_node
+         && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
+                                               SLP_TREE_VECTYPE (slp_node)))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "incompatible vector types for invariants\n");
+         return false;
+       }
        STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
        return true;
      }
@@ -7176,42 +7640,97 @@ vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
    basic_block bb = gimple_bb (stmt_info->stmt);
    edge e = single_pred_edge (bb);
    tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
-  vec<tree> vec_oprnds = vNULL;
-  vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
-                    stmt_info, &vec_oprnds, NULL, slp_node);
-  if (slp_node)
+  auto_vec<tree> vec_oprnds;
+  vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
+                    !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
+                    gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
+  for (unsigned i = 0; i < vec_oprnds.length (); i++)
+    {
+      /* Create the vectorized LC PHI node.  */
+      gphi *new_phi = create_phi_node (vec_dest, bb);
+      add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
+      if (slp_node)
+       SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
+      else
+       STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
+    }
+  if (!slp_node)
+    *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+
+  return true;
+}
+
+/* Vectorizes PHIs.  */
+
+bool
+vectorizable_phi (vec_info *,
+                 stmt_vec_info stmt_info, gimple **vec_stmt,
+                 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
+    return false;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+    return false;
+
+  tree vectype = SLP_TREE_VECTYPE (slp_node);
+
+  if (!vec_stmt) /* transformation not required.  */
      {
-      unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      gcc_assert (vec_oprnds.length () == vec_num);
-      for (unsigned i = 0; i < vec_num; i++)
-       {
-         /* Create the vectorized LC PHI node.  */
-         gphi *new_phi = create_phi_node (vec_dest, bb);
-         add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
-         stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
-         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
-       }
+      slp_tree child;
+      unsigned i;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
+       if (!child)
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "PHI node with unvectorized backedge def\n");
+           return false;
+         }
+       else if (!vect_maybe_update_slp_op_vectype (child, vectype))
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "incompatible vector types for invariants\n");
+           return false;
+         }
+      record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                       vector_stmt, stmt_info, vectype, 0, vect_body);
+      STMT_VINFO_TYPE (stmt_info) = phi_info_type;
+      return true;
      }
-  else
+
+  tree scalar_dest = gimple_phi_result (stmt_info->stmt);
+  basic_block bb = gimple_bb (stmt_info->stmt);
+  tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
+  auto_vec<gphi *> new_phis;
+  for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
      {
-      unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
-      stmt_vec_info prev_phi_info = NULL;
-      for (unsigned i = 0; i < ncopies; i++)
-       {
-         if (i != 0)
-           vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
-         /* Create the vectorized LC PHI node.  */
-         gphi *new_phi = create_phi_node (vec_dest, bb);
-         add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
-         stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
-         if (i == 0)
-           STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
-         else
-           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
-         prev_phi_info = new_phi_info;
+      slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
+
+      /* Skip not yet vectorized defs.  */
+      if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
+         && SLP_TREE_VEC_STMTS (child).is_empty ())
+       continue;
+
+      auto_vec<tree> vec_oprnds;
+      vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
+      if (!new_phis.exists ())
+       {
+         new_phis.create (vec_oprnds.length ());
+         for (unsigned j = 0; j < vec_oprnds.length (); j++)
+           {
+             /* Create the vectorized LC PHI node.  */
+             new_phis.quick_push (create_phi_node (vec_dest, bb));
+             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
+           }
         }
+      edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
+      for (unsigned j = 0; j < vec_oprnds.length (); j++)
+       add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
      }
-  vec_oprnds.release ();
+  /* We should have at least one already vectorized child.  */
+  gcc_assert (new_phis.exists ());
  
    return true;
  }
@@ -7265,12 +7784,11 @@ vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
     Return true if STMT_INFO is vectorizable in this way.  */
  
  bool
-vectorizable_induction (stmt_vec_info stmt_info,
-                       gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
-                       stmt_vec_info *vec_stmt, slp_tree slp_node,
+vectorizable_induction (loop_vec_info loop_vinfo,
+                       stmt_vec_info stmt_info,
+                       gimple **vec_stmt, slp_tree slp_node,
                         stmt_vector_for_cost *cost_vec)
  {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
    unsigned ncopies;
    bool nested_in_vect_loop = false;
@@ -7287,12 +7805,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
    poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    unsigned i;
    tree expr;
-  gimple_seq stmts;
-  imm_use_iterator imm_iter;
-  use_operand_p use_p;
-  gimple *exit_phi;
-  edge latch_e;
-  tree loop_arg;
    gimple_stmt_iterator si;
  
    gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
@@ -7332,10 +7844,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
           return false;
         }
  
-      /* FORNOW: outer loop induction with SLP not supported.  */
-      if (STMT_SLP_TYPE (stmt_info))
-       return false;
-
        exit_phi = NULL;
        latch_e = loop_latch_edge (loop->inner);
        loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
@@ -7374,7 +7882,7 @@ vectorizable_induction (stmt_vec_info stmt_info,
  
    if (slp_node && !nunits.is_constant ())
      {
-      /* The current SLP code creates the initial value element-by-element.  */
+      /* The current SLP code creates the step value element-by-element.  */
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                          "SLP induction not supported for variable-length"
@@ -7384,9 +7892,50 @@ vectorizable_induction (stmt_vec_info stmt_info,
  
    if (!vec_stmt) /* transformation not required.  */
      {
+      unsigned inside_cost = 0, prologue_cost = 0;
+      if (slp_node)
+       {
+         /* We eventually need to set a vector type on invariant
+            arguments.  */
+         unsigned j;
+         slp_tree child;
+         FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+           if (!vect_maybe_update_slp_op_vectype
+               (child, SLP_TREE_VECTYPE (slp_node)))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "incompatible vector types for "
+                                  "invariants\n");
+               return false;
+             }
+         /* loop cost for vec_loop.  */
+         inside_cost
+           = record_stmt_cost (cost_vec,
+                               SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                               vector_stmt, stmt_info, 0, vect_body);
+         /* prologue cost for vec_init (if not nested) and step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
+                                           scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      else /* if (!slp_node) */
+       {
+         /* loop cost for vec_loop.  */
+         inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+                                         stmt_info, 0, vect_body);
+         /* prologue cost for vec_init and vec_step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "vect_model_induction_cost: inside_cost = %d, "
+                        "prologue_cost = %d .\n", inside_cost,
+                        prologue_cost);
+
        STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
        DUMP_VECT_SCOPE ("vectorizable_induction");
-      vect_model_induction_cost (stmt_info, ncopies, cost_vec);
        return true;
      }
  
@@ -7400,161 +7949,200 @@ vectorizable_induction (stmt_vec_info stmt_info,
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
  
-  latch_e = loop_latch_edge (iv_loop);
-  loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
-
    step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
    gcc_assert (step_expr != NULL_TREE);
    tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
  
    pe = loop_preheader_edge (iv_loop);
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-                                    loop_preheader_edge (iv_loop));
-
-  stmts = NULL;
-  if (!nested_in_vect_loop)
-    {
-      /* Convert the initial value to the IV update type.  */
-      tree new_type = TREE_TYPE (step_expr);
-      init_expr = gimple_convert (&stmts, new_type, init_expr);
-
-      /* If we are using the loop mask to "peel" for alignment then we need
-        to adjust the start value here.  */
-      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-      if (skip_niters != NULL_TREE)
-       {
-         if (FLOAT_TYPE_P (vectype))
-           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
-                                       skip_niters);
-         else
-           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
-         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
-                                        skip_niters, step_expr);
-         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
-                                   init_expr, skip_step);
-       }
-    }
-
-  if (stmts)
-    {
-      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-      gcc_assert (!new_bb);
-    }
-
    /* Find the first insertion point in the BB.  */
    basic_block bb = gimple_bb (phi);
    si = gsi_after_labels (bb);
  
    /* For SLP induction we have to generate several IVs as for example
-     with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
-     [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
-     [VF*S, VF*S, VF*S, VF*S] for all.  */
+     with group size 3 we need
+       [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
+       [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
    if (slp_node)
      {
        /* Enforced above.  */
        unsigned int const_nunits = nunits.to_constant ();
  
-      /* Generate [VF*S, VF*S, ... ].  */
-      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+      /* The initial values are vectorized, but any lanes > group_size
+        need adjustment.  */
+      slp_tree init_node
+       = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
+
+      /* Gather steps.  Since we do not vectorize inductions as
+        cycles we have to reconstruct the step from SCEV data.  */
+      unsigned group_size = SLP_TREE_LANES (slp_node);
+      tree *steps = XALLOCAVEC (tree, group_size);
+      tree *inits = XALLOCAVEC (tree, group_size);
+      stmt_vec_info phi_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
         {
-         expr = build_int_cst (integer_type_node, vf);
-         expr = fold_convert (TREE_TYPE (step_expr), expr);
+         steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+         if (!init_node)
+           inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
+                                          pe->dest_idx);
         }
-      else
-       expr = build_int_cst (TREE_TYPE (step_expr), vf);
-      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                             expr, step_expr);
-      if (! CONSTANT_CLASS_P (new_name))
-       new_name = vect_init_vector (stmt_info, new_name,
-                                    TREE_TYPE (step_expr), NULL);
-      new_vec = build_vector_from_val (step_vectype, new_name);
-      vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
  
        /* Now generate the IVs.  */
-      unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
        unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      unsigned elts = const_nunits * nvects;
-      unsigned nivs = least_common_multiple (group_size,
-                                            const_nunits) / const_nunits;
-      gcc_assert (elts % group_size == 0);
-      tree elt = init_expr;
+      gcc_assert ((const_nunits * nvects) % group_size == 0);
+      unsigned nivs;
+      if (nested_in_vect_loop)
+       nivs = nvects;
+      else
+       {
+         /* Compute the number of distinct IVs we need.  First reduce
+            group_size if it is a multiple of const_nunits so we get
+            one IV for a group_size of 4 but const_nunits 2.  */
+         unsigned group_sizep = group_size;
+         if (group_sizep % const_nunits == 0)
+           group_sizep = group_sizep / const_nunits;
+         nivs = least_common_multiple (group_sizep,
+                                       const_nunits) / const_nunits;
+       }
+      tree stept = TREE_TYPE (step_vectype);
+      tree lupdate_mul = NULL_TREE;
+      if (!nested_in_vect_loop)
+       {
+         /* The number of iterations covered in one vector iteration.  */
+         unsigned lup_mul = (nvects * const_nunits) / group_size;
+         lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept, lup_mul,
+                                                            UNSIGNED)
+                                    : build_int_cstu (stept, lup_mul));
+       }
+      tree peel_mul = NULL_TREE;
+      gimple_seq init_stmts = NULL;
+      if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+       {
+         if (SCALAR_FLOAT_TYPE_P (stept))
+           peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
+                                    LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         else
+           peel_mul = gimple_convert (&init_stmts, stept,
+                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         peel_mul = gimple_build_vector_from_val (&init_stmts,
+                                                  step_vectype, peel_mul);
+       }
        unsigned ivn;
+      auto_vec<tree> vec_steps;
        for (ivn = 0; ivn < nivs; ++ivn)
         {
-         tree_vector_builder elts (step_vectype, const_nunits, 1);
-         stmts = NULL;
+         tree_vector_builder step_elts (step_vectype, const_nunits, 1);
+         tree_vector_builder init_elts (vectype, const_nunits, 1);
+         tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
             {
-             if (ivn*const_nunits + eltn >= group_size
-                 && (ivn * const_nunits + eltn) % group_size == 0)
-               elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
-                                   elt, step_expr);
-             elts.quick_push (elt);
-           }
-         vec_init = gimple_build_vector (&stmts, &elts);
-         vec_init = gimple_convert (&stmts, vectype, vec_init);
-         if (stmts)
-           {
-             new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-             gcc_assert (!new_bb);
+             /* The scalar steps of the IVs.  */
+             tree elt = steps[(ivn*const_nunits + eltn) % group_size];
+             elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
+             step_elts.quick_push (elt);
+             if (!init_node)
+               {
+                 /* The scalar inits of the IVs if not vectorized.  */
+                 elt = inits[(ivn*const_nunits + eltn) % group_size];
+                 if (!useless_type_conversion_p (TREE_TYPE (vectype),
+                                                 TREE_TYPE (elt)))
+                   elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
+                                       TREE_TYPE (vectype), elt);
+                 init_elts.quick_push (elt);
+               }
+             /* The number of steps to add to the initial values.  */
+             unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
+             mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
+                                  ? build_real_from_wide (stept,
+                                                          mul_elt, UNSIGNED)
+                                  : build_int_cstu (stept, mul_elt));
             }
+         vec_step = gimple_build_vector (&init_stmts, &step_elts);
+         vec_steps.safe_push (vec_step);
+         tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
+         if (peel_mul)
+           step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                    step_mul, peel_mul);
+         if (!init_node)
+           vec_init = gimple_build_vector (&init_stmts, &init_elts);
  
           /* Create the induction-phi that defines the induction-operand.  */
-         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
+                                           "vec_iv_");
           induction_phi = create_phi_node (vec_dest, iv_loop->header);
-         stmt_vec_info induction_phi_info
-           = loop_vinfo->add_stmt (induction_phi);
           induc_def = PHI_RESULT (induction_phi);
  
           /* Create the iv update inside the loop  */
+         tree up = vec_step;
+         if (lupdate_mul)
+           up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                              vec_step, lupdate_mul);
           gimple_seq stmts = NULL;
           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
           vec_def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, vec_def, vec_step);
+                                 PLUS_EXPR, step_vectype, vec_def, up);
           vec_def = gimple_convert (&stmts, vectype, vec_def);
-         loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+                      UNKNOWN_LOCATION);
+
+         if (init_node)
+           vec_init = vect_get_slp_vect_def (init_node, ivn);
+         if (!nested_in_vect_loop
+             && !integer_zerop (step_mul))
+           {
+             vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
+             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                vec_step, step_mul);
+             vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                     vec_def, up);
+             vec_init = gimple_convert (&init_stmts, vectype, vec_def);
+           }
  
           /* Set the arguments of the phi node:  */
           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
-         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
-                      UNKNOWN_LOCATION);
  
-         SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
+         SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
+       }
+      if (!nested_in_vect_loop)
+       {
+         /* Fill up to the number of vectors we need for the whole group.  */
+         nivs = least_common_multiple (group_size,
+                                       const_nunits) / const_nunits;
+         for (; ivn < nivs; ++ivn)
+           {
+             SLP_TREE_VEC_STMTS (slp_node)
+               .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+             vec_steps.safe_push (vec_steps[0]);
+           }
         }
  
-      /* Re-use IVs when we can.  */
+      /* Re-use IVs when we can.  We are generating further vector
+        stmts by adding VF' * stride to the IVs generated above.  */
        if (ivn < nvects)
         {
           unsigned vfp
             = least_common_multiple (group_size, const_nunits) / group_size;
-         /* Generate [VF'*S, VF'*S, ... ].  */
-         if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-           {
-             expr = build_int_cst (integer_type_node, vfp);
-             expr = fold_convert (TREE_TYPE (step_expr), expr);
-           }
-         else
-           expr = build_int_cst (TREE_TYPE (step_expr), vfp);
-         new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                                 expr, step_expr);
-         if (! CONSTANT_CLASS_P (new_name))
-           new_name = vect_init_vector (stmt_info, new_name,
-                                        TREE_TYPE (step_expr), NULL);
-         new_vec = build_vector_from_val (step_vectype, new_name);
-         vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
+         tree lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept,
+                                                            vfp, UNSIGNED)
+                                    : build_int_cstu (stept, vfp));
           for (; ivn < nvects; ++ivn)
             {
-             gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
-             tree def;
-             if (gimple_code (iv) == GIMPLE_PHI)
-               def = gimple_phi_result (iv);
-             else
-               def = gimple_assign_lhs (iv);
+             gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
+             tree def = gimple_get_lhs (iv);
+             if (ivn < 2*nivs)
+               vec_steps[ivn - nivs]
+                 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                 vec_steps[ivn - nivs], lupdate_mul);
               gimple_seq stmts = NULL;
               def = gimple_convert (&stmts, step_vectype, def);
-             def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, def, vec_step);
+             def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
+                                 def, vec_steps[ivn % nivs]);
               def = gimple_convert (&stmts, vectype, def);
               if (gimple_code (iv) == GIMPLE_PHI)
                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
@@ -7563,21 +8151,60 @@ vectorizable_induction (stmt_vec_info stmt_info,
                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
                 }
-             SLP_TREE_VEC_STMTS (slp_node).quick_push
-               (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
+             SLP_TREE_VEC_STMTS (slp_node)
+               .quick_push (SSA_NAME_DEF_STMT (def));
             }
         }
  
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
+      gcc_assert (!new_bb);
+
        return true;
      }
  
+  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
+                                    loop_preheader_edge (iv_loop));
+
+  gimple_seq stmts = NULL;
+  if (!nested_in_vect_loop)
+    {
+      /* Convert the initial value to the IV update type.  */
+      tree new_type = TREE_TYPE (step_expr);
+      init_expr = gimple_convert (&stmts, new_type, init_expr);
+
+      /* If we are using the loop mask to "peel" for alignment then we need
+        to adjust the start value here.  */
+      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+      if (skip_niters != NULL_TREE)
+       {
+         if (FLOAT_TYPE_P (vectype))
+           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
+                                       skip_niters);
+         else
+           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
+         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
+                                        skip_niters, step_expr);
+         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
+                                   init_expr, skip_step);
+       }
+    }
+
+  if (stmts)
+    {
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+      gcc_assert (!new_bb);
+    }
+
    /* Create the vector that holds the initial_value of the induction.  */
    if (nested_in_vect_loop)
      {
        /* iv_loop is nested in the loop to be vectorized.  init_expr had already
          been created during vectorization of previous stmts.  We obtain it
          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
-      vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
+      auto_vec<tree> vec_inits;
+      vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
+                                    init_expr, &vec_inits);
+      vec_init = vec_inits[0];
        /* If the initial value is not of proper type, convert it.  */
        if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
         {
@@ -7592,7 +8219,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
                                                  new_stmt);
           gcc_assert (!new_bb);
-         loop_vinfo->add_stmt (new_stmt);
         }
      }
    else
@@ -7680,7 +8306,8 @@ vectorizable_induction (stmt_vec_info stmt_info,
    gcc_assert (CONSTANT_CLASS_P (new_name)
               || TREE_CODE (new_name) == SSA_NAME);
    new_vec = build_vector_from_val (step_vectype, t);
-  vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
+  vec_step = vect_init_vector (loop_vinfo, stmt_info,
+                              new_vec, step_vectype, NULL);
  
  
    /* Create the following def-use cycle:
@@ -7697,7 +8324,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
    /* Create the induction-phi that defines the induction-operand.  */
    vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
    induction_phi = create_phi_node (vec_dest, iv_loop->header);
-  stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
    induc_def = PHI_RESULT (induction_phi);
  
    /* Create the iv update inside the loop  */
@@ -7707,14 +8333,14 @@ vectorizable_induction (stmt_vec_info stmt_info,
    vec_def = gimple_convert (&stmts, vectype, vec_def);
    gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
    new_stmt = SSA_NAME_DEF_STMT (vec_def);
-  stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
  
    /* Set the arguments of the phi node:  */
    add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
    add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
                UNKNOWN_LOCATION);
  
-  STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
+  STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
+  *vec_stmt = induction_phi;
  
    /* In case that vectorization factor (VF) is bigger than the number
       of elements that we can fit in a vectype (nunits), we have to generate
@@ -7725,7 +8351,6 @@ vectorizable_induction (stmt_vec_info stmt_info,
    if (ncopies > 1)
      {
        gimple_seq seq = NULL;
-      stmt_vec_info prev_stmt_vinfo;
        /* FORNOW. This restriction should be relaxed.  */
        gcc_assert (!nested_in_vect_loop);
  
@@ -7749,10 +8374,10 @@ vectorizable_induction (stmt_vec_info stmt_info,
        gcc_assert (CONSTANT_CLASS_P (new_name)
                   || TREE_CODE (new_name) == SSA_NAME);
        new_vec = build_vector_from_val (step_vectype, t);
-      vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
+      vec_step = vect_init_vector (loop_vinfo, stmt_info,
+                                  new_vec, step_vectype, NULL);
  
        vec_def = induc_def;
-      prev_stmt_vinfo = induction_phi_info;
        for (i = 1; i < ncopies; i++)
         {
           /* vec_i = vec_prev + vec_step  */
@@ -7764,46 +8389,10 @@ vectorizable_induction (stmt_vec_info stmt_info,
   
           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
           new_stmt = SSA_NAME_DEF_STMT (vec_def);
-         new_stmt_info = loop_vinfo->add_stmt (new_stmt);
-         STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
-         prev_stmt_vinfo = new_stmt_info;
+         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
         }
      }
  
-  if (nested_in_vect_loop)
-    {
-      /* Find the loop-closed exit-phi of the induction, and record
-         the final vector of induction results:  */
-      exit_phi = NULL;
-      FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
-        {
-         gimple *use_stmt = USE_STMT (use_p);
-         if (is_gimple_debug (use_stmt))
-           continue;
-
-         if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
-           {
-             exit_phi = use_stmt;
-             break;
-           }
-        }
-      if (exit_phi)
-       {
-         stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
-         /* FORNOW. Currently not supporting the case that an inner-loop induction
-            is not used in the outer-loop (i.e. only outside the outer-loop).  */
-         gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
-                     && !STMT_VINFO_LIVE_P (stmt_vinfo));
-
-         STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location,
-                            "vector of inductions after inner-loop:%G",
-                            new_stmt);
-       }
-    }
-
-
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
                      "transform induction: created def-use cycle: %G%G",
@@ -7818,17 +8407,19 @@ vectorizable_induction (stmt_vec_info stmt_info,
     it can be supported.  */
  
  bool
-vectorizable_live_operation (stmt_vec_info stmt_info,
+vectorizable_live_operation (vec_info *vinfo,
+                            stmt_vec_info stmt_info,
                              gimple_stmt_iterator *gsi,
                              slp_tree slp_node, slp_instance slp_node_instance,
                              int slp_index, bool vec_stmt_p,
-                            stmt_vector_for_cost *)
+                            stmt_vector_for_cost *cost_vec)
  {
-  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
-  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
    imm_use_iterator imm_iter;
    tree lhs, lhs_type, bitsize, vec_bitsize;
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype = (slp_node
+                 ? SLP_TREE_VECTYPE (slp_node)
+                 : STMT_VINFO_VECTYPE (stmt_info));
    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
    int ncopies;
    gimple *use_stmt;
@@ -7855,21 +8446,21 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
              all involved stmts together.  */
           else if (slp_index != 0)
             return true;
+         else
+           /* For SLP reductions the meta-info is attached to
+              the representative.  */
+           stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
         }
-      stmt_vec_info reduc_info = info_for_reduction (stmt_info);
+      stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
        gcc_assert (reduc_info->is_reduc_info);
        if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
         return true;
-      vect_create_epilog_for_reduction (stmt_info, slp_node,
+      vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
                                         slp_node_instance);
        return true;
      }
  
-  /* FORNOW.  CHECKME.  */
-  if (nested_in_vect_loop_p (loop, stmt_info))
-    return false;
-
    /* If STMT is not relevant and it is a simple assignment and its inputs are
       invariant then it can remain in place, unvectorized.  The original last
       scalar value that it computes will be used.  */
@@ -7892,12 +8483,11 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
      {
        gcc_assert (slp_index >= 0);
  
-      int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
-      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-
        /* Get the last occurrence of the scalar index from the concatenation of
          all the slp vectors. Calculate which slp vector it is and the index
          within.  */
+      int num_scalar = SLP_TREE_LANES (slp_node);
+      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
        poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
  
        /* Calculate which vector contains the result, and which lane of
@@ -7915,33 +8505,34 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
    if (!vec_stmt_p)
      {
        /* No transformation required.  */
-      if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
+      if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
         {
           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
                                                OPTIMIZE_FOR_SPEED))
             {
               if (dump_enabled_p ())
                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "can't use a fully-masked loop because "
-                                "the target doesn't support extract last "
-                                "reduction.\n");
-             LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                                "can't operate on partial vectors "
+                                "because the target doesn't support extract "
+                                "last reduction.\n");
+             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
             }
           else if (slp_node)
             {
               if (dump_enabled_p ())
                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "can't use a fully-masked loop because an "
-                                "SLP statement is live after the loop.\n");
-             LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                                "can't operate on partial vectors "
+                                "because an SLP statement is live after "
+                                "the loop.\n");
+             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
             }
           else if (ncopies > 1)
             {
               if (dump_enabled_p ())
                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "can't use a fully-masked loop because"
-                                " ncopies is greater than 1.\n");
-             LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
+                                "can't operate on partial vectors "
+                                "because ncopies is greater than 1.\n");
+             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
             }
           else
             {
@@ -7951,33 +8542,35 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
                                      1, vectype, NULL);
             }
         }
+      /* ???  Enable for loop costing as well.  */
+      if (!loop_vinfo)
+       record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
+                         0, vect_epilogue);
        return true;
      }
  
    /* Use the lhs of the original scalar statement.  */
    gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
+                    "stmt %G", stmt);
  
-  lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
-       : gimple_get_lhs (stmt);
+  lhs = gimple_get_lhs (stmt);
    lhs_type = TREE_TYPE (lhs);
  
-  bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
-            ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
-            : TYPE_SIZE (TREE_TYPE (vectype)));
+  bitsize = vector_element_bits_tree (vectype);
    vec_bitsize = TYPE_SIZE (vectype);
  
    /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
    tree vec_lhs, bitstart;
+  gimple *vec_stmt;
    if (slp_node)
      {
-      gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
  
        /* Get the correct slp vectorized stmt.  */
-      gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
-      if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
-       vec_lhs = gimple_phi_result (phi);
-      else
-       vec_lhs = gimple_get_lhs (vec_stmt);
+      vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
+      vec_lhs = gimple_get_lhs (vec_stmt);
  
        /* Get entry to use.  */
        bitstart = bitsize_int (vec_index);
@@ -7985,109 +8578,193 @@ vectorizable_live_operation (stmt_vec_info stmt_info,
      }
    else
      {
-      enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
-      vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
-      gcc_checking_assert (ncopies == 1
-                          || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
-
        /* For multiple copies, get the last copy.  */
-      for (int i = 1; i < ncopies; ++i)
-       vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
+      vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
+      vec_lhs = gimple_get_lhs (vec_stmt);
  
        /* Get the last lane in the vector.  */
        bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
      }
  
-  /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
-     requirement, insert one phi node for it.  It looks like:
-        loop;
-       BB:
-        # lhs' = PHI <lhs>
-     ==>
-        loop;
-       BB:
-        # vec_lhs' = PHI <vec_lhs>
-        new_tree = lane_extract <vec_lhs', ...>;
-        lhs' = new_tree;  */
+  if (loop_vinfo)
+    {
+      /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
+        requirement, insert one phi node for it.  It looks like:
+          loop;
+        BB:
+          # lhs' = PHI <lhs>
+        ==>
+          loop;
+        BB:
+          # vec_lhs' = PHI <vec_lhs>
+          new_tree = lane_extract <vec_lhs', ...>;
+          lhs' = new_tree;  */
+
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      basic_block exit_bb = single_exit (loop)->dest;
+      gcc_assert (single_pred_p (exit_bb));
+
+      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
+      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
+      SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
+
+      gimple_seq stmts = NULL;
+      tree new_tree;
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         /* Emit:
  
-  basic_block exit_bb = single_exit (loop)->dest;
-  gcc_assert (single_pred_p (exit_bb));
+              SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
  
-  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
-  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
+            where VEC_LHS is the vectorized live-out result and MASK is
+            the loop mask for the final iteration.  */
+         gcc_assert (ncopies == 1 && !slp_node);
+         tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+         tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+                                         1, vectype, 0);
+         tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+                                         mask, vec_lhs_phi);
  
-  gimple_seq stmts = NULL;
-  tree new_tree;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-    {
-      /* Emit:
+         /* Convert the extracted vector element to the scalar type.  */
+         new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+       }
+      else
+       {
+         tree bftype = TREE_TYPE (vectype);
+         if (VECTOR_BOOLEAN_TYPE_P (vectype))
+           bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+         new_tree = build3 (BIT_FIELD_REF, bftype,
+                            vec_lhs_phi, bitsize, bitstart);
+         new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+                                          &stmts, true, NULL_TREE);
+       }
  
-          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+      if (stmts)
+       {
+         gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
+         gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
  
-        where VEC_LHS is the vectorized live-out result and MASK is
-        the loop mask for the final iteration.  */
-      gcc_assert (ncopies == 1 && !slp_node);
-      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
-      tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
-                                     vectype, 0);
-      tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-                                     mask, vec_lhs_phi);
+         /* Remove existing phi from lhs and create one copy from new_tree.  */
+         tree lhs_phi = NULL_TREE;
+         gimple_stmt_iterator gsi;
+         for (gsi = gsi_start_phis (exit_bb);
+              !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             gimple *phi = gsi_stmt (gsi);
+             if ((gimple_phi_arg_def (phi, 0) == lhs))
+               {
+                 remove_phi_node (&gsi, false);
+                 lhs_phi = gimple_phi_result (phi);
+                 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
+                 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
+                 break;
+               }
+           }
+       }
  
-      /* Convert the extracted vector element to the required scalar type.  */
-      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+      /* Replace use of lhs with newly computed result.  If the use stmt is a
+        single arg PHI, just replace all uses of PHI result.  It's necessary
+        because lcssa PHI defining lhs may be before newly inserted stmt.  */
+      use_operand_p use_p;
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
+           && !is_gimple_debug (use_stmt))
+         {
+           if (gimple_code (use_stmt) == GIMPLE_PHI
+               && gimple_phi_num_args (use_stmt) == 1)
+             {
+               replace_uses_by (gimple_phi_result (use_stmt), new_tree);
+             }
+           else
+             {
+               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+                   SET_USE (use_p, new_tree);
+             }
+           update_stmt (use_stmt);
+         }
      }
    else
      {
+      /* For basic-block vectorization simply insert the lane-extraction.  */
        tree bftype = TREE_TYPE (vectype);
        if (VECTOR_BOOLEAN_TYPE_P (vectype))
         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
-      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
+      tree new_tree = build3 (BIT_FIELD_REF, bftype,
+                             vec_lhs, bitsize, bitstart);
+      gimple_seq stmts = NULL;
        new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
                                        &stmts, true, NULL_TREE);
-    }
-
-  if (stmts)
-    {
-      gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
-      gsi_insert_before (&exit_gsi, stmts, GSI_CONTINUE_LINKING);
-
-      /* Remove existing phi from lhs and create one copy from new_tree.  */
-      tree lhs_phi = NULL_TREE;
-      gimple_stmt_iterator gsi;
-      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
-       {
-         gimple *phi = gsi_stmt (gsi);
-         if ((gimple_phi_arg_def (phi, 0) == lhs))
-           {
-             remove_phi_node (&gsi, false);
-             lhs_phi = gimple_phi_result (phi);
-             gimple *copy = gimple_build_assign (lhs_phi, new_tree);
-             gsi_insert_after (&exit_gsi, copy, GSI_CONTINUE_LINKING);
-             break;
-           }
-       }
-    }
-
-  /* Replace use of lhs with newly computed result.  If the use stmt is a
-     single arg PHI, just replace all uses of PHI result.  It's necessary
-     because lcssa PHI defining lhs may be before newly inserted stmt.  */
-  use_operand_p use_p;
-  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
-    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
-       && !is_gimple_debug (use_stmt))
-    {
-      if (gimple_code (use_stmt) == GIMPLE_PHI
-         && gimple_phi_num_args (use_stmt) == 1)
+      if (TREE_CODE (new_tree) == SSA_NAME
+         && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
+       SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
+      if (is_a <gphi *> (vec_stmt))
         {
-         replace_uses_by (gimple_phi_result (use_stmt), new_tree);
+         gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
+         gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
         }
        else
         {
-         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-           SET_USE (use_p, new_tree);
+         gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
+         gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
         }
-      update_stmt (use_stmt);
+
+      /* Replace use of lhs with newly computed result.  If the use stmt is a
+        single arg PHI, just replace all uses of PHI result.  It's necessary
+        because lcssa PHI defining lhs may be before newly inserted stmt.  */
+      use_operand_p use_p;
+      stmt_vec_info use_stmt_info;
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+       if (!is_gimple_debug (use_stmt)
+           && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
+               || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
+         {
+           /* ???  This can happen when the live lane ends up being
+              used in a vector construction code-generated by an
+              external SLP node (and code-generation for that already
+              happened).  See gcc.dg/vect/bb-slp-47.c.
+              Doing this is what would happen if that vector CTOR
+              were not code-generated yet so it is not too bad.
+              ???  In fact we'd likely want to avoid this situation
+              in the first place.  */
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && gimple_code (use_stmt) != GIMPLE_PHI
+               && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
+                                               use_stmt))
+             {
+               enum tree_code code = gimple_assign_rhs_code (use_stmt);
+               gcc_assert (code == CONSTRUCTOR
+                           || code == VIEW_CONVERT_EXPR
+                           || CONVERT_EXPR_CODE_P (code));
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "Using original scalar computation for "
+                                  "live lane because use preceeds vector "
+                                  "def\n");
+               continue;
+             }
+           /* ???  It can also happen that we end up pulling a def into
+              a loop where replacing out-of-loop uses would require
+              a new LC SSA PHI node.  Retain the original scalar in
+              those cases as well.  PR98064.  */
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && (gimple_bb (use_stmt)->loop_father
+                   != gimple_bb (vec_stmt)->loop_father)
+               && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
+                                       gimple_bb (use_stmt)->loop_father))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "Using original scalar computation for "
+                                  "live lane because there is an out-of-loop "
+                                  "definition for it\n");
+               continue;
+             }
+           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+             SET_USE (use_p, new_tree);
+           update_stmt (use_stmt);
+         }
      }
  
    return true;
@@ -8196,8 +8873,8 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
  {
    gcc_assert (nvectors != 0);
    if (masks->length () < nvectors)
-    masks->safe_grow_cleared (nvectors);
-  rgroup_masks *rgm = &(*masks)[nvectors - 1];
+    masks->safe_grow_cleared (nvectors, true);
+  rgroup_controls *rgm = &(*masks)[nvectors - 1];
    /* The number of scalars per iteration and the number of vectors are
       both compile-time constants.  */
    unsigned int nscalars_per_iter
@@ -8213,7 +8890,8 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
    if (rgm->max_nscalars_per_iter < nscalars_per_iter)
      {
        rgm->max_nscalars_per_iter = nscalars_per_iter;
-      rgm->mask_type = truth_type_for (vectype);
+      rgm->type = truth_type_for (vectype);
+      rgm->factor = 1;
      }
  }
  
@@ -8228,24 +8906,24 @@ tree
  vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
                     unsigned int nvectors, tree vectype, unsigned int index)
  {
-  rgroup_masks *rgm = &(*masks)[nvectors - 1];
-  tree mask_type = rgm->mask_type;
+  rgroup_controls *rgm = &(*masks)[nvectors - 1];
+  tree mask_type = rgm->type;
  
    /* Populate the rgroup's mask array, if this is the first time we've
       used it.  */
-  if (rgm->masks.is_empty ())
+  if (rgm->controls.is_empty ())
      {
-      rgm->masks.safe_grow_cleared (nvectors);
+      rgm->controls.safe_grow_cleared (nvectors, true);
        for (unsigned int i = 0; i < nvectors; ++i)
         {
           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
           /* Provide a dummy definition until the real one is available.  */
           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
-         rgm->masks[i] = mask;
+         rgm->controls[i] = mask;
         }
      }
  
-  tree mask = rgm->masks[index];
+  tree mask = rgm->controls[index];
    if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
                 TYPE_VECTOR_SUBPARTS (vectype)))
      {
@@ -8266,6 +8944,69 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
    return mask;
  }
  
+/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
+   lengths for controlling an operation on VECTYPE.  The operation splits
+   each element of VECTYPE into FACTOR separate subelements, measuring the
+   length as a number of these subelements.  */
+
+void
+vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+                     unsigned int nvectors, tree vectype, unsigned int factor)
+{
+  gcc_assert (nvectors != 0);
+  if (lens->length () < nvectors)
+    lens->safe_grow_cleared (nvectors, true);
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* The number of scalars per iteration, scalar occupied bytes and
+     the number of vectors are both compile-time constants.  */
+  unsigned int nscalars_per_iter
+    = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+                LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+  if (rgl->max_nscalars_per_iter < nscalars_per_iter)
+    {
+      /* For now, we only support cases in which all loads and stores fall back
+        to VnQI or none do.  */
+      gcc_assert (!rgl->max_nscalars_per_iter
+                 || (rgl->factor == 1 && factor == 1)
+                 || (rgl->max_nscalars_per_iter * rgl->factor
+                     == nscalars_per_iter * factor));
+      rgl->max_nscalars_per_iter = nscalars_per_iter;
+      rgl->type = vectype;
+      rgl->factor = factor;
+    }
+}
+
+/* Given a complete set of length LENS, extract length number INDEX for an
+   rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
+
+tree
+vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+                  unsigned int nvectors, unsigned int index)
+{
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* Populate the rgroup's len array, if this is the first time we've
+     used it.  */
+  if (rgl->controls.is_empty ())
+    {
+      rgl->controls.safe_grow_cleared (nvectors, true);
+      for (unsigned int i = 0; i < nvectors; ++i)
+       {
+         tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+         gcc_assert (len_type != NULL_TREE);
+         tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
+
+         /* Provide a dummy definition until the real one is available.  */
+         SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
+         rgl->controls[i] = len;
+       }
+    }
+
+  return rgl->controls[index];
+}
+
  /* Scale profiling counters by estimation for LOOP which is vectorized
     by factor VF.  */
  
@@ -8300,11 +9041,47 @@ scale_profile_for_vect_loop (class loop *loop, unsigned vf)
      scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
  }
  
+/* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
+   latch edge values originally defined by it.  */
+
+static void
+maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
+                                    stmt_vec_info def_stmt_info)
+{
+  tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
+  if (!def || TREE_CODE (def) != SSA_NAME)
+    return;
+  stmt_vec_info phi_info;
+  imm_use_iterator iter;
+  use_operand_p use_p;
+  FOR_EACH_IMM_USE_FAST (use_p, iter, def)
+    if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
+      if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
+         && (phi_info = loop_vinfo->lookup_stmt (phi))
+         && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
+         && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
+         && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
+       {
+         loop_p loop = gimple_bb (phi)->loop_father;
+         edge e = loop_latch_edge (loop);
+         if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
+           {
+             vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
+             vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
+             gcc_assert (phi_defs.length () == latch_defs.length ());
+             for (unsigned i = 0; i < phi_defs.length (); ++i)
+               add_phi_arg (as_a <gphi *> (phi_defs[i]),
+                            gimple_get_lhs (latch_defs[i]), e,
+                            gimple_phi_arg_location (phi, e->dest_idx));
+           }
+       }
+}
+
  /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
     When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
     stmt_vec_info.  */
  
-static void
+static bool
  vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
  {
@@ -8320,7 +9097,7 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
  
    if (!STMT_VINFO_RELEVANT_P (stmt_info)
        && !STMT_VINFO_LIVE_P (stmt_info))
-    return;
+    return false;
  
    if (STMT_VINFO_VECTYPE (stmt_info))
      {
@@ -8337,13 +9114,15 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
    /* Pure SLP statements have already been vectorized.  We still need
       to apply loop vectorization to hybrid SLP statements.  */
    if (PURE_SLP_STMT (stmt_info))
-    return;
+    return false;
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
  
-  if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
+  if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
      *seen_store = stmt_info;
+
+  return true;
  }
  
  /* Helper function to pass to simplify_replace_tree to enable replacing tree's
@@ -8392,6 +9171,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
    basic_block *epilogue_bbs = get_loop_body (epilogue);
    unsigned i;
  
+  free (LOOP_VINFO_BBS (epilogue_vinfo));
    LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
  
    /* Advance data_reference's with the number of iterations of the previous
@@ -8429,6 +9209,8 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
         {
           new_stmt = gsi_stmt (epilogue_gsi);
+         if (is_gimple_debug (new_stmt))
+           continue;
  
           gcc_assert (gimple_uid (new_stmt) > 0);
           stmt_vinfo
@@ -8493,7 +9275,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
      }
  
    struct data_reference *dr;
-  vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
+  vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
    FOR_EACH_VEC_ELT (datarefs, i, dr)
      {
        orig_stmt = DR_STMT (dr);
@@ -8625,7 +9407,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
    if (niters_vector == NULL_TREE)
      {
        if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-         && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
+         && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
           && known_eq (lowest_vf, vf))
         {
           niters_vector
@@ -8633,9 +9415,15 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
           step_vector = build_one_cst (TREE_TYPE (niters));
         }
-      else
+      else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
                                      &step_vector, niters_no_overflow);
+      else
+       /* vect_do_peeling subtracted the number of peeled prologue
+          iterations from LOOP_VINFO_NITERS.  */
+       vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
+                                    &niters_vector, &step_vector,
+                                    niters_no_overflow);
      }
  
    /* 1) Make sure the loop header has exactly two entries
@@ -8645,8 +9433,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
  
    split_edge (loop_preheader_edge (loop));
  
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-      && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
      /* This will deal with any possible peeling.  */
      vect_prepare_for_masked_peels (loop_vinfo);
  
@@ -8655,7 +9442,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
    if (!loop_vinfo->slp_instances.is_empty ())
      {
        DUMP_VECT_SCOPE ("scheduling SLP instances");
-      vect_schedule_slp (loop_vinfo);
+      vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
      }
  
    /* FORNOW: the vectorizer supports only loops which body consist
@@ -8670,7 +9457,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
  
        for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
            gsi_next (&si))
-        {
+       {
           gphi *phi = si.phi ();
           if (dump_enabled_p ())
             dump_printf_loc (MSG_NOTE, vect_location,
@@ -8701,10 +9488,31 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
             {
               if (dump_enabled_p ())
                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
-             vect_transform_stmt (stmt_info, NULL, NULL, NULL);
+             vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
             }
         }
  
+      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
+          gsi_next (&si))
+       {
+         gphi *phi = si.phi ();
+         stmt_info = loop_vinfo->lookup_stmt (phi);
+         if (!stmt_info)
+           continue;
+
+         if (!STMT_VINFO_RELEVANT_P (stmt_info)
+             && !STMT_VINFO_LIVE_P (stmt_info))
+           continue;
+
+         if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
+             && ! PURE_SLP_STMT (stmt_info))
+           maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
+       }
+
        for (gimple_stmt_iterator si = gsi_start_bb (bb);
            !gsi_end_p (si);)
         {
@@ -8718,6 +9526,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
             }
           else
             {
+             /* Ignore vector stmts created in the outer loop.  */
               stmt_info = loop_vinfo->lookup_stmt (stmt);
  
               /* vector stmts created in the outer-loop during vectorization of
@@ -8739,11 +9548,18 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                         }
                       stmt_vec_info pat_stmt_info
                         = STMT_VINFO_RELATED_STMT (stmt_info);
-                     vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
-                                               &seen_store);
+                     if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
+                                                   &si, &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            pat_stmt_info);
+                   }
+                 else
+                   {
+                     if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
+                                                   &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            stmt_info);
                     }
-                 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
-                                           &seen_store);
                 }
               gsi_next (&si);
               if (seen_store)
@@ -8752,7 +9568,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                     /* Interleaving.  If IS_STORE is TRUE, the
                        vectorization of the interleaving chain was
                        completed - free all the stores in the chain.  */
-                   vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
+                   vect_remove_stores (loop_vinfo,
+                                       DR_GROUP_FIRST_ELEMENT (seen_store));
                   else
                     /* Free the attached stmt_vec_info and remove the stmt.  */
                     loop_vinfo->remove_stmt (stmt_info);
@@ -8792,7 +9609,8 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
  
    /* True if the final iteration might not handle a full vector's
       worth of scalar iterations.  */
-  bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
+  bool final_iter_may_be_partial
+    = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
    /* The minimum number of iterations performed by the epilogue.  This
       is 1 when peeling for gaps because we always need a final scalar
       iteration.  */
@@ -8803,7 +9621,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
    int bias_for_lowest = 1 - min_epilogue_iters;
    int bias_for_assumed = bias_for_lowest;
    int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-  if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
      {
        /* When the amount of peeling is known at compile time, the first
          iteration will have exactly alignment_npeels active elements.
@@ -8866,7 +9684,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
       won't work.  */
    slp_instance instance;
    FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
-    vect_free_slp_instance (instance, true);
+    vect_free_slp_instance (instance);
    LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
    /* Clear-up safelen field since its value is invalid after vectorization
       since vectorized loop can have loop-carried dependencies.  */
@@ -9095,12 +9913,13 @@ optimize_mask_stores (class loop *loop)
  }
  
  /* Decide whether it is possible to use a zero-based induction variable
-   when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
-   return the value that the induction variable must be able to hold
-   in order to ensure that the loop ends with an all-false mask.
+   when vectorizing LOOP_VINFO with partial vectors.  If it is, return
+   the value that the induction variable must be able to hold in order
+   to ensure that the rgroups eventually have no active vector elements.
     Return -1 otherwise.  */
+
  widest_int
-vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
+vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
  {
    tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
    class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -9135,3 +9954,25 @@ vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
    return iv_limit;
  }
  
+/* For the given rgroup_controls RGC, check whether an induction variable
+   would ever hit a value that produces a set of all-false masks or zero
+   lengths before wrapping around.  Return true if it's possible to wrap
+   around before hitting the desirable value, otherwise return false.  */
+
+bool
+vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
+{
+  widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
+
+  if (iv_limit == -1)
+    return true;
+
+  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  unsigned int compare_precision = TYPE_PRECISION (compare_type);
+  unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+
+  if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
+    return true;
+
+  return false;
+}