Daily bump.
[gcc.git] / gcc / tree-vect-loop.c
index fb56ca48b68c0b83a8e45c7b5e7692b7547bd4bc..72bbec4b45d225d73eddfaca69468cd23842a42a 100644 (file)
@@ -666,27 +666,50 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
   unsigned i;
 
   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
-    if (STMT_VINFO_IN_PATTERN_P (first))
-      {
-       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
-       while (next)
-         {
-           if (! STMT_VINFO_IN_PATTERN_P (next)
-               || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
-             break;
-           next = REDUC_GROUP_NEXT_ELEMENT (next);
-         }
-       /* If not all stmt in the chain are patterns or if we failed
-          to update STMT_VINFO_REDUC_IDX try to handle the chain
-          without patterns.  */
-       if (! next
-           && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
-         {
-           vect_fixup_reduc_chain (first);
-           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
-             = STMT_VINFO_RELATED_STMT (first);
-         }
-      }
+    {
+      stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
+      while (next)
+       {
+         if ((STMT_VINFO_IN_PATTERN_P (next)
+              != STMT_VINFO_IN_PATTERN_P (first))
+             || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
+           break;
+         next = REDUC_GROUP_NEXT_ELEMENT (next);
+       }
+      /* If all reduction chain members are well-formed patterns adjust
+        the group to group the pattern stmts instead.  */
+      if (! next
+         && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
+       {
+         if (STMT_VINFO_IN_PATTERN_P (first))
+           {
+             vect_fixup_reduc_chain (first);
+             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
+               = STMT_VINFO_RELATED_STMT (first);
+           }
+       }
+      /* If not all stmt in the chain are patterns or if we failed
+        to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
+        it as regular reduction instead.  */
+      else
+       {
+         stmt_vec_info vinfo = first;
+         stmt_vec_info last = NULL;
+         while (vinfo)
+           {
+             next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
+             REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
+             REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+             last = vinfo;
+             vinfo = next;
+           }
+         STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
+           = vect_internal_def;
+         loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
+         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
+         --i;
+       }
+    }
 }
 
 /* Function vect_get_loop_niters.
@@ -814,8 +837,9 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     vec_outside_cost (0),
     vec_inside_cost (0),
     vectorizable (false),
-    can_use_partial_vectors_p (true),
+    can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
+    epil_using_partial_vectors_p (false),
     peeling_for_gaps (false),
     peeling_for_niter (false),
     no_data_dependencies (false),
@@ -898,6 +922,7 @@ _loop_vec_info::~_loop_vec_info ()
   free (bbs);
 
   release_vec_loop_controls (&masks);
+  release_vec_loop_controls (&lens);
   delete ivexpr_map;
   delete scan_map;
   epilogue_vinfos.release ();
@@ -989,6 +1014,51 @@ vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
   return wi::min_precision (max_ni * factor, UNSIGNED);
 }
 
+/* True if the loop needs peeling or partial vectors when vectorized.  */
+
+static bool
+vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
+{
+  unsigned HOST_WIDE_INT const_vf;
+  HOST_WIDE_INT max_niter
+    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
+  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
+                                         (loop_vinfo));
+
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+    {
+      /* Work out the (constant) number of iterations that need to be
+        peeled for reasons other than niters.  */
+      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+       peel_niter += 1;
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
+                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+       return true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      /* ??? When peeling for gaps but not alignment, we could
+        try to check whether the (variable) niters is known to be
+        VF * N + 1.  That's something of a niche case though.  */
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+      || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+      || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+          < (unsigned) exact_log2 (const_vf))
+         /* In case of versioning, check if the maximum number of
+            iterations is greater than th.  If they are identical,
+            the epilogue is unnecessary.  */
+         && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+             || ((unsigned HOST_WIDE_INT) max_niter
+                 > (th / const_vf) * const_vf))))
+    return true;
+
+  return false;
+}
+
 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
    whether we can actually generate the masks required.  Return true if so,
    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
@@ -1072,6 +1142,81 @@ vect_verify_full_masking (loop_vec_info loop_vinfo)
   return true;
 }
 
+/* Check whether we can use vector access with length based on precison
+   comparison.  So far, to keep it simple, we only allow the case that the
+   precision of the target supported length is larger than the precision
+   required by loop niters.  */
+
+static bool
+vect_verify_loop_lens (loop_vec_info loop_vinfo)
+{
+  if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
+    return false;
+
+  unsigned int max_nitems_per_iter = 1;
+  unsigned int i;
+  rgroup_controls *rgl;
+  /* Find the maximum number of items per iteration for every rgroup.  */
+  FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
+    {
+      unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
+      max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
+    }
+
+  /* Work out how many bits we need to represent the length limit.  */
+  unsigned int min_ni_prec
+    = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
+
+  /* Now use the maximum of below precisions for one suitable IV type:
+     - the IV's natural precision
+     - the precision needed to hold: the maximum number of scalar
+       iterations multiplied by the scale factor (min_ni_prec above)
+     - the Pmode precision
+
+     If min_ni_prec is less than the precision of the current niters,
+     we perfer to still use the niters type.  Prefer to use Pmode and
+     wider IV to avoid narrow conversions.  */
+
+  unsigned int ni_prec
+    = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
+  min_ni_prec = MAX (min_ni_prec, ni_prec);
+  min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
+
+  tree iv_type = NULL_TREE;
+  opt_scalar_int_mode tmode_iter;
+  FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
+    {
+      scalar_mode tmode = tmode_iter.require ();
+      unsigned int tbits = GET_MODE_BITSIZE (tmode);
+
+      /* ??? Do we really want to construct one IV whose precision exceeds
+        BITS_PER_WORD?  */
+      if (tbits > BITS_PER_WORD)
+       break;
+
+      /* Find the first available standard integral type.  */
+      if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
+       {
+         iv_type = build_nonstandard_integer_type (tbits, true);
+         break;
+       }
+    }
+
+  if (!iv_type)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "can't vectorize with length-based partial vectors"
+                        " because there is no suitable iv type.\n");
+      return false;
+    }
+
+  LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
+  LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
+
+  return true;
+}
+
 /* Calculate the cost of one scalar iteration of the loop.  */
 static void
 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
@@ -1682,6 +1827,19 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
        }
     }
 
+  /* If using the "very cheap" model. reject cases in which we'd keep
+     a copy of the scalar code (even if we might be able to vectorize it).  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "some scalar iterations would need to be peeled\n");
+      return 0;
+    }
+
   int min_profitable_iters, min_profitable_estimate;
   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
                                      &min_profitable_estimate);
@@ -1740,6 +1898,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
       min_profitable_estimate = min_profitable_iters;
     }
 
+  /* If the vector loop needs multiple iterations to be beneficial then
+     things are probably too close to call, and the conservative thing
+     would be to stick with the scalar code.  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "one iteration of the vector loop would be"
+                        " more expensive than the equivalent number of"
+                        " iterations of the scalar loop\n");
+      return 0;
+    }
+
   HOST_WIDE_INT estimated_niter;
 
   /* If we are vectorizing an epilogue then we know the maximum number of
@@ -1788,7 +1960,8 @@ vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
        if (is_gimple_debug (stmt))
          continue;
        ++(*n_stmts);
-       opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
+       opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
+                                                       NULL, 0);
        if (!res)
          {
            if (is_gimple_call (stmt) && loop->safelen)
@@ -1880,56 +2053,123 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
     }
 }
 
+/* Determine if operating on full vectors for LOOP_VINFO might leave
+   some scalar iterations still to do.  If so, decide how we should
+   handle those scalar iterations.  The possibilities are:
 
-/* Decides whether we need to create an epilogue loop to handle
-   remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
+   (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
+       In this case:
 
-void
-determine_peel_for_niter (loop_vec_info loop_vinfo)
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
+        LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == false
+
+   (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
+       to handle the remaining scalar iterations.  In this case:
+
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == true
+
+       There are two choices:
+
+       (2a) Consider vectorizing the epilogue loop at the same VF as the
+           main loop, but using partial vectors instead of full vectors.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
+
+       (2b) Consider vectorizing the epilogue loop at lower VFs only.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+
+   When FOR_EPILOGUE_P is true, make this determination based on the
+   assumption that LOOP_VINFO is an epilogue loop, otherwise make it
+   based on the assumption that LOOP_VINFO is the main loop.  The caller
+   has made sure that the number of iterations is set appropriately for
+   this value of FOR_EPILOGUE_P.  */
+
+opt_result
+vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
+                                           bool for_epilogue_p)
 {
-  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  /* Determine whether there would be any scalar iterations left over.  */
+  bool need_peeling_or_partial_vectors_p
+    = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
+
+  /* Decide whether to vectorize the loop with partial vectors.  */
+  LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && need_peeling_or_partial_vectors_p)
+    {
+      /* For partial-vector-usage=1, try to push the handling of partial
+        vectors to the epilogue, with the main loop continuing to operate
+        on full vectors.
+
+        ??? We could then end up failing to use partial vectors if we
+        decide to peel iterations into a prologue, and if the main loop
+        then ends up processing fewer than VF iterations.  */
+      if (param_vect_partial_vector_usage == 1
+         && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+         && !vect_known_niters_smaller_than_vf (loop_vinfo))
+       LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+      else
+       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+    }
 
-  unsigned HOST_WIDE_INT const_vf;
-  HOST_WIDE_INT max_niter
-    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+  if (dump_enabled_p ())
+    {
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating on partial vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+      else
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating only on full vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+    }
 
-  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
-    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
-                                         (loop_vinfo));
+  if (for_epilogue_p)
+    {
+      loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+      gcc_assert (orig_loop_vinfo);
+      if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                             LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
+    }
 
-  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-    /* The main loop handles all iterations.  */
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
-  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-          && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
-      /* Work out the (constant) number of iterations that need to be
-        peeled for reasons other than niters.  */
-      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-       peel_niter += 1;
-      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
-                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-       LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+      /* Check that the loop processes at least one full vector.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
+      if (known_lt (wi::to_widest (scalar_niters), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support vectorization.\n");
+
+      /* If we need to peel an extra epilogue iteration to handle data
+        accesses with gaps, check that there are enough scalar iterations
+        available.
+
+        The check above is redundant with this one when peeling for gaps,
+        but the distinction is useful for diagnostics.  */
+      tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         && known_lt (wi::to_widest (scalar_nitersm1), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support peeling for gaps.\n");
     }
-  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
-          /* ??? When peeling for gaps but not alignment, we could
-             try to check whether the (variable) niters is known to be
-             VF * N + 1.  That's something of a niche case though.  */
-          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-          || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
-          || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
-               < (unsigned) exact_log2 (const_vf))
-              /* In case of versioning, check if the maximum number of
-                 iterations is greater than th.  If they are identical,
-                 the epilogue is unnecessary.  */
-              && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-                  || ((unsigned HOST_WIDE_INT) max_niter
-                      > (th / const_vf) * const_vf))))
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-}
 
+  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+    = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+       && need_peeling_or_partial_vectors_p);
+
+  return opt_result::success ();
+}
 
 /* Function vect_analyze_loop_2.
 
@@ -2010,7 +2250,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
   /* Analyze the access patterns of the data-refs in the loop (consecutive,
      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
 
-  ok = vect_analyze_data_ref_accesses (loop_vinfo);
+  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
   if (!ok)
     {
       if (dump_enabled_p ())
@@ -2085,6 +2325,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
 
       /* Optimize the SLP graph with the vectorization factor fixed.  */
       vect_optimize_slp (loop_vinfo);
+
+      /* Gather the loads reachable from the SLP graph entries.  */
+      vect_gather_slp_loads (loop_vinfo);
     }
 
   bool saved_can_use_partial_vectors_p
@@ -2136,8 +2379,6 @@ start_over:
     /* This pass will decide on using loop versioning and/or loop peeling in
        order to enhance the alignment of data references in the loop.  */
     ok = vect_enhance_data_refs_alignment (loop_vinfo);
-  else
-    ok = vect_verify_datarefs_alignment (loop_vinfo);
   if (!ok)
     return ok;
 
@@ -2154,6 +2395,79 @@ start_over:
                                       "unsupported SLP instances\n");
          goto again;
        }
+
+      /* Check whether any load in ALL SLP instances is possibly permuted.  */
+      slp_tree load_node, slp_root;
+      unsigned i, x;
+      slp_instance instance;
+      bool can_use_lanes = true;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
+       {
+         slp_root = SLP_INSTANCE_TREE (instance);
+         int group_size = SLP_TREE_LANES (slp_root);
+         tree vectype = SLP_TREE_VECTYPE (slp_root);
+         bool loads_permuted = false;
+         FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+           {
+             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+               continue;
+             unsigned j;
+             stmt_vec_info load_info;
+             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+               if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+                 {
+                   loads_permuted = true;
+                   break;
+                 }
+           }
+
+         /* If the loads and stores can be handled with load/store-lane
+            instructions record it and move on to the next instance.  */
+         if (loads_permuted
+             && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+             && vect_store_lanes_supported (vectype, group_size, false))
+           {
+             FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+               {
+                 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+                     (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+                 /* Use SLP for strided accesses (or if we can't
+                    load-lanes).  */
+                 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+                     || ! vect_load_lanes_supported
+                           (STMT_VINFO_VECTYPE (stmt_vinfo),
+                            DR_GROUP_SIZE (stmt_vinfo), false))
+                   break;
+               }
+
+             can_use_lanes
+               = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
+
+             if (can_use_lanes && dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "SLP instance %p can use load/store-lanes\n",
+                                instance);
+           }
+         else
+           {
+             can_use_lanes = false;
+             break;
+           }
+       }
+
+      /* If all SLP instances can use load/store-lanes abort SLP and try again
+        with SLP disabled.  */
+      if (can_use_lanes)
+       {
+         ok = opt_result::failure_at (vect_location,
+                                      "Built SLP cancelled: can use "
+                                      "load/store-lanes\n");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Built SLP cancelled: all SLP instances support "
+                            "load/store-lanes\n");
+         goto again;
+       }
     }
 
   /* Dissolve SLP-only groups.  */
@@ -2170,48 +2484,47 @@ start_over:
       return ok;
     }
 
-  /* Decide whether to use a fully-masked loop for this vectorization
-     factor.  */
-  LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
-    = (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
-       && vect_verify_full_masking (loop_vinfo));
-  if (dump_enabled_p ())
+  /* For now, we don't expect to mix both masking and length approaches for one
+     loop, disable it if both are recorded.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
+      && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
     {
-      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "operating on partial vectors.\n");
-      else
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "operating only on full vectors.\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "can't vectorize a loop with partial vectors"
+                        " because we don't expect to mix different"
+                        " approaches with partial vectors for the"
+                        " same loop.\n");
+      LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
     }
 
-  /* If epilog loop is required because of data accesses with gaps,
-     one additional iteration needs to be peeled.  Check if there is
-     enough iterations for vectorization.  */
-  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-    {
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
-
-      if (known_lt (wi::to_widest (scalar_niters), vf))
-       return opt_result::failure_at (vect_location,
-                                      "loop has no enough iterations to"
-                                      " support peeling for gaps.\n");
-    }
+  /* If we still have the option of using partial vectors,
+     check whether we can generate the necessary loop controls.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !vect_verify_full_masking (loop_vinfo)
+      && !vect_verify_loop_lens (loop_vinfo))
+    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
                   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
     return opt_result::failure_at (vect_location,
                                   "Vectorization factor too high for"
                                   " epilogue loop.\n");
 
+  /* Decide whether this loop_vinfo should use partial vectors or peeling,
+     assuming that the loop will be used as a main loop.  We will redo
+     this analysis later if we instead decide to use the loop as an
+     epilogue loop.  */
+  ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
+  if (!ok)
+    return ok;
+
   /* Check the costings of the loop make vectorizing worthwhile.  */
   res = vect_analyze_loop_costing (loop_vinfo);
   if (res < 0)
@@ -2224,7 +2537,6 @@ start_over:
     return opt_result::failure_at (vect_location,
                                   "Loop costings not worthwhile.\n");
 
-  determine_peel_for_niter (loop_vinfo);
   /* If an epilogue loop is required make sure we can create one.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
@@ -2354,7 +2666,7 @@ again:
   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
   /* Free the SLP instances.  */
   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
-    vect_free_slp_instance (instance, false);
+    vect_free_slp_instance (instance);
   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
   /* Reset SLP type to loop_vect on all stmts.  */
   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
@@ -2406,6 +2718,7 @@ again:
     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
   /* Reset accumulated rgroup information.  */
   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
+  release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
   /* Reset assorted flags.  */
   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
@@ -2692,7 +3005,10 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
                lowest_th = ordered_min (lowest_th, th);
            }
          else
-           delete loop_vinfo;
+           {
+             delete loop_vinfo;
+             loop_vinfo = opt_loop_vec_info::success (NULL);
+           }
 
          /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
             enabled, SIMDUID is not set, it is the innermost loop and we have
@@ -2717,6 +3033,7 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
       else
        {
          delete loop_vinfo;
+         loop_vinfo = opt_loop_vec_info::success (NULL);
          if (fatal)
            {
              gcc_checking_assert (first_loop_vinfo == NULL);
@@ -2724,6 +3041,23 @@ vect_analyze_loop (class loop *loop, vec_info_shared *shared)
            }
        }
 
+      /* Handle the case that the original loop can use partial
+        vectorization, but want to only adopt it for the epilogue.
+        The retry should be in the same mode as original.  */
+      if (vect_epilogues
+         && loop_vinfo
+         && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       {
+         gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+                     && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "***** Re-trying analysis with same vector mode"
+                            " %s for epilogue with partial vectors.\n",
+                            GET_MODE_NAME (loop_vinfo->vector_mode));
+         continue;
+       }
+
       if (mode_i < vector_modes.length ()
          && VECTOR_MODE_P (autodetected_vector_mode)
          && (related_vector_mode (vector_modes[mode_i],
@@ -3022,14 +3356,17 @@ pop:
          fail = true;
          break;
        }
-      /* Check there's only a single stmt the op is used on inside
-         of the loop.  */
+      /* Check there's only a single stmt the op is used on.  For the
+        not value-changing tail and the last stmt allow out-of-loop uses.
+        ???  We could relax this and handle arbitrary live stmts by
+        forcing a scalar epilogue for example.  */
       imm_use_iterator imm_iter;
       gimple *op_use_stmt;
       unsigned cnt = 0;
       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
        if (!is_gimple_debug (op_use_stmt)
-           && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
+           && (*code != ERROR_MARK
+               || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
          {
            /* We want to allow x + x but not x < 1 ? x : 2.  */
            if (is_gimple_assign (op_use_stmt)
@@ -3340,42 +3677,58 @@ vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
   return NULL;
 }
 
-/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
-int
-vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
-                             int *peel_iters_epilogue,
-                             stmt_vector_for_cost *scalar_cost_vec,
-                            stmt_vector_for_cost *prologue_cost_vec,
-                            stmt_vector_for_cost *epilogue_cost_vec)
+/* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
+   PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
+   or -1 if not known.  */
+
+static int
+vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
 {
-  int retval = 0;
   int assumed_vf = vect_vf_for_cost (loop_vinfo);
-
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
     {
-      *peel_iters_epilogue = assumed_vf / 2;
       if (dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location,
+       dump_printf_loc (MSG_NOTE, vect_location,
                         "cost model: epilogue peel iters set to vf/2 "
                         "because loop iterations are unknown .\n");
-
-      /* If peeled iterations are known but number of scalar loop
-         iterations are unknown, count a taken branch per peeled loop.  */
-      retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
-                                NULL, NULL_TREE, 0, vect_prologue);
-      retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
-                                 NULL, NULL_TREE, 0, vect_epilogue);
+      return assumed_vf / 2;
     }
   else
     {
       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
-      peel_iters_prologue = niters < peel_iters_prologue ?
-                            niters : peel_iters_prologue;
-      *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
+      peel_iters_prologue = MIN (niters, peel_iters_prologue);
+      int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
       /* If we need to peel for gaps, but no peeling is required, we have to
         peel VF iterations.  */
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
-       *peel_iters_epilogue = assumed_vf;
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
+       peel_iters_epilogue = assumed_vf;
+      return peel_iters_epilogue;
+    }
+}
+
+/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
+int
+vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
+                            int *peel_iters_epilogue,
+                            stmt_vector_for_cost *scalar_cost_vec,
+                            stmt_vector_for_cost *prologue_cost_vec,
+                            stmt_vector_for_cost *epilogue_cost_vec)
+{
+  int retval = 0;
+
+  *peel_iters_epilogue
+    = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
+
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    {
+      /* If peeled iterations are known but number of scalar loop
+        iterations are unknown, count a taken branch per peeled loop.  */
+      if (peel_iters_prologue > 0)
+       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
+                                  NULL, NULL_TREE, 0, vect_prologue);
+      if (*peel_iters_epilogue > 0)
+       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
+                                   NULL, NULL_TREE, 0, vect_epilogue);
     }
 
   stmt_info_for_cost *si;
@@ -3518,35 +3871,121 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      TODO: Build an expression that represents peel_iters for prologue and
      epilogue to be used in a run-time test.  */
 
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  bool prologue_need_br_taken_cost = false;
+  bool prologue_need_br_not_taken_cost = false;
+
+  /* Calculate peel_iters_prologue.  */
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
+    peel_iters_prologue = 0;
+  else if (npeel < 0)
     {
-      peel_iters_prologue = 0;
-      peel_iters_epilogue = 0;
+      peel_iters_prologue = assumed_vf / 2;
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "prologue peel iters set to vf/2.\n");
 
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-       {
-         /* We need to peel exactly one iteration.  */
-         peel_iters_epilogue += 1;
-         stmt_info_for_cost *si;
-         int j;
-         FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
-                           j, si)
-           (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
-                                 si->kind, si->stmt_info, si->vectype,
-                                 si->misalign, vect_epilogue);
-       }
+      /* If peeled iterations are unknown, count a taken branch and a not taken
+        branch per peeled loop.  Even if scalar loop iterations are known,
+        vector iterations are not known since peeled prologue iterations are
+        not known.  Hence guards remain the same.  */
+      prologue_need_br_taken_cost = true;
+      prologue_need_br_not_taken_cost = true;
+    }
+  else
+    {
+      peel_iters_prologue = npeel;
+      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
+       /* If peeled iterations are known but number of scalar loop
+          iterations are unknown, count a taken branch per peeled loop.  */
+       prologue_need_br_taken_cost = true;
+    }
 
-      /* Calculate how many masks we need to generate.  */
-      unsigned int num_masks = 0;
-      rgroup_controls *rgm;
-      unsigned int num_vectors_m1;
-      FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
-       if (rgm->type)
-         num_masks += num_vectors_m1 + 1;
-      gcc_assert (num_masks > 0);
+  bool epilogue_need_br_taken_cost = false;
+  bool epilogue_need_br_not_taken_cost = false;
 
-      /* In the worst case, we need to generate each mask in the prologue
-        and in the loop body.  One of the loop body mask instructions
+  /* Calculate peel_iters_epilogue.  */
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+    /* We need to peel exactly one iteration for gaps.  */
+    peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
+  else if (npeel < 0)
+    {
+      /* If peeling for alignment is unknown, loop bound of main loop
+        becomes unknown.  */
+      peel_iters_epilogue = assumed_vf / 2;
+      if (dump_enabled_p ())
+       dump_printf (MSG_NOTE, "cost model: "
+                    "epilogue peel iters set to vf/2 because "
+                    "peeling for alignment is unknown.\n");
+
+      /* See the same reason above in peel_iters_prologue calculation.  */
+      epilogue_need_br_taken_cost = true;
+      epilogue_need_br_not_taken_cost = true;
+    }
+  else
+    {
+      peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
+      if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
+       /* If peeled iterations are known but number of scalar loop
+          iterations are unknown, count a taken branch per peeled loop.  */
+       epilogue_need_br_taken_cost = true;
+    }
+
+  stmt_info_for_cost *si;
+  int j;
+  /* Add costs associated with peel_iters_prologue.  */
+  if (peel_iters_prologue)
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+      {
+       (void) add_stmt_cost (loop_vinfo, target_cost_data,
+                             si->count * peel_iters_prologue, si->kind,
+                             si->stmt_info, si->vectype, si->misalign,
+                             vect_prologue);
+      }
+
+  /* Add costs associated with peel_iters_epilogue.  */
+  if (peel_iters_epilogue)
+    FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
+      {
+       (void) add_stmt_cost (loop_vinfo, target_cost_data,
+                             si->count * peel_iters_epilogue, si->kind,
+                             si->stmt_info, si->vectype, si->misalign,
+                             vect_epilogue);
+      }
+
+  /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
+
+  if (prologue_need_br_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+                         NULL, NULL_TREE, 0, vect_prologue);
+
+  if (prologue_need_br_not_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+                         cond_branch_not_taken, NULL, NULL_TREE, 0,
+                         vect_prologue);
+
+  if (epilogue_need_br_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
+                         NULL, NULL_TREE, 0, vect_epilogue);
+
+  if (epilogue_need_br_not_taken_cost)
+    (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
+                         cond_branch_not_taken, NULL, NULL_TREE, 0,
+                         vect_epilogue);
+
+  /* Take care of special costs for rgroup controls of partial vectors.  */
+  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+    {
+      /* Calculate how many masks we need to generate.  */
+      unsigned int num_masks = 0;
+      rgroup_controls *rgm;
+      unsigned int num_vectors_m1;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
+       if (rgm->type)
+         num_masks += num_vectors_m1 + 1;
+      gcc_assert (num_masks > 0);
+
+      /* In the worst case, we need to generate each mask in the prologue
+        and in the loop body.  One of the loop body mask instructions
         replaces the comparison in the scalar loop, and since we don't
         count the scalar comparison against the scalar body, we shouldn't
         count that vector instruction against the vector body either.
@@ -3557,88 +3996,62 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
         simpler and safer to use the worst-case cost; if this ends up
         being the tie-breaker between vectorizing or not, then it's
         probably better not to vectorize.  */
-      (void) add_stmt_cost (loop_vinfo,
-                           target_cost_data, num_masks, vector_stmt,
-                           NULL, NULL_TREE, 0, vect_prologue);
-      (void) add_stmt_cost (loop_vinfo,
-                           target_cost_data, num_masks - 1, vector_stmt,
-                           NULL, NULL_TREE, 0, vect_body);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
+                           vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
+                           vector_stmt, NULL, NULL_TREE, 0, vect_body);
     }
-  else if (npeel < 0)
+  else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
     {
-      peel_iters_prologue = assumed_vf / 2;
-      if (dump_enabled_p ())
-       dump_printf (MSG_NOTE, "cost model: "
-                    "prologue peel iters set to vf/2.\n");
+      /* Referring to the functions vect_set_loop_condition_partial_vectors
+        and vect_set_loop_controls_directly, we need to generate each
+        length in the prologue and in the loop body if required. Although
+        there are some possible optimizations, we consider the worst case
+        here.  */
 
-      /* If peeling for alignment is unknown, loop bound of main loop becomes
-         unknown.  */
-      peel_iters_epilogue = assumed_vf / 2;
-      if (dump_enabled_p ())
-       dump_printf (MSG_NOTE, "cost model: "
-                    "epilogue peel iters set to vf/2 because "
-                    "peeling for alignment is unknown.\n");
+      bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
+      bool need_iterate_p
+       = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+          && !vect_known_niters_smaller_than_vf (loop_vinfo));
 
-      /* If peeled iterations are unknown, count a taken branch and a not taken
-         branch per peeled loop. Even if scalar loop iterations are known,
-         vector iterations are not known since peeled prologue iterations are
-         not known. Hence guards remain the same.  */
-      (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
-                           NULL, NULL_TREE, 0, vect_prologue);
-      (void) add_stmt_cost (loop_vinfo,
-                           target_cost_data, 1, cond_branch_not_taken,
-                           NULL, NULL_TREE, 0, vect_prologue);
-      (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
-                           NULL, NULL_TREE, 0, vect_epilogue);
-      (void) add_stmt_cost (loop_vinfo,
-                           target_cost_data, 1, cond_branch_not_taken,
-                           NULL, NULL_TREE, 0, vect_epilogue);
-      stmt_info_for_cost *si;
-      int j;
-      FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
-       {
-         (void) add_stmt_cost (loop_vinfo, target_cost_data,
-                               si->count * peel_iters_prologue,
-                               si->kind, si->stmt_info, si->vectype,
-                               si->misalign,
-                               vect_prologue);
-         (void) add_stmt_cost (loop_vinfo, target_cost_data,
-                               si->count * peel_iters_epilogue,
-                               si->kind, si->stmt_info, si->vectype,
-                               si->misalign,
-                               vect_epilogue);
-       }
-    }
-  else
-    {
-      stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
-      stmt_info_for_cost *si;
-      int j;
-      void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
+      /* Calculate how many statements to be added.  */
+      unsigned int prologue_stmts = 0;
+      unsigned int body_stmts = 0;
 
-      prologue_cost_vec.create (2);
-      epilogue_cost_vec.create (2);
-      peel_iters_prologue = npeel;
+      rgroup_controls *rgc;
+      unsigned int num_vectors_m1;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
+       if (rgc->type)
+         {
+           /* May need one SHIFT for nitems_total computation.  */
+           unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+           if (nitems != 1 && !niters_known_p)
+             prologue_stmts += 1;
+
+           /* May need one MAX and one MINUS for wrap around.  */
+           if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
+             prologue_stmts += 2;
 
-      (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
-                                         &peel_iters_epilogue,
-                                         &LOOP_VINFO_SCALAR_ITERATION_COST
-                                           (loop_vinfo),
-                                         &prologue_cost_vec,
-                                         &epilogue_cost_vec);
+           /* Need one MAX and one MINUS for each batch limit excepting for
+              the 1st one.  */
+           prologue_stmts += num_vectors_m1 * 2;
 
-      FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
-       (void) add_stmt_cost (loop_vinfo,
-                             data, si->count, si->kind, si->stmt_info,
-                             si->vectype, si->misalign, vect_prologue);
+           unsigned int num_vectors = num_vectors_m1 + 1;
 
-      FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
-       (void) add_stmt_cost (loop_vinfo,
-                             data, si->count, si->kind, si->stmt_info,
-                             si->vectype, si->misalign, vect_epilogue);
+           /* Need to set up lengths in prologue, only one MIN required
+              for each since start index is zero.  */
+           prologue_stmts += num_vectors;
 
-      prologue_cost_vec.release ();
-      epilogue_cost_vec.release ();
+           /* Each may need two MINs and one MINUS to update lengths in body
+              for next iteration.  */
+           if (need_iterate_p)
+             body_stmts += 3 * num_vectors;
+         }
+
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
+                           scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
+      (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
+                           scalar_stmt, NULL, NULL_TREE, 0, vect_body);
     }
 
   /* FORNOW: The scalar outside cost is incremented in one of the
@@ -3774,8 +4187,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
     }
 
   /* ??? The "if" arm is written to handle all cases; see below for what
-     we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+     we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
+  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
       /* Rewriting the condition above in terms of the number of
         vector iterations (vniters) rather than the number of
@@ -3802,7 +4215,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
        dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
                     min_vec_niters);
 
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
        {
          /* Now that we know the minimum number of vector iterations,
             find the minimum niters for which the scalar cost is larger:
@@ -3857,6 +4270,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
     /* We want the vectorized loop to execute at least once.  */
     min_profitable_iters = assumed_vf + peel_iters_prologue;
+  else if (min_profitable_iters < peel_iters_prologue)
+    /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
+       vectorized loop executes at least once.  */
+    min_profitable_iters = peel_iters_prologue;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -3874,7 +4291,9 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 
   if (vec_outside_cost <= 0)
     min_profitable_estimate = 0;
-  else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+  /* ??? This "else if" arm is written to handle all cases; see below for
+     what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
+  else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
     {
       /* This is a repeat of the code above, but with + SOC rather
         than - SOC.  */
@@ -3886,7 +4305,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
       if (outside_overhead > 0)
        min_vec_niters = outside_overhead / saving_per_viter + 1;
 
-      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
        {
          int threshold = (vec_inside_cost * min_vec_niters
                           + vec_outside_cost
@@ -4130,34 +4549,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 }
 
 
-/* Function vect_model_induction_cost.
-
-   Models cost for induction operations.  */
-
-static void
-vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
-                          stmt_vector_for_cost *cost_vec)
-{
-  unsigned inside_cost, prologue_cost;
-
-  if (PURE_SLP_STMT (stmt_info))
-    return;
-
-  /* loop cost for vec_loop.  */
-  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
-                                 stmt_info, 0, vect_body);
-
-  /* prologue cost for vec_init and vec_step.  */
-  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
-                                   stmt_info, 0, vect_prologue);
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-                     "vect_model_induction_cost: inside_cost = %d, "
-                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
-}
-
-
 
 /* Function get_initial_def_for_reduction
 
@@ -4431,7 +4822,8 @@ info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
 {
   stmt_info = vect_orig_stmt (stmt_info);
   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
-  if (!is_a <gphi *> (stmt_info->stmt))
+  if (!is_a <gphi *> (stmt_info->stmt)
+      || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
   gphi *phi = as_a <gphi *> (stmt_info->stmt);
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
@@ -6022,9 +6414,28 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
     {
       if (is_a <gphi *> (stmt_info->stmt))
-       /* Analysis for double-reduction is done on the outer
-          loop PHI, nested cycles have no further restrictions.  */
-       STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       {
+         if (slp_node)
+           {
+             /* We eventually need to set a vector type on invariant
+                arguments.  */
+             unsigned j;
+             slp_tree child;
+             FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+               if (!vect_maybe_update_slp_op_vectype
+                      (child, SLP_TREE_VECTYPE (slp_node)))
+                 {
+                   if (dump_enabled_p ())
+                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                      "incompatible vector types for "
+                                      "invariants\n");
+                   return false;
+                 }
+           }
+         /* Analysis for double-reduction is done on the outer
+            loop PHI, nested cycles have no further restrictions.  */
+         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       }
       else
        STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
       return true;
@@ -6066,12 +6477,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
 
   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
-     and compute the reduction chain length.  */
-  tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                         loop_latch_edge (loop));
+     and compute the reduction chain length.  Discover the real
+     reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
+  tree reduc_def
+    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                            loop_latch_edge
+                              (gimple_bb (reduc_def_phi)->loop_father));
   unsigned reduc_chain_length = 0;
   bool only_slp_reduc_chain = true;
   stmt_info = NULL;
+  slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
   while (reduc_def != PHI_RESULT (reduc_def_phi))
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
@@ -6114,6 +6529,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
        stmt_info = vdef;
       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
       reduc_chain_length++;
+      if (!stmt_info && slp_node)
+       slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
     }
   /* PHIs should not participate in patterns.  */
   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
@@ -6200,17 +6617,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
      The last use is the reduction variable.  In case of nested cycle this
      assumption is not true: we use reduc_index to record the index of the
      reduction variable.  */
-  /* ???  To get at invariant/constant uses on the SLP node we have to
-     get to it here, slp_node is still the reduction PHI.  */
-  slp_tree slp_for_stmt_info = NULL;
-  if (slp_node)
-    {
-      slp_for_stmt_info = slp_node_instance->root;
-      /* And then there's reduction chain with a conversion ...  */
-      if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
-       slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
-      gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
-    }
   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
   /* We need to skip an extra operand for COND_EXPRs with embedded
      comparison.  */
@@ -7091,15 +7497,24 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
   if (slp_node)
     {
       vec_initial_defs.reserve (vec_num);
-      gcc_assert (slp_node == slp_node_instance->reduc_phis);
-      stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
-      tree neutral_op
-       = neutral_op_for_slp_reduction (slp_node, vectype_out,
-                                       STMT_VINFO_REDUC_CODE (reduc_info),
-                                       first != NULL);
-      get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
-                                     &vec_initial_defs, vec_num,
-                                     first != NULL, neutral_op);
+      if (nested_cycle)
+       {
+         unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
+         vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
+                            &vec_initial_defs);
+       }
+      else
+       {
+         gcc_assert (slp_node == slp_node_instance->reduc_phis);
+         stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
+         tree neutral_op
+             = neutral_op_for_slp_reduction (slp_node, vectype_out,
+                                             STMT_VINFO_REDUC_CODE (reduc_info),
+                                             first != NULL);
+         get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
+                                         &vec_initial_defs, vec_num,
+                                         first != NULL, neutral_op);
+       }
     }
   else
     {
@@ -7205,6 +7620,17 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      /* Deal with copies from externs or constants that disguise as
+        loop-closed PHI nodes (PR97886).  */
+      if (slp_node
+         && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
+                                               SLP_TREE_VECTYPE (slp_node)))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "incompatible vector types for invariants\n");
+         return false;
+       }
       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
       return true;
     }
@@ -7234,6 +7660,81 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
   return true;
 }
 
+/* Vectorizes PHIs.  */
+
+bool
+vectorizable_phi (vec_info *,
+                 stmt_vec_info stmt_info, gimple **vec_stmt,
+                 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
+    return false;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+    return false;
+
+  tree vectype = SLP_TREE_VECTYPE (slp_node);
+
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      slp_tree child;
+      unsigned i;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
+       if (!child)
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "PHI node with unvectorized backedge def\n");
+           return false;
+         }
+       else if (!vect_maybe_update_slp_op_vectype (child, vectype))
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "incompatible vector types for invariants\n");
+           return false;
+         }
+      record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                       vector_stmt, stmt_info, vectype, 0, vect_body);
+      STMT_VINFO_TYPE (stmt_info) = phi_info_type;
+      return true;
+    }
+
+  tree scalar_dest = gimple_phi_result (stmt_info->stmt);
+  basic_block bb = gimple_bb (stmt_info->stmt);
+  tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
+  auto_vec<gphi *> new_phis;
+  for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
+    {
+      slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
+
+      /* Skip not yet vectorized defs.  */
+      if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
+         && SLP_TREE_VEC_STMTS (child).is_empty ())
+       continue;
+
+      auto_vec<tree> vec_oprnds;
+      vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
+      if (!new_phis.exists ())
+       {
+         new_phis.create (vec_oprnds.length ());
+         for (unsigned j = 0; j < vec_oprnds.length (); j++)
+           {
+             /* Create the vectorized LC PHI node.  */
+             new_phis.quick_push (create_phi_node (vec_dest, bb));
+             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
+           }
+       }
+      edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
+      for (unsigned j = 0; j < vec_oprnds.length (); j++)
+       add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
+    }
+  /* We should have at least one already vectorized child.  */
+  gcc_assert (new_phis.exists ());
+
+  return true;
+}
+
 
 /* Function vect_min_worthwhile_factor.
 
@@ -7304,7 +7805,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned i;
   tree expr;
-  gimple_seq stmts;
   gimple_stmt_iterator si;
 
   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
@@ -7344,10 +7844,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
          return false;
        }
 
-      /* FORNOW: outer loop induction with SLP not supported.  */
-      if (STMT_SLP_TYPE (stmt_info))
-       return false;
-
       exit_phi = NULL;
       latch_e = loop_latch_edge (loop->inner);
       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
@@ -7386,7 +7882,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 
   if (slp_node && !nunits.is_constant ())
     {
-      /* The current SLP code creates the initial value element-by-element.  */
+      /* The current SLP code creates the step value element-by-element.  */
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "SLP induction not supported for variable-length"
@@ -7396,9 +7892,50 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      unsigned inside_cost = 0, prologue_cost = 0;
+      if (slp_node)
+       {
+         /* We eventually need to set a vector type on invariant
+            arguments.  */
+         unsigned j;
+         slp_tree child;
+         FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+           if (!vect_maybe_update_slp_op_vectype
+               (child, SLP_TREE_VECTYPE (slp_node)))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "incompatible vector types for "
+                                  "invariants\n");
+               return false;
+             }
+         /* loop cost for vec_loop.  */
+         inside_cost
+           = record_stmt_cost (cost_vec,
+                               SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                               vector_stmt, stmt_info, 0, vect_body);
+         /* prologue cost for vec_init (if not nested) and step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
+                                           scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      else /* if (!slp_node) */
+       {
+         /* loop cost for vec_loop.  */
+         inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+                                         stmt_info, 0, vect_body);
+         /* prologue cost for vec_init and vec_step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "vect_model_induction_cost: inside_cost = %d, "
+                        "prologue_cost = %d .\n", inside_cost,
+                        prologue_cost);
+
       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
       DUMP_VECT_SCOPE ("vectorizable_induction");
-      vect_model_induction_cost (stmt_info, ncopies, cost_vec);
       return true;
     }
 
@@ -7417,164 +7954,195 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
 
   pe = loop_preheader_edge (iv_loop);
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-                                    loop_preheader_edge (iv_loop));
-
-  stmts = NULL;
-  if (!nested_in_vect_loop)
-    {
-      /* Convert the initial value to the IV update type.  */
-      tree new_type = TREE_TYPE (step_expr);
-      init_expr = gimple_convert (&stmts, new_type, init_expr);
-
-      /* If we are using the loop mask to "peel" for alignment then we need
-        to adjust the start value here.  */
-      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-      if (skip_niters != NULL_TREE)
-       {
-         if (FLOAT_TYPE_P (vectype))
-           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
-                                       skip_niters);
-         else
-           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
-         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
-                                        skip_niters, step_expr);
-         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
-                                   init_expr, skip_step);
-       }
-    }
-
-  if (stmts)
-    {
-      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-      gcc_assert (!new_bb);
-    }
-
   /* Find the first insertion point in the BB.  */
   basic_block bb = gimple_bb (phi);
   si = gsi_after_labels (bb);
 
   /* For SLP induction we have to generate several IVs as for example
-     with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
-     [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
-     [VF*S, VF*S, VF*S, VF*S] for all.  */
+     with group size 3 we need
+       [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
+       [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
   if (slp_node)
     {
       /* Enforced above.  */
       unsigned int const_nunits = nunits.to_constant ();
 
-      /* Generate [VF*S, VF*S, ... ].  */
-      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+      /* The initial values are vectorized, but any lanes > group_size
+        need adjustment.  */
+      slp_tree init_node
+       = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
+
+      /* Gather steps.  Since we do not vectorize inductions as
+        cycles we have to reconstruct the step from SCEV data.  */
+      unsigned group_size = SLP_TREE_LANES (slp_node);
+      tree *steps = XALLOCAVEC (tree, group_size);
+      tree *inits = XALLOCAVEC (tree, group_size);
+      stmt_vec_info phi_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
        {
-         expr = build_int_cst (integer_type_node, vf);
-         expr = fold_convert (TREE_TYPE (step_expr), expr);
+         steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+         if (!init_node)
+           inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
+                                          pe->dest_idx);
        }
-      else
-       expr = build_int_cst (TREE_TYPE (step_expr), vf);
-      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                             expr, step_expr);
-      if (! CONSTANT_CLASS_P (new_name))
-       new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
-                                    TREE_TYPE (step_expr), NULL);
-      new_vec = build_vector_from_val (step_vectype, new_name);
-      vec_step = vect_init_vector (loop_vinfo, stmt_info,
-                                  new_vec, step_vectype, NULL);
 
       /* Now generate the IVs.  */
-      unsigned group_size = SLP_TREE_LANES (slp_node);
       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      unsigned elts = const_nunits * nvects;
-      /* Compute the number of distinct IVs we need.  First reduce
-        group_size if it is a multiple of const_nunits so we get
-        one IV for a group_size of 4 but const_nunits 2.  */
-      unsigned group_sizep = group_size;
-      if (group_sizep % const_nunits == 0)
-       group_sizep = group_sizep / const_nunits;
-      unsigned nivs = least_common_multiple (group_sizep,
-                                            const_nunits) / const_nunits;
-      gcc_assert (elts % group_size == 0);
-      tree elt = init_expr;
+      gcc_assert ((const_nunits * nvects) % group_size == 0);
+      unsigned nivs;
+      if (nested_in_vect_loop)
+       nivs = nvects;
+      else
+       {
+         /* Compute the number of distinct IVs we need.  First reduce
+            group_size if it is a multiple of const_nunits so we get
+            one IV for a group_size of 4 but const_nunits 2.  */
+         unsigned group_sizep = group_size;
+         if (group_sizep % const_nunits == 0)
+           group_sizep = group_sizep / const_nunits;
+         nivs = least_common_multiple (group_sizep,
+                                       const_nunits) / const_nunits;
+       }
+      tree stept = TREE_TYPE (step_vectype);
+      tree lupdate_mul = NULL_TREE;
+      if (!nested_in_vect_loop)
+       {
+         /* The number of iterations covered in one vector iteration.  */
+         unsigned lup_mul = (nvects * const_nunits) / group_size;
+         lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept, lup_mul,
+                                                            UNSIGNED)
+                                    : build_int_cstu (stept, lup_mul));
+       }
+      tree peel_mul = NULL_TREE;
+      gimple_seq init_stmts = NULL;
+      if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+       {
+         if (SCALAR_FLOAT_TYPE_P (stept))
+           peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
+                                    LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         else
+           peel_mul = gimple_convert (&init_stmts, stept,
+                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         peel_mul = gimple_build_vector_from_val (&init_stmts,
+                                                  step_vectype, peel_mul);
+       }
       unsigned ivn;
+      auto_vec<tree> vec_steps;
       for (ivn = 0; ivn < nivs; ++ivn)
        {
-         tree_vector_builder elts (step_vectype, const_nunits, 1);
-         stmts = NULL;
+         tree_vector_builder step_elts (step_vectype, const_nunits, 1);
+         tree_vector_builder init_elts (vectype, const_nunits, 1);
+         tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
          for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
            {
-             if (ivn*const_nunits + eltn >= group_size
-                 && (ivn * const_nunits + eltn) % group_size == 0)
-               elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
-                                   elt, step_expr);
-             elts.quick_push (elt);
-           }
-         vec_init = gimple_build_vector (&stmts, &elts);
-         vec_init = gimple_convert (&stmts, vectype, vec_init);
-         if (stmts)
-           {
-             new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-             gcc_assert (!new_bb);
+             /* The scalar steps of the IVs.  */
+             tree elt = steps[(ivn*const_nunits + eltn) % group_size];
+             elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
+             step_elts.quick_push (elt);
+             if (!init_node)
+               {
+                 /* The scalar inits of the IVs if not vectorized.  */
+                 elt = inits[(ivn*const_nunits + eltn) % group_size];
+                 if (!useless_type_conversion_p (TREE_TYPE (vectype),
+                                                 TREE_TYPE (elt)))
+                   elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
+                                       TREE_TYPE (vectype), elt);
+                 init_elts.quick_push (elt);
+               }
+             /* The number of steps to add to the initial values.  */
+             unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
+             mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
+                                  ? build_real_from_wide (stept,
+                                                          mul_elt, UNSIGNED)
+                                  : build_int_cstu (stept, mul_elt));
            }
+         vec_step = gimple_build_vector (&init_stmts, &step_elts);
+         vec_steps.safe_push (vec_step);
+         tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
+         if (peel_mul)
+           step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                    step_mul, peel_mul);
+         if (!init_node)
+           vec_init = gimple_build_vector (&init_stmts, &init_elts);
 
          /* Create the induction-phi that defines the induction-operand.  */
-         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
+                                           "vec_iv_");
          induction_phi = create_phi_node (vec_dest, iv_loop->header);
          induc_def = PHI_RESULT (induction_phi);
 
          /* Create the iv update inside the loop  */
+         tree up = vec_step;
+         if (lupdate_mul)
+           up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                              vec_step, lupdate_mul);
          gimple_seq stmts = NULL;
          vec_def = gimple_convert (&stmts, step_vectype, induc_def);
          vec_def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, vec_def, vec_step);
+                                 PLUS_EXPR, step_vectype, vec_def, up);
          vec_def = gimple_convert (&stmts, vectype, vec_def);
          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+                      UNKNOWN_LOCATION);
+
+         if (init_node)
+           vec_init = vect_get_slp_vect_def (init_node, ivn);
+         if (!nested_in_vect_loop
+             && !integer_zerop (step_mul))
+           {
+             vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
+             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                vec_step, step_mul);
+             vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                     vec_def, up);
+             vec_init = gimple_convert (&init_stmts, vectype, vec_def);
+           }
 
          /* Set the arguments of the phi node:  */
          add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
-         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
-                      UNKNOWN_LOCATION);
 
          SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
        }
-      /* Fill up to the number of vectors we need for the whole group.  */
-      nivs = least_common_multiple (group_size,
-                                   const_nunits) / const_nunits;
-      for (; ivn < nivs; ++ivn)
-       SLP_TREE_VEC_STMTS (slp_node)
-         .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+      if (!nested_in_vect_loop)
+       {
+         /* Fill up to the number of vectors we need for the whole group.  */
+         nivs = least_common_multiple (group_size,
+                                       const_nunits) / const_nunits;
+         for (; ivn < nivs; ++ivn)
+           {
+             SLP_TREE_VEC_STMTS (slp_node)
+               .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+             vec_steps.safe_push (vec_steps[0]);
+           }
+       }
 
-      /* Re-use IVs when we can.  */
+      /* Re-use IVs when we can.  We are generating further vector
+        stmts by adding VF' * stride to the IVs generated above.  */
       if (ivn < nvects)
        {
          unsigned vfp
            = least_common_multiple (group_size, const_nunits) / group_size;
-         /* Generate [VF'*S, VF'*S, ... ].  */
-         if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-           {
-             expr = build_int_cst (integer_type_node, vfp);
-             expr = fold_convert (TREE_TYPE (step_expr), expr);
-           }
-         else
-           expr = build_int_cst (TREE_TYPE (step_expr), vfp);
-         new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                                 expr, step_expr);
-         if (! CONSTANT_CLASS_P (new_name))
-           new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
-                                        TREE_TYPE (step_expr), NULL);
-         new_vec = build_vector_from_val (step_vectype, new_name);
-         vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
-                                      step_vectype, NULL);
+         tree lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept,
+                                                            vfp, UNSIGNED)
+                                    : build_int_cstu (stept, vfp));
          for (; ivn < nvects; ++ivn)
            {
              gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
-             tree def;
-             if (gimple_code (iv) == GIMPLE_PHI)
-               def = gimple_phi_result (iv);
-             else
-               def = gimple_assign_lhs (iv);
+             tree def = gimple_get_lhs (iv);
+             if (ivn < 2*nivs)
+               vec_steps[ivn - nivs]
+                 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                 vec_steps[ivn - nivs], lupdate_mul);
              gimple_seq stmts = NULL;
              def = gimple_convert (&stmts, step_vectype, def);
-             def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, def, vec_step);
+             def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
+                                 def, vec_steps[ivn % nivs]);
              def = gimple_convert (&stmts, vectype, def);
              if (gimple_code (iv) == GIMPLE_PHI)
                gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
@@ -7588,9 +8156,45 @@ vectorizable_induction (loop_vec_info loop_vinfo,
            }
        }
 
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
+      gcc_assert (!new_bb);
+
       return true;
     }
 
+  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
+                                    loop_preheader_edge (iv_loop));
+
+  gimple_seq stmts = NULL;
+  if (!nested_in_vect_loop)
+    {
+      /* Convert the initial value to the IV update type.  */
+      tree new_type = TREE_TYPE (step_expr);
+      init_expr = gimple_convert (&stmts, new_type, init_expr);
+
+      /* If we are using the loop mask to "peel" for alignment then we need
+        to adjust the start value here.  */
+      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+      if (skip_niters != NULL_TREE)
+       {
+         if (FLOAT_TYPE_P (vectype))
+           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
+                                       skip_niters);
+         else
+           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
+         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
+                                        skip_niters, step_expr);
+         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
+                                   init_expr, skip_step);
+       }
+    }
+
+  if (stmts)
+    {
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+      gcc_assert (!new_bb);
+    }
+
   /* Create the vector that holds the initial_value of the induction.  */
   if (nested_in_vect_loop)
     {
@@ -7803,17 +8407,19 @@ vectorizable_induction (loop_vec_info loop_vinfo,
    it can be supported.  */
 
 bool
-vectorizable_live_operation (loop_vec_info loop_vinfo,
+vectorizable_live_operation (vec_info *vinfo,
                             stmt_vec_info stmt_info,
                             gimple_stmt_iterator *gsi,
                             slp_tree slp_node, slp_instance slp_node_instance,
                             int slp_index, bool vec_stmt_p,
-                            stmt_vector_for_cost *)
+                            stmt_vector_for_cost *cost_vec)
 {
-  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
   imm_use_iterator imm_iter;
   tree lhs, lhs_type, bitsize, vec_bitsize;
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype = (slp_node
+                 ? SLP_TREE_VECTYPE (slp_node)
+                 : STMT_VINFO_VECTYPE (stmt_info));
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   int ncopies;
   gimple *use_stmt;
@@ -7855,10 +8461,6 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
       return true;
     }
 
-  /* FORNOW.  CHECKME.  */
-  if (nested_in_vect_loop_p (loop, stmt_info))
-    return false;
-
   /* If STMT is not relevant and it is a simple assignment and its inputs are
      invariant then it can remain in place, unvectorized.  The original last
      scalar value that it computes will be used.  */
@@ -7881,12 +8483,11 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
     {
       gcc_assert (slp_index >= 0);
 
-      int num_scalar = SLP_TREE_LANES (slp_node);
-      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-
       /* Get the last occurrence of the scalar index from the concatenation of
         all the slp vectors. Calculate which slp vector it is and the index
         within.  */
+      int num_scalar = SLP_TREE_LANES (slp_node);
+      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
 
       /* Calculate which vector contains the result, and which lane of
@@ -7904,7 +8505,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
   if (!vec_stmt_p)
     {
       /* No transformation required.  */
-      if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+      if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
        {
          if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
                                               OPTIMIZE_FOR_SPEED))
@@ -7941,14 +8542,20 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
                                     1, vectype, NULL);
            }
        }
+      /* ???  Enable for loop costing as well.  */
+      if (!loop_vinfo)
+       record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
+                         0, vect_epilogue);
       return true;
     }
 
   /* Use the lhs of the original scalar statement.  */
   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
+                    "stmt %G", stmt);
 
-  lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
-       : gimple_get_lhs (stmt);
+  lhs = gimple_get_lhs (stmt);
   lhs_type = TREE_TYPE (lhs);
 
   bitsize = vector_element_bits_tree (vectype);
@@ -7956,16 +8563,14 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
 
   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
   tree vec_lhs, bitstart;
+  gimple *vec_stmt;
   if (slp_node)
     {
-      gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
 
       /* Get the correct slp vectorized stmt.  */
-      gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
-      if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
-       vec_lhs = gimple_phi_result (phi);
-      else
-       vec_lhs = gimple_get_lhs (vec_stmt);
+      vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
+      vec_lhs = gimple_get_lhs (vec_stmt);
 
       /* Get entry to use.  */
       bitstart = bitsize_int (vec_index);
@@ -7974,102 +8579,192 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
   else
     {
       /* For multiple copies, get the last copy.  */
-      vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
+      vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
+      vec_lhs = gimple_get_lhs (vec_stmt);
 
       /* Get the last lane in the vector.  */
       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
     }
 
-  /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
-     requirement, insert one phi node for it.  It looks like:
-        loop;
-       BB:
-        # lhs' = PHI <lhs>
-     ==>
-        loop;
-       BB:
-        # vec_lhs' = PHI <vec_lhs>
-        new_tree = lane_extract <vec_lhs', ...>;
-        lhs' = new_tree;  */
+  if (loop_vinfo)
+    {
+      /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
+        requirement, insert one phi node for it.  It looks like:
+          loop;
+        BB:
+          # lhs' = PHI <lhs>
+        ==>
+          loop;
+        BB:
+          # vec_lhs' = PHI <vec_lhs>
+          new_tree = lane_extract <vec_lhs', ...>;
+          lhs' = new_tree;  */
+
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      basic_block exit_bb = single_exit (loop)->dest;
+      gcc_assert (single_pred_p (exit_bb));
+
+      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
+      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
+      SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
 
-  basic_block exit_bb = single_exit (loop)->dest;
-  gcc_assert (single_pred_p (exit_bb));
+      gimple_seq stmts = NULL;
+      tree new_tree;
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         /* Emit:
 
-  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
-  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
+              SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
 
-  gimple_seq stmts = NULL;
-  tree new_tree;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-    {
-      /* Emit:
+            where VEC_LHS is the vectorized live-out result and MASK is
+            the loop mask for the final iteration.  */
+         gcc_assert (ncopies == 1 && !slp_node);
+         tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+         tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+                                         1, vectype, 0);
+         tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+                                         mask, vec_lhs_phi);
+
+         /* Convert the extracted vector element to the scalar type.  */
+         new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+       }
+      else
+       {
+         tree bftype = TREE_TYPE (vectype);
+         if (VECTOR_BOOLEAN_TYPE_P (vectype))
+           bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+         new_tree = build3 (BIT_FIELD_REF, bftype,
+                            vec_lhs_phi, bitsize, bitstart);
+         new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+                                          &stmts, true, NULL_TREE);
+       }
 
-          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+      if (stmts)
+       {
+         gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
+         gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
 
-        where VEC_LHS is the vectorized live-out result and MASK is
-        the loop mask for the final iteration.  */
-      gcc_assert (ncopies == 1 && !slp_node);
-      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
-      tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
-                                     vectype, 0);
-      tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-                                     mask, vec_lhs_phi);
+         /* Remove existing phi from lhs and create one copy from new_tree.  */
+         tree lhs_phi = NULL_TREE;
+         gimple_stmt_iterator gsi;
+         for (gsi = gsi_start_phis (exit_bb);
+              !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             gimple *phi = gsi_stmt (gsi);
+             if ((gimple_phi_arg_def (phi, 0) == lhs))
+               {
+                 remove_phi_node (&gsi, false);
+                 lhs_phi = gimple_phi_result (phi);
+                 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
+                 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
+                 break;
+               }
+           }
+       }
 
-      /* Convert the extracted vector element to the required scalar type.  */
-      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+      /* Replace use of lhs with newly computed result.  If the use stmt is a
+        single arg PHI, just replace all uses of PHI result.  It's necessary
+        because lcssa PHI defining lhs may be before newly inserted stmt.  */
+      use_operand_p use_p;
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
+           && !is_gimple_debug (use_stmt))
+         {
+           if (gimple_code (use_stmt) == GIMPLE_PHI
+               && gimple_phi_num_args (use_stmt) == 1)
+             {
+               replace_uses_by (gimple_phi_result (use_stmt), new_tree);
+             }
+           else
+             {
+               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+                   SET_USE (use_p, new_tree);
+             }
+           update_stmt (use_stmt);
+         }
     }
   else
     {
+      /* For basic-block vectorization simply insert the lane-extraction.  */
       tree bftype = TREE_TYPE (vectype);
       if (VECTOR_BOOLEAN_TYPE_P (vectype))
        bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
-      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
+      tree new_tree = build3 (BIT_FIELD_REF, bftype,
+                             vec_lhs, bitsize, bitstart);
+      gimple_seq stmts = NULL;
       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
                                       &stmts, true, NULL_TREE);
-    }
-
-  if (stmts)
-    {
-      gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-
-      /* Remove existing phi from lhs and create one copy from new_tree.  */
-      tree lhs_phi = NULL_TREE;
-      gimple_stmt_iterator gsi;
-      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
-       {
-         gimple *phi = gsi_stmt (gsi);
-         if ((gimple_phi_arg_def (phi, 0) == lhs))
-           {
-             remove_phi_node (&gsi, false);
-             lhs_phi = gimple_phi_result (phi);
-             gimple *copy = gimple_build_assign (lhs_phi, new_tree);
-             gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
-             break;
-           }
-       }
-    }
-
-  /* Replace use of lhs with newly computed result.  If the use stmt is a
-     single arg PHI, just replace all uses of PHI result.  It's necessary
-     because lcssa PHI defining lhs may be before newly inserted stmt.  */
-  use_operand_p use_p;
-  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
-    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
-       && !is_gimple_debug (use_stmt))
-    {
-      if (gimple_code (use_stmt) == GIMPLE_PHI
-         && gimple_phi_num_args (use_stmt) == 1)
+      if (TREE_CODE (new_tree) == SSA_NAME
+         && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
+       SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
+      if (is_a <gphi *> (vec_stmt))
        {
-         replace_uses_by (gimple_phi_result (use_stmt), new_tree);
+         gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
+         gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
        }
       else
        {
-         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-           SET_USE (use_p, new_tree);
+         gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
+         gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
        }
-      update_stmt (use_stmt);
+
+      /* Replace use of lhs with newly computed result.  If the use stmt is a
+        single arg PHI, just replace all uses of PHI result.  It's necessary
+        because lcssa PHI defining lhs may be before newly inserted stmt.  */
+      use_operand_p use_p;
+      stmt_vec_info use_stmt_info;
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+       if (!is_gimple_debug (use_stmt)
+           && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
+               || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
+         {
+           /* ???  This can happen when the live lane ends up being
+              used in a vector construction code-generated by an
+              external SLP node (and code-generation for that already
+              happened).  See gcc.dg/vect/bb-slp-47.c.
+              Doing this is what would happen if that vector CTOR
+              were not code-generated yet so it is not too bad.
+              ???  In fact we'd likely want to avoid this situation
+              in the first place.  */
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && gimple_code (use_stmt) != GIMPLE_PHI
+               && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
+                                               use_stmt))
+             {
+               enum tree_code code = gimple_assign_rhs_code (use_stmt);
+               gcc_assert (code == CONSTRUCTOR
+                           || code == VIEW_CONVERT_EXPR
+                           || CONVERT_EXPR_CODE_P (code));
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "Using original scalar computation for "
+                                  "live lane because use preceeds vector "
+                                  "def\n");
+               continue;
+             }
+           /* ???  It can also happen that we end up pulling a def into
+              a loop where replacing out-of-loop uses would require
+              a new LC SSA PHI node.  Retain the original scalar in
+              those cases as well.  PR98064.  */
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && (gimple_bb (use_stmt)->loop_father
+                   != gimple_bb (vec_stmt)->loop_father)
+               && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
+                                       gimple_bb (use_stmt)->loop_father))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "Using original scalar computation for "
+                                  "live lane because there is an out-of-loop "
+                                  "definition for it\n");
+               continue;
+             }
+           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+             SET_USE (use_p, new_tree);
+           update_stmt (use_stmt);
+         }
     }
 
   return true;
@@ -8178,7 +8873,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
 {
   gcc_assert (nvectors != 0);
   if (masks->length () < nvectors)
-    masks->safe_grow_cleared (nvectors);
+    masks->safe_grow_cleared (nvectors, true);
   rgroup_controls *rgm = &(*masks)[nvectors - 1];
   /* The number of scalars per iteration and the number of vectors are
      both compile-time constants.  */
@@ -8196,6 +8891,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
     {
       rgm->max_nscalars_per_iter = nscalars_per_iter;
       rgm->type = truth_type_for (vectype);
+      rgm->factor = 1;
     }
 }
 
@@ -8217,7 +8913,7 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
      used it.  */
   if (rgm->controls.is_empty ())
     {
-      rgm->controls.safe_grow_cleared (nvectors);
+      rgm->controls.safe_grow_cleared (nvectors, true);
       for (unsigned int i = 0; i < nvectors; ++i)
        {
          tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
@@ -8248,6 +8944,69 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
   return mask;
 }
 
+/* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
+   lengths for controlling an operation on VECTYPE.  The operation splits
+   each element of VECTYPE into FACTOR separate subelements, measuring the
+   length as a number of these subelements.  */
+
+void
+vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+                     unsigned int nvectors, tree vectype, unsigned int factor)
+{
+  gcc_assert (nvectors != 0);
+  if (lens->length () < nvectors)
+    lens->safe_grow_cleared (nvectors, true);
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* The number of scalars per iteration, scalar occupied bytes and
+     the number of vectors are both compile-time constants.  */
+  unsigned int nscalars_per_iter
+    = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
+                LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
+
+  if (rgl->max_nscalars_per_iter < nscalars_per_iter)
+    {
+      /* For now, we only support cases in which all loads and stores fall back
+        to VnQI or none do.  */
+      gcc_assert (!rgl->max_nscalars_per_iter
+                 || (rgl->factor == 1 && factor == 1)
+                 || (rgl->max_nscalars_per_iter * rgl->factor
+                     == nscalars_per_iter * factor));
+      rgl->max_nscalars_per_iter = nscalars_per_iter;
+      rgl->type = vectype;
+      rgl->factor = factor;
+    }
+}
+
+/* Given a complete set of length LENS, extract length number INDEX for an
+   rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
+
+tree
+vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
+                  unsigned int nvectors, unsigned int index)
+{
+  rgroup_controls *rgl = &(*lens)[nvectors - 1];
+
+  /* Populate the rgroup's len array, if this is the first time we've
+     used it.  */
+  if (rgl->controls.is_empty ())
+    {
+      rgl->controls.safe_grow_cleared (nvectors, true);
+      for (unsigned int i = 0; i < nvectors; ++i)
+       {
+         tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+         gcc_assert (len_type != NULL_TREE);
+         tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
+
+         /* Provide a dummy definition until the real one is available.  */
+         SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
+         rgl->controls[i] = len;
+       }
+    }
+
+  return rgl->controls[index];
+}
+
 /* Scale profiling counters by estimation for LOOP which is vectorized
    by factor VF.  */
 
@@ -8282,11 +9041,47 @@ scale_profile_for_vect_loop (class loop *loop, unsigned vf)
     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
 }
 
+/* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
+   latch edge values originally defined by it.  */
+
+static void
+maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
+                                    stmt_vec_info def_stmt_info)
+{
+  tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
+  if (!def || TREE_CODE (def) != SSA_NAME)
+    return;
+  stmt_vec_info phi_info;
+  imm_use_iterator iter;
+  use_operand_p use_p;
+  FOR_EACH_IMM_USE_FAST (use_p, iter, def)
+    if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
+      if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
+         && (phi_info = loop_vinfo->lookup_stmt (phi))
+         && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
+         && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
+         && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
+       {
+         loop_p loop = gimple_bb (phi)->loop_father;
+         edge e = loop_latch_edge (loop);
+         if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
+           {
+             vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
+             vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
+             gcc_assert (phi_defs.length () == latch_defs.length ());
+             for (unsigned i = 0; i < phi_defs.length (); ++i)
+               add_phi_arg (as_a <gphi *> (phi_defs[i]),
+                            gimple_get_lhs (latch_defs[i]), e,
+                            gimple_phi_arg_location (phi, e->dest_idx));
+           }
+       }
+}
+
 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
    stmt_vec_info.  */
 
-static void
+static bool
 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
                          gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
 {
@@ -8302,7 +9097,7 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info)
       && !STMT_VINFO_LIVE_P (stmt_info))
-    return;
+    return false;
 
   if (STMT_VINFO_VECTYPE (stmt_info))
     {
@@ -8319,13 +9114,15 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
   /* Pure SLP statements have already been vectorized.  We still need
      to apply loop vectorization to hybrid SLP statements.  */
   if (PURE_SLP_STMT (stmt_info))
-    return;
+    return false;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
 
   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
     *seen_store = stmt_info;
+
+  return true;
 }
 
 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
@@ -8374,6 +9171,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
   basic_block *epilogue_bbs = get_loop_body (epilogue);
   unsigned i;
 
+  free (LOOP_VINFO_BBS (epilogue_vinfo));
   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
 
   /* Advance data_reference's with the number of iterations of the previous
@@ -8617,9 +9415,15 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                             LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
          step_vector = build_one_cst (TREE_TYPE (niters));
        }
-      else
+      else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
        vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
                                     &step_vector, niters_no_overflow);
+      else
+       /* vect_do_peeling subtracted the number of peeled prologue
+          iterations from LOOP_VINFO_NITERS.  */
+       vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
+                                    &niters_vector, &step_vector,
+                                    niters_no_overflow);
     }
 
   /* 1) Make sure the loop header has exactly two entries
@@ -8629,8 +9433,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 
   split_edge (loop_preheader_edge (loop));
 
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-      && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
     /* This will deal with any possible peeling.  */
     vect_prepare_for_masked_peels (loop_vinfo);
 
@@ -8639,7 +9442,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
   if (!loop_vinfo->slp_instances.is_empty ())
     {
       DUMP_VECT_SCOPE ("scheduling SLP instances");
-      vect_schedule_slp (loop_vinfo);
+      vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
     }
 
   /* FORNOW: the vectorizer supports only loops which body consist
@@ -8654,7 +9457,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 
       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
           gsi_next (&si))
-        {
+       {
          gphi *phi = si.phi ();
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
@@ -8689,6 +9492,27 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
            }
        }
 
+      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
+          gsi_next (&si))
+       {
+         gphi *phi = si.phi ();
+         stmt_info = loop_vinfo->lookup_stmt (phi);
+         if (!stmt_info)
+           continue;
+
+         if (!STMT_VINFO_RELEVANT_P (stmt_info)
+             && !STMT_VINFO_LIVE_P (stmt_info))
+           continue;
+
+         if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
+             && ! PURE_SLP_STMT (stmt_info))
+           maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
+       }
+
       for (gimple_stmt_iterator si = gsi_start_bb (bb);
           !gsi_end_p (si);)
        {
@@ -8724,11 +9548,18 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                        }
                      stmt_vec_info pat_stmt_info
                        = STMT_VINFO_RELATED_STMT (stmt_info);
-                     vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
-                                               &seen_store);
+                     if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
+                                                   &si, &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            pat_stmt_info);
+                   }
+                 else
+                   {
+                     if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
+                                                   &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            stmt_info);
                    }
-                 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
-                                           &seen_store);
                }
              gsi_next (&si);
              if (seen_store)
@@ -8853,7 +9684,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
      won't work.  */
   slp_instance instance;
   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
-    vect_free_slp_instance (instance, true);
+    vect_free_slp_instance (instance);
   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
   /* Clear-up safelen field since its value is invalid after vectorization
      since vectorized loop can have loop-carried dependencies.  */
@@ -9123,3 +9954,25 @@ vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
   return iv_limit;
 }
 
+/* For the given rgroup_controls RGC, check whether an induction variable
+   would ever hit a value that produces a set of all-false masks or zero
+   lengths before wrapping around.  Return true if it's possible to wrap
+   around before hitting the desirable value, otherwise return false.  */
+
+bool
+vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
+{
+  widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
+
+  if (iv_limit == -1)
+    return true;
+
+  tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
+  unsigned int compare_precision = TYPE_PRECISION (compare_type);
+  unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
+
+  if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
+    return true;
+
+  return false;
+}