Daily bump.

[gcc.git] / gcc / tree-vect-loop.c
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 50abb2b2f3cc44dce321e771d1ef75b865d32308..72bbec4b45d225d73eddfaca69468cd23842a42a 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -666,27 +666,50 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
    unsigned i;
  
    FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
-    if (STMT_VINFO_IN_PATTERN_P (first))
-      {
-       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
-       while (next)
-         {
-           if (! STMT_VINFO_IN_PATTERN_P (next)
-               || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
-             break;
-           next = REDUC_GROUP_NEXT_ELEMENT (next);
-         }
-       /* If not all stmt in the chain are patterns or if we failed
-          to update STMT_VINFO_REDUC_IDX try to handle the chain
-          without patterns.  */
-       if (! next
-           && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
-         {
-           vect_fixup_reduc_chain (first);
-           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
-             = STMT_VINFO_RELATED_STMT (first);
-         }
-      }
+    {
+      stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
+      while (next)
+       {
+         if ((STMT_VINFO_IN_PATTERN_P (next)
+              != STMT_VINFO_IN_PATTERN_P (first))
+             || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
+           break;
+         next = REDUC_GROUP_NEXT_ELEMENT (next);
+       }
+      /* If all reduction chain members are well-formed patterns adjust
+        the group to group the pattern stmts instead.  */
+      if (! next
+         && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
+       {
+         if (STMT_VINFO_IN_PATTERN_P (first))
+           {
+             vect_fixup_reduc_chain (first);
+             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
+               = STMT_VINFO_RELATED_STMT (first);
+           }
+       }
+      /* If not all stmt in the chain are patterns or if we failed
+        to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
+        it as regular reduction instead.  */
+      else
+       {
+         stmt_vec_info vinfo = first;
+         stmt_vec_info last = NULL;
+         while (vinfo)
+           {
+             next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
+             REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
+             REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+             last = vinfo;
+             vinfo = next;
+           }
+         STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
+           = vect_internal_def;
+         loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
+         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
+         --i;
+       }
+    }
  }
  
  /* Function vect_get_loop_niters.
@@ -814,7 +837,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
      vec_outside_cost (0),
      vec_inside_cost (0),
      vectorizable (false),
-    can_use_partial_vectors_p (true),
+    can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
      using_partial_vectors_p (false),
      epil_using_partial_vectors_p (false),
      peeling_for_gaps (false),
@@ -991,6 +1014,51 @@ vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
    return wi::min_precision (max_ni * factor, UNSIGNED);
  }
  
+/* True if the loop needs peeling or partial vectors when vectorized.  */
+
+static bool
+vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
+{
+  unsigned HOST_WIDE_INT const_vf;
+  HOST_WIDE_INT max_niter
+    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+
+  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
+  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
+    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
+                                         (loop_vinfo));
+
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+    {
+      /* Work out the (constant) number of iterations that need to be
+        peeled for reasons other than niters.  */
+      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+       peel_niter += 1;
+      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
+                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+       return true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+      /* ??? When peeling for gaps but not alignment, we could
+        try to check whether the (variable) niters is known to be
+        VF * N + 1.  That's something of a niche case though.  */
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+      || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
+      || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+          < (unsigned) exact_log2 (const_vf))
+         /* In case of versioning, check if the maximum number of
+            iterations is greater than th.  If they are identical,
+            the epilogue is unnecessary.  */
+         && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
+             || ((unsigned HOST_WIDE_INT) max_niter
+                 > (th / const_vf) * const_vf))))
+    return true;
+
+  return false;
+}
+
  /* Each statement in LOOP_VINFO can be masked where necessary.  Check
     whether we can actually generate the masks required.  Return true if so,
     storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
@@ -1759,6 +1827,19 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
         }
      }
  
+  /* If using the "very cheap" model. reject cases in which we'd keep
+     a copy of the scalar code (even if we might be able to vectorize it).  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "some scalar iterations would need to be peeled\n");
+      return 0;
+    }
+
    int min_profitable_iters, min_profitable_estimate;
    vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
                                       &min_profitable_estimate);
@@ -1817,6 +1898,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
        min_profitable_estimate = min_profitable_iters;
      }
  
+  /* If the vector loop needs multiple iterations to be beneficial then
+     things are probably too close to call, and the conservative thing
+     would be to stick with the scalar code.  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "one iteration of the vector loop would be"
+                        " more expensive than the equivalent number of"
+                        " iterations of the scalar loop\n");
+      return 0;
+    }
+
    HOST_WIDE_INT estimated_niter;
  
    /* If we are vectorizing an epilogue then we know the maximum number of
@@ -1958,56 +2053,123 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
      }
  }
  
+/* Determine if operating on full vectors for LOOP_VINFO might leave
+   some scalar iterations still to do.  If so, decide how we should
+   handle those scalar iterations.  The possibilities are:
  
-/* Decides whether we need to create an epilogue loop to handle
-   remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
+   (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
+       In this case:
  
-void
-determine_peel_for_niter (loop_vec_info loop_vinfo)
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
+        LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == false
+
+   (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
+       to handle the remaining scalar iterations.  In this case:
+
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == true
+
+       There are two choices:
+
+       (2a) Consider vectorizing the epilogue loop at the same VF as the
+           main loop, but using partial vectors instead of full vectors.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
+
+       (2b) Consider vectorizing the epilogue loop at lower VFs only.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+
+   When FOR_EPILOGUE_P is true, make this determination based on the
+   assumption that LOOP_VINFO is an epilogue loop, otherwise make it
+   based on the assumption that LOOP_VINFO is the main loop.  The caller
+   has made sure that the number of iterations is set appropriately for
+   this value of FOR_EPILOGUE_P.  */
+
+opt_result
+vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
+                                           bool for_epilogue_p)
  {
-  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  /* Determine whether there would be any scalar iterations left over.  */
+  bool need_peeling_or_partial_vectors_p
+    = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
  
-  unsigned HOST_WIDE_INT const_vf;
-  HOST_WIDE_INT max_niter
-    = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
+  /* Decide whether to vectorize the loop with partial vectors.  */
+  LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && need_peeling_or_partial_vectors_p)
+    {
+      /* For partial-vector-usage=1, try to push the handling of partial
+        vectors to the epilogue, with the main loop continuing to operate
+        on full vectors.
+
+        ??? We could then end up failing to use partial vectors if we
+        decide to peel iterations into a prologue, and if the main loop
+        then ends up processing fewer than VF iterations.  */
+      if (param_vect_partial_vector_usage == 1
+         && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+         && !vect_known_niters_smaller_than_vf (loop_vinfo))
+       LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+      else
+       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+    }
  
-  unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
-  if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
-    th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
-                                         (loop_vinfo));
+  if (dump_enabled_p ())
+    {
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating on partial vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+      else
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating only on full vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+    }
  
-  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-    /* The main loop handles all iterations.  */
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
-  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-          && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
+  if (for_epilogue_p)
      {
-      /* Work out the (constant) number of iterations that need to be
-        peeled for reasons other than niters.  */
-      unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
-      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-       peel_niter += 1;
-      if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
-                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
-       LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+      loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+      gcc_assert (orig_loop_vinfo);
+      if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                             LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
+    }
+
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+    {
+      /* Check that the loop processes at least one full vector.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
+      if (known_lt (wi::to_widest (scalar_niters), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support vectorization.\n");
+
+      /* If we need to peel an extra epilogue iteration to handle data
+        accesses with gaps, check that there are enough scalar iterations
+        available.
+
+        The check above is redundant with this one when peeling for gaps,
+        but the distinction is useful for diagnostics.  */
+      tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         && known_lt (wi::to_widest (scalar_nitersm1), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support peeling for gaps.\n");
      }
-  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
-          /* ??? When peeling for gaps but not alignment, we could
-             try to check whether the (variable) niters is known to be
-             VF * N + 1.  That's something of a niche case though.  */
-          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-          || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
-          || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
-               < (unsigned) exact_log2 (const_vf))
-              /* In case of versioning, check if the maximum number of
-                 iterations is greater than th.  If they are identical,
-                 the epilogue is unnecessary.  */
-              && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
-                  || ((unsigned HOST_WIDE_INT) max_niter
-                      > (th / const_vf) * const_vf))))
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-}
  
+  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+    = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+       && need_peeling_or_partial_vectors_p);
+
+  return opt_result::success ();
+}
  
  /* Function vect_analyze_loop_2.
  
@@ -2163,6 +2325,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
  
        /* Optimize the SLP graph with the vectorization factor fixed.  */
        vect_optimize_slp (loop_vinfo);
+
+      /* Gather the loads reachable from the SLP graph entries.  */
+      vect_gather_slp_loads (loop_vinfo);
      }
  
    bool saved_can_use_partial_vectors_p
@@ -2230,6 +2395,79 @@ start_over:
                                        "unsupported SLP instances\n");
           goto again;
         }
+
+      /* Check whether any load in ALL SLP instances is possibly permuted.  */
+      slp_tree load_node, slp_root;
+      unsigned i, x;
+      slp_instance instance;
+      bool can_use_lanes = true;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
+       {
+         slp_root = SLP_INSTANCE_TREE (instance);
+         int group_size = SLP_TREE_LANES (slp_root);
+         tree vectype = SLP_TREE_VECTYPE (slp_root);
+         bool loads_permuted = false;
+         FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+           {
+             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+               continue;
+             unsigned j;
+             stmt_vec_info load_info;
+             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+               if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+                 {
+                   loads_permuted = true;
+                   break;
+                 }
+           }
+
+         /* If the loads and stores can be handled with load/store-lane
+            instructions record it and move on to the next instance.  */
+         if (loads_permuted
+             && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+             && vect_store_lanes_supported (vectype, group_size, false))
+           {
+             FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+               {
+                 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+                     (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+                 /* Use SLP for strided accesses (or if we can't
+                    load-lanes).  */
+                 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+                     || ! vect_load_lanes_supported
+                           (STMT_VINFO_VECTYPE (stmt_vinfo),
+                            DR_GROUP_SIZE (stmt_vinfo), false))
+                   break;
+               }
+
+             can_use_lanes
+               = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
+
+             if (can_use_lanes && dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "SLP instance %p can use load/store-lanes\n",
+                                instance);
+           }
+         else
+           {
+             can_use_lanes = false;
+             break;
+           }
+       }
+
+      /* If all SLP instances can use load/store-lanes abort SLP and try again
+        with SLP disabled.  */
+      if (can_use_lanes)
+       {
+         ok = opt_result::failure_at (vect_location,
+                                      "Built SLP cancelled: can use "
+                                      "load/store-lanes\n");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Built SLP cancelled: all SLP instances support "
+                            "load/store-lanes\n");
+         goto again;
+       }
      }
  
    /* Dissolve SLP-only groups.  */
@@ -2261,70 +2499,32 @@ start_over:
        LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
      }
  
-  /* Decide whether to vectorize a loop with partial vectors for
-     this vectorization factor.  */
-  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
-    {
-      if (param_vect_partial_vector_usage == 0)
-       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-      else if (vect_verify_full_masking (loop_vinfo)
-              || vect_verify_loop_lens (loop_vinfo))
-       {
-         /* The epilogue and other known niters less than VF
-           cases can still use vector access with length fully.  */
-         if (param_vect_partial_vector_usage == 1
-             && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-             && !vect_known_niters_smaller_than_vf (loop_vinfo))
-           {
-             LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
-           }
-         else
-           LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
-       }
-      else
-       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-    }
-  else
-    LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-
-  if (dump_enabled_p ())
-    {
-      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "operating on partial vectors.\n");
-      else
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "operating only on full vectors.\n");
-    }
-
-  /* If epilog loop is required because of data accesses with gaps,
-     one additional iteration needs to be peeled.  Check if there is
-     enough iterations for vectorization.  */
-  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-    {
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
-
-      if (known_lt (wi::to_widest (scalar_niters), vf))
-       return opt_result::failure_at (vect_location,
-                                      "loop has no enough iterations to"
-                                      " support peeling for gaps.\n");
-    }
+  /* If we still have the option of using partial vectors,
+     check whether we can generate the necessary loop controls.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !vect_verify_full_masking (loop_vinfo)
+      && !vect_verify_loop_lens (loop_vinfo))
+    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
  
    /* If we're vectorizing an epilogue loop, the vectorized loop either needs
       to be able to handle fewer than VF scalars, or needs to have a lower VF
       than the main loop.  */
    if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
        && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
      return opt_result::failure_at (vect_location,
                                    "Vectorization factor too high for"
                                    " epilogue loop.\n");
  
+  /* Decide whether this loop_vinfo should use partial vectors or peeling,
+     assuming that the loop will be used as a main loop.  We will redo
+     this analysis later if we instead decide to use the loop as an
+     epilogue loop.  */
+  ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
+  if (!ok)
+    return ok;
+
    /* Check the costings of the loop make vectorizing worthwhile.  */
    res = vect_analyze_loop_costing (loop_vinfo);
    if (res < 0)
@@ -2337,7 +2537,6 @@ start_over:
      return opt_result::failure_at (vect_location,
                                    "Loop costings not worthwhile.\n");
  
-  determine_peel_for_niter (loop_vinfo);
    /* If an epilogue loop is required make sure we can create one.  */
    if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
        || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
@@ -2467,7 +2666,7 @@ again:
    LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
    /* Free the SLP instances.  */
    FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
-    vect_free_slp_instance (instance, false);
+    vect_free_slp_instance (instance);
    LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
    /* Reset SLP type to loop_vect on all stmts.  */
    for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
@@ -3157,14 +3356,17 @@ pop:
           fail = true;
           break;
         }
-      /* Check there's only a single stmt the op is used on inside
-         of the loop.  */
+      /* Check there's only a single stmt the op is used on.  For the
+        not value-changing tail and the last stmt allow out-of-loop uses.
+        ???  We could relax this and handle arbitrary live stmts by
+        forcing a scalar epilogue for example.  */
        imm_use_iterator imm_iter;
        gimple *op_use_stmt;
        unsigned cnt = 0;
        FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
         if (!is_gimple_debug (op_use_stmt)
-           && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
+           && (*code != ERROR_MARK
+               || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
           {
             /* We want to allow x + x but not x < 1 ? x : 2.  */
             if (is_gimple_assign (op_use_stmt)
@@ -4089,6 +4291,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
  
    if (vec_outside_cost <= 0)
      min_profitable_estimate = 0;
+  /* ??? This "else if" arm is written to handle all cases; see below for
+     what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
    else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
      {
        /* This is a repeat of the code above, but with + SOC rather
@@ -4345,34 +4549,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
  }
  
  
-/* Function vect_model_induction_cost.
-
-   Models cost for induction operations.  */
-
-static void
-vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
-                          stmt_vector_for_cost *cost_vec)
-{
-  unsigned inside_cost, prologue_cost;
-
-  if (PURE_SLP_STMT (stmt_info))
-    return;
-
-  /* loop cost for vec_loop.  */
-  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
-                                 stmt_info, 0, vect_body);
-
-  /* prologue cost for vec_init and vec_step.  */
-  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
-                                   stmt_info, 0, vect_prologue);
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-                     "vect_model_induction_cost: inside_cost = %d, "
-                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
-}
-
-
  
  /* Function get_initial_def_for_reduction
  
@@ -6238,9 +6414,28 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
    if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
      {
        if (is_a <gphi *> (stmt_info->stmt))
-       /* Analysis for double-reduction is done on the outer
-          loop PHI, nested cycles have no further restrictions.  */
-       STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       {
+         if (slp_node)
+           {
+             /* We eventually need to set a vector type on invariant
+                arguments.  */
+             unsigned j;
+             slp_tree child;
+             FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+               if (!vect_maybe_update_slp_op_vectype
+                      (child, SLP_TREE_VECTYPE (slp_node)))
+                 {
+                   if (dump_enabled_p ())
+                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                      "incompatible vector types for "
+                                      "invariants\n");
+                   return false;
+                 }
+           }
+         /* Analysis for double-reduction is done on the outer
+            loop PHI, nested cycles have no further restrictions.  */
+         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       }
        else
         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
        return true;
@@ -6282,12 +6477,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
    gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
  
    /* Verify following REDUC_IDX from the latch def leads us back to the PHI
-     and compute the reduction chain length.  */
-  tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                         loop_latch_edge (loop));
+     and compute the reduction chain length.  Discover the real
+     reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
+  tree reduc_def
+    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                            loop_latch_edge
+                              (gimple_bb (reduc_def_phi)->loop_father));
    unsigned reduc_chain_length = 0;
    bool only_slp_reduc_chain = true;
    stmt_info = NULL;
+  slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
    while (reduc_def != PHI_RESULT (reduc_def_phi))
      {
        stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
@@ -6330,6 +6529,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
         stmt_info = vdef;
        reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
        reduc_chain_length++;
+      if (!stmt_info && slp_node)
+       slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
      }
    /* PHIs should not participate in patterns.  */
    gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
@@ -6416,17 +6617,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
       The last use is the reduction variable.  In case of nested cycle this
       assumption is not true: we use reduc_index to record the index of the
       reduction variable.  */
-  /* ???  To get at invariant/constant uses on the SLP node we have to
-     get to it here, slp_node is still the reduction PHI.  */
-  slp_tree slp_for_stmt_info = NULL;
-  if (slp_node)
-    {
-      slp_for_stmt_info = slp_node_instance->root;
-      /* And then there's reduction chain with a conversion ...  */
-      if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
-       slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
-      gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
-    }
    slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
    /* We need to skip an extra operand for COND_EXPRs with embedded
       comparison.  */
@@ -7307,15 +7497,24 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
    if (slp_node)
      {
        vec_initial_defs.reserve (vec_num);
-      gcc_assert (slp_node == slp_node_instance->reduc_phis);
-      stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
-      tree neutral_op
-       = neutral_op_for_slp_reduction (slp_node, vectype_out,
-                                       STMT_VINFO_REDUC_CODE (reduc_info),
-                                       first != NULL);
-      get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
-                                     &vec_initial_defs, vec_num,
-                                     first != NULL, neutral_op);
+      if (nested_cycle)
+       {
+         unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
+         vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
+                            &vec_initial_defs);
+       }
+      else
+       {
+         gcc_assert (slp_node == slp_node_instance->reduc_phis);
+         stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
+         tree neutral_op
+             = neutral_op_for_slp_reduction (slp_node, vectype_out,
+                                             STMT_VINFO_REDUC_CODE (reduc_info),
+                                             first != NULL);
+         get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
+                                         &vec_initial_defs, vec_num,
+                                         first != NULL, neutral_op);
+       }
      }
    else
      {
@@ -7421,6 +7620,17 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
  
    if (!vec_stmt) /* transformation not required.  */
      {
+      /* Deal with copies from externs or constants that disguise as
+        loop-closed PHI nodes (PR97886).  */
+      if (slp_node
+         && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
+                                               SLP_TREE_VECTYPE (slp_node)))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "incompatible vector types for invariants\n");
+         return false;
+       }
        STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
        return true;
      }
@@ -7450,6 +7660,81 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
    return true;
  }
  
+/* Vectorizes PHIs.  */
+
+bool
+vectorizable_phi (vec_info *,
+                 stmt_vec_info stmt_info, gimple **vec_stmt,
+                 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
+    return false;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+    return false;
+
+  tree vectype = SLP_TREE_VECTYPE (slp_node);
+
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      slp_tree child;
+      unsigned i;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
+       if (!child)
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "PHI node with unvectorized backedge def\n");
+           return false;
+         }
+       else if (!vect_maybe_update_slp_op_vectype (child, vectype))
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "incompatible vector types for invariants\n");
+           return false;
+         }
+      record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                       vector_stmt, stmt_info, vectype, 0, vect_body);
+      STMT_VINFO_TYPE (stmt_info) = phi_info_type;
+      return true;
+    }
+
+  tree scalar_dest = gimple_phi_result (stmt_info->stmt);
+  basic_block bb = gimple_bb (stmt_info->stmt);
+  tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
+  auto_vec<gphi *> new_phis;
+  for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
+    {
+      slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
+
+      /* Skip not yet vectorized defs.  */
+      if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
+         && SLP_TREE_VEC_STMTS (child).is_empty ())
+       continue;
+
+      auto_vec<tree> vec_oprnds;
+      vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
+      if (!new_phis.exists ())
+       {
+         new_phis.create (vec_oprnds.length ());
+         for (unsigned j = 0; j < vec_oprnds.length (); j++)
+           {
+             /* Create the vectorized LC PHI node.  */
+             new_phis.quick_push (create_phi_node (vec_dest, bb));
+             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
+           }
+       }
+      edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
+      for (unsigned j = 0; j < vec_oprnds.length (); j++)
+       add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
+    }
+  /* We should have at least one already vectorized child.  */
+  gcc_assert (new_phis.exists ());
+
+  return true;
+}
+
  
  /* Function vect_min_worthwhile_factor.
  
@@ -7520,7 +7805,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
    poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    unsigned i;
    tree expr;
-  gimple_seq stmts;
    gimple_stmt_iterator si;
  
    gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
@@ -7560,10 +7844,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
           return false;
         }
  
-      /* FORNOW: outer loop induction with SLP not supported.  */
-      if (STMT_SLP_TYPE (stmt_info))
-       return false;
-
        exit_phi = NULL;
        latch_e = loop_latch_edge (loop->inner);
        loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
@@ -7602,7 +7882,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
  
    if (slp_node && !nunits.is_constant ())
      {
-      /* The current SLP code creates the initial value element-by-element.  */
+      /* The current SLP code creates the step value element-by-element.  */
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                          "SLP induction not supported for variable-length"
@@ -7612,9 +7892,50 @@ vectorizable_induction (loop_vec_info loop_vinfo,
  
    if (!vec_stmt) /* transformation not required.  */
      {
+      unsigned inside_cost = 0, prologue_cost = 0;
+      if (slp_node)
+       {
+         /* We eventually need to set a vector type on invariant
+            arguments.  */
+         unsigned j;
+         slp_tree child;
+         FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+           if (!vect_maybe_update_slp_op_vectype
+               (child, SLP_TREE_VECTYPE (slp_node)))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "incompatible vector types for "
+                                  "invariants\n");
+               return false;
+             }
+         /* loop cost for vec_loop.  */
+         inside_cost
+           = record_stmt_cost (cost_vec,
+                               SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                               vector_stmt, stmt_info, 0, vect_body);
+         /* prologue cost for vec_init (if not nested) and step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
+                                           scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      else /* if (!slp_node) */
+       {
+         /* loop cost for vec_loop.  */
+         inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+                                         stmt_info, 0, vect_body);
+         /* prologue cost for vec_init and vec_step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "vect_model_induction_cost: inside_cost = %d, "
+                        "prologue_cost = %d .\n", inside_cost,
+                        prologue_cost);
+
        STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
        DUMP_VECT_SCOPE ("vectorizable_induction");
-      vect_model_induction_cost (stmt_info, ncopies, cost_vec);
        return true;
      }
  
@@ -7633,164 +7954,195 @@ vectorizable_induction (loop_vec_info loop_vinfo,
    tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
  
    pe = loop_preheader_edge (iv_loop);
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-                                    loop_preheader_edge (iv_loop));
-
-  stmts = NULL;
-  if (!nested_in_vect_loop)
-    {
-      /* Convert the initial value to the IV update type.  */
-      tree new_type = TREE_TYPE (step_expr);
-      init_expr = gimple_convert (&stmts, new_type, init_expr);
-
-      /* If we are using the loop mask to "peel" for alignment then we need
-        to adjust the start value here.  */
-      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-      if (skip_niters != NULL_TREE)
-       {
-         if (FLOAT_TYPE_P (vectype))
-           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
-                                       skip_niters);
-         else
-           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
-         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
-                                        skip_niters, step_expr);
-         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
-                                   init_expr, skip_step);
-       }
-    }
-
-  if (stmts)
-    {
-      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-      gcc_assert (!new_bb);
-    }
-
    /* Find the first insertion point in the BB.  */
    basic_block bb = gimple_bb (phi);
    si = gsi_after_labels (bb);
  
    /* For SLP induction we have to generate several IVs as for example
-     with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
-     [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
-     [VF*S, VF*S, VF*S, VF*S] for all.  */
+     with group size 3 we need
+       [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
+       [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
    if (slp_node)
      {
        /* Enforced above.  */
        unsigned int const_nunits = nunits.to_constant ();
  
-      /* Generate [VF*S, VF*S, ... ].  */
-      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+      /* The initial values are vectorized, but any lanes > group_size
+        need adjustment.  */
+      slp_tree init_node
+       = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
+
+      /* Gather steps.  Since we do not vectorize inductions as
+        cycles we have to reconstruct the step from SCEV data.  */
+      unsigned group_size = SLP_TREE_LANES (slp_node);
+      tree *steps = XALLOCAVEC (tree, group_size);
+      tree *inits = XALLOCAVEC (tree, group_size);
+      stmt_vec_info phi_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
         {
-         expr = build_int_cst (integer_type_node, vf);
-         expr = fold_convert (TREE_TYPE (step_expr), expr);
+         steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+         if (!init_node)
+           inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
+                                          pe->dest_idx);
         }
-      else
-       expr = build_int_cst (TREE_TYPE (step_expr), vf);
-      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                             expr, step_expr);
-      if (! CONSTANT_CLASS_P (new_name))
-       new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
-                                    TREE_TYPE (step_expr), NULL);
-      new_vec = build_vector_from_val (step_vectype, new_name);
-      vec_step = vect_init_vector (loop_vinfo, stmt_info,
-                                  new_vec, step_vectype, NULL);
  
        /* Now generate the IVs.  */
-      unsigned group_size = SLP_TREE_LANES (slp_node);
        unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      unsigned elts = const_nunits * nvects;
-      /* Compute the number of distinct IVs we need.  First reduce
-        group_size if it is a multiple of const_nunits so we get
-        one IV for a group_size of 4 but const_nunits 2.  */
-      unsigned group_sizep = group_size;
-      if (group_sizep % const_nunits == 0)
-       group_sizep = group_sizep / const_nunits;
-      unsigned nivs = least_common_multiple (group_sizep,
-                                            const_nunits) / const_nunits;
-      gcc_assert (elts % group_size == 0);
-      tree elt = init_expr;
+      gcc_assert ((const_nunits * nvects) % group_size == 0);
+      unsigned nivs;
+      if (nested_in_vect_loop)
+       nivs = nvects;
+      else
+       {
+         /* Compute the number of distinct IVs we need.  First reduce
+            group_size if it is a multiple of const_nunits so we get
+            one IV for a group_size of 4 but const_nunits 2.  */
+         unsigned group_sizep = group_size;
+         if (group_sizep % const_nunits == 0)
+           group_sizep = group_sizep / const_nunits;
+         nivs = least_common_multiple (group_sizep,
+                                       const_nunits) / const_nunits;
+       }
+      tree stept = TREE_TYPE (step_vectype);
+      tree lupdate_mul = NULL_TREE;
+      if (!nested_in_vect_loop)
+       {
+         /* The number of iterations covered in one vector iteration.  */
+         unsigned lup_mul = (nvects * const_nunits) / group_size;
+         lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept, lup_mul,
+                                                            UNSIGNED)
+                                    : build_int_cstu (stept, lup_mul));
+       }
+      tree peel_mul = NULL_TREE;
+      gimple_seq init_stmts = NULL;
+      if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+       {
+         if (SCALAR_FLOAT_TYPE_P (stept))
+           peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
+                                    LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         else
+           peel_mul = gimple_convert (&init_stmts, stept,
+                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         peel_mul = gimple_build_vector_from_val (&init_stmts,
+                                                  step_vectype, peel_mul);
+       }
        unsigned ivn;
+      auto_vec<tree> vec_steps;
        for (ivn = 0; ivn < nivs; ++ivn)
         {
-         tree_vector_builder elts (step_vectype, const_nunits, 1);
-         stmts = NULL;
+         tree_vector_builder step_elts (step_vectype, const_nunits, 1);
+         tree_vector_builder init_elts (vectype, const_nunits, 1);
+         tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
             {
-             if (ivn*const_nunits + eltn >= group_size
-                 && (ivn * const_nunits + eltn) % group_size == 0)
-               elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
-                                   elt, step_expr);
-             elts.quick_push (elt);
-           }
-         vec_init = gimple_build_vector (&stmts, &elts);
-         vec_init = gimple_convert (&stmts, vectype, vec_init);
-         if (stmts)
-           {
-             new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-             gcc_assert (!new_bb);
+             /* The scalar steps of the IVs.  */
+             tree elt = steps[(ivn*const_nunits + eltn) % group_size];
+             elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
+             step_elts.quick_push (elt);
+             if (!init_node)
+               {
+                 /* The scalar inits of the IVs if not vectorized.  */
+                 elt = inits[(ivn*const_nunits + eltn) % group_size];
+                 if (!useless_type_conversion_p (TREE_TYPE (vectype),
+                                                 TREE_TYPE (elt)))
+                   elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
+                                       TREE_TYPE (vectype), elt);
+                 init_elts.quick_push (elt);
+               }
+             /* The number of steps to add to the initial values.  */
+             unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
+             mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
+                                  ? build_real_from_wide (stept,
+                                                          mul_elt, UNSIGNED)
+                                  : build_int_cstu (stept, mul_elt));
             }
+         vec_step = gimple_build_vector (&init_stmts, &step_elts);
+         vec_steps.safe_push (vec_step);
+         tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
+         if (peel_mul)
+           step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                    step_mul, peel_mul);
+         if (!init_node)
+           vec_init = gimple_build_vector (&init_stmts, &init_elts);
  
           /* Create the induction-phi that defines the induction-operand.  */
-         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
+                                           "vec_iv_");
           induction_phi = create_phi_node (vec_dest, iv_loop->header);
           induc_def = PHI_RESULT (induction_phi);
  
           /* Create the iv update inside the loop  */
+         tree up = vec_step;
+         if (lupdate_mul)
+           up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                              vec_step, lupdate_mul);
           gimple_seq stmts = NULL;
           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
           vec_def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, vec_def, vec_step);
+                                 PLUS_EXPR, step_vectype, vec_def, up);
           vec_def = gimple_convert (&stmts, vectype, vec_def);
           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+                      UNKNOWN_LOCATION);
+
+         if (init_node)
+           vec_init = vect_get_slp_vect_def (init_node, ivn);
+         if (!nested_in_vect_loop
+             && !integer_zerop (step_mul))
+           {
+             vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
+             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                vec_step, step_mul);
+             vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                     vec_def, up);
+             vec_init = gimple_convert (&init_stmts, vectype, vec_def);
+           }
  
           /* Set the arguments of the phi node:  */
           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
-         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
-                      UNKNOWN_LOCATION);
  
           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
         }
-      /* Fill up to the number of vectors we need for the whole group.  */
-      nivs = least_common_multiple (group_size,
-                                   const_nunits) / const_nunits;
-      for (; ivn < nivs; ++ivn)
-       SLP_TREE_VEC_STMTS (slp_node)
-         .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+      if (!nested_in_vect_loop)
+       {
+         /* Fill up to the number of vectors we need for the whole group.  */
+         nivs = least_common_multiple (group_size,
+                                       const_nunits) / const_nunits;
+         for (; ivn < nivs; ++ivn)
+           {
+             SLP_TREE_VEC_STMTS (slp_node)
+               .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+             vec_steps.safe_push (vec_steps[0]);
+           }
+       }
  
-      /* Re-use IVs when we can.  */
+      /* Re-use IVs when we can.  We are generating further vector
+        stmts by adding VF' * stride to the IVs generated above.  */
        if (ivn < nvects)
         {
           unsigned vfp
             = least_common_multiple (group_size, const_nunits) / group_size;
-         /* Generate [VF'*S, VF'*S, ... ].  */
-         if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-           {
-             expr = build_int_cst (integer_type_node, vfp);
-             expr = fold_convert (TREE_TYPE (step_expr), expr);
-           }
-         else
-           expr = build_int_cst (TREE_TYPE (step_expr), vfp);
-         new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                                 expr, step_expr);
-         if (! CONSTANT_CLASS_P (new_name))
-           new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
-                                        TREE_TYPE (step_expr), NULL);
-         new_vec = build_vector_from_val (step_vectype, new_name);
-         vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
-                                      step_vectype, NULL);
+         tree lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept,
+                                                            vfp, UNSIGNED)
+                                    : build_int_cstu (stept, vfp));
           for (; ivn < nvects; ++ivn)
             {
               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
-             tree def;
-             if (gimple_code (iv) == GIMPLE_PHI)
-               def = gimple_phi_result (iv);
-             else
-               def = gimple_assign_lhs (iv);
+             tree def = gimple_get_lhs (iv);
+             if (ivn < 2*nivs)
+               vec_steps[ivn - nivs]
+                 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                 vec_steps[ivn - nivs], lupdate_mul);
               gimple_seq stmts = NULL;
               def = gimple_convert (&stmts, step_vectype, def);
-             def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, def, vec_step);
+             def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
+                                 def, vec_steps[ivn % nivs]);
               def = gimple_convert (&stmts, vectype, def);
               if (gimple_code (iv) == GIMPLE_PHI)
                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
@@ -7804,9 +8156,45 @@ vectorizable_induction (loop_vec_info loop_vinfo,
             }
         }
  
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
+      gcc_assert (!new_bb);
+
        return true;
      }
  
+  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
+                                    loop_preheader_edge (iv_loop));
+
+  gimple_seq stmts = NULL;
+  if (!nested_in_vect_loop)
+    {
+      /* Convert the initial value to the IV update type.  */
+      tree new_type = TREE_TYPE (step_expr);
+      init_expr = gimple_convert (&stmts, new_type, init_expr);
+
+      /* If we are using the loop mask to "peel" for alignment then we need
+        to adjust the start value here.  */
+      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+      if (skip_niters != NULL_TREE)
+       {
+         if (FLOAT_TYPE_P (vectype))
+           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
+                                       skip_niters);
+         else
+           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
+         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
+                                        skip_niters, step_expr);
+         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
+                                   init_expr, skip_step);
+       }
+    }
+
+  if (stmts)
+    {
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+      gcc_assert (!new_bb);
+    }
+
    /* Create the vector that holds the initial_value of the induction.  */
    if (nested_in_vect_loop)
      {
@@ -8019,17 +8407,19 @@ vectorizable_induction (loop_vec_info loop_vinfo,
     it can be supported.  */
  
  bool
-vectorizable_live_operation (loop_vec_info loop_vinfo,
+vectorizable_live_operation (vec_info *vinfo,
                              stmt_vec_info stmt_info,
                              gimple_stmt_iterator *gsi,
                              slp_tree slp_node, slp_instance slp_node_instance,
                              int slp_index, bool vec_stmt_p,
-                            stmt_vector_for_cost *)
+                            stmt_vector_for_cost *cost_vec)
  {
-  class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
    imm_use_iterator imm_iter;
    tree lhs, lhs_type, bitsize, vec_bitsize;
-  tree vectype = STMT_VINFO_VECTYPE (stmt_info);
+  tree vectype = (slp_node
+                 ? SLP_TREE_VECTYPE (slp_node)
+                 : STMT_VINFO_VECTYPE (stmt_info));
    poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
    int ncopies;
    gimple *use_stmt;
@@ -8071,10 +8461,6 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
        return true;
      }
  
-  /* FORNOW.  CHECKME.  */
-  if (nested_in_vect_loop_p (loop, stmt_info))
-    return false;
-
    /* If STMT is not relevant and it is a simple assignment and its inputs are
       invariant then it can remain in place, unvectorized.  The original last
       scalar value that it computes will be used.  */
@@ -8097,12 +8483,11 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
      {
        gcc_assert (slp_index >= 0);
  
-      int num_scalar = SLP_TREE_LANES (slp_node);
-      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-
        /* Get the last occurrence of the scalar index from the concatenation of
          all the slp vectors. Calculate which slp vector it is and the index
          within.  */
+      int num_scalar = SLP_TREE_LANES (slp_node);
+      int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
        poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
  
        /* Calculate which vector contains the result, and which lane of
@@ -8120,7 +8505,7 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
    if (!vec_stmt_p)
      {
        /* No transformation required.  */
-      if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
+      if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
         {
           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
                                                OPTIMIZE_FOR_SPEED))
@@ -8157,14 +8542,20 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
                                      1, vectype, NULL);
             }
         }
+      /* ???  Enable for loop costing as well.  */
+      if (!loop_vinfo)
+       record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
+                         0, vect_epilogue);
        return true;
      }
  
    /* Use the lhs of the original scalar statement.  */
    gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
+                    "stmt %G", stmt);
  
-  lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
-       : gimple_get_lhs (stmt);
+  lhs = gimple_get_lhs (stmt);
    lhs_type = TREE_TYPE (lhs);
  
    bitsize = vector_element_bits_tree (vectype);
@@ -8172,16 +8563,14 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
  
    /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
    tree vec_lhs, bitstart;
+  gimple *vec_stmt;
    if (slp_node)
      {
-      gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
+      gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
  
        /* Get the correct slp vectorized stmt.  */
-      gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
-      if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
-       vec_lhs = gimple_phi_result (phi);
-      else
-       vec_lhs = gimple_get_lhs (vec_stmt);
+      vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
+      vec_lhs = gimple_get_lhs (vec_stmt);
  
        /* Get entry to use.  */
        bitstart = bitsize_int (vec_index);
@@ -8190,102 +8579,192 @@ vectorizable_live_operation (loop_vec_info loop_vinfo,
    else
      {
        /* For multiple copies, get the last copy.  */
-      vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
+      vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
+      vec_lhs = gimple_get_lhs (vec_stmt);
  
        /* Get the last lane in the vector.  */
        bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
      }
  
-  /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
-     requirement, insert one phi node for it.  It looks like:
-        loop;
-       BB:
-        # lhs' = PHI <lhs>
-     ==>
-        loop;
-       BB:
-        # vec_lhs' = PHI <vec_lhs>
-        new_tree = lane_extract <vec_lhs', ...>;
-        lhs' = new_tree;  */
+  if (loop_vinfo)
+    {
+      /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
+        requirement, insert one phi node for it.  It looks like:
+          loop;
+        BB:
+          # lhs' = PHI <lhs>
+        ==>
+          loop;
+        BB:
+          # vec_lhs' = PHI <vec_lhs>
+          new_tree = lane_extract <vec_lhs', ...>;
+          lhs' = new_tree;  */
+
+      class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+      basic_block exit_bb = single_exit (loop)->dest;
+      gcc_assert (single_pred_p (exit_bb));
+
+      tree vec_lhs_phi = copy_ssa_name (vec_lhs);
+      gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
+      SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
  
-  basic_block exit_bb = single_exit (loop)->dest;
-  gcc_assert (single_pred_p (exit_bb));
+      gimple_seq stmts = NULL;
+      tree new_tree;
+      if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
+       {
+         /* Emit:
  
-  tree vec_lhs_phi = copy_ssa_name (vec_lhs);
-  gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
-  SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
+              SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
  
-  gimple_seq stmts = NULL;
-  tree new_tree;
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
-    {
-      /* Emit:
+            where VEC_LHS is the vectorized live-out result and MASK is
+            the loop mask for the final iteration.  */
+         gcc_assert (ncopies == 1 && !slp_node);
+         tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
+         tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
+                                         1, vectype, 0);
+         tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
+                                         mask, vec_lhs_phi);
+
+         /* Convert the extracted vector element to the scalar type.  */
+         new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+       }
+      else
+       {
+         tree bftype = TREE_TYPE (vectype);
+         if (VECTOR_BOOLEAN_TYPE_P (vectype))
+           bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
+         new_tree = build3 (BIT_FIELD_REF, bftype,
+                            vec_lhs_phi, bitsize, bitstart);
+         new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
+                                          &stmts, true, NULL_TREE);
+       }
  
-          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
+      if (stmts)
+       {
+         gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
+         gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
  
-        where VEC_LHS is the vectorized live-out result and MASK is
-        the loop mask for the final iteration.  */
-      gcc_assert (ncopies == 1 && !slp_node);
-      tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
-      tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
-                                     vectype, 0);
-      tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
-                                     mask, vec_lhs_phi);
+         /* Remove existing phi from lhs and create one copy from new_tree.  */
+         tree lhs_phi = NULL_TREE;
+         gimple_stmt_iterator gsi;
+         for (gsi = gsi_start_phis (exit_bb);
+              !gsi_end_p (gsi); gsi_next (&gsi))
+           {
+             gimple *phi = gsi_stmt (gsi);
+             if ((gimple_phi_arg_def (phi, 0) == lhs))
+               {
+                 remove_phi_node (&gsi, false);
+                 lhs_phi = gimple_phi_result (phi);
+                 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
+                 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
+                 break;
+               }
+           }
+       }
  
-      /* Convert the extracted vector element to the required scalar type.  */
-      new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
+      /* Replace use of lhs with newly computed result.  If the use stmt is a
+        single arg PHI, just replace all uses of PHI result.  It's necessary
+        because lcssa PHI defining lhs may be before newly inserted stmt.  */
+      use_operand_p use_p;
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
+           && !is_gimple_debug (use_stmt))
+         {
+           if (gimple_code (use_stmt) == GIMPLE_PHI
+               && gimple_phi_num_args (use_stmt) == 1)
+             {
+               replace_uses_by (gimple_phi_result (use_stmt), new_tree);
+             }
+           else
+             {
+               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+                   SET_USE (use_p, new_tree);
+             }
+           update_stmt (use_stmt);
+         }
      }
    else
      {
+      /* For basic-block vectorization simply insert the lane-extraction.  */
        tree bftype = TREE_TYPE (vectype);
        if (VECTOR_BOOLEAN_TYPE_P (vectype))
         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
-      new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
+      tree new_tree = build3 (BIT_FIELD_REF, bftype,
+                             vec_lhs, bitsize, bitstart);
+      gimple_seq stmts = NULL;
        new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
                                        &stmts, true, NULL_TREE);
-    }
-
-  if (stmts)
-    {
-      gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
-      gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
-
-      /* Remove existing phi from lhs and create one copy from new_tree.  */
-      tree lhs_phi = NULL_TREE;
-      gimple_stmt_iterator gsi;
-      for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
+      if (TREE_CODE (new_tree) == SSA_NAME
+         && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
+       SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
+      if (is_a <gphi *> (vec_stmt))
         {
-         gimple *phi = gsi_stmt (gsi);
-         if ((gimple_phi_arg_def (phi, 0) == lhs))
-           {
-             remove_phi_node (&gsi, false);
-             lhs_phi = gimple_phi_result (phi);
-             gimple *copy = gimple_build_assign (lhs_phi, new_tree);
-             gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
-             break;
-           }
-       }
-    }
-
-  /* Replace use of lhs with newly computed result.  If the use stmt is a
-     single arg PHI, just replace all uses of PHI result.  It's necessary
-     because lcssa PHI defining lhs may be before newly inserted stmt.  */
-  use_operand_p use_p;
-  FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
-    if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
-       && !is_gimple_debug (use_stmt))
-    {
-      if (gimple_code (use_stmt) == GIMPLE_PHI
-         && gimple_phi_num_args (use_stmt) == 1)
-       {
-         replace_uses_by (gimple_phi_result (use_stmt), new_tree);
+         gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
+         gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
         }
        else
         {
-         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
-           SET_USE (use_p, new_tree);
+         gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
+         gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
         }
-      update_stmt (use_stmt);
+
+      /* Replace use of lhs with newly computed result.  If the use stmt is a
+        single arg PHI, just replace all uses of PHI result.  It's necessary
+        because lcssa PHI defining lhs may be before newly inserted stmt.  */
+      use_operand_p use_p;
+      stmt_vec_info use_stmt_info;
+      FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
+       if (!is_gimple_debug (use_stmt)
+           && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
+               || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
+         {
+           /* ???  This can happen when the live lane ends up being
+              used in a vector construction code-generated by an
+              external SLP node (and code-generation for that already
+              happened).  See gcc.dg/vect/bb-slp-47.c.
+              Doing this is what would happen if that vector CTOR
+              were not code-generated yet so it is not too bad.
+              ???  In fact we'd likely want to avoid this situation
+              in the first place.  */
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && gimple_code (use_stmt) != GIMPLE_PHI
+               && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
+                                               use_stmt))
+             {
+               enum tree_code code = gimple_assign_rhs_code (use_stmt);
+               gcc_assert (code == CONSTRUCTOR
+                           || code == VIEW_CONVERT_EXPR
+                           || CONVERT_EXPR_CODE_P (code));
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "Using original scalar computation for "
+                                  "live lane because use preceeds vector "
+                                  "def\n");
+               continue;
+             }
+           /* ???  It can also happen that we end up pulling a def into
+              a loop where replacing out-of-loop uses would require
+              a new LC SSA PHI node.  Retain the original scalar in
+              those cases as well.  PR98064.  */
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && (gimple_bb (use_stmt)->loop_father
+                   != gimple_bb (vec_stmt)->loop_father)
+               && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
+                                       gimple_bb (use_stmt)->loop_father))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "Using original scalar computation for "
+                                  "live lane because there is an out-of-loop "
+                                  "definition for it\n");
+               continue;
+             }
+           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
+             SET_USE (use_p, new_tree);
+           update_stmt (use_stmt);
+         }
      }
  
    return true;
@@ -8394,7 +8873,7 @@ vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
  {
    gcc_assert (nvectors != 0);
    if (masks->length () < nvectors)
-    masks->safe_grow_cleared (nvectors);
+    masks->safe_grow_cleared (nvectors, true);
    rgroup_controls *rgm = &(*masks)[nvectors - 1];
    /* The number of scalars per iteration and the number of vectors are
       both compile-time constants.  */
@@ -8434,7 +8913,7 @@ vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
       used it.  */
    if (rgm->controls.is_empty ())
      {
-      rgm->controls.safe_grow_cleared (nvectors);
+      rgm->controls.safe_grow_cleared (nvectors, true);
        for (unsigned int i = 0; i < nvectors; ++i)
         {
           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
@@ -8476,7 +8955,7 @@ vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
  {
    gcc_assert (nvectors != 0);
    if (lens->length () < nvectors)
-    lens->safe_grow_cleared (nvectors);
+    lens->safe_grow_cleared (nvectors, true);
    rgroup_controls *rgl = &(*lens)[nvectors - 1];
  
    /* The number of scalars per iteration, scalar occupied bytes and
@@ -8512,7 +8991,7 @@ vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
       used it.  */
    if (rgl->controls.is_empty ())
      {
-      rgl->controls.safe_grow_cleared (nvectors);
+      rgl->controls.safe_grow_cleared (nvectors, true);
        for (unsigned int i = 0; i < nvectors; ++i)
         {
           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
@@ -8562,11 +9041,47 @@ scale_profile_for_vect_loop (class loop *loop, unsigned vf)
      scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
  }
  
+/* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
+   latch edge values originally defined by it.  */
+
+static void
+maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
+                                    stmt_vec_info def_stmt_info)
+{
+  tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
+  if (!def || TREE_CODE (def) != SSA_NAME)
+    return;
+  stmt_vec_info phi_info;
+  imm_use_iterator iter;
+  use_operand_p use_p;
+  FOR_EACH_IMM_USE_FAST (use_p, iter, def)
+    if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
+      if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
+         && (phi_info = loop_vinfo->lookup_stmt (phi))
+         && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
+         && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
+         && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
+       {
+         loop_p loop = gimple_bb (phi)->loop_father;
+         edge e = loop_latch_edge (loop);
+         if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
+           {
+             vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
+             vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
+             gcc_assert (phi_defs.length () == latch_defs.length ());
+             for (unsigned i = 0; i < phi_defs.length (); ++i)
+               add_phi_arg (as_a <gphi *> (phi_defs[i]),
+                            gimple_get_lhs (latch_defs[i]), e,
+                            gimple_phi_arg_location (phi, e->dest_idx));
+           }
+       }
+}
+
  /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
     When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
     stmt_vec_info.  */
  
-static void
+static bool
  vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
  {
@@ -8582,7 +9097,7 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
  
    if (!STMT_VINFO_RELEVANT_P (stmt_info)
        && !STMT_VINFO_LIVE_P (stmt_info))
-    return;
+    return false;
  
    if (STMT_VINFO_VECTYPE (stmt_info))
      {
@@ -8599,13 +9114,15 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
    /* Pure SLP statements have already been vectorized.  We still need
       to apply loop vectorization to hybrid SLP statements.  */
    if (PURE_SLP_STMT (stmt_info))
-    return;
+    return false;
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
  
    if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
      *seen_store = stmt_info;
+
+  return true;
  }
  
  /* Helper function to pass to simplify_replace_tree to enable replacing tree's
@@ -8654,6 +9171,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
    basic_block *epilogue_bbs = get_loop_body (epilogue);
    unsigned i;
  
+  free (LOOP_VINFO_BBS (epilogue_vinfo));
    LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
  
    /* Advance data_reference's with the number of iterations of the previous
@@ -8915,8 +9433,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
  
    split_edge (loop_preheader_edge (loop));
  
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-      && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
      /* This will deal with any possible peeling.  */
      vect_prepare_for_masked_peels (loop_vinfo);
  
@@ -8925,7 +9442,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
    if (!loop_vinfo->slp_instances.is_empty ())
      {
        DUMP_VECT_SCOPE ("scheduling SLP instances");
-      vect_schedule_slp (loop_vinfo);
+      vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
      }
  
    /* FORNOW: the vectorizer supports only loops which body consist
@@ -8940,7 +9457,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
  
        for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
            gsi_next (&si))
-        {
+       {
           gphi *phi = si.phi ();
           if (dump_enabled_p ())
             dump_printf_loc (MSG_NOTE, vect_location,
@@ -8975,6 +9492,27 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
             }
         }
  
+      for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
+          gsi_next (&si))
+       {
+         gphi *phi = si.phi ();
+         stmt_info = loop_vinfo->lookup_stmt (phi);
+         if (!stmt_info)
+           continue;
+
+         if (!STMT_VINFO_RELEVANT_P (stmt_info)
+             && !STMT_VINFO_LIVE_P (stmt_info))
+           continue;
+
+         if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
+              || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
+             && ! PURE_SLP_STMT (stmt_info))
+           maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
+       }
+
        for (gimple_stmt_iterator si = gsi_start_bb (bb);
            !gsi_end_p (si);)
         {
@@ -9010,11 +9548,18 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                         }
                       stmt_vec_info pat_stmt_info
                         = STMT_VINFO_RELATED_STMT (stmt_info);
-                     vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
-                                               &seen_store);
+                     if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
+                                                   &si, &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            pat_stmt_info);
+                   }
+                 else
+                   {
+                     if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
+                                                   &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            stmt_info);
                     }
-                 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
-                                           &seen_store);
                 }
               gsi_next (&si);
               if (seen_store)
@@ -9032,38 +9577,6 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
             }
         }
  
-      /* Fill in backedge defs of reductions.  */
-      for (unsigned i = 0; i < loop_vinfo->reduc_latch_defs.length (); ++i)
-       {
-         stmt_vec_info stmt_info = loop_vinfo->reduc_latch_defs[i];
-         stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
-         vec<gimple *> &phi_info
-           = STMT_VINFO_VEC_STMTS (STMT_VINFO_REDUC_DEF (orig_stmt_info));
-         vec<gimple *> &vec_stmt
-           = STMT_VINFO_VEC_STMTS (stmt_info);
-         gcc_assert (phi_info.length () == vec_stmt.length ());
-         gphi *phi
-           = dyn_cast <gphi *> (STMT_VINFO_REDUC_DEF (orig_stmt_info)->stmt);
-         edge e = loop_latch_edge (gimple_bb (phi_info[0])->loop_father);
-         for (unsigned j = 0; j < phi_info.length (); ++j)
-           add_phi_arg (as_a <gphi *> (phi_info[j]),
-                        gimple_get_lhs (vec_stmt[j]), e,
-                        gimple_phi_arg_location (phi, e->dest_idx));
-       }
-      for (unsigned i = 0; i < loop_vinfo->reduc_latch_slp_defs.length (); ++i)
-       {
-         slp_tree slp_node = loop_vinfo->reduc_latch_slp_defs[i].first;
-         slp_tree phi_node = loop_vinfo->reduc_latch_slp_defs[i].second;
-         gphi *phi = as_a <gphi *> (SLP_TREE_SCALAR_STMTS (phi_node)[0]->stmt);
-         e = loop_latch_edge (gimple_bb (phi)->loop_father);
-         gcc_assert (SLP_TREE_VEC_STMTS (phi_node).length ()
-                     == SLP_TREE_VEC_STMTS (slp_node).length ());
-         for (unsigned j = 0; j < SLP_TREE_VEC_STMTS (phi_node).length (); ++j)
-           add_phi_arg (as_a <gphi *> (SLP_TREE_VEC_STMTS (phi_node)[j]),
-                        vect_get_slp_vect_def (slp_node, j),
-                        e, gimple_phi_arg_location (phi, e->dest_idx));
-       }
-
        /* Stub out scalar statements that must not survive vectorization.
          Doing this here helps with grouped statements, or statements that
          are involved in patterns.  */
@@ -9171,7 +9684,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
       won't work.  */
    slp_instance instance;
    FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
-    vect_free_slp_instance (instance, true);
+    vect_free_slp_instance (instance);
    LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
    /* Clear-up safelen field since its value is invalid after vectorization
       since vectorized loop can have loop-carried dependencies.  */