Daily bump.
[gcc.git] / gcc / tree-vect-loop.c
index 3021be3b241adce8695167fbec0d2c374d6916f5..72bbec4b45d225d73eddfaca69468cd23842a42a 100644 (file)
@@ -666,27 +666,50 @@ vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
   unsigned i;
 
   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
-    if (STMT_VINFO_IN_PATTERN_P (first))
-      {
-       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
-       while (next)
-         {
-           if (! STMT_VINFO_IN_PATTERN_P (next)
-               || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
-             break;
-           next = REDUC_GROUP_NEXT_ELEMENT (next);
-         }
-       /* If not all stmt in the chain are patterns or if we failed
-          to update STMT_VINFO_REDUC_IDX try to handle the chain
-          without patterns.  */
-       if (! next
-           && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
-         {
-           vect_fixup_reduc_chain (first);
-           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
-             = STMT_VINFO_RELATED_STMT (first);
-         }
-      }
+    {
+      stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
+      while (next)
+       {
+         if ((STMT_VINFO_IN_PATTERN_P (next)
+              != STMT_VINFO_IN_PATTERN_P (first))
+             || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
+           break;
+         next = REDUC_GROUP_NEXT_ELEMENT (next);
+       }
+      /* If all reduction chain members are well-formed patterns adjust
+        the group to group the pattern stmts instead.  */
+      if (! next
+         && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
+       {
+         if (STMT_VINFO_IN_PATTERN_P (first))
+           {
+             vect_fixup_reduc_chain (first);
+             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
+               = STMT_VINFO_RELATED_STMT (first);
+           }
+       }
+      /* If not all stmt in the chain are patterns or if we failed
+        to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
+        it as regular reduction instead.  */
+      else
+       {
+         stmt_vec_info vinfo = first;
+         stmt_vec_info last = NULL;
+         while (vinfo)
+           {
+             next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
+             REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
+             REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
+             last = vinfo;
+             vinfo = next;
+           }
+         STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
+           = vect_internal_def;
+         loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
+         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
+         --i;
+       }
+    }
 }
 
 /* Function vect_get_loop_niters.
@@ -814,7 +837,7 @@ _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
     vec_outside_cost (0),
     vec_inside_cost (0),
     vectorizable (false),
-    can_use_partial_vectors_p (true),
+    can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
     using_partial_vectors_p (false),
     epil_using_partial_vectors_p (false),
     peeling_for_gaps (false),
@@ -1804,6 +1827,19 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
        }
     }
 
+  /* If using the "very cheap" model. reject cases in which we'd keep
+     a copy of the scalar code (even if we might be able to vectorize it).  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "some scalar iterations would need to be peeled\n");
+      return 0;
+    }
+
   int min_profitable_iters, min_profitable_estimate;
   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
                                      &min_profitable_estimate);
@@ -1862,6 +1898,20 @@ vect_analyze_loop_costing (loop_vec_info loop_vinfo)
       min_profitable_estimate = min_profitable_iters;
     }
 
+  /* If the vector loop needs multiple iterations to be beneficial then
+     things are probably too close to call, and the conservative thing
+     would be to stick with the scalar code.  */
+  if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
+      && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "one iteration of the vector loop would be"
+                        " more expensive than the equivalent number of"
+                        " iterations of the scalar loop\n");
+      return 0;
+    }
+
   HOST_WIDE_INT estimated_niter;
 
   /* If we are vectorizing an epilogue then we know the maximum number of
@@ -2003,22 +2053,123 @@ vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
     }
 }
 
+/* Determine if operating on full vectors for LOOP_VINFO might leave
+   some scalar iterations still to do.  If so, decide how we should
+   handle those scalar iterations.  The possibilities are:
 
-/* Decides whether we need to create an epilogue loop to handle
-   remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
+   (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
+       In this case:
 
-void
-determine_peel_for_niter (loop_vec_info loop_vinfo)
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
+        LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == false
+
+   (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
+       to handle the remaining scalar iterations.  In this case:
+
+        LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
+        LOOP_VINFO_PEELING_FOR_NITER == true
+
+       There are two choices:
+
+       (2a) Consider vectorizing the epilogue loop at the same VF as the
+           main loop, but using partial vectors instead of full vectors.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
+
+       (2b) Consider vectorizing the epilogue loop at lower VFs only.
+           In this case:
+
+             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
+
+   When FOR_EPILOGUE_P is true, make this determination based on the
+   assumption that LOOP_VINFO is an epilogue loop, otherwise make it
+   based on the assumption that LOOP_VINFO is the main loop.  The caller
+   has made sure that the number of iterations is set appropriately for
+   this value of FOR_EPILOGUE_P.  */
+
+opt_result
+vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
+                                           bool for_epilogue_p)
 {
-  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
+  /* Determine whether there would be any scalar iterations left over.  */
+  bool need_peeling_or_partial_vectors_p
+    = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
 
-  if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-    /* The main loop handles all iterations.  */
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
-  else if (vect_need_peeling_or_partial_vectors_p (loop_vinfo))
-    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
-}
+  /* Decide whether to vectorize the loop with partial vectors.  */
+  LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && need_peeling_or_partial_vectors_p)
+    {
+      /* For partial-vector-usage=1, try to push the handling of partial
+        vectors to the epilogue, with the main loop continuing to operate
+        on full vectors.
+
+        ??? We could then end up failing to use partial vectors if we
+        decide to peel iterations into a prologue, and if the main loop
+        then ends up processing fewer than VF iterations.  */
+      if (param_vect_partial_vector_usage == 1
+         && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
+         && !vect_known_niters_smaller_than_vf (loop_vinfo))
+       LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+      else
+       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
+    }
+
+  if (dump_enabled_p ())
+    {
+      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating on partial vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+      else
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "operating only on full vectors%s.\n",
+                        for_epilogue_p ? " for epilogue loop" : "");
+    }
 
+  if (for_epilogue_p)
+    {
+      loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
+      gcc_assert (orig_loop_vinfo);
+      if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+       gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
+                             LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
+    }
+
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
+    {
+      /* Check that the loop processes at least one full vector.  */
+      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+      tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
+      if (known_lt (wi::to_widest (scalar_niters), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support vectorization.\n");
+
+      /* If we need to peel an extra epilogue iteration to handle data
+        accesses with gaps, check that there are enough scalar iterations
+        available.
+
+        The check above is redundant with this one when peeling for gaps,
+        but the distinction is useful for diagnostics.  */
+      tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
+      if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+         && known_lt (wi::to_widest (scalar_nitersm1), vf))
+       return opt_result::failure_at (vect_location,
+                                      "loop does not have enough iterations"
+                                      " to support peeling for gaps.\n");
+    }
+
+  LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+    = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+       && need_peeling_or_partial_vectors_p);
+
+  return opt_result::success ();
+}
 
 /* Function vect_analyze_loop_2.
 
@@ -2174,6 +2325,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
 
       /* Optimize the SLP graph with the vectorization factor fixed.  */
       vect_optimize_slp (loop_vinfo);
+
+      /* Gather the loads reachable from the SLP graph entries.  */
+      vect_gather_slp_loads (loop_vinfo);
     }
 
   bool saved_can_use_partial_vectors_p
@@ -2241,6 +2395,79 @@ start_over:
                                       "unsupported SLP instances\n");
          goto again;
        }
+
+      /* Check whether any load in ALL SLP instances is possibly permuted.  */
+      slp_tree load_node, slp_root;
+      unsigned i, x;
+      slp_instance instance;
+      bool can_use_lanes = true;
+      FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
+       {
+         slp_root = SLP_INSTANCE_TREE (instance);
+         int group_size = SLP_TREE_LANES (slp_root);
+         tree vectype = SLP_TREE_VECTYPE (slp_root);
+         bool loads_permuted = false;
+         FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+           {
+             if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
+               continue;
+             unsigned j;
+             stmt_vec_info load_info;
+             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
+               if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
+                 {
+                   loads_permuted = true;
+                   break;
+                 }
+           }
+
+         /* If the loads and stores can be handled with load/store-lane
+            instructions record it and move on to the next instance.  */
+         if (loads_permuted
+             && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
+             && vect_store_lanes_supported (vectype, group_size, false))
+           {
+             FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
+               {
+                 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
+                     (SLP_TREE_SCALAR_STMTS (load_node)[0]);
+                 /* Use SLP for strided accesses (or if we can't
+                    load-lanes).  */
+                 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
+                     || ! vect_load_lanes_supported
+                           (STMT_VINFO_VECTYPE (stmt_vinfo),
+                            DR_GROUP_SIZE (stmt_vinfo), false))
+                   break;
+               }
+
+             can_use_lanes
+               = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
+
+             if (can_use_lanes && dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location,
+                                "SLP instance %p can use load/store-lanes\n",
+                                instance);
+           }
+         else
+           {
+             can_use_lanes = false;
+             break;
+           }
+       }
+
+      /* If all SLP instances can use load/store-lanes abort SLP and try again
+        with SLP disabled.  */
+      if (can_use_lanes)
+       {
+         ok = opt_result::failure_at (vect_location,
+                                      "Built SLP cancelled: can use "
+                                      "load/store-lanes\n");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "Built SLP cancelled: all SLP instances support "
+                            "load/store-lanes\n");
+         goto again;
+       }
     }
 
   /* Dissolve SLP-only groups.  */
@@ -2272,72 +2499,32 @@ start_over:
       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
     }
 
-  /* Decide whether to vectorize a loop with partial vectors for
-     this vectorization factor.  */
-  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
-    {
-      /* Don't use partial vectors if we don't need to peel the loop.  */
-      if (param_vect_partial_vector_usage == 0
-         || !vect_need_peeling_or_partial_vectors_p (loop_vinfo))
-       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-      else if (vect_verify_full_masking (loop_vinfo)
-              || vect_verify_loop_lens (loop_vinfo))
-       {
-         /* The epilogue and other known niters less than VF
-           cases can still use vector access with length fully.  */
-         if (param_vect_partial_vector_usage == 1
-             && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-             && !vect_known_niters_smaller_than_vf (loop_vinfo))
-           {
-             LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-             LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
-           }
-         else
-           LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
-       }
-      else
-       LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-    }
-  else
-    LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
-
-  if (dump_enabled_p ())
-    {
-      if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "operating on partial vectors.\n");
-      else
-       dump_printf_loc (MSG_NOTE, vect_location,
-                        "operating only on full vectors.\n");
-    }
-
-  /* If epilog loop is required because of data accesses with gaps,
-     one additional iteration needs to be peeled.  Check if there is
-     enough iterations for vectorization.  */
-  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
-      && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
-    {
-      poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-      tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
-
-      if (known_lt (wi::to_widest (scalar_niters), vf))
-       return opt_result::failure_at (vect_location,
-                                      "loop has no enough iterations to"
-                                      " support peeling for gaps.\n");
-    }
+  /* If we still have the option of using partial vectors,
+     check whether we can generate the necessary loop controls.  */
+  if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
+      && !vect_verify_full_masking (loop_vinfo)
+      && !vect_verify_loop_lens (loop_vinfo))
+    LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
 
   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
      to be able to handle fewer than VF scalars, or needs to have a lower VF
      than the main loop.  */
   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
-      && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
+      && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
                   LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
     return opt_result::failure_at (vect_location,
                                   "Vectorization factor too high for"
                                   " epilogue loop.\n");
 
+  /* Decide whether this loop_vinfo should use partial vectors or peeling,
+     assuming that the loop will be used as a main loop.  We will redo
+     this analysis later if we instead decide to use the loop as an
+     epilogue loop.  */
+  ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
+  if (!ok)
+    return ok;
+
   /* Check the costings of the loop make vectorizing worthwhile.  */
   res = vect_analyze_loop_costing (loop_vinfo);
   if (res < 0)
@@ -2350,7 +2537,6 @@ start_over:
     return opt_result::failure_at (vect_location,
                                   "Loop costings not worthwhile.\n");
 
-  determine_peel_for_niter (loop_vinfo);
   /* If an epilogue loop is required make sure we can create one.  */
   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
@@ -3170,14 +3356,17 @@ pop:
          fail = true;
          break;
        }
-      /* Check there's only a single stmt the op is used on inside
-         of the loop.  */
+      /* Check there's only a single stmt the op is used on.  For the
+        not value-changing tail and the last stmt allow out-of-loop uses.
+        ???  We could relax this and handle arbitrary live stmts by
+        forcing a scalar epilogue for example.  */
       imm_use_iterator imm_iter;
       gimple *op_use_stmt;
       unsigned cnt = 0;
       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
        if (!is_gimple_debug (op_use_stmt)
-           && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
+           && (*code != ERROR_MARK
+               || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
          {
            /* We want to allow x + x but not x < 1 ? x : 2.  */
            if (is_gimple_assign (op_use_stmt)
@@ -4360,34 +4549,6 @@ vect_model_reduction_cost (loop_vec_info loop_vinfo,
 }
 
 
-/* Function vect_model_induction_cost.
-
-   Models cost for induction operations.  */
-
-static void
-vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
-                          stmt_vector_for_cost *cost_vec)
-{
-  unsigned inside_cost, prologue_cost;
-
-  if (PURE_SLP_STMT (stmt_info))
-    return;
-
-  /* loop cost for vec_loop.  */
-  inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
-                                 stmt_info, 0, vect_body);
-
-  /* prologue cost for vec_init and vec_step.  */
-  prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
-                                   stmt_info, 0, vect_prologue);
-
-  if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location,
-                     "vect_model_induction_cost: inside_cost = %d, "
-                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
-}
-
-
 
 /* Function get_initial_def_for_reduction
 
@@ -6253,9 +6414,28 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
     {
       if (is_a <gphi *> (stmt_info->stmt))
-       /* Analysis for double-reduction is done on the outer
-          loop PHI, nested cycles have no further restrictions.  */
-       STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       {
+         if (slp_node)
+           {
+             /* We eventually need to set a vector type on invariant
+                arguments.  */
+             unsigned j;
+             slp_tree child;
+             FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+               if (!vect_maybe_update_slp_op_vectype
+                      (child, SLP_TREE_VECTYPE (slp_node)))
+                 {
+                   if (dump_enabled_p ())
+                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                      "incompatible vector types for "
+                                      "invariants\n");
+                   return false;
+                 }
+           }
+         /* Analysis for double-reduction is done on the outer
+            loop PHI, nested cycles have no further restrictions.  */
+         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
+       }
       else
        STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
       return true;
@@ -6297,12 +6477,16 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
 
   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
-     and compute the reduction chain length.  */
-  tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
-                                         loop_latch_edge (loop));
+     and compute the reduction chain length.  Discover the real
+     reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
+  tree reduc_def
+    = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
+                            loop_latch_edge
+                              (gimple_bb (reduc_def_phi)->loop_father));
   unsigned reduc_chain_length = 0;
   bool only_slp_reduc_chain = true;
   stmt_info = NULL;
+  slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
   while (reduc_def != PHI_RESULT (reduc_def_phi))
     {
       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
@@ -6345,6 +6529,8 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
        stmt_info = vdef;
       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
       reduc_chain_length++;
+      if (!stmt_info && slp_node)
+       slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
     }
   /* PHIs should not participate in patterns.  */
   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
@@ -6431,17 +6617,6 @@ vectorizable_reduction (loop_vec_info loop_vinfo,
      The last use is the reduction variable.  In case of nested cycle this
      assumption is not true: we use reduc_index to record the index of the
      reduction variable.  */
-  /* ???  To get at invariant/constant uses on the SLP node we have to
-     get to it here, slp_node is still the reduction PHI.  */
-  slp_tree slp_for_stmt_info = NULL;
-  if (slp_node)
-    {
-      slp_for_stmt_info = slp_node_instance->root;
-      /* And then there's reduction chain with a conversion ...  */
-      if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
-       slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
-      gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
-    }
   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
   /* We need to skip an extra operand for COND_EXPRs with embedded
      comparison.  */
@@ -7322,15 +7497,24 @@ vect_transform_cycle_phi (loop_vec_info loop_vinfo,
   if (slp_node)
     {
       vec_initial_defs.reserve (vec_num);
-      gcc_assert (slp_node == slp_node_instance->reduc_phis);
-      stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
-      tree neutral_op
-       = neutral_op_for_slp_reduction (slp_node, vectype_out,
-                                       STMT_VINFO_REDUC_CODE (reduc_info),
-                                       first != NULL);
-      get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
-                                     &vec_initial_defs, vec_num,
-                                     first != NULL, neutral_op);
+      if (nested_cycle)
+       {
+         unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
+         vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
+                            &vec_initial_defs);
+       }
+      else
+       {
+         gcc_assert (slp_node == slp_node_instance->reduc_phis);
+         stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
+         tree neutral_op
+             = neutral_op_for_slp_reduction (slp_node, vectype_out,
+                                             STMT_VINFO_REDUC_CODE (reduc_info),
+                                             first != NULL);
+         get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
+                                         &vec_initial_defs, vec_num,
+                                         first != NULL, neutral_op);
+       }
     }
   else
     {
@@ -7436,6 +7620,17 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      /* Deal with copies from externs or constants that disguise as
+        loop-closed PHI nodes (PR97886).  */
+      if (slp_node
+         && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
+                                               SLP_TREE_VECTYPE (slp_node)))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "incompatible vector types for invariants\n");
+         return false;
+       }
       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
       return true;
     }
@@ -7465,6 +7660,81 @@ vectorizable_lc_phi (loop_vec_info loop_vinfo,
   return true;
 }
 
+/* Vectorizes PHIs.  */
+
+bool
+vectorizable_phi (vec_info *,
+                 stmt_vec_info stmt_info, gimple **vec_stmt,
+                 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
+{
+  if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
+    return false;
+
+  if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
+    return false;
+
+  tree vectype = SLP_TREE_VECTYPE (slp_node);
+
+  if (!vec_stmt) /* transformation not required.  */
+    {
+      slp_tree child;
+      unsigned i;
+      FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
+       if (!child)
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "PHI node with unvectorized backedge def\n");
+           return false;
+         }
+       else if (!vect_maybe_update_slp_op_vectype (child, vectype))
+         {
+           if (dump_enabled_p ())
+             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                              "incompatible vector types for invariants\n");
+           return false;
+         }
+      record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                       vector_stmt, stmt_info, vectype, 0, vect_body);
+      STMT_VINFO_TYPE (stmt_info) = phi_info_type;
+      return true;
+    }
+
+  tree scalar_dest = gimple_phi_result (stmt_info->stmt);
+  basic_block bb = gimple_bb (stmt_info->stmt);
+  tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
+  auto_vec<gphi *> new_phis;
+  for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
+    {
+      slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
+
+      /* Skip not yet vectorized defs.  */
+      if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
+         && SLP_TREE_VEC_STMTS (child).is_empty ())
+       continue;
+
+      auto_vec<tree> vec_oprnds;
+      vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
+      if (!new_phis.exists ())
+       {
+         new_phis.create (vec_oprnds.length ());
+         for (unsigned j = 0; j < vec_oprnds.length (); j++)
+           {
+             /* Create the vectorized LC PHI node.  */
+             new_phis.quick_push (create_phi_node (vec_dest, bb));
+             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
+           }
+       }
+      edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
+      for (unsigned j = 0; j < vec_oprnds.length (); j++)
+       add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
+    }
+  /* We should have at least one already vectorized child.  */
+  gcc_assert (new_phis.exists ());
+
+  return true;
+}
+
 
 /* Function vect_min_worthwhile_factor.
 
@@ -7535,7 +7805,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   unsigned i;
   tree expr;
-  gimple_seq stmts;
   gimple_stmt_iterator si;
 
   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
@@ -7575,10 +7844,6 @@ vectorizable_induction (loop_vec_info loop_vinfo,
          return false;
        }
 
-      /* FORNOW: outer loop induction with SLP not supported.  */
-      if (STMT_SLP_TYPE (stmt_info))
-       return false;
-
       exit_phi = NULL;
       latch_e = loop_latch_edge (loop->inner);
       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
@@ -7617,7 +7882,7 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 
   if (slp_node && !nunits.is_constant ())
     {
-      /* The current SLP code creates the initial value element-by-element.  */
+      /* The current SLP code creates the step value element-by-element.  */
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                         "SLP induction not supported for variable-length"
@@ -7627,9 +7892,50 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 
   if (!vec_stmt) /* transformation not required.  */
     {
+      unsigned inside_cost = 0, prologue_cost = 0;
+      if (slp_node)
+       {
+         /* We eventually need to set a vector type on invariant
+            arguments.  */
+         unsigned j;
+         slp_tree child;
+         FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
+           if (!vect_maybe_update_slp_op_vectype
+               (child, SLP_TREE_VECTYPE (slp_node)))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "incompatible vector types for "
+                                  "invariants\n");
+               return false;
+             }
+         /* loop cost for vec_loop.  */
+         inside_cost
+           = record_stmt_cost (cost_vec,
+                               SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
+                               vector_stmt, stmt_info, 0, vect_body);
+         /* prologue cost for vec_init (if not nested) and step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
+                                           scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      else /* if (!slp_node) */
+       {
+         /* loop cost for vec_loop.  */
+         inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
+                                         stmt_info, 0, vect_body);
+         /* prologue cost for vec_init and vec_step.  */
+         prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
+                                           stmt_info, 0, vect_prologue);
+       }
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "vect_model_induction_cost: inside_cost = %d, "
+                        "prologue_cost = %d .\n", inside_cost,
+                        prologue_cost);
+
       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
       DUMP_VECT_SCOPE ("vectorizable_induction");
-      vect_model_induction_cost (stmt_info, ncopies, cost_vec);
       return true;
     }
 
@@ -7648,164 +7954,195 @@ vectorizable_induction (loop_vec_info loop_vinfo,
   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
 
   pe = loop_preheader_edge (iv_loop);
-  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
-                                    loop_preheader_edge (iv_loop));
-
-  stmts = NULL;
-  if (!nested_in_vect_loop)
-    {
-      /* Convert the initial value to the IV update type.  */
-      tree new_type = TREE_TYPE (step_expr);
-      init_expr = gimple_convert (&stmts, new_type, init_expr);
-
-      /* If we are using the loop mask to "peel" for alignment then we need
-        to adjust the start value here.  */
-      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
-      if (skip_niters != NULL_TREE)
-       {
-         if (FLOAT_TYPE_P (vectype))
-           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
-                                       skip_niters);
-         else
-           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
-         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
-                                        skip_niters, step_expr);
-         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
-                                   init_expr, skip_step);
-       }
-    }
-
-  if (stmts)
-    {
-      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-      gcc_assert (!new_bb);
-    }
-
   /* Find the first insertion point in the BB.  */
   basic_block bb = gimple_bb (phi);
   si = gsi_after_labels (bb);
 
   /* For SLP induction we have to generate several IVs as for example
-     with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
-     [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
-     [VF*S, VF*S, VF*S, VF*S] for all.  */
+     with group size 3 we need
+       [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
+       [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
   if (slp_node)
     {
       /* Enforced above.  */
       unsigned int const_nunits = nunits.to_constant ();
 
-      /* Generate [VF*S, VF*S, ... ].  */
-      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+      /* The initial values are vectorized, but any lanes > group_size
+        need adjustment.  */
+      slp_tree init_node
+       = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
+
+      /* Gather steps.  Since we do not vectorize inductions as
+        cycles we have to reconstruct the step from SCEV data.  */
+      unsigned group_size = SLP_TREE_LANES (slp_node);
+      tree *steps = XALLOCAVEC (tree, group_size);
+      tree *inits = XALLOCAVEC (tree, group_size);
+      stmt_vec_info phi_info;
+      FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
        {
-         expr = build_int_cst (integer_type_node, vf);
-         expr = fold_convert (TREE_TYPE (step_expr), expr);
+         steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+         if (!init_node)
+           inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
+                                          pe->dest_idx);
        }
-      else
-       expr = build_int_cst (TREE_TYPE (step_expr), vf);
-      new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                             expr, step_expr);
-      if (! CONSTANT_CLASS_P (new_name))
-       new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
-                                    TREE_TYPE (step_expr), NULL);
-      new_vec = build_vector_from_val (step_vectype, new_name);
-      vec_step = vect_init_vector (loop_vinfo, stmt_info,
-                                  new_vec, step_vectype, NULL);
 
       /* Now generate the IVs.  */
-      unsigned group_size = SLP_TREE_LANES (slp_node);
       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
-      unsigned elts = const_nunits * nvects;
-      /* Compute the number of distinct IVs we need.  First reduce
-        group_size if it is a multiple of const_nunits so we get
-        one IV for a group_size of 4 but const_nunits 2.  */
-      unsigned group_sizep = group_size;
-      if (group_sizep % const_nunits == 0)
-       group_sizep = group_sizep / const_nunits;
-      unsigned nivs = least_common_multiple (group_sizep,
-                                            const_nunits) / const_nunits;
-      gcc_assert (elts % group_size == 0);
-      tree elt = init_expr;
+      gcc_assert ((const_nunits * nvects) % group_size == 0);
+      unsigned nivs;
+      if (nested_in_vect_loop)
+       nivs = nvects;
+      else
+       {
+         /* Compute the number of distinct IVs we need.  First reduce
+            group_size if it is a multiple of const_nunits so we get
+            one IV for a group_size of 4 but const_nunits 2.  */
+         unsigned group_sizep = group_size;
+         if (group_sizep % const_nunits == 0)
+           group_sizep = group_sizep / const_nunits;
+         nivs = least_common_multiple (group_sizep,
+                                       const_nunits) / const_nunits;
+       }
+      tree stept = TREE_TYPE (step_vectype);
+      tree lupdate_mul = NULL_TREE;
+      if (!nested_in_vect_loop)
+       {
+         /* The number of iterations covered in one vector iteration.  */
+         unsigned lup_mul = (nvects * const_nunits) / group_size;
+         lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept, lup_mul,
+                                                            UNSIGNED)
+                                    : build_int_cstu (stept, lup_mul));
+       }
+      tree peel_mul = NULL_TREE;
+      gimple_seq init_stmts = NULL;
+      if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
+       {
+         if (SCALAR_FLOAT_TYPE_P (stept))
+           peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
+                                    LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         else
+           peel_mul = gimple_convert (&init_stmts, stept,
+                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
+         peel_mul = gimple_build_vector_from_val (&init_stmts,
+                                                  step_vectype, peel_mul);
+       }
       unsigned ivn;
+      auto_vec<tree> vec_steps;
       for (ivn = 0; ivn < nivs; ++ivn)
        {
-         tree_vector_builder elts (step_vectype, const_nunits, 1);
-         stmts = NULL;
+         tree_vector_builder step_elts (step_vectype, const_nunits, 1);
+         tree_vector_builder init_elts (vectype, const_nunits, 1);
+         tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
          for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
            {
-             if (ivn*const_nunits + eltn >= group_size
-                 && (ivn * const_nunits + eltn) % group_size == 0)
-               elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
-                                   elt, step_expr);
-             elts.quick_push (elt);
-           }
-         vec_init = gimple_build_vector (&stmts, &elts);
-         vec_init = gimple_convert (&stmts, vectype, vec_init);
-         if (stmts)
-           {
-             new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
-             gcc_assert (!new_bb);
+             /* The scalar steps of the IVs.  */
+             tree elt = steps[(ivn*const_nunits + eltn) % group_size];
+             elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
+             step_elts.quick_push (elt);
+             if (!init_node)
+               {
+                 /* The scalar inits of the IVs if not vectorized.  */
+                 elt = inits[(ivn*const_nunits + eltn) % group_size];
+                 if (!useless_type_conversion_p (TREE_TYPE (vectype),
+                                                 TREE_TYPE (elt)))
+                   elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
+                                       TREE_TYPE (vectype), elt);
+                 init_elts.quick_push (elt);
+               }
+             /* The number of steps to add to the initial values.  */
+             unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
+             mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
+                                  ? build_real_from_wide (stept,
+                                                          mul_elt, UNSIGNED)
+                                  : build_int_cstu (stept, mul_elt));
            }
+         vec_step = gimple_build_vector (&init_stmts, &step_elts);
+         vec_steps.safe_push (vec_step);
+         tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
+         if (peel_mul)
+           step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                    step_mul, peel_mul);
+         if (!init_node)
+           vec_init = gimple_build_vector (&init_stmts, &init_elts);
 
          /* Create the induction-phi that defines the induction-operand.  */
-         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+         vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
+                                           "vec_iv_");
          induction_phi = create_phi_node (vec_dest, iv_loop->header);
          induc_def = PHI_RESULT (induction_phi);
 
          /* Create the iv update inside the loop  */
+         tree up = vec_step;
+         if (lupdate_mul)
+           up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                              vec_step, lupdate_mul);
          gimple_seq stmts = NULL;
          vec_def = gimple_convert (&stmts, step_vectype, induc_def);
          vec_def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, vec_def, vec_step);
+                                 PLUS_EXPR, step_vectype, vec_def, up);
          vec_def = gimple_convert (&stmts, vectype, vec_def);
          gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
+                      UNKNOWN_LOCATION);
+
+         if (init_node)
+           vec_init = vect_get_slp_vect_def (init_node, ivn);
+         if (!nested_in_vect_loop
+             && !integer_zerop (step_mul))
+           {
+             vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
+             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                vec_step, step_mul);
+             vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
+                                     vec_def, up);
+             vec_init = gimple_convert (&init_stmts, vectype, vec_def);
+           }
 
          /* Set the arguments of the phi node:  */
          add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
-         add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
-                      UNKNOWN_LOCATION);
 
          SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
        }
-      /* Fill up to the number of vectors we need for the whole group.  */
-      nivs = least_common_multiple (group_size,
-                                   const_nunits) / const_nunits;
-      for (; ivn < nivs; ++ivn)
-       SLP_TREE_VEC_STMTS (slp_node)
-         .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+      if (!nested_in_vect_loop)
+       {
+         /* Fill up to the number of vectors we need for the whole group.  */
+         nivs = least_common_multiple (group_size,
+                                       const_nunits) / const_nunits;
+         for (; ivn < nivs; ++ivn)
+           {
+             SLP_TREE_VEC_STMTS (slp_node)
+               .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
+             vec_steps.safe_push (vec_steps[0]);
+           }
+       }
 
-      /* Re-use IVs when we can.  */
+      /* Re-use IVs when we can.  We are generating further vector
+        stmts by adding VF' * stride to the IVs generated above.  */
       if (ivn < nvects)
        {
          unsigned vfp
            = least_common_multiple (group_size, const_nunits) / group_size;
-         /* Generate [VF'*S, VF'*S, ... ].  */
-         if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
-           {
-             expr = build_int_cst (integer_type_node, vfp);
-             expr = fold_convert (TREE_TYPE (step_expr), expr);
-           }
-         else
-           expr = build_int_cst (TREE_TYPE (step_expr), vfp);
-         new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
-                                 expr, step_expr);
-         if (! CONSTANT_CLASS_P (new_name))
-           new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
-                                        TREE_TYPE (step_expr), NULL);
-         new_vec = build_vector_from_val (step_vectype, new_name);
-         vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
-                                      step_vectype, NULL);
+         tree lupdate_mul
+           = build_vector_from_val (step_vectype,
+                                    SCALAR_FLOAT_TYPE_P (stept)
+                                    ? build_real_from_wide (stept,
+                                                            vfp, UNSIGNED)
+                                    : build_int_cstu (stept, vfp));
          for (; ivn < nvects; ++ivn)
            {
              gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
-             tree def;
-             if (gimple_code (iv) == GIMPLE_PHI)
-               def = gimple_phi_result (iv);
-             else
-               def = gimple_assign_lhs (iv);
+             tree def = gimple_get_lhs (iv);
+             if (ivn < 2*nivs)
+               vec_steps[ivn - nivs]
+                 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
+                                 vec_steps[ivn - nivs], lupdate_mul);
              gimple_seq stmts = NULL;
              def = gimple_convert (&stmts, step_vectype, def);
-             def = gimple_build (&stmts,
-                                 PLUS_EXPR, step_vectype, def, vec_step);
+             def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
+                                 def, vec_steps[ivn % nivs]);
              def = gimple_convert (&stmts, vectype, def);
              if (gimple_code (iv) == GIMPLE_PHI)
                gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
@@ -7819,9 +8156,45 @@ vectorizable_induction (loop_vec_info loop_vinfo,
            }
        }
 
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
+      gcc_assert (!new_bb);
+
       return true;
     }
 
+  init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
+                                    loop_preheader_edge (iv_loop));
+
+  gimple_seq stmts = NULL;
+  if (!nested_in_vect_loop)
+    {
+      /* Convert the initial value to the IV update type.  */
+      tree new_type = TREE_TYPE (step_expr);
+      init_expr = gimple_convert (&stmts, new_type, init_expr);
+
+      /* If we are using the loop mask to "peel" for alignment then we need
+        to adjust the start value here.  */
+      tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
+      if (skip_niters != NULL_TREE)
+       {
+         if (FLOAT_TYPE_P (vectype))
+           skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
+                                       skip_niters);
+         else
+           skip_niters = gimple_convert (&stmts, new_type, skip_niters);
+         tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
+                                        skip_niters, step_expr);
+         init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
+                                   init_expr, skip_step);
+       }
+    }
+
+  if (stmts)
+    {
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+      gcc_assert (!new_bb);
+    }
+
   /* Create the vector that holds the initial_value of the induction.  */
   if (nested_in_vect_loop)
     {
@@ -8045,7 +8418,7 @@ vectorizable_live_operation (vec_info *vinfo,
   imm_use_iterator imm_iter;
   tree lhs, lhs_type, bitsize, vec_bitsize;
   tree vectype = (slp_node
-                 ? SLP_TREE_VECTYPE (SLP_TREE_REPRESENTATIVE (slp_node))
+                 ? SLP_TREE_VECTYPE (slp_node)
                  : STMT_VINFO_VECTYPE (stmt_info));
   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
   int ncopies;
@@ -8321,8 +8694,19 @@ vectorizable_live_operation (vec_info *vinfo,
       gimple_seq stmts = NULL;
       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
                                       &stmts, true, NULL_TREE);
-
-      gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+      if (TREE_CODE (new_tree) == SSA_NAME
+         && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
+       SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
+      if (is_a <gphi *> (vec_stmt))
+       {
+         gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
+         gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
+       }
+      else
+       {
+         gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
+         gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
+       }
 
       /* Replace use of lhs with newly computed result.  If the use stmt is a
         single arg PHI, just replace all uses of PHI result.  It's necessary
@@ -8342,11 +8726,16 @@ vectorizable_live_operation (vec_info *vinfo,
               were not code-generated yet so it is not too bad.
               ???  In fact we'd likely want to avoid this situation
               in the first place.  */
-           if (gimple_code (use_stmt) != GIMPLE_PHI
-               && !vect_stmt_dominates_stmt_p (gsi_stmt (*gsi), use_stmt))
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && gimple_code (use_stmt) != GIMPLE_PHI
+               && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
+                                               use_stmt))
              {
-               gcc_assert (is_gimple_assign (use_stmt)
-                           && gimple_assign_rhs_code (use_stmt) == CONSTRUCTOR);
+               enum tree_code code = gimple_assign_rhs_code (use_stmt);
+               gcc_assert (code == CONSTRUCTOR
+                           || code == VIEW_CONVERT_EXPR
+                           || CONVERT_EXPR_CODE_P (code));
                if (dump_enabled_p ())
                  dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                   "Using original scalar computation for "
@@ -8354,6 +8743,24 @@ vectorizable_live_operation (vec_info *vinfo,
                                   "def\n");
                continue;
              }
+           /* ???  It can also happen that we end up pulling a def into
+              a loop where replacing out-of-loop uses would require
+              a new LC SSA PHI node.  Retain the original scalar in
+              those cases as well.  PR98064.  */
+           if (TREE_CODE (new_tree) == SSA_NAME
+               && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
+               && (gimple_bb (use_stmt)->loop_father
+                   != gimple_bb (vec_stmt)->loop_father)
+               && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
+                                       gimple_bb (use_stmt)->loop_father))
+             {
+               if (dump_enabled_p ())
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "Using original scalar computation for "
+                                  "live lane because there is an out-of-loop "
+                                  "definition for it\n");
+               continue;
+             }
            FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
              SET_USE (use_p, new_tree);
            update_stmt (use_stmt);
@@ -8674,7 +9081,7 @@ maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
    stmt_vec_info.  */
 
-static void
+static bool
 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
                          gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
 {
@@ -8690,7 +9097,7 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
 
   if (!STMT_VINFO_RELEVANT_P (stmt_info)
       && !STMT_VINFO_LIVE_P (stmt_info))
-    return;
+    return false;
 
   if (STMT_VINFO_VECTYPE (stmt_info))
     {
@@ -8707,13 +9114,15 @@ vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
   /* Pure SLP statements have already been vectorized.  We still need
      to apply loop vectorization to hybrid SLP statements.  */
   if (PURE_SLP_STMT (stmt_info))
-    return;
+    return false;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
 
   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
     *seen_store = stmt_info;
+
+  return true;
 }
 
 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
@@ -8762,6 +9171,7 @@ update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
   basic_block *epilogue_bbs = get_loop_body (epilogue);
   unsigned i;
 
+  free (LOOP_VINFO_BBS (epilogue_vinfo));
   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
 
   /* Advance data_reference's with the number of iterations of the previous
@@ -9023,8 +9433,7 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
 
   split_edge (loop_preheader_edge (loop));
 
-  if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
-      && vect_use_loop_mask_for_alignment_p (loop_vinfo))
+  if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
     /* This will deal with any possible peeling.  */
     vect_prepare_for_masked_peels (loop_vinfo);
 
@@ -9139,17 +9548,17 @@ vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
                        }
                      stmt_vec_info pat_stmt_info
                        = STMT_VINFO_RELATED_STMT (stmt_info);
-                     vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
-                                               &seen_store);
-                     maybe_set_vectorized_backedge_value (loop_vinfo,
-                                                          pat_stmt_info);
+                     if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
+                                                   &si, &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            pat_stmt_info);
                    }
                  else
                    {
-                     vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
-                                               &seen_store);
-                     maybe_set_vectorized_backedge_value (loop_vinfo,
-                                                          stmt_info);
+                     if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
+                                                   &seen_store))
+                       maybe_set_vectorized_backedge_value (loop_vinfo,
+                                                            stmt_info);
                    }
                }
              gsi_next (&si);