ipa-cp.c (ipcp_cloning_candidate_p): Use opt_for_fn.
[gcc.git] / gcc / tree-vect-loop.c
index cd8c3afb1c7618afd1a142680ca24c8c77cd14b8..70405d86f3493a1ddf320c21e8e9857f6015d566 100644 (file)
@@ -1,6 +1,5 @@
 /* Loop Vectorization
-   Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
-   Free Software Foundation, Inc.
+   Copyright (C) 2003-2014 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
    Ira Rosen <irar@il.ibm.com>
 
@@ -23,16 +22,44 @@ along with GCC; see the file COPYING3.  If not see
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
+#include "dumpfile.h"
 #include "tm.h"
-#include "ggc.h"
 #include "tree.h"
+#include "stor-layout.h"
+#include "predict.h"
+#include "vec.h"
+#include "hashtab.h"
+#include "hash-set.h"
+#include "machmode.h"
+#include "hard-reg-set.h"
+#include "input.h"
+#include "function.h"
+#include "dominance.h"
+#include "cfg.h"
+#include "cfganal.h"
 #include "basic-block.h"
 #include "gimple-pretty-print.h"
-#include "tree-flow.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimplify-me.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
 #include "tree-pass.h"
 #include "cfgloop.h"
 #include "expr.h"
 #include "recog.h"
+#include "insn-codes.h"
 #include "optabs.h"
 #include "params.h"
 #include "diagnostic-core.h"
@@ -139,6 +166,8 @@ along with GCC; see the file COPYING3.  If not see
    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 */
 
+static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
+
 /* Function vect_determine_vectorization_factor
 
    Determine the vectorization factor (VF).  VF is the number of data elements
@@ -184,8 +213,9 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
   gimple_stmt_iterator pattern_def_si = gsi_none ();
   bool analyze_pattern_stmt = false;
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                     "=== vect_determine_vectorization_factor ===\n");
 
   for (i = 0; i < nbbs; i++)
     {
@@ -195,10 +225,11 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
        {
          phi = gsi_stmt (si);
          stmt_info = vinfo_for_stmt (phi);
-         if (vect_print_dump_info (REPORT_DETAILS))
+         if (dump_enabled_p ())
            {
-             fprintf (vect_dump, "==> examining phi: ");
-             print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
+             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
+             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+             dump_printf (MSG_NOTE, "\n");
            }
 
          gcc_assert (stmt_info);
@@ -208,34 +239,41 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
              gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 
-             if (vect_print_dump_info (REPORT_DETAILS))
+             if (dump_enabled_p ())
                {
-                 fprintf (vect_dump, "get vectype for scalar type:  ");
-                 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                   "get vectype for scalar type:  ");
+                 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
+                  dump_printf (MSG_NOTE, "\n");
                }
 
              vectype = get_vectype_for_scalar_type (scalar_type);
              if (!vectype)
                {
-                 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+                 if (dump_enabled_p ())
                    {
-                     fprintf (vect_dump,
-                              "not vectorized: unsupported data-type ");
-                     print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                       "not vectorized: unsupported "
+                                       "data-type ");
+                     dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                         scalar_type);
+                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                    }
                  return false;
                }
              STMT_VINFO_VECTYPE (stmt_info) = vectype;
 
-             if (vect_print_dump_info (REPORT_DETAILS))
+             if (dump_enabled_p ())
                {
-                 fprintf (vect_dump, "vectype: ");
-                 print_generic_expr (vect_dump, vectype, TDF_SLIM);
+                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
+                 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+                  dump_printf (MSG_NOTE, "\n");
                }
 
              nunits = TYPE_VECTOR_SUBPARTS (vectype);
-             if (vect_print_dump_info (REPORT_DETAILS))
-               fprintf (vect_dump, "nunits = %d", nunits);
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
+                                 nunits);
 
              if (!vectorization_factor
                  || (nunits > vectorization_factor))
@@ -254,17 +292,20 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 
           stmt_info = vinfo_for_stmt (stmt);
 
-         if (vect_print_dump_info (REPORT_DETAILS))
+         if (dump_enabled_p ())
            {
-             fprintf (vect_dump, "==> examining statement: ");
-             print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+             dump_printf_loc (MSG_NOTE, vect_location,
+                               "==> examining statement: ");
+             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+              dump_printf (MSG_NOTE, "\n");
            }
 
          gcc_assert (stmt_info);
 
          /* Skip stmts which do not need to be vectorized.  */
-         if (!STMT_VINFO_RELEVANT_P (stmt_info)
-             && !STMT_VINFO_LIVE_P (stmt_info))
+         if ((!STMT_VINFO_RELEVANT_P (stmt_info)
+              && !STMT_VINFO_LIVE_P (stmt_info))
+             || gimple_clobber_p (stmt))
             {
               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
@@ -273,16 +314,18 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                 {
                   stmt = pattern_stmt;
                   stmt_info = vinfo_for_stmt (pattern_stmt);
-                  if (vect_print_dump_info (REPORT_DETAILS))
+                  if (dump_enabled_p ())
                     {
-                      fprintf (vect_dump, "==> examining pattern statement: ");
-                      print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                      dump_printf_loc (MSG_NOTE, vect_location,
+                                       "==> examining pattern statement: ");
+                      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+                      dump_printf (MSG_NOTE, "\n");
                     }
                 }
               else
                {
-                 if (vect_print_dump_info (REPORT_DETAILS))
-                   fprintf (vect_dump, "skip.");
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
                   gsi_next (&si);
                  continue;
                 }
@@ -321,12 +364,13 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 
                  if (!gsi_end_p (pattern_def_si))
                    {
-                     if (vect_print_dump_info (REPORT_DETAILS))
+                     if (dump_enabled_p ())
                        {
-                         fprintf (vect_dump,
-                                  "==> examining pattern def stmt: ");
-                         print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
-                                            TDF_SLIM);
+                         dump_printf_loc (MSG_NOTE, vect_location,
+                                           "==> examining pattern def stmt: ");
+                         dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
+                                            pattern_def_stmt, 0);
+                          dump_printf (MSG_NOTE, "\n");
                        }
 
                      stmt = pattern_def_stmt;
@@ -342,22 +386,44 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                analyze_pattern_stmt = false;
            }
 
-         if (gimple_get_lhs (stmt) == NULL_TREE)
+         if (gimple_get_lhs (stmt) == NULL_TREE
+             /* MASK_STORE has no lhs, but is ok.  */
+             && (!is_gimple_call (stmt)
+                 || !gimple_call_internal_p (stmt)
+                 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
            {
-             if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+             if (is_gimple_call (stmt))
+               {
+                 /* Ignore calls with no lhs.  These must be calls to
+                    #pragma omp simd functions, and what vectorization factor
+                    it really needs can't be determined until
+                    vectorizable_simd_clone_call.  */
+                 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
+                   {
+                     pattern_def_seq = NULL;
+                     gsi_next (&si);
+                   }
+                 continue;
+               }
+             if (dump_enabled_p ())
                {
-                 fprintf (vect_dump, "not vectorized: irregular stmt.");
-                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                   "not vectorized: irregular stmt.");
+                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
+                                    0);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              return false;
            }
 
          if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
            {
-             if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+             if (dump_enabled_p ())
                {
-                 fprintf (vect_dump, "not vectorized: vector stmt in loop:");
-                 print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                   "not vectorized: vector stmt in loop:");
+                 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              return false;
            }
@@ -376,25 +442,42 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
          else
            {
              gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
-             scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
-             if (vect_print_dump_info (REPORT_DETAILS))
+             if (is_gimple_call (stmt)
+                 && gimple_call_internal_p (stmt)
+                 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
+               scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
+             else
+               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
+             if (dump_enabled_p ())
                {
-                 fprintf (vect_dump, "get vectype for scalar type:  ");
-                 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                   "get vectype for scalar type:  ");
+                 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
+                  dump_printf (MSG_NOTE, "\n");
                }
              vectype = get_vectype_for_scalar_type (scalar_type);
              if (!vectype)
                {
-                 if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+                 if (dump_enabled_p ())
                    {
-                     fprintf (vect_dump,
-                              "not vectorized: unsupported data-type ");
-                     print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                       "not vectorized: unsupported "
+                                       "data-type ");
+                     dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                         scalar_type);
+                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                    }
                  return false;
                }
 
              STMT_VINFO_VECTYPE (stmt_info) = vectype;
+
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
+                 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+                  dump_printf (MSG_NOTE, "\n");
+               }
             }
 
          /* The vectorization factor is according to the smallest
@@ -402,19 +485,23 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
             support one vector size per loop).  */
          scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
                                                       &dummy);
-         if (vect_print_dump_info (REPORT_DETAILS))
+         if (dump_enabled_p ())
            {
-             fprintf (vect_dump, "get vectype for scalar type:  ");
-             print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+             dump_printf_loc (MSG_NOTE, vect_location,
+                               "get vectype for scalar type:  ");
+             dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
+              dump_printf (MSG_NOTE, "\n");
            }
          vf_vectype = get_vectype_for_scalar_type (scalar_type);
          if (!vf_vectype)
            {
-             if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+             if (dump_enabled_p ())
                {
-                 fprintf (vect_dump,
-                          "not vectorized: unsupported data-type ");
-                 print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                   "not vectorized: unsupported data-type ");
+                 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                     scalar_type);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              return false;
            }
@@ -422,28 +509,31 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
          if ((GET_MODE_SIZE (TYPE_MODE (vectype))
               != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
            {
-             if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+             if (dump_enabled_p ())
                {
-                 fprintf (vect_dump,
-                          "not vectorized: different sized vector "
-                          "types in statement, ");
-                 print_generic_expr (vect_dump, vectype, TDF_SLIM);
-                 fprintf (vect_dump, " and ");
-                 print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                   "not vectorized: different sized vector "
+                                   "types in statement, ");
+                 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                     vectype);
+                 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
+                 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                                     vf_vectype);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              return false;
            }
 
-         if (vect_print_dump_info (REPORT_DETAILS))
+         if (dump_enabled_p ())
            {
-             fprintf (vect_dump, "vectype: ");
-             print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
+             dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
+             dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
+              dump_printf (MSG_NOTE, "\n");
            }
 
          nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "nunits = %d", nunits);
-
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
          if (!vectorization_factor
              || (nunits > vectorization_factor))
            vectorization_factor = nunits;
@@ -457,12 +547,14 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
     }
 
   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
+                     vectorization_factor);
   if (vectorization_factor <= 1)
     {
-      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-        fprintf (vect_dump, "not vectorized: unsupported data-type");
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                         "not vectorized: unsupported data-type\n");
       return false;
     }
   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
@@ -474,7 +566,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 /* Function vect_is_simple_iv_evolution.
 
    FORNOW: A simple evolution of an induction variables in the loop is
-   considered a polynomial evolution with constant step.  */
+   considered a polynomial evolution.  */
 
 static bool
 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
@@ -483,6 +575,7 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
   tree init_expr;
   tree step_expr;
   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
+  basic_block bb;
 
   /* When there is no evolution in this loop, the evolution function
      is not "simple".  */
@@ -497,21 +590,31 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
   step_expr = evolution_part;
   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (dump_enabled_p ())
     {
-      fprintf (vect_dump, "step: ");
-      print_generic_expr (vect_dump, step_expr, TDF_SLIM);
-      fprintf (vect_dump, ",  init: ");
-      print_generic_expr (vect_dump, init_expr, TDF_SLIM);
+      dump_printf_loc (MSG_NOTE, vect_location, "step: ");
+      dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
+      dump_printf (MSG_NOTE, ",  init: ");
+      dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
+      dump_printf (MSG_NOTE, "\n");
     }
 
   *init = init_expr;
   *step = step_expr;
 
-  if (TREE_CODE (step_expr) != INTEGER_CST)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "step unknown.");
+  if (TREE_CODE (step_expr) != INTEGER_CST
+      && (TREE_CODE (step_expr) != SSA_NAME
+         || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
+             && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
+         || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
+             && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
+                 || !flag_associative_math)))
+      && (TREE_CODE (step_expr) != REAL_CST
+         || !flag_associative_math))
+    {
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                         "step unknown.\n");
       return false;
     }
 
@@ -529,13 +632,14 @@ static void
 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 {
   basic_block bb = loop->header;
-  tree dumy;
-  VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
+  tree init, step;
+  auto_vec<gimple, 64> worklist;
   gimple_stmt_iterator gsi;
   bool double_reduc;
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                     "=== vect_analyze_scalar_cycles ===\n");
 
   /* First - identify all inductions.  Reduction detection assumes that all the
      inductions have been identified, therefore, this order must not be
@@ -547,15 +651,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
       tree def = PHI_RESULT (phi);
       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (dump_enabled_p ())
        {
-         fprintf (vect_dump, "Analyze phi: ");
-         print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
+         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
+         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+          dump_printf (MSG_NOTE, "\n");
        }
 
       /* Skip virtual phi's.  The data dependences that are associated with
          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
-      if (!is_gimple_reg (def))
+      if (virtual_operand_p (def))
        continue;
 
       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
@@ -565,47 +670,52 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
       if (access_fn)
        {
          STRIP_NOPS (access_fn);
-         if (vect_print_dump_info (REPORT_DETAILS))
+         if (dump_enabled_p ())
            {
-             fprintf (vect_dump, "Access function of PHI: ");
-             print_generic_expr (vect_dump, access_fn, TDF_SLIM);
+             dump_printf_loc (MSG_NOTE, vect_location,
+                               "Access function of PHI: ");
+             dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
+              dump_printf (MSG_NOTE, "\n");
            }
          STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
            = evolution_part_in_loop_num (access_fn, loop->num);
        }
 
       if (!access_fn
-         || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
+         || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
+         || (LOOP_VINFO_LOOP (loop_vinfo) != loop
+             && TREE_CODE (step) != INTEGER_CST))
        {
-         VEC_safe_push (gimple, heap, worklist, phi);
+         worklist.safe_push (phi);
          continue;
        }
 
       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "Detected induction.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
     }
 
 
   /* Second - identify all reductions and nested cycles.  */
-  while (VEC_length (gimple, worklist) > 0)
+  while (worklist.length () > 0)
     {
-      gimple phi = VEC_pop (gimple, worklist);
+      gimple phi = worklist.pop ();
       tree def = PHI_RESULT (phi);
       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
       gimple reduc_stmt;
       bool nested_cycle;
 
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (dump_enabled_p ())
         {
-          fprintf (vect_dump, "Analyze phi: ");
-          print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
+          dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
+          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+          dump_printf (MSG_NOTE, "\n");
         }
 
-      gcc_assert (is_gimple_reg (def));
-      gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
+      gcc_assert (!virtual_operand_p (def)
+                 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 
       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
@@ -614,8 +724,9 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
         {
           if (double_reduc)
             {
-              if (vect_print_dump_info (REPORT_DETAILS))
-                fprintf (vect_dump, "Detected double reduction.");
+              if (dump_enabled_p ())
+                dump_printf_loc (MSG_NOTE, vect_location,
+                                "Detected double reduction.\n");
 
               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
@@ -625,8 +736,9 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
             {
               if (nested_cycle)
                 {
-                  if (vect_print_dump_info (REPORT_DETAILS))
-                    fprintf (vect_dump, "Detected vectorizable nested cycle.");
+                  if (dump_enabled_p ())
+                    dump_printf_loc (MSG_NOTE, vect_location,
+                                    "Detected vectorizable nested cycle.\n");
 
                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
@@ -634,26 +746,24 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
                 }
               else
                 {
-                  if (vect_print_dump_info (REPORT_DETAILS))
-                    fprintf (vect_dump, "Detected reduction.");
+                  if (dump_enabled_p ())
+                    dump_printf_loc (MSG_NOTE, vect_location,
+                                    "Detected reduction.\n");
 
                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
                                                            vect_reduction_def;
                   /* Store the reduction cycles for possible vectorization in
                      loop-aware SLP.  */
-                  VEC_safe_push (gimple, heap,
-                                 LOOP_VINFO_REDUCTIONS (loop_vinfo),
-                                 reduc_stmt);
+                  LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
                 }
             }
         }
       else
-        if (vect_print_dump_info (REPORT_DETAILS))
-          fprintf (vect_dump, "Unknown def-use cycle pattern.");
+        if (dump_enabled_p ())
+          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                          "Unknown def-use cycle pattern.\n");
     }
-
-  VEC_free (gimple, heap, worklist);
 }
 
 
@@ -698,34 +808,36 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 }
 
+
 /* Function vect_get_loop_niters.
 
-   Determine how many iterations the loop is executed.
-   If an expression that represents the number of iterations
-   can be constructed, place it in NUMBER_OF_ITERATIONS.
+   Determine how many iterations the loop is executed and place it
+   in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
+   in NUMBER_OF_ITERATIONSM1.
+
    Return the loop exit condition.  */
 
 static gimple
-vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
+vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
+                     tree *number_of_iterationsm1)
 {
   tree niters;
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "=== get_loop_niters ===");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "=== get_loop_niters ===\n");
 
-  niters = number_of_exit_cond_executions (loop);
+  niters = number_of_latch_executions (loop);
+  *number_of_iterationsm1 = niters;
 
-  if (niters != NULL_TREE
-      && niters != chrec_dont_know)
-    {
-      *number_of_iterations = niters;
-
-      if (vect_print_dump_info (REPORT_DETAILS))
-        {
-          fprintf (vect_dump, "==> get_loop_niters:" );
-          print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
-        }
-    }
+  /* We want the number of loop header executions which is the number
+     of latch executions plus one.
+     ???  For UINT_MAX latch executions this number overflows to zero
+     for loops like do { n++; } while (n != 0);  */
+  if (niters && !chrec_contains_undetermined (niters))
+    niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
+                         build_int_cst (TREE_TYPE (niters), 1));
+  *number_of_iterations = niters;
 
   return get_loop_exit_condition (loop);
 }
@@ -829,30 +941,30 @@ new_loop_vec_info (struct loop *loop)
    gcc_assert (nbbs == loop->num_nodes);
 
   LOOP_VINFO_BBS (res) = bbs;
+  LOOP_VINFO_NITERSM1 (res) = NULL;
   LOOP_VINFO_NITERS (res) = NULL;
   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
+  LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
-  LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
+  LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
   LOOP_VINFO_VECT_FACTOR (res) = 0;
-  LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
-  LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
-  LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
+  LOOP_VINFO_LOOP_NEST (res).create (3);
+  LOOP_VINFO_DATAREFS (res).create (10);
+  LOOP_VINFO_DDRS (res).create (10 * 10);
   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
-  LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
-    VEC_alloc (gimple, heap,
-               PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
-  LOOP_VINFO_MAY_ALIAS_DDRS (res) =
-    VEC_alloc (ddr_p, heap,
-               PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
-  LOOP_VINFO_GROUPED_STORES (res) = VEC_alloc (gimple, heap, 10);
-  LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
-  LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
-  LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
+  LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
+            PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
+  LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
+            PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
+  LOOP_VINFO_GROUPED_STORES (res).create (10);
+  LOOP_VINFO_REDUCTIONS (res).create (10);
+  LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
+  LOOP_VINFO_SLP_INSTANCES (res).create (10);
   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
-  LOOP_VINFO_PEELING_HTAB (res) = NULL;
   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
+  LOOP_VINFO_PEELING_FOR_NITER (res) = false;
   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 
   return res;
@@ -872,7 +984,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
   int nbbs;
   gimple_stmt_iterator si;
   int j;
-  VEC (slp_instance, heap) *slp_instances;
+  vec<slp_instance> slp_instances;
   slp_instance instance;
   bool swapped;
 
@@ -882,23 +994,9 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
   loop = LOOP_VINFO_LOOP (loop_vinfo);
 
   bbs = LOOP_VINFO_BBS (loop_vinfo);
-  nbbs = loop->num_nodes;
+  nbbs = clean_stmts ? loop->num_nodes : 0;
   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 
-  if (!clean_stmts)
-    {
-      free (LOOP_VINFO_BBS (loop_vinfo));
-      free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
-      free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
-      VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
-      VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
-      VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
-
-      free (loop_vinfo);
-      loop->aux = NULL;
-      return;
-    }
-
   for (j = 0; j < nbbs; j++)
     {
       basic_block bb = bbs[j];
@@ -919,9 +1017,9 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
                   || code == POINTER_PLUS_EXPR
                   || code == MULT_EXPR)
                  && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
-               swap_tree_operands (stmt,
-                                   gimple_assign_rhs1_ptr (stmt),
-                                   gimple_assign_rhs2_ptr (stmt));
+               swap_ssa_operands (stmt,
+                                  gimple_assign_rhs1_ptr (stmt),
+                                  gimple_assign_rhs2_ptr (stmt));
            }
 
          /* Free stmt_vec_info.  */
@@ -931,22 +1029,22 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
     }
 
   free (LOOP_VINFO_BBS (loop_vinfo));
-  free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
+  vect_destroy_datarefs (loop_vinfo, NULL);
   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
-  VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
-  VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
-  VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
+  LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
+  LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
+  LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
-  FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
+  FOR_EACH_VEC_ELT (slp_instances, j, instance)
     vect_free_slp_instance (instance);
 
-  VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
-  VEC_free (gimple, heap, LOOP_VINFO_GROUPED_STORES (loop_vinfo));
-  VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
-  VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
+  LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
+  LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
+  LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
+  LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
 
-  if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
-    htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
+  delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
+  LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
 
   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
 
@@ -968,16 +1066,18 @@ vect_analyze_loop_1 (struct loop *loop)
 {
   loop_vec_info loop_vinfo;
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "===== analyze_loop_nest_1 =====\n");
 
   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 
   loop_vinfo = vect_analyze_loop_form (loop);
   if (!loop_vinfo)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "bad inner-loop form.");
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad inner-loop form.\n");
       return NULL;
     }
 
@@ -998,11 +1098,12 @@ vect_analyze_loop_form (struct loop *loop)
 {
   loop_vec_info loop_vinfo;
   gimple loop_cond;
-  tree number_of_iterations = NULL;
+  tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
   loop_vec_info inner_loop_vinfo = NULL;
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "=== vect_analyze_loop_form ===");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "=== vect_analyze_loop_form ===\n");
 
   /* Different restrictions apply when we are considering an inner-most loop,
      vs. an outer (nested) loop.
@@ -1024,17 +1125,19 @@ vect_analyze_loop_form (struct loop *loop)
 
       if (loop->num_nodes != 2)
         {
-          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-            fprintf (vect_dump, "not vectorized: control flow in loop.");
+          if (dump_enabled_p ())
+            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: control flow in loop.\n");
           return NULL;
         }
 
       if (empty_block_p (loop->header))
-    {
-          if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-            fprintf (vect_dump, "not vectorized: empty loop.");
-      return NULL;
-    }
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: empty loop.\n");
+         return NULL;
+       }
     }
   else
     {
@@ -1060,8 +1163,9 @@ vect_analyze_loop_form (struct loop *loop)
 
       if ((loop->inner)->inner || (loop->inner)->next)
        {
-         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-           fprintf (vect_dump, "not vectorized: multiple nested loops.");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: multiple nested loops.\n");
          return NULL;
        }
 
@@ -1069,25 +1173,28 @@ vect_analyze_loop_form (struct loop *loop)
       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
       if (!inner_loop_vinfo)
        {
-         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-            fprintf (vect_dump, "not vectorized: Bad inner loop.");
+         if (dump_enabled_p ())
+            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: Bad inner loop.\n");
          return NULL;
        }
 
       if (!expr_invariant_in_loop_p (loop,
                                        LOOP_VINFO_NITERS (inner_loop_vinfo)))
        {
-         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-           fprintf (vect_dump,
-                    "not vectorized: inner-loop count not invariant.");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: inner-loop count not"
+                             " invariant.\n");
          destroy_loop_vec_info (inner_loop_vinfo, true);
          return NULL;
        }
 
       if (loop->num_nodes != 5)
         {
-         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-           fprintf (vect_dump, "not vectorized: control flow in loop.");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: control flow in loop.\n");
          destroy_loop_vec_info (inner_loop_vinfo, true);
          return NULL;
         }
@@ -1101,25 +1208,29 @@ vect_analyze_loop_form (struct loop *loop)
          || !single_exit (innerloop)
          || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
        {
-         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-           fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: unsupported outerloop form.\n");
          destroy_loop_vec_info (inner_loop_vinfo, true);
          return NULL;
        }
 
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "Considering outer-loop vectorization.");
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location,
+                        "Considering outer-loop vectorization.\n");
     }
 
   if (!single_exit (loop)
       || EDGE_COUNT (loop->header->preds) != 2)
     {
-      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
+      if (dump_enabled_p ())
         {
           if (!single_exit (loop))
-            fprintf (vect_dump, "not vectorized: multiple exits.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: multiple exits.\n");
           else if (EDGE_COUNT (loop->header->preds) != 2)
-            fprintf (vect_dump, "not vectorized: too many incoming edges.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: too many incoming edges.\n");
         }
       if (inner_loop_vinfo)
        destroy_loop_vec_info (inner_loop_vinfo, true);
@@ -1131,10 +1242,11 @@ vect_analyze_loop_form (struct loop *loop)
      before the loop if needed), where the loop header contains all the
      executable statements, and the latch is empty.  */
   if (!empty_block_p (loop->latch)
-        || !gimple_seq_empty_p (phi_nodes (loop->latch)))
+      || !gimple_seq_empty_p (phi_nodes (loop->latch)))
     {
-      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-        fprintf (vect_dump, "not vectorized: unexpected loop form.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: latch block not empty.\n");
       if (inner_loop_vinfo)
        destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
@@ -1147,68 +1259,69 @@ vect_analyze_loop_form (struct loop *loop)
       if (!(e->flags & EDGE_ABNORMAL))
        {
          split_loop_exit_edge (e);
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "split exit edge.");
+         if (dump_enabled_p ())
+           dump_printf (MSG_NOTE, "split exit edge.\n");
        }
       else
        {
-         if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-           fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: abnormal loop exit edge.\n");
          if (inner_loop_vinfo)
            destroy_loop_vec_info (inner_loop_vinfo, true);
          return NULL;
        }
     }
 
-  loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
+  loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
+                                   &number_of_iterationsm1);
   if (!loop_cond)
     {
-      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-       fprintf (vect_dump, "not vectorized: complicated exit condition.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: complicated exit condition.\n");
       if (inner_loop_vinfo)
        destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
 
-  if (!number_of_iterations)
+  if (!number_of_iterations
+      || chrec_contains_undetermined (number_of_iterations))
     {
-      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-       fprintf (vect_dump,
-                "not vectorized: number of iterations cannot be computed.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: number of iterations cannot be "
+                        "computed.\n");
       if (inner_loop_vinfo)
        destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
 
-  if (chrec_contains_undetermined (number_of_iterations))
+  if (integer_zerop (number_of_iterations))
     {
-      if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
-        fprintf (vect_dump, "Infinite number of iterations.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: number of iterations = 0.\n");
       if (inner_loop_vinfo)
-       destroy_loop_vec_info (inner_loop_vinfo, true);
+        destroy_loop_vec_info (inner_loop_vinfo, true);
       return NULL;
     }
 
-  if (!NITERS_KNOWN_P (number_of_iterations))
+  loop_vinfo = new_loop_vec_info (loop);
+  LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
+  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
+  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
+
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (dump_enabled_p ())
         {
-          fprintf (vect_dump, "Symbolic number of iterations is ");
-          print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
+          dump_printf_loc (MSG_NOTE, vect_location,
+                          "Symbolic number of iterations is ");
+         dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
+          dump_printf (MSG_NOTE, "\n");
         }
     }
-  else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
-    {
-      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-        fprintf (vect_dump, "not vectorized: number of iterations = 0.");
-      if (inner_loop_vinfo)
-        destroy_loop_vec_info (inner_loop_vinfo, false);
-      return NULL;
-    }
-
-  loop_vinfo = new_loop_vec_info (loop);
-  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
-  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
 
   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
 
@@ -1243,9 +1356,12 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
   unsigned int th;
   bool only_slp_in_loop = true, ok;
   HOST_WIDE_INT max_niter;
+  HOST_WIDE_INT estimated_niter;
+  int min_profitable_estimate;
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "=== vect_analyze_loop_operations ===\n");
 
   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
@@ -1279,9 +1395,10 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
 
       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "Updating vectorization factor to %d ",
-                           vectorization_factor);
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Updating vectorization factor to %d\n",
+                        vectorization_factor);
     }
 
   for (i = 0; i < nbbs; i++)
@@ -1294,10 +1411,11 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
           ok = true;
 
           stmt_info = vinfo_for_stmt (phi);
-          if (vect_print_dump_info (REPORT_DETAILS))
+          if (dump_enabled_p ())
             {
-              fprintf (vect_dump, "examining phi: ");
-              print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
+              dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
+              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+              dump_printf (MSG_NOTE, "\n");
             }
 
           /* Inner-loop loop-closed exit phi in outer-loop vectorization
@@ -1313,9 +1431,10 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
                   && STMT_VINFO_DEF_TYPE (stmt_info)
                      != vect_double_reduction_def)
                 {
-                  if (vect_print_dump_info (REPORT_DETAILS))
-                    fprintf (vect_dump,
-                             "Unsupported loop-closed phi in outer-loop.");
+                  if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "Unsupported loop-closed phi in "
+                                    "outer-loop.\n");
                   return false;
                 }
 
@@ -1334,7 +1453,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
                     return false;
 
                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
-                 if (!op_def_stmt
+                 if (gimple_nop_p (op_def_stmt)
                      || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
                      || !vinfo_for_stmt (op_def_stmt))
                     return false;
@@ -1354,8 +1473,9 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
           if (STMT_VINFO_LIVE_P (stmt_info))
             {
               /* FORNOW: not yet supported.  */
-              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-                fprintf (vect_dump, "not vectorized: value used after loop.");
+              if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "not vectorized: value used after loop.\n");
               return false;
             }
 
@@ -1363,8 +1483,9 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
             {
               /* A scalar-dependence cycle that we don't support.  */
-              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-                fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
+              if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "not vectorized: scalar dependence cycle.\n");
               return false;
             }
 
@@ -1377,11 +1498,13 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
 
           if (!ok)
             {
-              if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
+              if (dump_enabled_p ())
                 {
-                  fprintf (vect_dump,
-                           "not vectorized: relevant phi not supported: ");
-                  print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  "not vectorized: relevant phi not "
+                                  "supported: ");
+                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                 }
              return false;
             }
@@ -1390,7 +1513,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
         {
           gimple stmt = gsi_stmt (si);
-         if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
+         if (!gimple_clobber_p (stmt)
+             && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
            return false;
         }
     } /* bbs */
@@ -1402,31 +1526,34 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
      touching this loop.  */
   if (!need_to_vectorize)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump,
-                 "All the computation can be taken out of the loop.");
-      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-        fprintf (vect_dump,
-                 "not vectorized: redundant loop. no profit to vectorize.");
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location,
+                        "All the computation can be taken out of the loop.\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: redundant loop. no profit to "
+                        "vectorize.\n");
       return false;
     }
 
-  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      && vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump,
-        "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
-        vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "vectorization_factor = %d, niters = "
+                    HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
+                    LOOP_VINFO_INT_NITERS (loop_vinfo));
 
   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
       || ((max_niter = max_stmt_executions_int (loop)) != -1
          && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
     {
-      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-        fprintf (vect_dump, "not vectorized: iteration count too small.");
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump,"not vectorized: iteration count smaller than "
-                 "vectorization factor.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: iteration count too small.\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: iteration count smaller than "
+                        "vectorization factor.\n");
       return false;
     }
 
@@ -1436,22 +1563,26 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
      vector stmts depends on VF.  */
   vect_update_slp_costs_according_to_vf (loop_vinfo);
 
-  min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
+  vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
+                                     &min_profitable_estimate);
   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
 
   if (min_profitable_iters < 0)
     {
-      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-        fprintf (vect_dump, "not vectorized: vectorization not profitable.");
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "not vectorized: vector version will never be "
-                 "profitable.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: vectorization not profitable.\n");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: vector version will never be "
+                        "profitable.\n");
       return false;
     }
 
   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
                             * vectorization_factor) - 1);
 
+
   /* Use the cost model only if it is more conservative than user specified
      threshold.  */
 
@@ -1461,39 +1592,37 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
           || min_profitable_iters > min_scalar_loop_bound))
     th = (unsigned) min_profitable_iters;
 
+  LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
+
   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
     {
-      if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-        fprintf (vect_dump, "not vectorized: vectorization not "
-                 "profitable.");
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "not vectorized: iteration count smaller than "
-                 "user specified loop bound parameter or minimum "
-                 "profitable iterations (whichever is more conservative).");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: vectorization not profitable.\n");
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location,
+                        "not vectorized: iteration count smaller than user "
+                        "specified loop bound parameter or minimum profitable "
+                        "iterations (whichever is more conservative).\n");
       return false;
     }
 
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
-      || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "epilog loop required.");
-      if (!vect_can_advance_ivs_p (loop_vinfo))
-        {
-          if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-            fprintf (vect_dump,
-                     "not vectorized: can't create epilog loop 1.");
-          return false;
-        }
-      if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
-        {
-          if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
-            fprintf (vect_dump,
-                     "not vectorized: can't create epilog loop 2.");
-          return false;
-        }
+  if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
+      && ((unsigned HOST_WIDE_INT) estimated_niter
+          <= MAX (th, (unsigned)min_profitable_estimate)))
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: estimated iteration count too "
+                         "small.\n");
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location,
+                        "not vectorized: estimated iteration count smaller "
+                         "than specified loop bound parameter or minimum "
+                         "profitable iterations (whichever is more "
+                         "conservative).\n");
+      return false;
     }
 
   return true;
@@ -1511,6 +1640,8 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
   bool ok, slp = false;
   int max_vf = MAX_VECTORIZATION_FACTOR;
   int min_vf = 2;
+  unsigned int th;
+  unsigned int n_stmts = 0;
 
   /* Find all data references in the loop (which correspond to vdefs/vuses)
      and analyze their evolution in the loop.  Also adjust the minimal
@@ -1519,11 +1650,24 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
      FORNOW: Handle only simple, array references, which
      alignment can be forced, and aligned pointer-references.  */
 
-  ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
+  ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
+  if (!ok)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data references.\n");
+      return false;
+    }
+
+  /* Analyze the access patterns of the data-refs in the loop (consecutive,
+     complex, etc.). FORNOW: Only handle consecutive access pattern.  */
+
+  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
   if (!ok)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "bad data references.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data access.\n");
       return false;
     }
 
@@ -1539,8 +1683,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
   if (!ok)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "unexpected pattern.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "unexpected pattern.\n");
       return false;
     }
 
@@ -1549,26 +1694,29 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
      the dependences.
      FORNOW: fail at the first data dependence that we encounter.  */
 
-  ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
+  ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
   if (!ok
       || max_vf < min_vf)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "bad data dependence.");
+      if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "bad data dependence.\n");
       return false;
     }
 
   ok = vect_determine_vectorization_factor (loop_vinfo);
   if (!ok)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "can't determine vectorization factor.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "can't determine vectorization factor.\n");
       return false;
     }
   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "bad data dependence.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data dependence.\n");
       return false;
     }
 
@@ -1578,19 +1726,9 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
   if (!ok)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "bad data alignment.");
-      return false;
-    }
-
-  /* Analyze the access patterns of the data-refs in the loop (consecutive,
-     complex, etc.). FORNOW: Only handle consecutive access pattern.  */
-
-  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
-  if (!ok)
-    {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "bad data access.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data alignment.\n");
       return false;
     }
 
@@ -1600,9 +1738,12 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
   if (!ok)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "too long list of versioning for alias "
-                           "run-time tests.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "number of versioning for alias "
+                        "run-time tests exceeds %d "
+                        "(--param vect-max-version-for-alias-checks)\n",
+                        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
       return false;
     }
 
@@ -1612,13 +1753,14 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
   ok = vect_enhance_data_refs_alignment (loop_vinfo);
   if (!ok)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "bad data alignment.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data alignment.\n");
       return false;
     }
 
   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
-  ok = vect_analyze_slp (loop_vinfo, NULL);
+  ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
   if (ok)
     {
       /* Decide which possible SLP instances to SLP.  */
@@ -1636,11 +1778,57 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
   ok = vect_analyze_loop_operations (loop_vinfo, slp);
   if (!ok)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "bad operation or unsupported loop bound.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad operation or unsupported loop bound.\n");
       return false;
     }
 
+  /* Decide whether we need to create an epilogue loop to handle
+     remaining scalar iterations.  */
+  th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
+        / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
+       * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+    {
+      if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
+                  - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+         < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+       LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+          || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+              < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
+               /* In case of versioning, check if the maximum number of
+                  iterations is greater than th.  If they are identical,
+                  the epilogue is unnecessary.  */
+              && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
+                   && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
+                   || (unsigned HOST_WIDE_INT)max_stmt_executions_int
+                       (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+
+  /* If an epilogue loop is required make sure we can create one.  */
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+      || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
+      if (!vect_can_advance_ivs_p (loop_vinfo)
+         || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
+                                          single_exit (LOOP_VINFO_LOOP
+                                                        (loop_vinfo))))
+        {
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: can't create required "
+                            "epilog loop\n");
+          return false;
+        }
+    }
+
   return true;
 }
 
@@ -1659,15 +1847,17 @@ vect_analyze_loop (struct loop *loop)
   current_vector_size = 0;
   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "===== analyze_loop_nest =====");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "===== analyze_loop_nest =====\n");
 
   if (loop_outer (loop)
       && loop_vec_info_for_loop (loop_outer (loop))
       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "outer-loop already vectorized.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "outer-loop already vectorized.\n");
       return NULL;
     }
 
@@ -1677,8 +1867,9 @@ vect_analyze_loop (struct loop *loop)
       loop_vinfo = vect_analyze_loop_form (loop);
       if (!loop_vinfo)
        {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "bad loop form.");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "bad loop form.\n");
          return NULL;
        }
 
@@ -1698,9 +1889,10 @@ vect_analyze_loop (struct loop *loop)
 
       /* Try the next biggest vector size.  */
       current_vector_size = 1 << floor_log2 (vector_sizes);
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "***** Re-trying analysis with "
-                "vector size %d\n", current_vector_size);
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "***** Re-trying analysis with "
+                        "vector size %d\n", current_vector_size);
     }
 }
 
@@ -1712,9 +1904,9 @@ vect_analyze_loop (struct loop *loop)
 
    Output:
    REDUC_CODE - the corresponding tree-code to be used to reduce the
-      vector of partial results into a single scalar result (which
-      will also reside in a vector) or ERROR_MARK if the operation is
-      a supported reduction operation, but does not have such tree-code.
+      vector of partial results into a single scalar result, or ERROR_MARK
+      if the operation is a supported reduction operation, but does not have
+      such a tree-code.
 
    Return FALSE if CODE currently cannot be vectorized as reduction.  */
 
@@ -1754,10 +1946,11 @@ reduction_code_for_scalar_code (enum tree_code code,
    STMT is printed with a message MSG. */
 
 static void
-report_vect_op (gimple stmt, const char *msg)
+report_vect_op (int msg_type, gimple stmt, const char *msg)
 {
-  fprintf (vect_dump, "%s", msg);
-  print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+  dump_printf_loc (msg_type, vect_location, "%s", msg);
+  dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
+  dump_printf (msg_type, "\n");
 }
 
 
@@ -1803,10 +1996,8 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
         {
          gimple use_stmt = USE_STMT (use_p);
-          if (is_gimple_debug (use_stmt))
-            continue;
-
-         use_stmt = USE_STMT (use_p);
+         if (is_gimple_debug (use_stmt))
+           continue;
 
           /* Check if we got back to the reduction phi.  */
          if (use_stmt == phi)
@@ -1929,15 +2120,16 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
                                   == vect_internal_def
                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
            {
-             if (vect_print_dump_info (REPORT_DETAILS))
+             if (dump_enabled_p ())
                {
-                 fprintf (vect_dump, "swapping oprnds: ");
-                 print_gimple_stmt (vect_dump, next_stmt, 0, TDF_SLIM);
+                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
+                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
+                  dump_printf (MSG_NOTE, "\n");
                }
 
-             swap_tree_operands (next_stmt,
-                                 gimple_assign_rhs1_ptr (next_stmt),
-                                  gimple_assign_rhs2_ptr (next_stmt));
+             swap_ssa_operands (next_stmt,
+                                gimple_assign_rhs1_ptr (next_stmt),
+                                 gimple_assign_rhs2_ptr (next_stmt));
              update_stmt (next_stmt);
 
              if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
@@ -1953,7 +2145,7 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
 
   /* Save the chain for further analysis in SLP detection.  */
   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
-  VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
+  LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
   GROUP_SIZE (vinfo_for_stmt (first)) = size;
 
   return true;
@@ -1970,6 +2162,13 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
      a3 = ...
      a2 = operation (a3, a1)
 
+   or
+
+   a3 = ...
+   loop_header:
+     a1 = phi < a0, a2 >
+     a2 = operation (a3, a1)
+
    such that:
    1. operation is commutative and associative and it is safe to
       change the order of the computation (if CHECK_REDUCTION is true)
@@ -2022,6 +2221,12 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
 
   name = PHI_RESULT (phi);
+  /* ???  If there are no uses of the PHI result the inner loop reduction
+     won't be detected as possibly double-reduction by vectorizable_reduction
+     because that tries to walk the PHI arg from the preheader edge which
+     can be constant.  See PR60382.  */
+  if (has_zero_uses (name))
+    return NULL;
   nloop_uses = 0;
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
     {
@@ -2031,8 +2236,9 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
 
       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "intermediate value used outside loop.");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "intermediate value used outside loop.\n");
 
           return NULL;
         }
@@ -2042,18 +2248,21 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
         nloop_uses++;
       if (nloop_uses > 1)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "reduction used in loop.");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "reduction used in loop.\n");
           return NULL;
         }
     }
 
   if (TREE_CODE (loop_arg) != SSA_NAME)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (dump_enabled_p ())
        {
-         fprintf (vect_dump, "reduction: not ssa_name: ");
-         print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                          "reduction: not ssa_name: ");
+         dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
+          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
        }
       return NULL;
     }
@@ -2061,15 +2270,19 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
   if (!def_stmt)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       fprintf (vect_dump, "reduction: no def_stmt.");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "reduction: no def_stmt.\n");
       return NULL;
     }
 
   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
+      if (dump_enabled_p ())
+        {
+          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+          dump_printf (MSG_NOTE, "\n");
+        }
       return NULL;
     }
 
@@ -2096,8 +2309,9 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
        nloop_uses++;
       if (nloop_uses > 1)
        {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "reduction used in loop.");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "reduction used in loop.\n");
          return NULL;
        }
     }
@@ -2111,20 +2325,23 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
       if (gimple_phi_num_args (def_stmt) != 1
           || TREE_CODE (op1) != SSA_NAME)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "unsupported phi node definition.");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "unsupported phi node definition.\n");
 
           return NULL;
         }
 
       def1 = SSA_NAME_DEF_STMT (op1);
-      if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
+      if (gimple_bb (def1)
+         && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
           && loop->inner
           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
           && is_gimple_assign (def1))
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            report_vect_op (def_stmt, "detected double reduction: ");
+          if (dump_enabled_p ())
+            report_vect_op (MSG_NOTE, def_stmt,
+                           "detected double reduction: ");
 
           *double_reduc = true;
           return def_stmt;
@@ -2149,8 +2366,9 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
   if (check_reduction
       && (!commutative_tree_code (code) || !associative_tree_code (code)))
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        report_vect_op (def_stmt, "reduction: not commutative/associative: ");
+      if (dump_enabled_p ())
+        report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+                       "reduction: not commutative/associative: ");
       return NULL;
     }
 
@@ -2158,8 +2376,9 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
     {
       if (code != COND_EXPR)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-           report_vect_op (def_stmt, "reduction: not binary operation: ");
+         if (dump_enabled_p ())
+           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+                           "reduction: not binary operation: ");
 
           return NULL;
         }
@@ -2176,8 +2395,9 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
 
       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
+          if (dump_enabled_p ())
+            report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+                           "reduction: uses not ssa_names: ");
 
           return NULL;
         }
@@ -2189,8 +2409,9 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
 
       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-           report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
+          if (dump_enabled_p ())
+           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+                           "reduction: uses not ssa_names: ");
 
           return NULL;
         }
@@ -2206,25 +2427,31 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
       || (op4 && TREE_CODE (op4) == SSA_NAME
           && !types_compatible_p (type, TREE_TYPE (op4))))
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
+      if (dump_enabled_p ())
         {
-          fprintf (vect_dump, "reduction: multiple types: operation type: ");
-          print_generic_expr (vect_dump, type, TDF_SLIM);
-          fprintf (vect_dump, ", operands types: ");
-          print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
-          fprintf (vect_dump, ",");
-          print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
+          dump_printf_loc (MSG_NOTE, vect_location,
+                          "reduction: multiple types: operation type: ");
+          dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
+          dump_printf (MSG_NOTE, ", operands types: ");
+          dump_generic_expr (MSG_NOTE, TDF_SLIM,
+                            TREE_TYPE (op1));
+          dump_printf (MSG_NOTE, ",");
+          dump_generic_expr (MSG_NOTE, TDF_SLIM,
+                            TREE_TYPE (op2));
           if (op3)
             {
-              fprintf (vect_dump, ",");
-              print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
+              dump_printf (MSG_NOTE, ",");
+              dump_generic_expr (MSG_NOTE, TDF_SLIM,
+                                TREE_TYPE (op3));
             }
 
           if (op4)
             {
-              fprintf (vect_dump, ",");
-              print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
+              dump_printf (MSG_NOTE, ",");
+              dump_generic_expr (MSG_NOTE, TDF_SLIM,
+                                TREE_TYPE (op4));
             }
+          dump_printf (MSG_NOTE, "\n");
         }
 
       return NULL;
@@ -2243,23 +2470,25 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
       && check_reduction)
     {
       /* Changing the order of operations changes the semantics.  */
-      if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
+      if (dump_enabled_p ())
+       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+                       "reduction: unsafe fp math optimization: ");
       return NULL;
     }
   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
           && check_reduction)
     {
       /* Changing the order of operations changes the semantics.  */
-      if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
+      if (dump_enabled_p ())
+       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+                       "reduction: unsafe int math optimization: ");
       return NULL;
     }
   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
     {
       /* Changing the order of operations changes the semantics.  */
-      if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt,
+      if (dump_enabled_p ())
+       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
                        "reduction: unsafe fixed-point math optimization: ");
       return NULL;
     }
@@ -2270,7 +2499,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
   if (orig_code == MINUS_EXPR)
     {
       tree rhs = gimple_assign_rhs2 (def_stmt);
-      tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
+      tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
                                                         rhs, NULL);
       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
@@ -2295,8 +2524,8 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
   if (code != COND_EXPR
       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt, "reduction: no defs for operands: ");
+      if (dump_enabled_p ())
+       report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
       return NULL;
     }
 
@@ -2307,6 +2536,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
   if (def2 && def2 == phi
       && (code == COND_EXPR
          || !def1 || gimple_nop_p (def1)
+         || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
               && (is_gimple_assign (def1)
                  || is_gimple_call (def1)
@@ -2317,14 +2547,15 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
                           == vect_internal_def
                      && !is_loop_header_bb_p (gimple_bb (def1)))))))
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-       report_vect_op (def_stmt, "detected reduction: ");
+      if (dump_enabled_p ())
+       report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
       return def_stmt;
     }
 
   if (def1 && def1 == phi
       && (code == COND_EXPR
          || !def2 || gimple_nop_p (def2)
+         || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
              && (is_gimple_assign (def2)
                  || is_gimple_call (def2)
@@ -2340,20 +2571,20 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
           /* Swap operands (just for simplicity - so that the rest of the code
             can assume that the reduction variable is always the last (second)
             argument).  */
-          if (vect_print_dump_info (REPORT_DETAILS))
-           report_vect_op (def_stmt,
+          if (dump_enabled_p ())
+           report_vect_op (MSG_NOTE, def_stmt,
                            "detected reduction: need to swap operands: ");
 
-          swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
-                             gimple_assign_rhs2_ptr (def_stmt));
+          swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
+                            gimple_assign_rhs2_ptr (def_stmt));
 
          if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
            LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
         }
       else
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            report_vect_op (def_stmt, "detected reduction: ");
+          if (dump_enabled_p ())
+            report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
         }
 
       return def_stmt;
@@ -2362,14 +2593,16 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
   /* Try to find SLP reduction chain.  */
   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        report_vect_op (def_stmt, "reduction: detected reduction chain: ");
+      if (dump_enabled_p ())
+        report_vect_op (MSG_NOTE, def_stmt,
+                       "reduction: detected reduction chain: ");
 
       return def_stmt;
     }
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    report_vect_op (def_stmt, "reduction: unknown pattern: ");
+  if (dump_enabled_p ())
+    report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
+                   "reduction: unknown pattern: ");
        
   return NULL;
 }
@@ -2475,10 +2708,10 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
       *peel_iters_epilogue = vf/2;
-      if (vect_print_dump_info (REPORT_COST))
-        fprintf (vect_dump, "cost model: "
-                            "epilogue peel iters set to vf/2 because "
-                            "loop iterations are unknown .");
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location,
+                        "cost model: epilogue peel iters set to vf/2 "
+                        "because loop iterations are unknown .\n");
 
       /* If peeled iterations are known but number of scalar loop
          iterations are unknown, count a taken branch per peeled loop.  */
@@ -2512,15 +2745,15 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
 
    Return the number of iterations required for the vector version of the
    loop to be profitable relative to the cost of the scalar version of the
-   loop.
-
-   TODO: Take profile info into account before making vectorization
-   decisions, if available.  */
+   loop.  */
 
-int
-vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
+static void
+vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
+                                   int *ret_min_profitable_niters,
+                                   int *ret_min_profitable_estimate)
 {
   int min_profitable_iters;
+  int min_profitable_estimate;
   int peel_iters_prologue;
   int peel_iters_epilogue;
   unsigned vec_inside_cost = 0;
@@ -2530,40 +2763,40 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
   int scalar_single_iter_cost = 0;
   int scalar_outside_cost = 0;
   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
+  int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
 
   /* Cost model disabled.  */
-  if (!flag_vect_cost_model)
+  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
     {
-      if (vect_print_dump_info (REPORT_COST))
-        fprintf (vect_dump, "cost model disabled.");
-      return 0;
+      dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
+      *ret_min_profitable_niters = 0;
+      *ret_min_profitable_estimate = 0;
+      return;
     }
 
   /* Requires loop versioning tests to handle misalignment.  */
   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
     {
       /*  FIXME: Make cost depend on complexity of individual check.  */
-      unsigned len = VEC_length (gimple,
-                                LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
+      unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
                            vect_prologue);
-      if (vect_print_dump_info (REPORT_COST))
-        fprintf (vect_dump, "cost model: Adding cost of checks for loop "
-                 "versioning to treat misalignment.\n");
+      dump_printf (MSG_NOTE,
+                   "cost model: Adding cost of checks for loop "
+                   "versioning to treat misalignment.\n");
     }
 
   /* Requires loop versioning with alias checks.  */
   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
     {
       /*  FIXME: Make cost depend on complexity of individual check.  */
-      unsigned len = VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
+      unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
                            vect_prologue);
-      if (vect_print_dump_info (REPORT_COST))
-        fprintf (vect_dump, "cost model: Adding cost of checks for loop "
-                 "versioning aliasing.\n");
+      dump_printf (MSG_NOTE,
+                   "cost model: Adding cost of checks for loop "
+                   "versioning aliasing.\n");
     }
 
   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
@@ -2593,17 +2826,15 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
   if (npeel  < 0)
     {
       peel_iters_prologue = vf/2;
-      if (vect_print_dump_info (REPORT_COST))
-        fprintf (vect_dump, "cost model: "
-                 "prologue peel iters set to vf/2.");
+      dump_printf (MSG_NOTE, "cost model: "
+                   "prologue peel iters set to vf/2.\n");
 
       /* If peeling for alignment is unknown, loop bound of main loop becomes
          unknown.  */
       peel_iters_epilogue = vf/2;
-      if (vect_print_dump_info (REPORT_COST))
-        fprintf (vect_dump, "cost model: "
-                 "epilogue peel iters set to vf/2 because "
-                 "peeling for alignment is unknown .");
+      dump_printf (MSG_NOTE, "cost model: "
+                   "epilogue peel iters set to vf/2 because "
+                   "peeling for alignment is unknown.\n");
 
       /* If peeled iterations are unknown, count a taken branch and a not taken
          branch per peeled loop. Even if scalar loop iterations are known,
@@ -2629,8 +2860,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
       int j;
       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
 
-      prologue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
-      epilogue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
+      prologue_cost_vec.create (2);
+      epilogue_cost_vec.create (2);
       peel_iters_prologue = npeel;
 
       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
@@ -2639,7 +2870,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                                          &prologue_cost_vec,
                                          &epilogue_cost_vec);
 
-      FOR_EACH_VEC_ELT (stmt_info_for_cost, prologue_cost_vec, j, si)
+      FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
        {
          struct _stmt_vec_info *stmt_info
            = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
@@ -2647,7 +2878,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                                si->misalign, vect_prologue);
        }
 
-      FOR_EACH_VEC_ELT (stmt_info_for_cost, epilogue_cost_vec, j, si)
+      FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
        {
          struct _stmt_vec_info *stmt_info
            = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
@@ -2655,8 +2886,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
                                si->misalign, vect_epilogue);
        }
 
-      VEC_free (stmt_info_for_cost, heap, prologue_cost_vec);
-      VEC_free (stmt_info_for_cost, heap, epilogue_cost_vec);
+      prologue_cost_vec.release ();
+      epilogue_cost_vec.release ();
     }
 
   /* FORNOW: The scalar outside cost is incremented in one of the
@@ -2723,7 +2954,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
       else
        {
          /* Cost model check occurs at prologue generation.  */
-         if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+         if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
            scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
              + vect_get_stmt_cost (cond_branch_not_taken); 
          /* Cost model check occurs at epilogue generation.  */
@@ -2769,32 +3000,45 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
   /* vector version will never be profitable.  */
   else
     {
-      if (vect_print_dump_info (REPORT_COST))
-        fprintf (vect_dump, "cost model: the vector iteration cost = %d "
-                "divided by the scalar iteration cost = %d "
-                "is greater or equal to the vectorization factor = %d.",
-                 vec_inside_cost, scalar_single_iter_cost, vf);
-      return -1;
-    }
-
-  if (vect_print_dump_info (REPORT_COST))
-    {
-      fprintf (vect_dump, "Cost model analysis: \n");
-      fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
-              vec_inside_cost);
-      fprintf (vect_dump, "  Vector prologue cost: %d\n",
-              vec_prologue_cost);
-      fprintf (vect_dump, "  Vector epilogue cost: %d\n",
-              vec_epilogue_cost);
-      fprintf (vect_dump, "  Scalar iteration cost: %d\n",
-              scalar_single_iter_cost);
-      fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
-      fprintf (vect_dump, "  prologue iterations: %d\n",
-               peel_iters_prologue);
-      fprintf (vect_dump, "  epilogue iterations: %d\n",
-               peel_iters_epilogue);
-      fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
-              min_profitable_iters);
+      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
+       warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
+                   "did not happen for a simd loop");
+
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "cost model: the vector iteration cost = %d "
+                        "divided by the scalar iteration cost = %d "
+                        "is greater or equal to the vectorization factor = %d"
+                         ".\n",
+                        vec_inside_cost, scalar_single_iter_cost, vf);
+      *ret_min_profitable_niters = -1;
+      *ret_min_profitable_estimate = -1;
+      return;
+    }
+
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
+      dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
+                   vec_inside_cost);
+      dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
+                   vec_prologue_cost);
+      dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
+                   vec_epilogue_cost);
+      dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
+                   scalar_single_iter_cost);
+      dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
+                   scalar_outside_cost);
+      dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
+                   vec_outside_cost);
+      dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
+                   peel_iters_prologue);
+      dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
+                   peel_iters_epilogue);
+      dump_printf (MSG_NOTE,
+                   "  Calculated minimum iters for profitability: %d\n",
+                   min_profitable_iters);
+      dump_printf (MSG_NOTE, "\n");
     }
 
   min_profitable_iters =
@@ -2805,13 +3049,75 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
        then skip the vectorized loop.  */
   min_profitable_iters--;
 
-  if (vect_print_dump_info (REPORT_COST))
-    fprintf (vect_dump, "  Profitability threshold = %d\n",
-            min_profitable_iters);
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                     "  Runtime profitability threshold = %d\n",
+                     min_profitable_iters);
+
+  *ret_min_profitable_niters = min_profitable_iters;
+
+  /* Calculate number of iterations required to make the vector version
+     profitable, relative to the loop bodies only.
+
+     Non-vectorized variant is SIC * niters and it must win over vector
+     variant on the expected loop trip count.  The following condition must hold true:
+     SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
+
+  if (vec_outside_cost <= 0)
+    min_profitable_estimate = 1;
+  else
+    {
+      min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
+                                - vec_inside_cost * peel_iters_prologue
+                                - vec_inside_cost * peel_iters_epilogue)
+                                / ((scalar_single_iter_cost * vf)
+                                  - vec_inside_cost);
+    }
+  min_profitable_estimate --;
+  min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                     "  Static estimate profitability threshold = %d\n",
+                      min_profitable_iters);
 
-  return min_profitable_iters;
+  *ret_min_profitable_estimate = min_profitable_estimate;
 }
 
+/* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
+   vector elements (not bits) for a vector of mode MODE.  */
+static void
+calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
+                             unsigned char *sel)
+{
+  unsigned int i, nelt = GET_MODE_NUNITS (mode);
+
+  for (i = 0; i < nelt; i++)
+    sel[i] = (i + offset) & (2*nelt - 1);
+}
+
+/* Checks whether the target supports whole-vector shifts for vectors of mode
+   MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
+   it supports vec_perm_const with masks for all necessary shift amounts.  */
+static bool
+have_whole_vector_shift (enum machine_mode mode)
+{
+  if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
+    return true;
+
+  if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
+    return false;
+
+  unsigned int i, nelt = GET_MODE_NUNITS (mode);
+  unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
+
+  for (i = nelt/2; i >= 1; i/=2)
+    {
+      calc_vec_perm_mask_for_shift (mode, i, sel);
+      if (!can_vec_perm_p (mode, false, sel))
+       return false;
+    }
+  return true;
+}
 
 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
    functions. Design better to avoid maintenance issues.  */
@@ -2832,7 +3138,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
   tree vectype;
   gimple stmt, orig_stmt;
   tree reduction_op;
-  enum machine_mode mode;
+  machine_mode mode;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
@@ -2864,10 +3170,13 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
   if (!vectype)
     {
-      if (vect_print_dump_info (REPORT_COST))
+      if (dump_enabled_p ())
         {
-          fprintf (vect_dump, "unsupported data-type ");
-          print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
+         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                          "unsupported data-type ");
+          dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                            TREE_TYPE (reduction_op));
+          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
         }
       return false;
    }
@@ -2900,10 +3209,10 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
        }
       else
        {
-         int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+         int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
          tree bitsize =
            TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
-         int element_bitsize = tree_low_cst (bitsize, 1);
+         int element_bitsize = tree_to_uhwi (bitsize);
          int nelements = vec_size_in_bits / element_bitsize;
 
          optab = optab_for_tree_code (code, vectype, optab_default);
@@ -2911,7 +3220,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
          /* We have a whole vector shift available.  */
          if (VECTOR_MODE_P (mode)
              && optab_handler (optab, mode) != CODE_FOR_nothing
-             && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
+             && have_whole_vector_shift (mode))
            {
              /* Final reduction via vector shifts and the reduction operator.
                 Also requires scalar extract.  */
@@ -2933,10 +3242,11 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
        }
     }
 
-  if (vect_print_dump_info (REPORT_COST))
-    fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
-             "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
-            prologue_cost, epilogue_cost);
+  if (dump_enabled_p ())
+    dump_printf (MSG_NOTE, 
+                 "vect_model_reduction_cost: inside_cost = %d, "
+                 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
+                 prologue_cost, epilogue_cost);
 
   return true;
 }
@@ -2961,9 +3271,10 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
                                 stmt_info, 0, vect_prologue);
 
-  if (vect_print_dump_info (REPORT_COST))
-    fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
-             "prologue_cost = %d .", inside_cost, prologue_cost);
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                     "vect_model_induction_cost: inside_cost = %d, "
+                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
 }
 
 
@@ -2985,14 +3296,12 @@ get_initial_def_for_induction (gimple iv_phi)
   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  tree scalar_type;
   tree vectype;
   int nunits;
   edge pe = loop_preheader_edge (loop);
   struct loop *iv_loop;
   basic_block new_bb;
-  tree vec, vec_init, vec_step, t;
-  tree access_fn;
+  tree new_vec, vec_init, vec_step, t;
   tree new_var;
   tree new_name;
   gimple init_stmt, induction_phi, new_stmt;
@@ -3000,7 +3309,6 @@ get_initial_def_for_induction (gimple iv_phi)
   tree init_expr, step_expr;
   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   int i;
-  bool ok;
   int ncopies;
   tree expr;
   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
@@ -3029,16 +3337,14 @@ get_initial_def_for_induction (gimple iv_phi)
   latch_e = loop_latch_edge (iv_loop);
   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
 
-  access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
-  gcc_assert (access_fn);
-  STRIP_NOPS (access_fn);
-  ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
-                                    &init_expr, &step_expr);
-  gcc_assert (ok);
+  step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+  gcc_assert (step_expr != NULL_TREE);
+
   pe = loop_preheader_edge (iv_loop);
+  init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
+                                    loop_preheader_edge (iv_loop));
 
-  scalar_type = TREE_TYPE (init_expr);
-  vectype = get_vectype_for_scalar_type (scalar_type);
+  vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
   gcc_assert (vectype);
   nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -3047,6 +3353,16 @@ get_initial_def_for_induction (gimple iv_phi)
   gcc_assert (phi_info);
   gcc_assert (ncopies >= 1);
 
+  /* Convert the step to the desired type.  */
+  step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
+                                                 step_expr),
+                                   &stmts, true, NULL_TREE);
+  if (stmts)
+    {
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+      gcc_assert (!new_bb);
+    }
+
   /* Find the first insertion point in the BB.  */
   si = gsi_after_labels (bb);
 
@@ -3056,49 +3372,72 @@ get_initial_def_for_induction (gimple iv_phi)
       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
         been created during vectorization of previous stmts.  We obtain it
         from the STMT_VINFO_VEC_STMT of the defining stmt.  */
-      tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
-                                           loop_preheader_edge (iv_loop));
-      vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
+      vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
+      /* If the initial value is not of proper type, convert it.  */
+      if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
+       {
+         new_stmt = gimple_build_assign_with_ops
+             (VIEW_CONVERT_EXPR,
+              vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
+              build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
+         vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
+         gimple_assign_set_lhs (new_stmt, vec_init);
+         new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
+                                                new_stmt);
+         gcc_assert (!new_bb);
+         set_vinfo_for_stmt (new_stmt,
+                             new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
+       }
     }
   else
     {
-      VEC(constructor_elt,gc) *v;
+      vec<constructor_elt, va_gc> *v;
 
       /* iv_loop is the loop to be vectorized. Create:
         vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
-      new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
-      new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
+      new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
+                                      vect_scalar_var, "var_");
+      new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
+                                                    init_expr),
+                                      &stmts, false, new_var);
       if (stmts)
        {
          new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
          gcc_assert (!new_bb);
        }
 
-      v = VEC_alloc (constructor_elt, gc, nunits);
+      vec_alloc (v, nunits);
+      bool constant_p = is_gimple_min_invariant (new_name);
       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
       for (i = 1; i < nunits; i++)
        {
          /* Create: new_name_i = new_name + step_expr  */
-         enum tree_code code = POINTER_TYPE_P (scalar_type)
-                               ? POINTER_PLUS_EXPR : PLUS_EXPR;
-         init_stmt = gimple_build_assign_with_ops (code, new_var,
-                                                   new_name, step_expr);
-         new_name = make_ssa_name (new_var, init_stmt);
-         gimple_assign_set_lhs (init_stmt, new_name);
-
-         new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
-         gcc_assert (!new_bb);
-
-         if (vect_print_dump_info (REPORT_DETAILS))
+         new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
+                                 new_name, step_expr);
+         if (!is_gimple_min_invariant (new_name))
            {
-             fprintf (vect_dump, "created new init_stmt: ");
-             print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
+             init_stmt = gimple_build_assign (new_var, new_name);
+             new_name = make_ssa_name (new_var, init_stmt);
+             gimple_assign_set_lhs (init_stmt, new_name);
+             new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
+             gcc_assert (!new_bb);
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  "created new init_stmt: ");
+                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
+                  dump_printf (MSG_NOTE, "\n");
+               }
+             constant_p = false;
            }
          CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
        }
       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
-      vec = build_constructor (vectype, v);
-      vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
+      if (constant_p)
+       new_vec = build_vector_from_ctor (vectype, v);
+      else
+       new_vec = build_constructor (vectype, v);
+      vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
     }
 
 
@@ -3111,17 +3450,27 @@ get_initial_def_for_induction (gimple iv_phi)
     {
       /* iv_loop is the loop to be vectorized. Generate:
          vec_step = [VF*S, VF*S, VF*S, VF*S]  */
-      expr = build_int_cst (TREE_TYPE (step_expr), vf);
+      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+       {
+         expr = build_int_cst (integer_type_node, vf);
+         expr = fold_convert (TREE_TYPE (step_expr), expr);
+       }
+      else
+       expr = build_int_cst (TREE_TYPE (step_expr), vf);
       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
                              expr, step_expr);
+      if (TREE_CODE (step_expr) == SSA_NAME)
+       new_name = vect_init_vector (iv_phi, new_name,
+                                    TREE_TYPE (step_expr), NULL);
     }
 
   t = unshare_expr (new_name);
-  gcc_assert (CONSTANT_CLASS_P (new_name));
+  gcc_assert (CONSTANT_CLASS_P (new_name)
+             || TREE_CODE (new_name) == SSA_NAME);
   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
   gcc_assert (stepvectype);
-  vec = build_vector_from_val (stepvectype, t);
-  vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
+  new_vec = build_vector_from_val (stepvectype, t);
+  vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
 
 
   /* Create the following def-use cycle:
@@ -3170,13 +3519,23 @@ get_initial_def_for_induction (gimple iv_phi)
       gcc_assert (!nested_in_vect_loop);
 
       /* Create the vector that holds the step of the induction.  */
-      expr = build_int_cst (TREE_TYPE (step_expr), nunits);
+      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+       {
+         expr = build_int_cst (integer_type_node, nunits);
+         expr = fold_convert (TREE_TYPE (step_expr), expr);
+       }
+      else
+       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
                              expr, step_expr);
+      if (TREE_CODE (step_expr) == SSA_NAME)
+       new_name = vect_init_vector (iv_phi, new_name,
+                                    TREE_TYPE (step_expr), NULL);
       t = unshare_expr (new_name);
-      gcc_assert (CONSTANT_CLASS_P (new_name));
-      vec = build_vector_from_val (stepvectype, t);
-      vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
+      gcc_assert (CONSTANT_CLASS_P (new_name)
+                 || TREE_CODE (new_name) == SSA_NAME);
+      new_vec = build_vector_from_val (stepvectype, t);
+      vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
 
       vec_def = induc_def;
       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
@@ -3216,9 +3575,13 @@ get_initial_def_for_induction (gimple iv_phi)
       exit_phi = NULL;
       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
         {
-         if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
+         gimple use_stmt = USE_STMT (use_p);
+         if (is_gimple_debug (use_stmt))
+           continue;
+
+         if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
            {
-             exit_phi = USE_STMT (use_p);
+             exit_phi = use_stmt;
              break;
            }
         }
@@ -3231,21 +3594,26 @@ get_initial_def_for_induction (gimple iv_phi)
                      && !STMT_VINFO_LIVE_P (stmt_vinfo));
 
          STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
-         if (vect_print_dump_info (REPORT_DETAILS))
+         if (dump_enabled_p ())
            {
-             fprintf (vect_dump, "vector of inductions after inner-loop:");
-             print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "vector of inductions after inner-loop:");
+             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
+              dump_printf (MSG_NOTE, "\n");
            }
        }
     }
 
 
-  if (vect_print_dump_info (REPORT_DETAILS))
+  if (dump_enabled_p ())
     {
-      fprintf (vect_dump, "transform induction: created def-use cycle: ");
-      print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
-      fprintf (vect_dump, "\n");
-      print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "transform induction: created def-use cycle: ");
+      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
+      dump_printf (MSG_NOTE, "\n");
+      dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
+                       SSA_NAME_DEF_STMT (vec_def), 0);
+      dump_printf (MSG_NOTE, "\n");
     }
 
   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
@@ -3257,7 +3625,7 @@ get_initial_def_for_induction (gimple iv_phi)
          build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
       gimple_assign_set_lhs (new_stmt, induc_def);
-      si = gsi_start_bb (bb);
+      si = gsi_after_labels (bb);
       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
       set_vinfo_for_stmt (new_stmt,
                          new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
@@ -3374,6 +3742,7 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
     {
       case WIDEN_SUM_EXPR:
       case DOT_PROD_EXPR:
+      case SAD_EXPR:
       case PLUS_EXPR:
       case MINUS_EXPR:
       case BIT_IOR_EXPR:
@@ -3424,8 +3793,8 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
           init_def = build_vector (vectype, elts);
         else
          {
-           VEC(constructor_elt,gc) *v;
-           v = VEC_alloc (constructor_elt, gc, nunits);
+           vec<constructor_elt, va_gc> *v;
+           vec_alloc (v, nunits);
            CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
            for (i = 1; i < nunits; ++i)
              CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
@@ -3454,7 +3823,6 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
   return init_def;
 }
 
-
 /* Function vect_create_epilog_for_reduction
 
    Create code at the loop-epilog to finalize the result of a reduction
@@ -3517,16 +3885,16 @@ get_initial_def_for_reduction (gimple stmt, tree init_val,
 */
 
 static void
-vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
+vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
                                  int ncopies, enum tree_code reduc_code,
-                                 VEC (gimple, heap) *reduction_phis,
+                                 vec<gimple> reduction_phis,
                                   int reduc_index, bool double_reduc, 
                                   slp_tree slp_node)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   stmt_vec_info prev_phi_info;
   tree vectype;
-  enum machine_mode mode;
+  machine_mode mode;
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
   basic_block exit_bb;
@@ -3539,30 +3907,29 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
   gimple epilog_stmt = NULL;
   enum tree_code code = gimple_assign_rhs_code (stmt);
   gimple exit_phi;
-  tree bitsize, bitpos;
+  tree bitsize;
   tree adjustment_def = NULL;
   tree vec_initial_def = NULL;
   tree reduction_op, expr, def;
   tree orig_name, scalar_result;
   imm_use_iterator imm_iter, phi_imm_iter;
   use_operand_p use_p, phi_use_p;
-  bool extract_scalar_result = false;
   gimple use_stmt, orig_stmt, reduction_phi = NULL;
   bool nested_in_vect_loop = false;
-  VEC (gimple, heap) *new_phis = NULL;
-  VEC (gimple, heap) *inner_phis = NULL;
+  auto_vec<gimple> new_phis;
+  auto_vec<gimple> inner_phis;
   enum vect_def_type dt = vect_unknown_def_type;
   int j, i;
-  VEC (tree, heap) *scalar_results = NULL;
+  auto_vec<tree> scalar_results;
   unsigned int group_size = 1, k, ratio;
-  VEC (tree, heap) *vec_initial_defs = NULL;
-  VEC (gimple, heap) *phis;
+  auto_vec<tree> vec_initial_defs;
+  auto_vec<gimple> phis;
   bool slp_reduc = false;
   tree new_phi_result;
   gimple inner_phi = NULL;
 
   if (slp_node)
-    group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node)); 
+    group_size = SLP_TREE_SCALAR_STMTS (slp_node).length (); 
 
   if (nested_in_vect_loop_p (loop, stmt))
     {
@@ -3620,20 +3987,24 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
                        NULL, slp_node, reduc_index);
   else
     {
-      vec_initial_defs = VEC_alloc (tree, heap, 1);
+      vec_initial_defs.create (1);
      /* For the case of reduction, vect_get_vec_def_for_operand returns
         the scalar def before the loop, that defines the initial value
         of the reduction variable.  */
       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
                                                       &adjustment_def);
-      VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
+      vec_initial_defs.quick_push (vec_initial_def);
     }
 
   /* Set phi nodes arguments.  */
-  FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
-    {
-      tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
-      tree def = VEC_index (tree, vect_defs, i);
+  FOR_EACH_VEC_ELT (reduction_phis, i, phi)
+    {
+      tree vec_init_def, def;
+      gimple_seq stmts;
+      vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
+                                          true, NULL_TREE);
+      gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
+      def = vect_defs[i];
       for (j = 0; j < ncopies; j++)
         {
           /* Set the loop-entry arg of the reduction-phi.  */
@@ -3646,22 +4017,20 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
 
           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
 
-          if (vect_print_dump_info (REPORT_DETAILS))
+          if (dump_enabled_p ())
             {
-              fprintf (vect_dump, "transform reduction: created def-use"
-                                  " cycle: ");
-              print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
-              fprintf (vect_dump, "\n");
-              print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
-                                 TDF_SLIM);
+              dump_printf_loc (MSG_NOTE, vect_location,
+                              "transform reduction: created def-use cycle: ");
+              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+              dump_printf (MSG_NOTE, "\n");
+              dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
+              dump_printf (MSG_NOTE, "\n");
             }
 
           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
         }
     }
 
-  VEC_free (tree, heap, vec_initial_defs);
-
   /* 2. Create epilog code.
         The reduction epilog code operates across the elements of the vector
         of partial results computed by the vectorized loop.
@@ -3695,15 +4064,16 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
 
   exit_bb = single_exit (loop)->dest;
   prev_phi_info = NULL;
-  new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
-  FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
+  new_phis.create (vect_defs.length ());
+  FOR_EACH_VEC_ELT (vect_defs, i, def)
     {
       for (j = 0; j < ncopies; j++)
         {
-          phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
+         tree new_def = copy_ssa_name (def, NULL);
+          phi = create_phi_node (new_def, exit_bb);
           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
           if (j == 0)
-            VEC_quick_push (gimple, new_phis, phi);
+            new_phis.quick_push (phi);
           else
            {
              def = vect_get_vec_def_for_stmt_copy (dt, def);
@@ -3721,23 +4091,23 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
     {
       loop = outer_loop;
       exit_bb = single_exit (loop)->dest;
-      inner_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
-      FOR_EACH_VEC_ELT (gimple, new_phis, i, phi)
+      inner_phis.create (vect_defs.length ());
+      FOR_EACH_VEC_ELT (new_phis, i, phi)
        {
-         gimple outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
-                                             exit_bb);
+         tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
+         gimple outer_phi = create_phi_node (new_result, exit_bb);
          SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
                           PHI_RESULT (phi));
          set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
                                                            loop_vinfo, NULL));
-         VEC_quick_push (gimple, inner_phis, phi);
-         VEC_replace (gimple, new_phis, i, outer_phi);
+         inner_phis.quick_push (phi);
+         new_phis[i] = outer_phi;
          prev_phi_info = vinfo_for_stmt (outer_phi);
           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
             {
              phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
-             outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
-                                          exit_bb);
+             new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
+             outer_phi = create_phi_node (new_result, exit_bb);
              SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
                               PHI_RESULT (phi));
              set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
@@ -3781,7 +4151,7 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
   
   scalar_dest = gimple_assign_lhs (orig_stmt);
   scalar_type = TREE_TYPE (scalar_dest);
-  scalar_results = VEC_alloc (tree, heap, group_size); 
+  scalar_results.create (group_size); 
   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
   bitsize = TYPE_SIZE (scalar_type);
 
@@ -3810,14 +4180,14 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
      one vector.  */
   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
-      tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
+      tree first_vect = PHI_RESULT (new_phis[0]);
       tree tmp;
       gimple new_vec_stmt = NULL;
 
       vec_dest = vect_create_destination_var (scalar_dest, vectype);
-      for (k = 1; k < VEC_length (gimple, new_phis); k++)
+      for (k = 1; k < new_phis.length (); k++)
         {
-          gimple next_phi = VEC_index (gimple, new_phis, k);
+          gimple next_phi = new_phis[k];
           tree second_vect = PHI_RESULT (next_phi);
 
           tmp = build2 (code, vectype,  first_vect, second_vect);
@@ -3830,12 +4200,12 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
       new_phi_result = first_vect;
       if (new_vec_stmt)
         {
-          VEC_truncate (gimple, new_phis, 0);
-          VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
+          new_phis.truncate (0);
+          new_phis.safe_push (new_vec_stmt);
         }
     }
   else
-    new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
+    new_phi_result = PHI_RESULT (new_phis[0]);
  
   /* 2.3 Create the reduction code, using one of the three schemes described
          above. In SLP we simply need to extract all the elements from the 
@@ -3843,36 +4213,43 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
   if (reduc_code != ERROR_MARK && !slp_reduc)
     {
       tree tmp;
+      tree vec_elem_type;
 
       /*** Case 1:  Create:
            v_out2 = reduc_expr <v_out1>  */
 
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "Reduce using direct vector reduction.");
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location,
+                        "Reduce using direct vector reduction.\n");
 
-      vec_dest = vect_create_destination_var (scalar_dest, vectype);
-      tmp = build1 (reduc_code, vectype, new_phi_result);
-      epilog_stmt = gimple_build_assign (vec_dest, tmp);
-      new_temp = make_ssa_name (vec_dest, epilog_stmt);
+      vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
+      if (!useless_type_conversion_p (scalar_type, vec_elem_type))
+       {
+          tree tmp_dest =
+             vect_create_destination_var (scalar_dest, vec_elem_type);
+         tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
+         epilog_stmt = gimple_build_assign (tmp_dest, tmp);
+         new_temp = make_ssa_name (tmp_dest, epilog_stmt);
+         gimple_assign_set_lhs (epilog_stmt, new_temp);
+         gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+
+         tmp = build1 (NOP_EXPR, scalar_type, new_temp);
+       }
+      else
+       tmp = build1 (reduc_code, scalar_type, new_phi_result);
+      epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
+      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
       gimple_assign_set_lhs (epilog_stmt, new_temp);
       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-
-      extract_scalar_result = true;
+      scalar_results.safe_push (new_temp);
     }
   else
     {
-      enum tree_code shift_code = ERROR_MARK;
-      bool have_whole_vector_shift = true;
-      int bit_offset;
-      int element_bitsize = tree_low_cst (bitsize, 1);
-      int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+      bool reduce_with_shift = have_whole_vector_shift (mode);
+      int element_bitsize = tree_to_uhwi (bitsize);
+      int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
       tree vec_temp;
 
-      if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
-        shift_code = VEC_RSHIFT_EXPR;
-      else
-        have_whole_vector_shift = false;
-
       /* Regardless of whether we have a whole vector shift, if we're
          emulating the operation via tree-vect-generic, we don't want
          to use it.  Only the first round of the reduction is likely
@@ -3880,36 +4257,46 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
       /* ??? It might be better to emit a reduction tree code here, so that
          tree-vect-generic can expand the first round via bit tricks.  */
       if (!VECTOR_MODE_P (mode))
-        have_whole_vector_shift = false;
+        reduce_with_shift = false;
       else
         {
           optab optab = optab_for_tree_code (code, vectype, optab_default);
           if (optab_handler (optab, mode) == CODE_FOR_nothing)
-            have_whole_vector_shift = false;
+            reduce_with_shift = false;
         }
 
-      if (have_whole_vector_shift && !slp_reduc)
+      if (reduce_with_shift && !slp_reduc)
         {
+          int nelements = vec_size_in_bits / element_bitsize;
+          unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
+
+          int elt_offset;
+
+          tree zero_vec = build_zero_cst (vectype);
           /*** Case 2: Create:
-             for (offset = VS/2; offset >= element_size; offset/=2)
+             for (offset = nelements/2; offset >= 1; offset/=2)
                 {
                   Create:  va' = vec_shift <va, offset>
                   Create:  va = vop <va, va'>
                 }  */
 
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "Reduce using vector shifts");
+          tree rhs;
+
+          if (dump_enabled_p ())
+            dump_printf_loc (MSG_NOTE, vect_location,
+                            "Reduce using vector shifts\n");
 
           vec_dest = vect_create_destination_var (scalar_dest, vectype);
           new_temp = new_phi_result;
-          for (bit_offset = vec_size_in_bits/2;
-               bit_offset >= element_bitsize;
-               bit_offset /= 2)
+          for (elt_offset = nelements / 2;
+               elt_offset >= 1;
+               elt_offset /= 2)
             {
-              tree bitpos = size_int (bit_offset);
-
-              epilog_stmt = gimple_build_assign_with_ops (shift_code,
-                                               vec_dest, new_temp, bitpos);
+              calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
+              tree mask = vect_gen_perm_mask_any (vectype, sel);
+             epilog_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR,
+                                                         vec_dest, new_temp,
+                                                         zero_vec, mask);
               new_name = make_ssa_name (vec_dest, epilog_stmt);
               gimple_assign_set_lhs (epilog_stmt, new_name);
               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
@@ -3921,12 +4308,23 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
             }
 
-          extract_scalar_result = true;
+         /* 2.4  Extract the final scalar result.  Create:
+            s_out3 = extract_field <v_out2, bitpos>  */
+
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "extract scalar result\n");
+
+         rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
+                       bitsize, bitsize_zero_node);
+         epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
+         new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
+         gimple_assign_set_lhs (epilog_stmt, new_temp);
+         gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
+         scalar_results.safe_push (new_temp);
         }
       else
         {
-          tree rhs;
-
           /*** Case 3: Create:
              s = extract_field <v_out2, 0>
              for (offset = element_size;
@@ -3937,17 +4335,19 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
                  Create:  s = op <s, s'>  // For non SLP cases
                }  */
 
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "Reduce using scalar code. ");
+          if (dump_enabled_p ())
+            dump_printf_loc (MSG_NOTE, vect_location,
+                            "Reduce using scalar code.\n");
 
-          vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
-          FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
+          vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
+          FOR_EACH_VEC_ELT (new_phis, i, new_phi)
             {
+              int bit_offset;
               if (gimple_code (new_phi) == GIMPLE_PHI)
                 vec_temp = PHI_RESULT (new_phi);
               else
                 vec_temp = gimple_assign_lhs (new_phi);
-              rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
+              tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
                             bitsize_zero_node);
               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
@@ -3957,7 +4357,7 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
               /* In SLP we don't need to apply reduction operation, so we just
                  collect s' values in SCALAR_RESULTS.  */
               if (slp_reduc)
-                VEC_safe_push (tree, heap, scalar_results, new_temp);
+                scalar_results.safe_push (new_temp);
 
               for (bit_offset = element_bitsize;
                    bit_offset < vec_size_in_bits;
@@ -3977,7 +4377,7 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
                       /* In SLP we don't need to apply reduction operation, so 
                          we just collect s' values in SCALAR_RESULTS.  */
                       new_temp = new_name;
-                      VEC_safe_push (tree, heap, scalar_results, new_name);
+                      scalar_results.safe_push (new_name);
                     }
                   else
                     {
@@ -4000,50 +4400,23 @@ vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
               gimple new_stmt;
             
               /* Reduce multiple scalar results in case of SLP unrolling.  */
-              for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
+              for (j = group_size; scalar_results.iterate (j, &res);
                    j++)
                 {
-                  first_res = VEC_index (tree, scalar_results, j % group_size);
+                  first_res = scalar_results[j % group_size];
                   new_stmt = gimple_build_assign_with_ops (code,
                                               new_scalar_dest, first_res, res);
                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
                   gimple_assign_set_lhs (new_stmt, new_res);
                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
-                  VEC_replace (tree, scalar_results, j % group_size, new_res);
+                  scalar_results[j % group_size] = new_res;
                 }
             }
           else
             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
-            VEC_safe_push (tree, heap, scalar_results, new_temp);
-
-          extract_scalar_result = false;
+            scalar_results.safe_push (new_temp);
         }
     }
-
-  /* 2.4  Extract the final scalar result.  Create:
-          s_out3 = extract_field <v_out2, bitpos>  */
-
-  if (extract_scalar_result)
-    {
-      tree rhs;
-
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "extract scalar result");
-
-      if (BYTES_BIG_ENDIAN)
-        bitpos = size_binop (MULT_EXPR,
-                             bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
-                             TYPE_SIZE (scalar_type));
-      else
-        bitpos = bitsize_zero_node;
-
-      rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
-      epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
-      new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
-      gimple_assign_set_lhs (epilog_stmt, new_temp);
-      gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
-      VEC_safe_push (tree, heap, scalar_results, new_temp);
-    }
   
 vect_finalize_reduction:
 
@@ -4060,14 +4433,14 @@ vect_finalize_reduction:
       gcc_assert (!slp_reduc);
       if (nested_in_vect_loop)
        {
-          new_phi = VEC_index (gimple, new_phis, 0);
+          new_phi = new_phis[0];
          gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
          expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
          new_dest = vect_create_destination_var (scalar_dest, vectype);
        }
       else
        {
-          new_temp = VEC_index (tree, scalar_results, 0);
+          new_temp = scalar_results[0];
          gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
          expr = build2 (code, scalar_type, new_temp, adjustment_def);
          new_dest = vect_create_destination_var (scalar_dest, scalar_type);
@@ -4076,7 +4449,6 @@ vect_finalize_reduction:
       epilog_stmt = gimple_build_assign (new_dest, expr);
       new_temp = make_ssa_name (new_dest, epilog_stmt);
       gimple_assign_set_lhs (epilog_stmt, new_temp);
-      SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
       if (nested_in_vect_loop)
         {
@@ -4087,14 +4459,14 @@ vect_finalize_reduction:
                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
 
           if (!double_reduc)
-            VEC_quick_push (tree, scalar_results, new_temp);
+            scalar_results.quick_push (new_temp);
           else
-            VEC_replace (tree, scalar_results, 0, new_temp);
+            scalar_results[0] = new_temp;
         }
       else
-        VEC_replace (tree, scalar_results, 0, new_temp);
+        scalar_results[0] = new_temp;
 
-      VEC_replace (gimple, new_phis, 0, epilog_stmt);
+      new_phis[0] = epilog_stmt;
     }
 
   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
@@ -4129,9 +4501,8 @@ vect_finalize_reduction:
      exit phi node.  */
   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
     {
-      scalar_dest = gimple_assign_lhs (VEC_index (gimple,
-                                       SLP_TREE_SCALAR_STMTS (slp_node),
-                                       group_size - 1));
+      scalar_dest = gimple_assign_lhs (
+                       SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
       group_size = 1;
     }
 
@@ -4141,10 +4512,10 @@ vect_finalize_reduction:
      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
      the first vector stmt, etc.  
      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */ 
-  if (group_size > VEC_length (gimple, new_phis))
+  if (group_size > new_phis.length ())
     {
-      ratio = group_size / VEC_length (gimple, new_phis);
-      gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
+      ratio = group_size / new_phis.length ();
+      gcc_assert (!(group_size % new_phis.length ()));
     }
   else
     ratio = 1;
@@ -4153,16 +4524,15 @@ vect_finalize_reduction:
     {
       if (k % ratio == 0)
         {
-          epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
-          reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
+          epilog_stmt = new_phis[k / ratio];
+          reduction_phi = reduction_phis[k / ratio];
          if (double_reduc)
-           inner_phi = VEC_index (gimple, inner_phis, k / ratio);
+           inner_phi = inner_phis[k / ratio];
         }
 
       if (slp_reduc)
         {
-          gimple current_stmt = VEC_index (gimple,
-                                       SLP_TREE_SCALAR_STMTS (slp_node), k);
+          gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
 
           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
           /* SLP statements can't participate in patterns.  */
@@ -4170,19 +4540,19 @@ vect_finalize_reduction:
           scalar_dest = gimple_assign_lhs (current_stmt);
         }
 
-      phis = VEC_alloc (gimple, heap, 3);
+      phis.create (3);
       /* Find the loop-closed-use at the loop exit of the original scalar
          result.  (The reduction result is expected to have two immediate uses -
          one at the latch block, and one at the loop exit).  */
       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
-        if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
-          VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
+        if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
+           && !is_gimple_debug (USE_STMT (use_p)))
+          phis.safe_push (USE_STMT (use_p));
 
-      /* We expect to have found an exit_phi because of loop-closed-ssa
-         form.  */
-      gcc_assert (!VEC_empty (gimple, phis));
+      /* While we expect to have found an exit_phi because of loop-closed-ssa
+         form we can end up without one if the scalar cycle is dead.  */
 
-      FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
+      FOR_EACH_VEC_ELT (phis, i, exit_phi)
         {
           if (outer_loop)
             {
@@ -4265,11 +4635,12 @@ vect_finalize_reduction:
                                UNKNOWN_LOCATION);
                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
-                  if (vect_print_dump_info (REPORT_DETAILS))
+                  if (dump_enabled_p ())
                     {
-                      fprintf (vect_dump, "created double reduction phi "
-                                          "node: ");
-                      print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
+                      dump_printf_loc (MSG_NOTE, vect_location,
+                                      "created double reduction phi node: ");
+                      dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
+                      dump_printf (MSG_NOTE, "\n");
                     }
 
                   vect_phi_res = PHI_RESULT (vect_phi);
@@ -4288,7 +4659,7 @@ vect_finalize_reduction:
             }
         }
 
-      VEC_free (gimple, heap, phis);
+      phis.release ();
       if (nested_in_vect_loop)
         {
           if (double_reduc)
@@ -4297,7 +4668,7 @@ vect_finalize_reduction:
             continue;
         }
 
-      phis = VEC_alloc (gimple, heap, 3);
+      phis.create (3);
       /* Find the loop-closed-use at the loop exit of the original scalar
          result.  (The reduction result is expected to have two immediate uses,
          one at the latch block, and one at the loop exit).  For double
@@ -4305,7 +4676,10 @@ vect_finalize_reduction:
       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
         {
           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
-            VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
+           {
+             if (!is_gimple_debug (USE_STMT (use_p)))
+               phis.safe_push (USE_STMT (use_p));
+           }
           else
             {
               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
@@ -4315,30 +4689,27 @@ vect_finalize_reduction:
                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
                     {
                       if (!flow_bb_inside_loop_p (loop,
-                                             gimple_bb (USE_STMT (phi_use_p))))
-                        VEC_safe_push (gimple, heap, phis,
-                                       USE_STMT (phi_use_p));
+                                             gimple_bb (USE_STMT (phi_use_p)))
+                         && !is_gimple_debug (USE_STMT (phi_use_p)))
+                        phis.safe_push (USE_STMT (phi_use_p));
                     }
                 }
             }
         }
 
-      FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
+      FOR_EACH_VEC_ELT (phis, i, exit_phi)
         {
           /* Replace the uses:  */
           orig_name = PHI_RESULT (exit_phi);
-          scalar_result = VEC_index (tree, scalar_results, k);
+          scalar_result = scalar_results[k];
           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
               SET_USE (use_p, scalar_result);
         }
 
-      VEC_free (gimple, heap, phis);
+      phis.release ();
     }
-
-  VEC_free (tree, heap, scalar_results);
-  VEC_free (gimple, heap, new_phis);
-} 
+}
 
 
 /* Function vectorizable_reduction.
@@ -4394,7 +4765,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
   enum tree_code code, orig_code, epilog_reduc_code;
-  enum machine_mode vec_mode;
+  machine_mode vec_mode;
   int op_type;
   optab optab, reduc_optab;
   tree new_temp = NULL_TREE;
@@ -4425,8 +4796,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   struct loop * def_stmt_loop, *outer_loop = NULL;
   tree def_arg;
   gimple def_arg_stmt;
-  VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
-  VEC (gimple, heap) *phis = NULL;
+  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds1;
+  auto_vec<tree> vect_defs;
+  auto_vec<gimple> phis;
   int vec_num;
   tree def0, def1, tem, op0, op1 = NULL_TREE;
 
@@ -4472,7 +4845,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   if (orig_stmt)
     {
       orig_stmt_info = vinfo_for_stmt (orig_stmt);
-      gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
     }
@@ -4542,7 +4914,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
      The last use is the reduction variable.  In case of nested cycle this
      assumption is not true: we use reduc_index to record the index of the
      reduction variable.  */
-  for (i = 0; i < op_type-1; i++)
+  for (i = 0; i < op_type - 1; i++)
     {
       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
       if (i == 0 && code == COND_EXPR)
@@ -4574,11 +4946,18 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   if (!vectype_in)
     vectype_in = tem;
   gcc_assert (is_simple_use);
-  gcc_assert (dt == vect_reduction_def
-              || dt == vect_nested_cycle
-              || ((dt == vect_internal_def || dt == vect_external_def
-                   || dt == vect_constant_def || dt == vect_induction_def)
-                   && nested_cycle && found_nested_cycle_def));
+  if (!(dt == vect_reduction_def
+       || dt == vect_nested_cycle
+       || ((dt == vect_internal_def || dt == vect_external_def
+            || dt == vect_constant_def || dt == vect_induction_def)
+           && nested_cycle && found_nested_cycle_def)))
+    {
+      /* For pattern recognized stmts, orig_stmt might be a reduction,
+        but some helper statements for the pattern might not, or
+        might be COND_EXPRs with reduction uses in the condition.  */
+      gcc_assert (orig_stmt);
+      return false;
+    }
   if (!found_nested_cycle_def)
     reduc_def_stmt = def_stmt;
 
@@ -4615,8 +4994,9 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
     {
       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "unsupported condition in reduction");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "unsupported condition in reduction\n");
 
             return false;
         }
@@ -4625,28 +5005,40 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
     {
       /* 4. Supportable by target?  */
 
+      if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
+         || code == LROTATE_EXPR || code == RROTATE_EXPR)
+       {
+         /* Shifts and rotates are only supported by vectorizable_shifts,
+            not vectorizable_reduction.  */
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "unsupported shift or rotation.\n");
+         return false;
+       }
+
       /* 4.1. check support for the operation in the loop  */
       optab = optab_for_tree_code (code, vectype_in, optab_default);
       if (!optab)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "no optab.");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "no optab.\n");
 
           return false;
         }
 
       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "op not supported by target.");
+          if (dump_enabled_p ())
+            dump_printf (MSG_NOTE, "op not supported by target.\n");
 
           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
                  < vect_min_worthwhile_factor (code))
             return false;
 
-          if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "proceeding using word mode.");
+          if (dump_enabled_p ())
+           dump_printf (MSG_NOTE, "proceeding using word mode.\n");
         }
 
       /* Worthwhile without SIMD support?  */
@@ -4654,8 +5046,9 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
             < vect_min_worthwhile_factor (code))
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "not worthwhile without SIMD support.");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not worthwhile without SIMD support.\n");
 
           return false;
         }
@@ -4734,27 +5127,32 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
                                          optab_default);
       if (!reduc_optab)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "no optab for reduction.");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "no optab for reduction.\n");
 
           epilog_reduc_code = ERROR_MARK;
         }
-
-      if (reduc_optab
-          && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
+      else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "reduc op not supported by target.");
+          optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
+          if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
+            {
+              if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "reduc op not supported by target.\n");
 
-          epilog_reduc_code = ERROR_MARK;
+             epilog_reduc_code = ERROR_MARK;
+           }
         }
     }
   else
     {
       if (!nested_cycle || double_reduc)
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "no reduc code for scalar code.");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "no reduc code for scalar code.\n");
 
           return false;
         }
@@ -4762,8 +5160,9 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
 
   if (double_reduc && ncopies > 1)
     {
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "multiple types in double reduction");
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "multiple types in double reduction\n");
 
       return false;
     }
@@ -4780,8 +5179,9 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
       else
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "invalid types in dot-prod");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "invalid types in dot-prod\n");
 
           return false;
         }
@@ -4797,8 +5197,8 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
 
   /** Transform.  **/
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "transform reduction.");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
 
   /* FORNOW: Multiple types are not supported for condition.  */
   if (code == COND_EXPR)
@@ -4853,15 +5253,15 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   else
     {
       vec_num = 1;
-      vec_oprnds0 = VEC_alloc (tree, heap, 1);
+      vec_oprnds0.create (1);
       if (op_type == ternary_op)
-        vec_oprnds1 = VEC_alloc (tree, heap, 1);
+        vec_oprnds1.create (1);
     }
 
-  phis = VEC_alloc (gimple, heap, vec_num);
-  vect_defs = VEC_alloc (tree, heap, vec_num);
+  phis.create (vec_num);
+  vect_defs.create (vec_num);
   if (!slp_node)
-    VEC_quick_push (tree, vect_defs, NULL_TREE);
+    vect_defs.quick_push (NULL_TREE);
 
   for (j = 0; j < ncopies; j++)
     {
@@ -4876,7 +5276,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
                                   new_stmt_vec_info (new_phi, loop_vinfo,
                                                      NULL));
                if (j == 0 || slp_node)
-                 VEC_quick_push (gimple, phis, new_phi);
+                 phis.quick_push (new_phi);
             }
         }
 
@@ -4884,7 +5284,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
         {
           gcc_assert (!slp_node);
           vectorizable_condition (stmt, gsi, vec_stmt, 
-                                  PHI_RESULT (VEC_index (gimple, phis, 0)), 
+                                  PHI_RESULT (phis[0]), 
                                   reduc_index, NULL);
           /* Multiple types are not supported for condition.  */
           break;
@@ -4909,12 +5309,12 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
             {
               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
                                                             stmt, NULL);
-              VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
+              vec_oprnds0.quick_push (loop_vec_def0);
               if (op_type == ternary_op)
                {
                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
                                                                NULL);
-                 VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
+                 vec_oprnds1.quick_push (loop_vec_def1);
                }
             }
         }
@@ -4930,14 +5330,14 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
                                   &dummy_stmt, &dummy, &dt);
               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
                                                               loop_vec_def0);
-              VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
+              vec_oprnds0[0] = loop_vec_def0;
               if (op_type == ternary_op)
                 {
                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
                                       &dummy, &dt);
                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
                                                                 loop_vec_def1);
-                  VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
+                  vec_oprnds1[0] = loop_vec_def1;
                 }
             }
 
@@ -4947,10 +5347,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
         }
 
-      FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
+      FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
         {
           if (slp_node)
-            reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
+            reduc_def = PHI_RESULT (phis[i]);
           else
             {
               if (!single_defuse_cycle || j == 0)
@@ -4958,7 +5358,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
             }
 
           def1 = ((op_type == ternary_op)
-                  ? VEC_index (tree, vec_oprnds1, i) : NULL);
+                  ? vec_oprnds1[i] : NULL);
           if (op_type == binary_op)
             {
               if (reduc_index == 0)
@@ -4986,11 +5386,11 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
 
           if (slp_node)
             {
-              VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
-              VEC_quick_push (tree, vect_defs, new_temp);
+              SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
+              vect_defs.quick_push (new_temp);
             }
           else
-            VEC_replace (tree, vect_defs, 0, new_temp);
+            vect_defs[0] = new_temp;
         }
 
       if (slp_node)
@@ -5010,18 +5410,13 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
     {
       new_temp = gimple_assign_lhs (*vec_stmt);
-      VEC_replace (tree, vect_defs, 0, new_temp);
+      vect_defs[0] = new_temp;
     }
 
   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
                                     epilog_reduc_code, phis, reduc_index,
                                     double_reduc, slp_node);
 
-  VEC_free (gimple, heap, phis);
-  VEC_free (tree, heap, vec_oprnds0);
-  if (vec_oprnds1)
-    VEC_free (tree, heap, vec_oprnds1);
-
   return true;
 }
 
@@ -5083,8 +5478,9 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
 
       if (ncopies > 1)
        {
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "multiple types in nested loop.");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "multiple types in nested loop.\n");
          return false;
        }
 
@@ -5093,10 +5489,13 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
        {
-         if (!flow_bb_inside_loop_p (loop->inner,
-                                     gimple_bb (USE_STMT (use_p))))
+         gimple use_stmt = USE_STMT (use_p);
+         if (is_gimple_debug (use_stmt))
+           continue;
+
+         if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
            {
-             exit_phi = USE_STMT (use_p);
+             exit_phi = use_stmt;
              break;
            }
        }
@@ -5106,9 +5505,10 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
          if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
                && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
            {
-             if (vect_print_dump_info (REPORT_DETAILS))
-               fprintf (vect_dump, "inner-loop induction only used outside "
-                        "of the outer vectorized loop.");
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "inner-loop induction only used outside "
+                                "of the outer vectorized loop.\n");
              return false;
            }
        }
@@ -5129,16 +5529,17 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
   if (!vec_stmt) /* transformation not required.  */
     {
       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
-      if (vect_print_dump_info (REPORT_DETAILS))
-        fprintf (vect_dump, "=== vectorizable_induction ===");
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location,
+                         "=== vectorizable_induction ===\n");
       vect_model_induction_cost (stmt_info, ncopies);
       return true;
     }
 
   /** Transform.  **/
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "transform induction phi.");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
 
   vec_def = get_initial_def_for_induction (phi);
   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
@@ -5153,7 +5554,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
 bool
 vectorizable_live_operation (gimple stmt,
                             gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
-                            gimple *vec_stmt ATTRIBUTE_UNUSED)
+                            gimple *vec_stmt)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
@@ -5173,7 +5574,41 @@ vectorizable_live_operation (gimple stmt,
     return false;
 
   if (!is_gimple_assign (stmt))
-    return false;
+    {
+      if (gimple_call_internal_p (stmt)
+         && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
+         && gimple_call_lhs (stmt)
+         && loop->simduid
+         && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
+         && loop->simduid
+            == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
+       {
+         edge e = single_exit (loop);
+         basic_block merge_bb = e->dest;
+         imm_use_iterator imm_iter;
+         use_operand_p use_p;
+         tree lhs = gimple_call_lhs (stmt);
+
+         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
+           {
+             gimple use_stmt = USE_STMT (use_p);
+             if (gimple_code (use_stmt) == GIMPLE_PHI
+                 && gimple_bb (use_stmt) == merge_bb)
+               {
+                 if (vec_stmt)
+                   {
+                     tree vfm1
+                       = build_int_cst (unsigned_type_node,
+                                        loop_vinfo->vectorization_factor - 1);
+                     SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
+                   }
+                 return true;
+               }
+           }
+       }
+
+      return false;
+    }
 
   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
     return false;
@@ -5202,8 +5637,9 @@ vectorizable_live_operation (gimple stmt,
           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
                                  &dt))
         {
-          if (vect_print_dump_info (REPORT_DETAILS))
-            fprintf (vect_dump, "use not simple.");
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "use not simple.\n");
           return false;
         }
 
@@ -5240,8 +5676,9 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
            {
              if (gimple_debug_bind_p (ustmt))
                {
-                 if (vect_print_dump_info (REPORT_DETAILS))
-                   fprintf (vect_dump, "killing debug use");
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_NOTE, vect_location,
+                                     "killing debug use\n");
 
                  gimple_debug_bind_reset_value (ustmt);
                  update_stmt (ustmt);
@@ -5253,6 +5690,120 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
     }
 }
 
+
+/* This function builds ni_name = number of iterations.  Statements
+   are emitted on the loop preheader edge.  */
+
+static tree
+vect_build_loop_niters (loop_vec_info loop_vinfo)
+{
+  tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
+  if (TREE_CODE (ni) == INTEGER_CST)
+    return ni;
+  else
+    {
+      tree ni_name, var;
+      gimple_seq stmts = NULL;
+      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+
+      var = create_tmp_var (TREE_TYPE (ni), "niters");
+      ni_name = force_gimple_operand (ni, &stmts, false, var);
+      if (stmts)
+       gsi_insert_seq_on_edge_immediate (pe, stmts);
+
+      return ni_name;
+    }
+}
+
+
+/* This function generates the following statements:
+
+   ni_name = number of iterations loop executes
+   ratio = ni_name / vf
+   ratio_mult_vf_name = ratio * vf
+
+   and places them on the loop preheader edge.  */
+
+static void
+vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
+                                tree ni_name,
+                                tree *ratio_mult_vf_name_ptr,
+                                tree *ratio_name_ptr)
+{
+  tree ni_minus_gap_name;
+  tree var;
+  tree ratio_name;
+  tree ratio_mult_vf_name;
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+  tree log_vf;
+
+  log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
+
+  /* If epilogue loop is required because of data accesses with gaps, we
+     subtract one iteration from the total number of iterations here for
+     correct calculation of RATIO.  */
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    {
+      ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
+                                      ni_name,
+                                      build_one_cst (TREE_TYPE (ni_name)));
+      if (!is_gimple_val (ni_minus_gap_name))
+       {
+         var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
+          gimple stmts = NULL;
+          ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
+                                                   true, var);
+         gsi_insert_seq_on_edge_immediate (pe, stmts);
+        }
+    }
+  else
+    ni_minus_gap_name = ni_name;
+
+  /* Create: ratio = ni >> log2(vf) */
+  /* ???  As we have ni == number of latch executions + 1, ni could
+     have overflown to zero.  So avoid computing ratio based on ni
+     but compute it using the fact that we know ratio will be at least
+     one, thus via (ni - vf) >> log2(vf) + 1.  */
+  ratio_name
+    = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
+                  fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
+                               fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
+                                            ni_minus_gap_name,
+                                            build_int_cst
+                                              (TREE_TYPE (ni_name), vf)),
+                               log_vf),
+                  build_int_cst (TREE_TYPE (ni_name), 1));
+  if (!is_gimple_val (ratio_name))
+    {
+      var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
+      gimple stmts = NULL;
+      ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
+      gsi_insert_seq_on_edge_immediate (pe, stmts);
+    }
+  *ratio_name_ptr = ratio_name;
+
+  /* Create: ratio_mult_vf = ratio << log2 (vf).  */
+
+  if (ratio_mult_vf_name_ptr)
+    {
+      ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
+                                       ratio_name, log_vf);
+      if (!is_gimple_val (ratio_mult_vf_name))
+       {
+         var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
+         gimple stmts = NULL;
+         ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
+                                                    true, var);
+         gsi_insert_seq_on_edge_immediate (pe, stmts);
+       }
+      *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
+    }
+
+  return;
+}
+
+
 /* Function vect_transform_loop.
 
    The analysis phase has determined that the loop is vectorizable.
@@ -5271,48 +5822,62 @@ vect_transform_loop (loop_vec_info loop_vinfo)
   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
   bool grouped_store;
   bool slp_scheduled = false;
-  unsigned int nunits;
   gimple stmt, pattern_stmt;
   gimple_seq pattern_def_seq = NULL;
   gimple_stmt_iterator pattern_def_si = gsi_none ();
   bool transform_pattern_stmt = false;
-  bool check_profitability;
+  bool check_profitability = false;
   int th;
+  /* Record number of iterations before we started tampering with the profile. */
+  gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
 
-  if (vect_print_dump_info (REPORT_DETAILS))
-    fprintf (vect_dump, "=== vec_transform_loop ===");
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
+
+  /* If profile is inprecise, we have chance to fix it up.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+    expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
 
   /* Use the more conservative vectorization threshold.  If the number
      of iterations is constant assume the cost check has been performed
      by our caller.  If the threshold makes all loops profitable that
      run at least the vectorization factor number of times checking
      is pointless, too.  */
-  th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
-        * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
-  th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
+  th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     {
-      if (vect_print_dump_info (REPORT_COST))
-       fprintf (vect_dump,
-                "Profitability threshold is %d loop iterations.", th);
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "Profitability threshold is %d loop iterations.\n",
+                         th);
       check_profitability = true;
     }
 
-  /* Peel the loop if there are data refs with unknown alignment.
-     Only one data ref with unknown store is allowed.  */
+  /* Version the loop first, if required, so the profitability check
+     comes first.  */
 
-  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
+  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
+      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
     {
-      vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
+      vect_loop_versioning (loop_vinfo, th, check_profitability);
       check_profitability = false;
     }
 
-  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
-      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
+  tree ni_name = vect_build_loop_niters (loop_vinfo);
+  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
+
+  /* Peel the loop if there are data refs with unknown alignment.
+     Only one data ref with unknown store is allowed.  */
+
+  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
     {
-      vect_loop_versioning (loop_vinfo, th, check_profitability);
+      vect_do_peeling_for_alignment (loop_vinfo, ni_name,
+                                    th, check_profitability);
       check_profitability = false;
+      /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
+        be re-computed.  */
+      ni_name = NULL_TREE;
     }
 
   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
@@ -5323,15 +5888,26 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      will remain scalar and will compute the remaining (n%VF) iterations.
      (VF is the vectorization factor).  */
 
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-       || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-          && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
-       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-    vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
-                                   th, check_profitability);
-  else
+  if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    {
+      tree ratio_mult_vf;
+      if (!ni_name)
+       ni_name = vect_build_loop_niters (loop_vinfo);
+      vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
+                                      &ratio);
+      vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
+                                     th, check_profitability);
+    }
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
                LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
+  else
+    {
+      if (!ni_name)
+       ni_name = vect_build_loop_niters (loop_vinfo);
+      vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
+    }
 
   /* 1) Make sure the loop header has exactly two entries
      2) Make sure we have a preheader basic block.  */
@@ -5354,10 +5930,12 @@ vect_transform_loop (loop_vec_info loop_vinfo)
       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
         {
          phi = gsi_stmt (si);
-         if (vect_print_dump_info (REPORT_DETAILS))
+         if (dump_enabled_p ())
            {
-             fprintf (vect_dump, "------>vectorizing phi: ");
-             print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
+             dump_printf_loc (MSG_NOTE, vect_location,
+                               "------>vectorizing phi: ");
+             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+              dump_printf (MSG_NOTE, "\n");
            }
          stmt_info = vinfo_for_stmt (phi);
          if (!stmt_info)
@@ -5370,15 +5948,16 @@ vect_transform_loop (loop_vec_info loop_vinfo)
              && !STMT_VINFO_LIVE_P (stmt_info))
            continue;
 
-         if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
-               != (unsigned HOST_WIDE_INT) vectorization_factor)
-             && vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "multiple-types.");
+         if (STMT_VINFO_VECTYPE (stmt_info)
+             && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
+                 != (unsigned HOST_WIDE_INT) vectorization_factor)
+             && dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
 
          if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
            {
-             if (vect_print_dump_info (REPORT_DETAILS))
-               fprintf (vect_dump, "transform phi.");
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
              vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
            }
        }
@@ -5391,12 +5970,24 @@ vect_transform_loop (loop_vec_info loop_vinfo)
           if (transform_pattern_stmt)
            stmt = pattern_stmt;
           else
-            stmt = gsi_stmt (si);
+           {
+             stmt = gsi_stmt (si);
+             /* During vectorization remove existing clobber stmts.  */
+             if (gimple_clobber_p (stmt))
+               {
+                 unlink_stmt_vdef (stmt);
+                 gsi_remove (&si, true);
+                 release_defs (stmt);
+                 continue;
+               }
+           }
 
-         if (vect_print_dump_info (REPORT_DETAILS))
+         if (dump_enabled_p ())
            {
-             fprintf (vect_dump, "------>vectorizing statement: ");
-             print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "------>vectorizing statement: ");
+             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+              dump_printf (MSG_NOTE, "\n");
            }
 
          stmt_info = vinfo_for_stmt (stmt);
@@ -5464,12 +6055,14 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
                  if (!gsi_end_p (pattern_def_si))
                    {
-                     if (vect_print_dump_info (REPORT_DETAILS))
+                     if (dump_enabled_p ())
                        {
-                         fprintf (vect_dump, "==> vectorizing pattern def"
-                                             " stmt: ");
-                         print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
-                                            TDF_SLIM);
+                         dump_printf_loc (MSG_NOTE, vect_location,
+                                          "==> vectorizing pattern def "
+                                          "stmt: ");
+                         dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
+                                           pattern_def_stmt, 0);
+                          dump_printf (MSG_NOTE, "\n");
                        }
 
                      stmt = pattern_def_stmt;
@@ -5485,15 +6078,18 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                transform_pattern_stmt = false;
             }
 
-         gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
-         nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
-                                               STMT_VINFO_VECTYPE (stmt_info));
-         if (!STMT_SLP_TYPE (stmt_info)
-             && nunits != (unsigned int) vectorization_factor
-              && vect_print_dump_info (REPORT_DETAILS))
-           /* For SLP VF is set according to unrolling factor, and not to
-              vector size, hence for SLP this print is not valid.  */
-            fprintf (vect_dump, "multiple-types.");
+         if (STMT_VINFO_VECTYPE (stmt_info))
+           {
+             unsigned int nunits
+               = (unsigned int)
+                 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
+             if (!STMT_SLP_TYPE (stmt_info)
+                 && nunits != (unsigned int) vectorization_factor
+                 && dump_enabled_p ())
+                 /* For SLP VF is set according to unrolling factor, and not
+                    to vector size, hence for SLP this print is not valid.  */
+               dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
+           }
 
          /* SLP. Schedule all the SLP instances when the first SLP stmt is
             reached.  */
@@ -5503,8 +6099,9 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                {
                  slp_scheduled = true;
 
-                 if (vect_print_dump_info (REPORT_DETAILS))
-                   fprintf (vect_dump, "=== scheduling SLP instances ===");
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_NOTE, vect_location,
+                                    "=== scheduling SLP instances ===\n");
 
                  vect_schedule_slp (loop_vinfo, NULL);
                }
@@ -5522,8 +6119,8 @@ vect_transform_loop (loop_vec_info loop_vinfo)
            }
 
          /* -------- vectorize statement ------------ */
-         if (vect_print_dump_info (REPORT_DETAILS))
-           fprintf (vect_dump, "transform statement.");
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
 
          grouped_store = false;
          is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
@@ -5536,7 +6133,6 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                     the chain.  */
                  gsi_next (&si);
                  vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
-                 continue;
                }
              else
                {
@@ -5546,11 +6142,13 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                  unlink_stmt_vdef (store);
                  gsi_remove (&si, true);
                  release_defs (store);
-                 continue;
                }
-           }
 
-         if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
+             /* Stores can only appear at the end of pattern statements.  */
+             gcc_assert (!transform_pattern_stmt);
+             pattern_def_seq = NULL;
+           }
+         else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
            {
              pattern_def_seq = NULL;
              gsi_next (&si);
@@ -5560,13 +6158,30 @@ vect_transform_loop (loop_vec_info loop_vinfo)
 
   slpeel_make_loop_iterate_ntimes (loop, ratio);
 
-  /* The memory tags and pointers in vectorized statements need to
-     have their SSA forms updated.  FIXME, why can't this be delayed
-     until all the loops have been transformed?  */
-  update_ssa (TODO_update_ssa);
+  /* Reduce loop iterations by the vectorization factor.  */
+  scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
+                     expected_iterations / vectorization_factor);
+  loop->nb_iterations_upper_bound
+    = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+      && loop->nb_iterations_upper_bound != 0)
+    loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
+  if (loop->any_estimate)
+    {
+      loop->nb_iterations_estimate
+        = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
+       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+          && loop->nb_iterations_estimate != 0)
+        loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
+    }
 
-  if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
-    fprintf (vect_dump, "LOOP VECTORIZED.");
-  if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
-    fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
+  if (dump_enabled_p ())
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "LOOP VECTORIZED\n");
+      if (loop->inner)
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "OUTER LOOP VECTORIZED\n");
+      dump_printf (MSG_NOTE, "\n");
+    }
 }