re PR tree-optimization/59058 (wrong code at -O3 on x86_64-linux-gnu (affecting gcc...

[gcc.git] / gcc / tree-vect-loop.c
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c

index 1252c5a04fc06f332276bc474dd8f72dc6192356..ca8d3a6cde089574cf431201dac9bea6fa60e226 100644 (file)
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -24,11 +24,26 @@ along with GCC; see the file COPYING3.  If not see
  #include "coretypes.h"
  #include "dumpfile.h"
  #include "tm.h"
-#include "ggc.h"
  #include "tree.h"
+#include "stor-layout.h"
  #include "basic-block.h"
  #include "gimple-pretty-print.h"
-#include "tree-flow.h"
+#include "tree-ssa-alias.h"
+#include "internal-fn.h"
+#include "gimple-expr.h"
+#include "is-a.h"
+#include "gimple.h"
+#include "gimplify.h"
+#include "gimple-iterator.h"
+#include "gimplify-me.h"
+#include "gimple-ssa.h"
+#include "tree-phinodes.h"
+#include "ssa-iterators.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "tree-ssa-loop-manip.h"
+#include "tree-ssa-loop-niter.h"
  #include "tree-pass.h"
  #include "cfgloop.h"
  #include "expr.h"
@@ -188,7 +203,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
-                     "=== vect_determine_vectorization_factor ===");
+                     "=== vect_determine_vectorization_factor ===\n");
  
    for (i = 0; i < nbbs; i++)
      {
@@ -202,6 +217,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
             {
               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+             dump_printf (MSG_NOTE, "\n");
             }
  
           gcc_assert (stmt_info);
@@ -216,6 +232,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                   dump_printf_loc (MSG_NOTE, vect_location,
                                     "get vectype for scalar type:  ");
                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
+                  dump_printf (MSG_NOTE, "\n");
                 }
  
               vectype = get_vectype_for_scalar_type (scalar_type);
@@ -228,6 +245,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                                         "data-type ");
                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                                           scalar_type);
+                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                     }
                   return false;
                 }
@@ -237,11 +255,13 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                 {
                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+                  dump_printf (MSG_NOTE, "\n");
                 }
  
               nunits = TYPE_VECTOR_SUBPARTS (vectype);
               if (dump_enabled_p ())
-               dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
+               dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
+                                 nunits);
  
               if (!vectorization_factor
                   || (nunits > vectorization_factor))
@@ -265,13 +285,15 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
               dump_printf_loc (MSG_NOTE, vect_location,
                                 "==> examining statement: ");
               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+              dump_printf (MSG_NOTE, "\n");
             }
  
           gcc_assert (stmt_info);
  
           /* Skip stmts which do not need to be vectorized.  */
-         if (!STMT_VINFO_RELEVANT_P (stmt_info)
-             && !STMT_VINFO_LIVE_P (stmt_info))
+         if ((!STMT_VINFO_RELEVANT_P (stmt_info)
+              && !STMT_VINFO_LIVE_P (stmt_info))
+             || gimple_clobber_p (stmt))
              {
                if (STMT_VINFO_IN_PATTERN_P (stmt_info)
                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
@@ -285,12 +307,13 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                        dump_printf_loc (MSG_NOTE, vect_location,
                                         "==> examining pattern statement: ");
                        dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+                      dump_printf (MSG_NOTE, "\n");
                      }
                  }
                else
                 {
                   if (dump_enabled_p ())
-                   dump_printf_loc (MSG_NOTE, vect_location, "skip.");
+                   dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
                    gsi_next (&si);
                   continue;
                  }
@@ -335,6 +358,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                                             "==> examining pattern def stmt: ");
                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
                                              pattern_def_stmt, 0);
+                          dump_printf (MSG_NOTE, "\n");
                         }
  
                       stmt = pattern_def_stmt;
@@ -352,12 +376,26 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
  
           if (gimple_get_lhs (stmt) == NULL_TREE)
             {
+             if (is_gimple_call (stmt))
+               {
+                 /* Ignore calls with no lhs.  These must be calls to
+                    #pragma omp simd functions, and what vectorization factor
+                    it really needs can't be determined until
+                    vectorizable_simd_clone_call.  */
+                 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
+                   {
+                     pattern_def_seq = NULL;
+                     gsi_next (&si);
+                   }
+                 continue;
+               }
               if (dump_enabled_p ())
                 {
                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                     "not vectorized: irregular stmt.");
                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
                                      0);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                 }
               return false;
             }
@@ -369,6 +407,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                     "not vectorized: vector stmt in loop:");
                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                 }
               return false;
             }
@@ -393,6 +432,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                   dump_printf_loc (MSG_NOTE, vect_location,
                                     "get vectype for scalar type:  ");
                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
+                  dump_printf (MSG_NOTE, "\n");
                 }
               vectype = get_vectype_for_scalar_type (scalar_type);
               if (!vectype)
@@ -404,11 +444,19 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                                         "data-type ");
                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                                           scalar_type);
+                      dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                     }
                   return false;
                 }
  
               STMT_VINFO_VECTYPE (stmt_info) = vectype;
+
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
+                 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+                  dump_printf (MSG_NOTE, "\n");
+               }
              }
  
           /* The vectorization factor is according to the smallest
@@ -421,6 +469,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
               dump_printf_loc (MSG_NOTE, vect_location,
                                 "get vectype for scalar type:  ");
               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
+              dump_printf (MSG_NOTE, "\n");
             }
           vf_vectype = get_vectype_for_scalar_type (scalar_type);
           if (!vf_vectype)
@@ -431,6 +480,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                                     "not vectorized: unsupported data-type ");
                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                                       scalar_type);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                 }
               return false;
             }
@@ -448,6 +498,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                                       vf_vectype);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                 }
               return false;
             }
@@ -456,11 +507,12 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
             {
               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
+              dump_printf (MSG_NOTE, "\n");
             }
  
           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
           if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
+           dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
           if (!vectorization_factor
               || (nunits > vectorization_factor))
             vectorization_factor = nunits;
@@ -475,13 +527,13 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
  
    /* TODO: Analyze cost. Decide if worth while to vectorize.  */
    if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
+    dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
                       vectorization_factor);
    if (vectorization_factor <= 1)
      {
        if (dump_enabled_p ())
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "not vectorized: unsupported data-type");
+                         "not vectorized: unsupported data-type\n");
        return false;
      }
    LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
@@ -493,7 +545,7 @@ vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
  /* Function vect_is_simple_iv_evolution.
  
     FORNOW: A simple evolution of an induction variables in the loop is
-   considered a polynomial evolution with constant step.  */
+   considered a polynomial evolution.  */
  
  static bool
  vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
@@ -502,6 +554,7 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
    tree init_expr;
    tree step_expr;
    tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
+  basic_block bb;
  
    /* When there is no evolution in this loop, the evolution function
       is not "simple".  */
@@ -522,16 +575,25 @@ vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
        dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
        dump_printf (MSG_NOTE, ",  init: ");
        dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
+      dump_printf (MSG_NOTE, "\n");
      }
  
    *init = init_expr;
    *step = step_expr;
  
-  if (TREE_CODE (step_expr) != INTEGER_CST)
+  if (TREE_CODE (step_expr) != INTEGER_CST
+      && (TREE_CODE (step_expr) != SSA_NAME
+         || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
+             && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
+         || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
+             && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
+                 || !flag_associative_math)))
+      && (TREE_CODE (step_expr) != REAL_CST
+         || !flag_associative_math))
      {
        if (dump_enabled_p ())
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "step unknown.");
+                         "step unknown.\n");
        return false;
      }
  
@@ -549,15 +611,14 @@ static void
  vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
  {
    basic_block bb = loop->header;
-  tree dumy;
-  vec<gimple> worklist;
-  worklist.create (64);
+  tree init, step;
+  stack_vec<gimple, 64> worklist;
    gimple_stmt_iterator gsi;
    bool double_reduc;
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
-                     "=== vect_analyze_scalar_cycles ===");
+                     "=== vect_analyze_scalar_cycles ===\n");
  
    /* First - identify all inductions.  Reduction detection assumes that all the
       inductions have been identified, therefore, this order must not be
@@ -573,6 +634,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
         {
           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+          dump_printf (MSG_NOTE, "\n");
         }
  
        /* Skip virtual phi's.  The data dependences that are associated with
@@ -592,13 +654,16 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
               dump_printf_loc (MSG_NOTE, vect_location,
                                 "Access function of PHI: ");
               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
+              dump_printf (MSG_NOTE, "\n");
             }
           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
             = evolution_part_in_loop_num (access_fn, loop->num);
         }
  
        if (!access_fn
-         || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
+         || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
+         || (LOOP_VINFO_LOOP (loop_vinfo) != loop
+             && TREE_CODE (step) != INTEGER_CST))
         {
           worklist.safe_push (phi);
           continue;
@@ -607,7 +672,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
        gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
  
        if (dump_enabled_p ())
-       dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
+       dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
        STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
      }
  
@@ -625,6 +690,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
          {
            dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
            dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+          dump_printf (MSG_NOTE, "\n");
          }
  
        gcc_assert (!virtual_operand_p (def)
@@ -639,7 +705,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
              {
                if (dump_enabled_p ())
                  dump_printf_loc (MSG_NOTE, vect_location,
-                                "Detected double reduction.");
+                                "Detected double reduction.\n");
  
                STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
                STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
@@ -651,7 +717,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
                  {
                    if (dump_enabled_p ())
                      dump_printf_loc (MSG_NOTE, vect_location,
-                                    "Detected vectorizable nested cycle.");
+                                    "Detected vectorizable nested cycle.\n");
  
                    STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
                    STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
@@ -661,7 +727,7 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
                  {
                    if (dump_enabled_p ())
                      dump_printf_loc (MSG_NOTE, vect_location,
-                                    "Detected reduction.");
+                                    "Detected reduction.\n");
  
                    STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
                    STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
@@ -675,10 +741,8 @@ vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
        else
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                          "Unknown def-use cycle pattern.");
+                          "Unknown def-use cycle pattern.\n");
      }
-
-  worklist.release ();
  }
  
  
@@ -723,34 +787,36 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
      vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
  }
  
+
  /* Function vect_get_loop_niters.
  
-   Determine how many iterations the loop is executed.
-   If an expression that represents the number of iterations
-   can be constructed, place it in NUMBER_OF_ITERATIONS.
+   Determine how many iterations the loop is executed and place it
+   in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
+   in NUMBER_OF_ITERATIONSM1.
+
     Return the loop exit condition.  */
  
  static gimple
-vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
+vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
+                     tree *number_of_iterationsm1)
  {
    tree niters;
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
-                    "=== get_loop_niters ===");
-  niters = number_of_exit_cond_executions (loop);
+                    "=== get_loop_niters ===\n");
  
-  if (niters != NULL_TREE
-      && niters != chrec_dont_know)
-    {
-      *number_of_iterations = niters;
+  niters = number_of_latch_executions (loop);
+  *number_of_iterationsm1 = niters;
  
-      if (dump_enabled_p ())
-        {
-          dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
-          dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
-        }
-    }
+  /* We want the number of loop header executions which is the number
+     of latch executions plus one.
+     ???  For UINT_MAX latch executions this number overflows to zero
+     for loops like do { n++; } while (n != 0);  */
+  if (niters && !chrec_contains_undetermined (niters))
+    niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
+                         build_int_cst (TREE_TYPE (niters), 1));
+  *number_of_iterations = niters;
  
    return get_loop_exit_condition (loop);
  }
@@ -854,11 +920,12 @@ new_loop_vec_info (struct loop *loop)
     gcc_assert (nbbs == loop->num_nodes);
  
    LOOP_VINFO_BBS (res) = bbs;
+  LOOP_VINFO_NITERSM1 (res) = NULL;
    LOOP_VINFO_NITERS (res) = NULL;
    LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
    LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
    LOOP_VINFO_VECTORIZABLE_P (res) = 0;
-  LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
+  LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
    LOOP_VINFO_VECT_FACTOR (res) = 0;
    LOOP_VINFO_LOOP_NEST (res).create (3);
    LOOP_VINFO_DATAREFS (res).create (10);
@@ -873,9 +940,9 @@ new_loop_vec_info (struct loop *loop)
    LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
    LOOP_VINFO_SLP_INSTANCES (res).create (10);
    LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
-  LOOP_VINFO_PEELING_HTAB (res) = NULL;
    LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
    LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
+  LOOP_VINFO_PEELING_FOR_NITER (res) = false;
    LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
  
    return res;
@@ -928,9 +995,9 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
                    || code == POINTER_PLUS_EXPR
                    || code == MULT_EXPR)
                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
-               swap_tree_operands (stmt,
-                                   gimple_assign_rhs1_ptr (stmt),
-                                   gimple_assign_rhs2_ptr (stmt));
+               swap_ssa_operands (stmt,
+                                  gimple_assign_rhs1_ptr (stmt),
+                                  gimple_assign_rhs2_ptr (stmt));
             }
  
           /* Free stmt_vec_info.  */
@@ -940,7 +1007,7 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
      }
  
    free (LOOP_VINFO_BBS (loop_vinfo));
-  free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
+  vect_destroy_datarefs (loop_vinfo, NULL);
    free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
    LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
    LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
@@ -954,8 +1021,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
    LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
    LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
  
-  if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
-    htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
+  if (LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
+    LOOP_VINFO_PEELING_HTAB (loop_vinfo).dispose ();
  
    destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
  
@@ -979,7 +1046,7 @@ vect_analyze_loop_1 (struct loop *loop)
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
-                    "===== analyze_loop_nest_1 =====");
+                    "===== analyze_loop_nest_1 =====\n");
  
    /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
  
@@ -988,7 +1055,7 @@ vect_analyze_loop_1 (struct loop *loop)
      {
        if (dump_enabled_p ())
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad inner-loop form.");
+                        "bad inner-loop form.\n");
        return NULL;
      }
  
@@ -1009,12 +1076,12 @@ vect_analyze_loop_form (struct loop *loop)
  {
    loop_vec_info loop_vinfo;
    gimple loop_cond;
-  tree number_of_iterations = NULL;
+  tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
    loop_vec_info inner_loop_vinfo = NULL;
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
-                    "=== vect_analyze_loop_form ===");
+                    "=== vect_analyze_loop_form ===\n");
  
    /* Different restrictions apply when we are considering an inner-most loop,
       vs. an outer (nested) loop.
@@ -1038,17 +1105,17 @@ vect_analyze_loop_form (struct loop *loop)
          {
            if (dump_enabled_p ())
              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: control flow in loop.");
+                            "not vectorized: control flow in loop.\n");
            return NULL;
          }
  
        if (empty_block_p (loop->header))
-    {
-          if (dump_enabled_p ())
-            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: empty loop.");
-      return NULL;
-    }
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: empty loop.\n");
+         return NULL;
+       }
      }
    else
      {
@@ -1076,7 +1143,7 @@ vect_analyze_loop_form (struct loop *loop)
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: multiple nested loops.");
+                            "not vectorized: multiple nested loops.\n");
           return NULL;
         }
  
@@ -1086,7 +1153,7 @@ vect_analyze_loop_form (struct loop *loop)
         {
           if (dump_enabled_p ())
              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: Bad inner loop.");
+                            "not vectorized: Bad inner loop.\n");
           return NULL;
         }
  
@@ -1094,8 +1161,9 @@ vect_analyze_loop_form (struct loop *loop)
                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
         {
           if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: inner-loop count not invariant.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: inner-loop count not"
+                             " invariant.\n");
           destroy_loop_vec_info (inner_loop_vinfo, true);
           return NULL;
         }
@@ -1104,7 +1172,7 @@ vect_analyze_loop_form (struct loop *loop)
          {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: control flow in loop.");
+                            "not vectorized: control flow in loop.\n");
           destroy_loop_vec_info (inner_loop_vinfo, true);
           return NULL;
          }
@@ -1119,15 +1187,15 @@ vect_analyze_loop_form (struct loop *loop)
           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
         {
           if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: unsupported outerloop form.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: unsupported outerloop form.\n");
           destroy_loop_vec_info (inner_loop_vinfo, true);
           return NULL;
         }
  
        if (dump_enabled_p ())
          dump_printf_loc (MSG_NOTE, vect_location,
-                        "Considering outer-loop vectorization.");
+                        "Considering outer-loop vectorization.\n");
      }
  
    if (!single_exit (loop)
@@ -1137,10 +1205,10 @@ vect_analyze_loop_form (struct loop *loop)
          {
            if (!single_exit (loop))
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: multiple exits.");
+                            "not vectorized: multiple exits.\n");
            else if (EDGE_COUNT (loop->header->preds) != 2)
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: too many incoming edges.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: too many incoming edges.\n");
          }
        if (inner_loop_vinfo)
         destroy_loop_vec_info (inner_loop_vinfo, true);
@@ -1156,7 +1224,7 @@ vect_analyze_loop_form (struct loop *loop)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: latch block not empty.");
+                        "not vectorized: latch block not empty.\n");
        if (inner_loop_vinfo)
         destroy_loop_vec_info (inner_loop_vinfo, true);
        return NULL;
@@ -1170,73 +1238,68 @@ vect_analyze_loop_form (struct loop *loop)
         {
           split_loop_exit_edge (e);
           if (dump_enabled_p ())
-           dump_printf (MSG_NOTE, "split exit edge.");
+           dump_printf (MSG_NOTE, "split exit edge.\n");
         }
        else
         {
           if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: abnormal loop exit edge.");
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: abnormal loop exit edge.\n");
           if (inner_loop_vinfo)
             destroy_loop_vec_info (inner_loop_vinfo, true);
           return NULL;
         }
      }
  
-  loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
+  loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
+                                   &number_of_iterationsm1);
    if (!loop_cond)
      {
        if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                        "not vectorized: complicated exit condition.");
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: complicated exit condition.\n");
        if (inner_loop_vinfo)
         destroy_loop_vec_info (inner_loop_vinfo, true);
        return NULL;
      }
  
-  if (!number_of_iterations)
+  if (!number_of_iterations
+      || chrec_contains_undetermined (number_of_iterations))
      {
        if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                          "not vectorized: number of iterations cannot be "
-                        "computed.");
+                        "computed.\n");
        if (inner_loop_vinfo)
         destroy_loop_vec_info (inner_loop_vinfo, true);
        return NULL;
      }
  
-  if (chrec_contains_undetermined (number_of_iterations))
+  if (integer_zerop (number_of_iterations))
      {
        if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "Infinite number of iterations.");
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "not vectorized: number of iterations = 0.\n");
        if (inner_loop_vinfo)
-       destroy_loop_vec_info (inner_loop_vinfo, true);
+        destroy_loop_vec_info (inner_loop_vinfo, true);
        return NULL;
      }
  
-  if (!NITERS_KNOWN_P (number_of_iterations))
+  loop_vinfo = new_loop_vec_info (loop);
+  LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
+  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
+  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
+
+  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
      {
        if (dump_enabled_p ())
          {
            dump_printf_loc (MSG_NOTE, vect_location,
                            "Symbolic number of iterations is ");
           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
+          dump_printf (MSG_NOTE, "\n");
          }
      }
-  else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: number of iterations = 0.");
-      if (inner_loop_vinfo)
-        destroy_loop_vec_info (inner_loop_vinfo, true);
-      return NULL;
-    }
-
-  loop_vinfo = new_loop_vec_info (loop);
-  LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
-  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
  
    STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
  
@@ -1276,7 +1339,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
-                    "=== vect_analyze_loop_operations ===");
+                    "=== vect_analyze_loop_operations ===\n");
  
    gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
    vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
@@ -1312,7 +1375,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
        LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
        if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
-                        "Updating vectorization factor to %d ",
+                        "Updating vectorization factor to %d\n",
                          vectorization_factor);
      }
  
@@ -1330,6 +1393,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
              {
                dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
                dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+              dump_printf (MSG_NOTE, "\n");
              }
  
            /* Inner-loop loop-closed exit phi in outer-loop vectorization
@@ -1346,9 +1410,9 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
                       != vect_double_reduction_def)
                  {
                    if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                      "Unsupported loop-closed phi in "
-                                    "outer-loop.");
+                                    "outer-loop.\n");
                    return false;
                  }
  
@@ -1367,7 +1431,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
                      return false;
  
                    op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
-                 if (!op_def_stmt
+                 if (gimple_nop_p (op_def_stmt)
                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
                       || !vinfo_for_stmt (op_def_stmt))
                      return false;
@@ -1389,7 +1453,7 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
                /* FORNOW: not yet supported.  */
                if (dump_enabled_p ())
                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                "not vectorized: value used after loop.");
+                                "not vectorized: value used after loop.\n");
                return false;
              }
  
@@ -1398,8 +1462,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
              {
                /* A scalar-dependence cycle that we don't support.  */
                if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                "not vectorized: scalar dependence cycle.");
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "not vectorized: scalar dependence cycle.\n");
                return false;
              }
  
@@ -1414,10 +1478,11 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
              {
                if (dump_enabled_p ())
                  {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                    "not vectorized: relevant phi not "
                                    "supported: ");
                    dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
+                  dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                  }
               return false;
              }
@@ -1426,7 +1491,8 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
        for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
          {
            gimple stmt = gsi_stmt (si);
-         if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
+         if (!gimple_clobber_p (stmt)
+             && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
             return false;
          }
      } /* bbs */
@@ -1440,18 +1506,18 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
      {
        if (dump_enabled_p ())
          dump_printf_loc (MSG_NOTE, vect_location,
-                        "All the computation can be taken out of the loop.");
+                        "All the computation can be taken out of the loop.\n");
        if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                          "not vectorized: redundant loop. no profit to "
-                        "vectorize.");
+                        "vectorize.\n");
        return false;
      }
  
    if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
                      "vectorization_factor = %d, niters = "
-                    HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
+                    HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
                      LOOP_VINFO_INT_NITERS (loop_vinfo));
  
    if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
@@ -1461,11 +1527,11 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: iteration count too small.");
+                        "not vectorized: iteration count too small.\n");
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                          "not vectorized: iteration count smaller than "
-                        "vectorization factor.");
+                        "vectorization factor.\n");
        return false;
      }
  
@@ -1483,11 +1549,11 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: vectorization not profitable.");
+                        "not vectorized: vectorization not profitable.\n");
        if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                          "not vectorized: vector version will never be "
-                        "profitable.");
+                        "profitable.\n");
        return false;
      }
  
@@ -1509,12 +1575,12 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: vectorization not profitable.");
+                        "not vectorized: vectorization not profitable.\n");
        if (dump_enabled_p ())
          dump_printf_loc (MSG_NOTE, vect_location,
                          "not vectorized: iteration count smaller than user "
                          "specified loop bound parameter or minimum profitable "
-                        "iterations (whichever is more conservative).");
+                        "iterations (whichever is more conservative).\n");
        return false;
      }
  
@@ -1525,38 +1591,16 @@ vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                          "not vectorized: estimated iteration count too "
-                         "small.");
+                         "small.\n");
        if (dump_enabled_p ())
          dump_printf_loc (MSG_NOTE, vect_location,
                          "not vectorized: estimated iteration count smaller "
                           "than specified loop bound parameter or minimum "
                           "profitable iterations (whichever is more "
-                         "conservative).");
+                         "conservative).\n");
        return false;
      }
  
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-      || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
-      || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
-    {
-      if (dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
-      if (!vect_can_advance_ivs_p (loop_vinfo))
-        {
-          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: can't create epilog loop 1.");
-          return false;
-        }
-      if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
-        {
-          if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                            "not vectorized: can't create epilog loop 2.");
-          return false;
-        }
-    }
-
    return true;
  }
  
@@ -1585,7 +1629,19 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data references.");
+                        "bad data references.\n");
+      return false;
+    }
+
+  /* Analyze the access patterns of the data-refs in the loop (consecutive,
+     complex, etc.). FORNOW: Only handle consecutive access pattern.  */
+
+  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
+  if (!ok)
+    {
+      if (dump_enabled_p ())
+       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                        "bad data access.\n");
        return false;
      }
  
@@ -1603,7 +1659,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "unexpected pattern.");
+                        "unexpected pattern.\n");
        return false;
      }
  
@@ -1612,13 +1668,13 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
       the dependences.
       FORNOW: fail at the first data dependence that we encounter.  */
  
-  ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
+  ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
    if (!ok
        || max_vf < min_vf)
      {
        if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "bad data dependence.");
+                            "bad data dependence.\n");
        return false;
      }
  
@@ -1627,14 +1683,14 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "can't determine vectorization factor.");
+                        "can't determine vectorization factor.\n");
        return false;
      }
    if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data dependence.");
+                        "bad data dependence.\n");
        return false;
      }
  
@@ -1646,19 +1702,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data alignment.");
-      return false;
-    }
-
-  /* Analyze the access patterns of the data-refs in the loop (consecutive,
-     complex, etc.). FORNOW: Only handle consecutive access pattern.  */
-
-  ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
-  if (!ok)
-    {
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data access.");
+                        "bad data alignment.\n");
        return false;
      }
  
@@ -1671,7 +1715,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                          "too long list of versioning for alias "
-                        "run-time tests.");
+                        "run-time tests.\n");
        return false;
      }
  
@@ -1683,7 +1727,7 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad data alignment.");
+                        "bad data alignment.\n");
        return false;
      }
  
@@ -1708,10 +1752,44 @@ vect_analyze_loop_2 (loop_vec_info loop_vinfo)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "bad operation or unsupported loop bound.");
+                        "bad operation or unsupported loop bound.\n");
        return false;
      }
  
+  /* Decide whether we need to create an epilogue loop to handle
+     remaining scalar iterations.  */
+  if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
+      && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
+    {
+      if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
+                  - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
+         < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
+       LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+    }
+  else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+          || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
+              < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
+    LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
+
+  /* If an epilogue loop is required make sure we can create one.  */
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
+      || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
+    {
+      if (dump_enabled_p ())
+        dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
+      if (!vect_can_advance_ivs_p (loop_vinfo)
+         || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
+                                          single_exit (LOOP_VINFO_LOOP
+                                                        (loop_vinfo))))
+        {
+          if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: can't create required "
+                            "epilog loop\n");
+          return false;
+        }
+    }
+
    return true;
  }
  
@@ -1732,7 +1810,7 @@ vect_analyze_loop (struct loop *loop)
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
-                    "===== analyze_loop_nest =====");
+                    "===== analyze_loop_nest =====\n");
  
    if (loop_outer (loop)
        && loop_vec_info_for_loop (loop_outer (loop))
@@ -1740,7 +1818,7 @@ vect_analyze_loop (struct loop *loop)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
-                        "outer-loop already vectorized.");
+                        "outer-loop already vectorized.\n");
        return NULL;
      }
  
@@ -1752,7 +1830,7 @@ vect_analyze_loop (struct loop *loop)
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "bad loop form.");
+                            "bad loop form.\n");
           return NULL;
         }
  
@@ -1833,6 +1911,7 @@ report_vect_op (int msg_type, gimple stmt, const char *msg)
  {
    dump_printf_loc (msg_type, vect_location, "%s", msg);
    dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
+  dump_printf (msg_type, "\n");
  }
  
  
@@ -2008,11 +2087,12 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
                 {
                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
+                  dump_printf (MSG_NOTE, "\n");
                 }
  
-             swap_tree_operands (next_stmt,
-                                 gimple_assign_rhs1_ptr (next_stmt),
-                                  gimple_assign_rhs2_ptr (next_stmt));
+             swap_ssa_operands (next_stmt,
+                                gimple_assign_rhs1_ptr (next_stmt),
+                                 gimple_assign_rhs2_ptr (next_stmt));
               update_stmt (next_stmt);
  
               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
@@ -2045,6 +2125,13 @@ vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
       a3 = ...
       a2 = operation (a3, a1)
  
+   or
+
+   a3 = ...
+   loop_header:
+     a1 = phi < a0, a2 >
+     a2 = operation (a3, a1)
+
     such that:
     1. operation is commutative and associative and it is safe to
        change the order of the computation (if CHECK_REDUCTION is true)
@@ -2108,7 +2195,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "intermediate value used outside loop.");
+                            "intermediate value used outside loop.\n");
  
            return NULL;
          }
@@ -2120,7 +2207,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction used in loop.");
+                            "reduction used in loop.\n");
            return NULL;
          }
      }
@@ -2132,6 +2219,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                            "reduction: not ssa_name: ");
           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
+          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
         }
        return NULL;
      }
@@ -2141,14 +2229,17 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "reduction: no def_stmt.");
+                        "reduction: no def_stmt.\n");
        return NULL;
      }
  
    if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
      {
        if (dump_enabled_p ())
-        dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+        {
+          dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
+          dump_printf (MSG_NOTE, "\n");
+        }
        return NULL;
      }
  
@@ -2177,7 +2268,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduction used in loop.");
+                            "reduction used in loop.\n");
           return NULL;
         }
      }
@@ -2193,7 +2284,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "unsupported phi node definition.");
+                            "unsupported phi node definition.\n");
  
            return NULL;
          }
@@ -2316,6 +2407,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
                dump_generic_expr (MSG_NOTE, TDF_SLIM,
                                  TREE_TYPE (op4));
              }
+          dump_printf (MSG_NOTE, "\n");
          }
  
        return NULL;
@@ -2400,6 +2492,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
    if (def2 && def2 == phi
        && (code == COND_EXPR
           || !def1 || gimple_nop_p (def1)
+         || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
            || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
                && (is_gimple_assign (def1)
                   || is_gimple_call (def1)
@@ -2418,6 +2511,7 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
    if (def1 && def1 == phi
        && (code == COND_EXPR
           || !def2 || gimple_nop_p (def2)
+         || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
            || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
               && (is_gimple_assign (def2)
                   || is_gimple_call (def2)
@@ -2437,8 +2531,8 @@ vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
             report_vect_op (MSG_NOTE, def_stmt,
                             "detected reduction: need to swap operands: ");
  
-          swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
-                             gimple_assign_rhs2_ptr (def_stmt));
+          swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
+                            gimple_assign_rhs2_ptr (def_stmt));
  
           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
@@ -2573,7 +2667,7 @@ vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
        if (dump_enabled_p ())
          dump_printf_loc (MSG_NOTE, vect_location,
                          "cost model: epilogue peel iters set to vf/2 "
-                        "because loop iterations are unknown .");
+                        "because loop iterations are unknown .\n");
  
        /* If peeled iterations are known but number of scalar loop
           iterations are unknown, count a taken branch per peeled loop.  */
@@ -2625,13 +2719,13 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
    int scalar_single_iter_cost = 0;
    int scalar_outside_cost = 0;
    int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
-  int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
+  int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
    void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
  
    /* Cost model disabled.  */
-  if (!flag_vect_cost_model)
+  if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
      {
-      dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
+      dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
        *ret_min_profitable_niters = 0;
        *ret_min_profitable_estimate = 0;
        return;
@@ -2689,14 +2783,14 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
      {
        peel_iters_prologue = vf/2;
        dump_printf (MSG_NOTE, "cost model: "
-                   "prologue peel iters set to vf/2.");
+                   "prologue peel iters set to vf/2.\n");
  
        /* If peeling for alignment is unknown, loop bound of main loop becomes
           unknown.  */
        peel_iters_epilogue = vf/2;
        dump_printf (MSG_NOTE, "cost model: "
                     "epilogue peel iters set to vf/2 because "
-                   "peeling for alignment is unknown.");
+                   "peeling for alignment is unknown.\n");
  
        /* If peeled iterations are unknown, count a taken branch and a not taken
           branch per peeled loop. Even if scalar loop iterations are known,
@@ -2816,7 +2910,7 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
        else
         {
           /* Cost model check occurs at prologue generation.  */
-         if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
+         if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
               + vect_get_stmt_cost (cond_branch_not_taken); 
           /* Cost model check occurs at epilogue generation.  */
@@ -2862,11 +2956,16 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
    /* vector version will never be profitable.  */
    else
      {
+      if (LOOP_VINFO_LOOP (loop_vinfo)->force_vect)
+       warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
+                   "did not happen for a simd loop");
+
        if (dump_enabled_p ())
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                          "cost model: the vector iteration cost = %d "
                          "divided by the scalar iteration cost = %d "
-                        "is greater or equal to the vectorization factor = %d.",
+                        "is greater or equal to the vectorization factor = %d"
+                         ".\n",
                          vec_inside_cost, scalar_single_iter_cost, vf);
        *ret_min_profitable_niters = -1;
        *ret_min_profitable_estimate = -1;
@@ -2892,9 +2991,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
                     peel_iters_prologue);
        dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
                     peel_iters_epilogue);
-      dump_printf (MSG_NOTE, 
+      dump_printf (MSG_NOTE,
                     "  Calculated minimum iters for profitability: %d\n",
                     min_profitable_iters);
+      dump_printf (MSG_NOTE, "\n");
      }
  
    min_profitable_iters =
@@ -2907,7 +3007,8 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
  
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
-                     "  Runtime profitability threshold = %d\n", min_profitable_iters);
+                     "  Runtime profitability threshold = %d\n",
+                     min_profitable_iters);
  
    *ret_min_profitable_niters = min_profitable_iters;
  
@@ -2996,6 +3097,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
                            "unsupported data-type ");
            dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
                              TREE_TYPE (reduction_op));
+          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
          }
        return false;
     }
@@ -3028,10 +3130,10 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
         }
        else
         {
-         int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+         int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
           tree bitsize =
             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
-         int element_bitsize = tree_low_cst (bitsize, 1);
+         int element_bitsize = tree_to_uhwi (bitsize);
           int nelements = vec_size_in_bits / element_bitsize;
  
           optab = optab_for_tree_code (code, vectype, optab_default);
@@ -3064,7 +3166,7 @@ vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
    if (dump_enabled_p ())
      dump_printf (MSG_NOTE, 
                   "vect_model_reduction_cost: inside_cost = %d, "
-                 "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
+                 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
                   prologue_cost, epilogue_cost);
  
    return true;
@@ -3093,7 +3195,7 @@ vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
    if (dump_enabled_p ())
      dump_printf_loc (MSG_NOTE, vect_location,
                       "vect_model_induction_cost: inside_cost = %d, "
-                     "prologue_cost = %d .", inside_cost, prologue_cost);
+                     "prologue_cost = %d .\n", inside_cost, prologue_cost);
  }
  
  
@@ -3115,14 +3217,12 @@ get_initial_def_for_induction (gimple iv_phi)
    stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
    struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
-  tree scalar_type;
    tree vectype;
    int nunits;
    edge pe = loop_preheader_edge (loop);
    struct loop *iv_loop;
    basic_block new_bb;
    tree new_vec, vec_init, vec_step, t;
-  tree access_fn;
    tree new_var;
    tree new_name;
    gimple init_stmt, induction_phi, new_stmt;
@@ -3130,7 +3230,6 @@ get_initial_def_for_induction (gimple iv_phi)
    tree init_expr, step_expr;
    int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    int i;
-  bool ok;
    int ncopies;
    tree expr;
    stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
@@ -3159,16 +3258,14 @@ get_initial_def_for_induction (gimple iv_phi)
    latch_e = loop_latch_edge (iv_loop);
    loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
  
-  access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
-  gcc_assert (access_fn);
-  STRIP_NOPS (access_fn);
-  ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
-                                    &init_expr, &step_expr);
-  gcc_assert (ok);
+  step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
+  gcc_assert (step_expr != NULL_TREE);
+
    pe = loop_preheader_edge (iv_loop);
+  init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
+                                    loop_preheader_edge (iv_loop));
  
-  scalar_type = TREE_TYPE (init_expr);
-  vectype = get_vectype_for_scalar_type (scalar_type);
+  vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
    resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
    gcc_assert (vectype);
    nunits = TYPE_VECTOR_SUBPARTS (vectype);
@@ -3177,6 +3274,16 @@ get_initial_def_for_induction (gimple iv_phi)
    gcc_assert (phi_info);
    gcc_assert (ncopies >= 1);
  
+  /* Convert the step to the desired type.  */
+  step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
+                                                 step_expr),
+                                   &stmts, true, NULL_TREE);
+  if (stmts)
+    {
+      new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
+      gcc_assert (!new_bb);
+    }
+
    /* Find the first insertion point in the BB.  */
    si = gsi_after_labels (bb);
  
@@ -3186,9 +3293,7 @@ get_initial_def_for_induction (gimple iv_phi)
        /* iv_loop is nested in the loop to be vectorized.  init_expr had already
          been created during vectorization of previous stmts.  We obtain it
          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
-      tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
-                                           loop_preheader_edge (iv_loop));
-      vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
+      vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
        /* If the initial value is not of proper type, convert it.  */
        if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
         {
@@ -3211,8 +3316,11 @@ get_initial_def_for_induction (gimple iv_phi)
  
        /* iv_loop is the loop to be vectorized. Create:
          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
-      new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
-      new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
+      new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
+                                      vect_scalar_var, "var_");
+      new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
+                                                    init_expr),
+                                      &stmts, false, new_var);
        if (stmts)
         {
           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
@@ -3220,30 +3328,36 @@ get_initial_def_for_induction (gimple iv_phi)
         }
  
        vec_alloc (v, nunits);
+      bool constant_p = is_gimple_min_invariant (new_name);
        CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
        for (i = 1; i < nunits; i++)
         {
           /* Create: new_name_i = new_name + step_expr  */
-         enum tree_code code = POINTER_TYPE_P (scalar_type)
-                               ? POINTER_PLUS_EXPR : PLUS_EXPR;
-         init_stmt = gimple_build_assign_with_ops (code, new_var,
-                                                   new_name, step_expr);
-         new_name = make_ssa_name (new_var, init_stmt);
-         gimple_assign_set_lhs (init_stmt, new_name);
-
-         new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
-         gcc_assert (!new_bb);
-
-         if (dump_enabled_p ())
+         new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
+                                 new_name, step_expr);
+         if (!is_gimple_min_invariant (new_name))
             {
-             dump_printf_loc (MSG_NOTE, vect_location,
-                              "created new init_stmt: ");
-             dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
+             init_stmt = gimple_build_assign (new_var, new_name);
+             new_name = make_ssa_name (new_var, init_stmt);
+             gimple_assign_set_lhs (init_stmt, new_name);
+             new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
+             gcc_assert (!new_bb);
+             if (dump_enabled_p ())
+               {
+                 dump_printf_loc (MSG_NOTE, vect_location,
+                                  "created new init_stmt: ");
+                 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
+                  dump_printf (MSG_NOTE, "\n");
+               }
+             constant_p = false;
             }
           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
         }
        /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
-      new_vec = build_constructor (vectype, v);
+      if (constant_p)
+       new_vec = build_vector_from_ctor (vectype, v);
+      else
+       new_vec = build_constructor (vectype, v);
        vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
      }
  
@@ -3257,13 +3371,23 @@ get_initial_def_for_induction (gimple iv_phi)
      {
        /* iv_loop is the loop to be vectorized. Generate:
           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
-      expr = build_int_cst (TREE_TYPE (step_expr), vf);
+      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+       {
+         expr = build_int_cst (integer_type_node, vf);
+         expr = fold_convert (TREE_TYPE (step_expr), expr);
+       }
+      else
+       expr = build_int_cst (TREE_TYPE (step_expr), vf);
        new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
                               expr, step_expr);
+      if (TREE_CODE (step_expr) == SSA_NAME)
+       new_name = vect_init_vector (iv_phi, new_name,
+                                    TREE_TYPE (step_expr), NULL);
      }
  
    t = unshare_expr (new_name);
-  gcc_assert (CONSTANT_CLASS_P (new_name));
+  gcc_assert (CONSTANT_CLASS_P (new_name)
+             || TREE_CODE (new_name) == SSA_NAME);
    stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
    gcc_assert (stepvectype);
    new_vec = build_vector_from_val (stepvectype, t);
@@ -3316,11 +3440,21 @@ get_initial_def_for_induction (gimple iv_phi)
        gcc_assert (!nested_in_vect_loop);
  
        /* Create the vector that holds the step of the induction.  */
-      expr = build_int_cst (TREE_TYPE (step_expr), nunits);
+      if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
+       {
+         expr = build_int_cst (integer_type_node, nunits);
+         expr = fold_convert (TREE_TYPE (step_expr), expr);
+       }
+      else
+       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
        new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
                               expr, step_expr);
+      if (TREE_CODE (step_expr) == SSA_NAME)
+       new_name = vect_init_vector (iv_phi, new_name,
+                                    TREE_TYPE (step_expr), NULL);
        t = unshare_expr (new_name);
-      gcc_assert (CONSTANT_CLASS_P (new_name));
+      gcc_assert (CONSTANT_CLASS_P (new_name)
+                 || TREE_CODE (new_name) == SSA_NAME);
        new_vec = build_vector_from_val (stepvectype, t);
        vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
  
@@ -3382,6 +3516,7 @@ get_initial_def_for_induction (gimple iv_phi)
               dump_printf_loc (MSG_NOTE, vect_location,
                                "vector of inductions after inner-loop:");
               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
+              dump_printf (MSG_NOTE, "\n");
             }
         }
      }
@@ -3395,6 +3530,7 @@ get_initial_def_for_induction (gimple iv_phi)
        dump_printf (MSG_NOTE, "\n");
        dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
                         SSA_NAME_DEF_STMT (vec_def), 0);
+      dump_printf (MSG_NOTE, "\n");
      }
  
    STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
@@ -3698,14 +3834,14 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
    bool extract_scalar_result = false;
    gimple use_stmt, orig_stmt, reduction_phi = NULL;
    bool nested_in_vect_loop = false;
-  vec<gimple> new_phis = vNULL;
-  vec<gimple> inner_phis = vNULL;
+  auto_vec<gimple> new_phis;
+  auto_vec<gimple> inner_phis;
    enum vect_def_type dt = vect_unknown_def_type;
    int j, i;
-  vec<tree> scalar_results = vNULL;
+  auto_vec<tree> scalar_results;
    unsigned int group_size = 1, k, ratio;
-  vec<tree> vec_initial_defs = vNULL;
-  vec<gimple> phis;
+  auto_vec<tree> vec_initial_defs;
+  auto_vec<gimple> phis;
    bool slp_reduc = false;
    tree new_phi_result;
    gimple inner_phi = NULL;
@@ -3802,14 +3938,13 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
                dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
                dump_printf (MSG_NOTE, "\n");
                dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
+              dump_printf (MSG_NOTE, "\n");
              }
  
            phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
          }
      }
  
-  vec_initial_defs.release ();
-
    /* 2. Create epilog code.
          The reduction epilog code operates across the elements of the vector
          of partial results computed by the vectorized loop.
@@ -3998,7 +4133,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
  
        if (dump_enabled_p ())
          dump_printf_loc (MSG_NOTE, vect_location,
-                        "Reduce using direct vector reduction.");
+                        "Reduce using direct vector reduction.\n");
  
        vec_dest = vect_create_destination_var (scalar_dest, vectype);
        tmp = build1 (reduc_code, vectype, new_phi_result);
@@ -4014,8 +4149,8 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
        enum tree_code shift_code = ERROR_MARK;
        bool have_whole_vector_shift = true;
        int bit_offset;
-      int element_bitsize = tree_low_cst (bitsize, 1);
-      int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+      int element_bitsize = tree_to_uhwi (bitsize);
+      int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
        tree vec_temp;
  
        if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
@@ -4049,7 +4184,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
  
            if (dump_enabled_p ())
              dump_printf_loc (MSG_NOTE, vect_location,
-                            "Reduce using vector shifts");
+                            "Reduce using vector shifts\n");
  
            vec_dest = vect_create_destination_var (scalar_dest, vectype);
            new_temp = new_phi_result;
@@ -4090,9 +4225,9 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
  
            if (dump_enabled_p ())
              dump_printf_loc (MSG_NOTE, vect_location,
-                            "Reduce using scalar code. ");
+                            "Reduce using scalar code.\n");
  
-          vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
+          vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
            FOR_EACH_VEC_ELT (new_phis, i, new_phi)
              {
                if (gimple_code (new_phi) == GIMPLE_PHI)
@@ -4181,7 +4316,7 @@ vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
  
        if (dump_enabled_p ())
          dump_printf_loc (MSG_NOTE, vect_location,
-                        "extract scalar result");
+                        "extract scalar result\n");
  
        if (BYTES_BIG_ENDIAN)
          bitpos = size_binop (MULT_EXPR,
@@ -4229,7 +4364,6 @@ vect_finalize_reduction:
        epilog_stmt = gimple_build_assign (new_dest, expr);
        new_temp = make_ssa_name (new_dest, epilog_stmt);
        gimple_assign_set_lhs (epilog_stmt, new_temp);
-      SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
        gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
        if (nested_in_vect_loop)
          {
@@ -4326,12 +4460,12 @@ vect_finalize_reduction:
           result.  (The reduction result is expected to have two immediate uses -
           one at the latch block, and one at the loop exit).  */
        FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
-        if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
+        if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
+           && !is_gimple_debug (USE_STMT (use_p)))
            phis.safe_push (USE_STMT (use_p));
  
-      /* We expect to have found an exit_phi because of loop-closed-ssa
-         form.  */
-      gcc_assert (!phis.is_empty ());
+      /* While we expect to have found an exit_phi because of loop-closed-ssa
+         form we can end up without one if the scalar cycle is dead.  */
  
        FOR_EACH_VEC_ELT (phis, i, exit_phi)
          {
@@ -4421,6 +4555,7 @@ vect_finalize_reduction:
                        dump_printf_loc (MSG_NOTE, vect_location,
                                        "created double reduction phi node: ");
                        dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
+                      dump_printf (MSG_NOTE, "\n");
                      }
  
                    vect_phi_res = PHI_RESULT (vect_phi);
@@ -4456,7 +4591,10 @@ vect_finalize_reduction:
        FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
          {
            if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
-            phis.safe_push (USE_STMT (use_p));
+           {
+             if (!is_gimple_debug (USE_STMT (use_p)))
+               phis.safe_push (USE_STMT (use_p));
+           }
            else
              {
                if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
@@ -4466,7 +4604,8 @@ vect_finalize_reduction:
                    FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
                      {
                        if (!flow_bb_inside_loop_p (loop,
-                                             gimple_bb (USE_STMT (phi_use_p))))
+                                             gimple_bb (USE_STMT (phi_use_p)))
+                         && !is_gimple_debug (USE_STMT (phi_use_p)))
                          phis.safe_push (USE_STMT (phi_use_p));
                      }
                  }
@@ -4485,10 +4624,6 @@ vect_finalize_reduction:
  
        phis.release ();
      }
-
-  scalar_results.release ();
-  inner_phis.release ();
-  new_phis.release ();
  }
  
  
@@ -4576,10 +4711,10 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
    struct loop * def_stmt_loop, *outer_loop = NULL;
    tree def_arg;
    gimple def_arg_stmt;
-  vec<tree> vec_oprnds0 = vNULL;
-  vec<tree> vec_oprnds1 = vNULL;
-  vec<tree> vect_defs = vNULL;
-  vec<gimple> phis = vNULL;
+  auto_vec<tree> vec_oprnds0;
+  auto_vec<tree> vec_oprnds1;
+  auto_vec<tree> vect_defs;
+  auto_vec<gimple> phis;
    int vec_num;
    tree def0, def1, tem, op0, op1 = NULL_TREE;
  
@@ -4776,7 +4911,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "unsupported condition in reduction");
+                            "unsupported condition in reduction\n");
  
              return false;
          }
@@ -4792,7 +4927,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
              not vectorizable_reduction.  */
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "unsupported shift or rotation.");
+                            "unsupported shift or rotation.\n");
           return false;
         }
  
@@ -4802,7 +4937,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "no optab.");
+                            "no optab.\n");
  
            return false;
          }
@@ -4810,7 +4945,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
        if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
          {
            if (dump_enabled_p ())
-            dump_printf (MSG_NOTE, "op not supported by target.");
+            dump_printf (MSG_NOTE, "op not supported by target.\n");
  
            if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
                || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
@@ -4818,7 +4953,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
              return false;
  
            if (dump_enabled_p ())
-           dump_printf (MSG_NOTE, "proceeding using word mode.");
+           dump_printf (MSG_NOTE, "proceeding using word mode.\n");
          }
  
        /* Worthwhile without SIMD support?  */
@@ -4828,7 +4963,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not worthwhile without SIMD support.");
+                            "not worthwhile without SIMD support.\n");
  
            return false;
          }
@@ -4909,7 +5044,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "no optab for reduction.");
+                            "no optab for reduction.\n");
  
            epilog_reduc_code = ERROR_MARK;
          }
@@ -4919,7 +5054,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "reduc op not supported by target.");
+                            "reduc op not supported by target.\n");
  
            epilog_reduc_code = ERROR_MARK;
          }
@@ -4930,7 +5065,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "no reduc code for scalar code.");
+                            "no reduc code for scalar code.\n");
  
            return false;
          }
@@ -4940,7 +5075,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "multiple types in double reduction");
+                        "multiple types in double reduction\n");
  
        return false;
      }
@@ -4959,7 +5094,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "invalid types in dot-prod");
+                            "invalid types in dot-prod\n");
  
            return false;
          }
@@ -4976,7 +5111,7 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
    /** Transform.  **/
  
    if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
+    dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
  
    /* FORNOW: Multiple types are not supported for condition.  */
    if (code == COND_EXPR)
@@ -5195,11 +5330,6 @@ vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
                                      epilog_reduc_code, phis, reduc_index,
                                      double_reduc, slp_node);
  
-  phis.release ();
-  vect_defs.release ();
-  vec_oprnds0.release ();
-  vec_oprnds1.release ();
-
    return true;
  }
  
@@ -5263,7 +5393,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "multiple types in nested loop.");
+                            "multiple types in nested loop.\n");
           return false;
         }
  
@@ -5286,9 +5416,9 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
             {
               if (dump_enabled_p ())
-               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                                  "inner-loop induction only used outside "
-                                "of the outer vectorized loop.");
+                                "of the outer vectorized loop.\n");
               return false;
             }
         }
@@ -5311,7 +5441,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
        STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
        if (dump_enabled_p ())
          dump_printf_loc (MSG_NOTE, vect_location,
-                         "=== vectorizable_induction ===");
+                         "=== vectorizable_induction ===\n");
        vect_model_induction_cost (stmt_info, ncopies);
        return true;
      }
@@ -5319,7 +5449,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
    /** Transform.  **/
  
    if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
+    dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
  
    vec_def = get_initial_def_for_induction (phi);
    *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
@@ -5334,7 +5464,7 @@ vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
  bool
  vectorizable_live_operation (gimple stmt,
                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
-                            gimple *vec_stmt ATTRIBUTE_UNUSED)
+                            gimple *vec_stmt)
  {
    stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
    loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
@@ -5354,7 +5484,41 @@ vectorizable_live_operation (gimple stmt,
      return false;
  
    if (!is_gimple_assign (stmt))
-    return false;
+    {
+      if (gimple_call_internal_p (stmt)
+         && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
+         && gimple_call_lhs (stmt)
+         && loop->simduid
+         && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
+         && loop->simduid
+            == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
+       {
+         edge e = single_exit (loop);
+         basic_block merge_bb = e->dest;
+         imm_use_iterator imm_iter;
+         use_operand_p use_p;
+         tree lhs = gimple_call_lhs (stmt);
+
+         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
+           {
+             gimple use_stmt = USE_STMT (use_p);
+             if (gimple_code (use_stmt) == GIMPLE_PHI
+                 || gimple_bb (use_stmt) == merge_bb)
+               {
+                 if (vec_stmt)
+                   {
+                     tree vfm1
+                       = build_int_cst (unsigned_type_node,
+                                        loop_vinfo->vectorization_factor - 1);
+                     SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
+                   }
+                 return true;
+               }
+           }
+       }
+
+      return false;
+    }
  
    if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
      return false;
@@ -5385,7 +5549,7 @@ vectorizable_live_operation (gimple stmt,
          {
            if (dump_enabled_p ())
             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "use not simple.");
+                            "use not simple.\n");
            return false;
          }
  
@@ -5424,7 +5588,7 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
                 {
                   if (dump_enabled_p ())
                     dump_printf_loc (MSG_NOTE, vect_location,
-                                     "killing debug use");
+                                     "killing debug use\n");
  
                   gimple_debug_bind_reset_value (ustmt);
                   update_stmt (ustmt);
@@ -5436,6 +5600,120 @@ vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
      }
  }
  
+
+/* This function builds ni_name = number of iterations.  Statements
+   are emitted on the loop preheader edge.  */
+
+static tree
+vect_build_loop_niters (loop_vec_info loop_vinfo)
+{
+  tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
+  if (TREE_CODE (ni) == INTEGER_CST)
+    return ni;
+  else
+    {
+      tree ni_name, var;
+      gimple_seq stmts = NULL;
+      edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+
+      var = create_tmp_var (TREE_TYPE (ni), "niters");
+      ni_name = force_gimple_operand (ni, &stmts, false, var);
+      if (stmts)
+       gsi_insert_seq_on_edge_immediate (pe, stmts);
+
+      return ni_name;
+    }
+}
+
+
+/* This function generates the following statements:
+
+   ni_name = number of iterations loop executes
+   ratio = ni_name / vf
+   ratio_mult_vf_name = ratio * vf
+
+   and places them on the loop preheader edge.  */
+
+static void
+vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
+                                tree ni_name,
+                                tree *ratio_mult_vf_name_ptr,
+                                tree *ratio_name_ptr)
+{
+  tree ni_minus_gap_name;
+  tree var;
+  tree ratio_name;
+  tree ratio_mult_vf_name;
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
+  tree log_vf;
+
+  log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
+
+  /* If epilogue loop is required because of data accesses with gaps, we
+     subtract one iteration from the total number of iterations here for
+     correct calculation of RATIO.  */
+  if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    {
+      ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
+                                      ni_name,
+                                      build_one_cst (TREE_TYPE (ni_name)));
+      if (!is_gimple_val (ni_minus_gap_name))
+       {
+         var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
+          gimple stmts = NULL;
+          ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
+                                                   true, var);
+         gsi_insert_seq_on_edge_immediate (pe, stmts);
+        }
+    }
+  else
+    ni_minus_gap_name = ni_name;
+
+  /* Create: ratio = ni >> log2(vf) */
+  /* ???  As we have ni == number of latch executions + 1, ni could
+     have overflown to zero.  So avoid computing ratio based on ni
+     but compute it using the fact that we know ratio will be at least
+     one, thus via (ni - vf) >> log2(vf) + 1.  */
+  ratio_name
+    = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
+                  fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
+                               fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
+                                            ni_minus_gap_name,
+                                            build_int_cst
+                                              (TREE_TYPE (ni_name), vf)),
+                               log_vf),
+                  build_int_cst (TREE_TYPE (ni_name), 1));
+  if (!is_gimple_val (ratio_name))
+    {
+      var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
+      gimple stmts = NULL;
+      ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
+      gsi_insert_seq_on_edge_immediate (pe, stmts);
+    }
+  *ratio_name_ptr = ratio_name;
+
+  /* Create: ratio_mult_vf = ratio << log2 (vf).  */
+
+  if (ratio_mult_vf_name_ptr)
+    {
+      ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
+                                       ratio_name, log_vf);
+      if (!is_gimple_val (ratio_mult_vf_name))
+       {
+         var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
+         gimple stmts = NULL;
+         ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
+                                                    true, var);
+         gsi_insert_seq_on_edge_immediate (pe, stmts);
+       }
+      *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
+    }
+
+  return;
+}
+
+
  /* Function vect_transform_loop.
  
     The analysis phase has determined that the loop is vectorizable.
@@ -5454,7 +5732,6 @@ vect_transform_loop (loop_vec_info loop_vinfo)
    int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
    bool grouped_store;
    bool slp_scheduled = false;
-  unsigned int nunits;
    gimple stmt, pattern_stmt;
    gimple_seq pattern_def_seq = NULL;
    gimple_stmt_iterator pattern_def_si = gsi_none ();
@@ -5465,7 +5742,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
    gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
  
    if (dump_enabled_p ())
-    dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
+    dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
  
    /* If profile is inprecise, we have chance to fix it up.  */
    if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
@@ -5484,24 +5761,35 @@ vect_transform_loop (loop_vec_info loop_vinfo)
      {
        if (dump_enabled_p ())
         dump_printf_loc (MSG_NOTE, vect_location,
-                        "Profitability threshold is %d loop iterations.", th);
+                        "Profitability threshold is %d loop iterations.\n",
+                         th);
        check_profitability = true;
      }
  
-  /* Peel the loop if there are data refs with unknown alignment.
-     Only one data ref with unknown store is allowed.  */
+  /* Version the loop first, if required, so the profitability check
+     comes first.  */
  
-  if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
+  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
+      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
      {
-      vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
+      vect_loop_versioning (loop_vinfo, th, check_profitability);
        check_profitability = false;
      }
  
-  if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
-      || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
+  tree ni_name = vect_build_loop_niters (loop_vinfo);
+  LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
+
+  /* Peel the loop if there are data refs with unknown alignment.
+     Only one data ref with unknown store is allowed.  */
+
+  if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
      {
-      vect_loop_versioning (loop_vinfo, th, check_profitability);
+      vect_do_peeling_for_alignment (loop_vinfo, ni_name,
+                                    th, check_profitability);
        check_profitability = false;
+      /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
+        be re-computed.  */
+      ni_name = NULL_TREE;
      }
  
    /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
@@ -5512,15 +5800,26 @@ vect_transform_loop (loop_vec_info loop_vinfo)
       will remain scalar and will compute the remaining (n%VF) iterations.
       (VF is the vectorization factor).  */
  
-  if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-       || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
-          && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
-       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
-    vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
-                                   th, check_profitability);
-  else
+  if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    {
+      tree ratio_mult_vf;
+      if (!ni_name)
+       ni_name = vect_build_loop_niters (loop_vinfo);
+      vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
+                                      &ratio);
+      vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
+                                     th, check_profitability);
+    }
+  else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
      ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
+  else
+    {
+      if (!ni_name)
+       ni_name = vect_build_loop_niters (loop_vinfo);
+      vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
+    }
  
    /* 1) Make sure the loop header has exactly two entries
       2) Make sure we have a preheader basic block.  */
@@ -5548,6 +5847,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
               dump_printf_loc (MSG_NOTE, vect_location,
                                 "------>vectorizing phi: ");
               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
+              dump_printf (MSG_NOTE, "\n");
             }
           stmt_info = vinfo_for_stmt (phi);
           if (!stmt_info)
@@ -5563,12 +5863,12 @@ vect_transform_loop (loop_vec_info loop_vinfo)
           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
                 != (unsigned HOST_WIDE_INT) vectorization_factor)
               && dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
+           dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
  
           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
             {
               if (dump_enabled_p ())
-               dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
+               dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
             }
         }
@@ -5581,13 +5881,24 @@ vect_transform_loop (loop_vec_info loop_vinfo)
            if (transform_pattern_stmt)
             stmt = pattern_stmt;
            else
-            stmt = gsi_stmt (si);
+           {
+             stmt = gsi_stmt (si);
+             /* During vectorization remove existing clobber stmts.  */
+             if (gimple_clobber_p (stmt))
+               {
+                 unlink_stmt_vdef (stmt);
+                 gsi_remove (&si, true);
+                 release_defs (stmt);
+                 continue;
+               }
+           }
  
           if (dump_enabled_p ())
             {
               dump_printf_loc (MSG_NOTE, vect_location,
                                "------>vectorizing statement: ");
               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+              dump_printf (MSG_NOTE, "\n");
             }
  
           stmt_info = vinfo_for_stmt (stmt);
@@ -5662,6 +5973,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                                            "stmt: ");
                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
                                             pattern_def_stmt, 0);
+                          dump_printf (MSG_NOTE, "\n");
                         }
  
                       stmt = pattern_def_stmt;
@@ -5677,16 +5989,18 @@ vect_transform_loop (loop_vec_info loop_vinfo)
                 transform_pattern_stmt = false;
              }
  
-         gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
-         nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
-                                               STMT_VINFO_VECTYPE (stmt_info));
-         if (!STMT_SLP_TYPE (stmt_info)
-             && nunits != (unsigned int) vectorization_factor
-              && dump_enabled_p ())
-           /* For SLP VF is set according to unrolling factor, and not to
-              vector size, hence for SLP this print is not valid.  */
-            dump_printf_loc (MSG_NOTE, vect_location,
-                            "multiple-types.");
+         if (STMT_VINFO_VECTYPE (stmt_info))
+           {
+             unsigned int nunits
+               = (unsigned int)
+                 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
+             if (!STMT_SLP_TYPE (stmt_info)
+                 && nunits != (unsigned int) vectorization_factor
+                 && dump_enabled_p ())
+                 /* For SLP VF is set according to unrolling factor, and not
+                    to vector size, hence for SLP this print is not valid.  */
+               dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
+           }
  
           /* SLP. Schedule all the SLP instances when the first SLP stmt is
              reached.  */
@@ -5698,7 +6012,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
  
                   if (dump_enabled_p ())
                     dump_printf_loc (MSG_NOTE, vect_location,
-                                    "=== scheduling SLP instances ===");
+                                    "=== scheduling SLP instances ===\n");
  
                   vect_schedule_slp (loop_vinfo, NULL);
                 }
@@ -5717,7 +6031,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
  
           /* -------- vectorize statement ------------ */
           if (dump_enabled_p ())
-           dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
+           dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
  
           grouped_store = false;
           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
@@ -5755,7 +6069,7 @@ vect_transform_loop (loop_vec_info loop_vinfo)
    slpeel_make_loop_iterate_ntimes (loop, ratio);
  
    /* Reduce loop iterations by the vectorization factor.  */
-  scale_loop_profile (loop, RDIV (REG_BR_PROB_BASE , vectorization_factor),
+  scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
                       expected_iterations / vectorization_factor);
    loop->nb_iterations_upper_bound
      = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
@@ -5773,14 +6087,13 @@ vect_transform_loop (loop_vec_info loop_vinfo)
          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
      }
  
-  /* The memory tags and pointers in vectorized statements need to
-     have their SSA forms updated.  FIXME, why can't this be delayed
-     until all the loops have been transformed?  */
-  update_ssa (TODO_update_ssa);
-
    if (dump_enabled_p ())
-    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
-  if (loop->inner && dump_enabled_p ())
-    dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
-                    "OUTER LOOP VECTORIZED.");
+    {
+      dump_printf_loc (MSG_NOTE, vect_location,
+                      "LOOP VECTORIZED\n");
+      if (loop->inner)
+       dump_printf_loc (MSG_NOTE, vect_location,
+                        "OUTER LOOP VECTORIZED\n");
+      dump_printf (MSG_NOTE, "\n");
+    }
  }