PR c++/68795: fix uninitialized close_paren_loc in cp_parser_postfix_expression
[gcc.git] / gcc / tree-vect-data-refs.c
index e973b34a1a79b5538f72342bad8ff8975d2c4903..d0e20da04a62cafda9d9c2dda2e96479ef6a2ffc 100644 (file)
@@ -1,5 +1,5 @@
 /* Data References Analysis and Manipulation Utilities for Vectorization.
-   Copyright (C) 2003-2014 Free Software Foundation, Inc.
+   Copyright (C) 2003-2016 Free Software Foundation, Inc.
    Contributed by Dorit Naishlos <dorit@il.ibm.com>
    and Ira Rosen <irar@il.ibm.com>
 
@@ -22,41 +22,33 @@ along with GCC; see the file COPYING3.  If not see
 #include "config.h"
 #include "system.h"
 #include "coretypes.h"
-#include "dumpfile.h"
-#include "tm.h"
+#include "backend.h"
+#include "target.h"
+#include "rtl.h"
 #include "tree.h"
-#include "stor-layout.h"
+#include "gimple.h"
+#include "predict.h"
 #include "tm_p.h"
-#include "target.h"
-#include "basic-block.h"
-#include "gimple-pretty-print.h"
-#include "tree-ssa-alias.h"
-#include "internal-fn.h"
+#include "ssa.h"
+#include "optabs-tree.h"
+#include "cgraph.h"
+#include "dumpfile.h"
+#include "alias.h"
+#include "fold-const.h"
+#include "stor-layout.h"
 #include "tree-eh.h"
-#include "gimple-expr.h"
-#include "is-a.h"
-#include "gimple.h"
 #include "gimplify.h"
 #include "gimple-iterator.h"
 #include "gimplify-me.h"
-#include "gimple-ssa.h"
-#include "tree-phinodes.h"
-#include "ssa-iterators.h"
-#include "stringpool.h"
-#include "tree-ssanames.h"
 #include "tree-ssa-loop-ivopts.h"
 #include "tree-ssa-loop-manip.h"
 #include "tree-ssa-loop.h"
-#include "dumpfile.h"
 #include "cfgloop.h"
-#include "tree-chrec.h"
 #include "tree-scalar-evolution.h"
 #include "tree-vectorizer.h"
-#include "diagnostic-core.h"
-#include "cgraph.h"
-/* Need to include rtl.h, expr.h, etc. for optabs.  */
 #include "expr.h"
-#include "optabs.h"
+#include "builtins.h"
+#include "params.h"
 
 /* Return true if load- or store-lanes optab OPTAB is implemented for
    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
@@ -65,7 +57,7 @@ static bool
 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
                              tree vectype, unsigned HOST_WIDE_INT count)
 {
-  enum machine_mode mode, array_mode;
+  machine_mode mode, array_mode;
   bool limit_p;
 
   mode = TYPE_MODE (vectype);
@@ -118,7 +110,7 @@ vect_lanes_optab_supported_p (const char *name, convert_optab optab,
    types.  */
 
 tree
-vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
+vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
                                HOST_WIDE_INT *rhs_size_unit)
 {
   tree scalar_type = gimple_expr_type (stmt);
@@ -260,8 +252,8 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
          return false;
        }
 
-      if (STMT_VINFO_GATHER_P (stmtinfo_a)
-         || STMT_VINFO_GATHER_P (stmtinfo_b))
+      if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
+         || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
        {
          if (dump_enabled_p ())
            {
@@ -308,8 +300,8 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
          return false;
        }
 
-      if (STMT_VINFO_GATHER_P (stmtinfo_a)
-         || STMT_VINFO_GATHER_P (stmtinfo_b))
+      if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
+         || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
        {
          if (dump_enabled_p ())
            {
@@ -373,13 +365,16 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
                .. = a[i+1];
             where we will end up loading { a[i], a[i+1] } once, make
             sure that inserting group loads before the first load and
-            stores after the last store will do the right thing.  */
-         if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
-              && GROUP_SAME_DR_STMT (stmtinfo_a))
-             || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
-                 && GROUP_SAME_DR_STMT (stmtinfo_b)))
+            stores after the last store will do the right thing.
+            Similar for groups like
+               a[i] = ...;
+               ... = a[i];
+               a[i+1] = ...;
+            where loads from the group interleave with the store.  */
+         if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
+             || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
            {
-             gimple earlier_stmt;
+             gimple *earlier_stmt;
              earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
              if (DR_IS_WRITE
                    (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
@@ -403,6 +398,13 @@ vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                             "dependence distance negative.\n");
+         /* Record a negative dependence distance to later limit the
+            amount of stmt copying / unrolling we can perform.
+            Only need to handle read-after-write dependence.  */
+         if (DR_IS_READ (drb)
+             && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
+                 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
+           STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
          continue;
        }
 
@@ -461,6 +463,9 @@ vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
     dump_printf_loc (MSG_NOTE, vect_location,
                      "=== vect_analyze_data_ref_dependences ===\n");
 
+  LOOP_VINFO_DDRS (loop_vinfo)
+    .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
+            * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
                                &LOOP_VINFO_DDRS (loop_vinfo),
@@ -532,24 +537,78 @@ vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
       dump_printf (MSG_NOTE,  "\n");
     }
 
-  /* We do not vectorize basic blocks with write-write dependencies.  */
-  if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
-    return true;
+  return true;
+}
+
+
+/* Analyze dependences involved in the transform of SLP NODE.  STORES
+   contain the vector of scalar stores of this instance if we are
+   disambiguating the loads.  */
 
-  /* If we have a read-write dependence check that the load is before the store.
-     When we vectorize basic blocks, vector load can be only before
-     corresponding scalar load, and vector store can be only after its
-     corresponding scalar store.  So the order of the acceses is preserved in
-     case the load is before the store.  */
-  gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
-  if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
+static bool
+vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
+                                  vec<gimple *> stores, gimple *last_store)
+{
+  /* This walks over all stmts involved in the SLP load/store done
+     in NODE verifying we can sink them up to the last stmt in the
+     group.  */
+  gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
+  for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
     {
-      /* That only holds for load-store pairs taking part in vectorization.  */
-      if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
-         && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
-       return false;
-    }
+      gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
+      if (access == last_access)
+       continue;
+      data_reference *dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (access));
+      for (gimple_stmt_iterator gsi = gsi_for_stmt (access);
+          gsi_stmt (gsi) != last_access; gsi_next (&gsi))
+       {
+         gimple *stmt = gsi_stmt (gsi);
+         if (! gimple_vuse (stmt)
+             || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
+           continue;
+
+         /* If we couldn't record a (single) data reference for this
+            stmt we have to give up.  */
+         /* ???  Here and below if dependence analysis fails we can resort
+            to the alias oracle which can handle more kinds of stmts.  */
+         data_reference *dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt));
+         if (!dr_b)
+           return false;
+
+         /* If we run into a store of this same instance (we've just
+            marked those) then delay dependence checking until we run
+            into the last store because this is where it will have
+            been sunk to (and we verify if we can do that as well).  */
+         if (gimple_visited_p (stmt))
+           {
+             if (stmt != last_store)
+               continue;
+             unsigned i;
+             gimple *store;
+             FOR_EACH_VEC_ELT (stores, i, store)
+               {
+                 data_reference *store_dr
+                   = STMT_VINFO_DATA_REF (vinfo_for_stmt (store));
+                 ddr_p ddr = initialize_data_dependence_relation
+                               (dr_a, store_dr, vNULL);
+                 if (vect_slp_analyze_data_ref_dependence (ddr))
+                   {
+                     free_dependence_relation (ddr);
+                     return false;
+                   }
+                 free_dependence_relation (ddr);
+               }
+           }
 
+         ddr_p ddr = initialize_data_dependence_relation (dr_a, dr_b, vNULL);
+         if (vect_slp_analyze_data_ref_dependence (ddr))
+           {
+             free_dependence_relation (ddr);
+             return false;
+           }
+         free_dependence_relation (ddr);
+       }
+    }
   return true;
 }
 
@@ -561,27 +620,53 @@ vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
    the maximum vectorization factor the data dependences allow.  */
 
 bool
-vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
+vect_slp_analyze_instance_dependence (slp_instance instance)
 {
-  struct data_dependence_relation *ddr;
-  unsigned int i;
-
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                     "=== vect_slp_analyze_data_ref_dependences ===\n");
+                     "=== vect_slp_analyze_instance_dependence ===\n");
 
-  if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
-                               &BB_VINFO_DDRS (bb_vinfo),
-                               vNULL, true))
-    return false;
+  /* The stores of this instance are at the root of the SLP tree.  */
+  slp_tree store = SLP_INSTANCE_TREE (instance);
+  if (! STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0])))
+    store = NULL;
 
-  FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
-    if (vect_slp_analyze_data_ref_dependence (ddr))
-      return false;
+  /* Verify we can sink stores to the vectorized stmt insert location.  */
+  gimple *last_store = NULL;
+  if (store)
+    {
+      if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
+       return false;
 
-  return true;
-}
+      /* Mark stores in this instance and remember the last one.  */
+      last_store = vect_find_last_scalar_stmt_in_slp (store);
+      for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
+       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], true);
+    }
+
+  bool res = true;
 
+  /* Verify we can sink loads to the vectorized stmt insert location,
+     special-casing stores of this instance.  */
+  slp_tree load;
+  unsigned int i;
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
+    if (! vect_slp_analyze_node_dependences (instance, load,
+                                            store
+                                            ? SLP_TREE_SCALAR_STMTS (store)
+                                            : vNULL, last_store))
+      {
+       res = false;
+       break;
+      }
+
+  /* Unset the visited flag.  */
+  if (store)
+    for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
+      gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k], false);
+
+  return res;
+}
 
 /* Function vect_compute_data_ref_alignment
 
@@ -595,19 +680,19 @@ vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
    FOR NOW: No analysis is actually performed. Misalignment is calculated
    only for trivial cases. TODO.  */
 
-static bool
+bool
 vect_compute_data_ref_alignment (struct data_reference *dr)
 {
-  gimple stmt = DR_STMT (dr);
+  gimple *stmt = DR_STMT (dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = NULL;
   tree ref = DR_REF (dr);
   tree vectype;
   tree base, base_addr;
-  bool base_aligned;
-  tree misalign;
-  tree aligned_to, alignment;
+  tree misalign = NULL_TREE;
+  tree aligned_to;
+  unsigned HOST_WIDE_INT alignment;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -619,12 +704,8 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
   /* Initialize misalignment to unknown.  */
   SET_DR_MISALIGNMENT (dr, -1);
 
-  /* Strided loads perform only component accesses, misalignment information
-     is irrelevant for them.  */
-  if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
-    return true;
-
-  misalign = DR_INIT (dr);
+  if (tree_fits_shwi_p (DR_STEP (dr)))
+    misalign = DR_INIT (dr);
   aligned_to = DR_ALIGNED_TO (dr);
   base_addr = DR_BASE_ADDRESS (dr);
   vectype = STMT_VINFO_VECTYPE (stmt_info);
@@ -638,9 +719,9 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
   if (loop && nested_in_vect_loop_p (loop, stmt))
     {
       tree step = DR_STEP (dr);
-      HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 
-      if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
+      if (tree_fits_shwi_p (step)
+         && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
         {
           if (dump_enabled_p ())
             dump_printf_loc (MSG_NOTE, vect_location,
@@ -658,61 +739,63 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
        }
     }
 
-  /* Similarly, if we're doing basic-block vectorization, we can only use
-     base and misalignment information relative to an innermost loop if the
-     misalignment stays the same throughout the execution of the loop.
-     As above, this is the case if the stride of the dataref evenly divides
-     by the vector size.  */
-  if (!loop)
+  /* Similarly we can only use base and misalignment information relative to
+     an innermost loop if the misalignment stays the same throughout the
+     execution of the loop.  As above, this is the case if the stride of
+     the dataref evenly divides by the vector size.  */
+  else
     {
       tree step = DR_STEP (dr);
-      HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
+      unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 
-      if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
+      if (tree_fits_shwi_p (step)
+         && ((tree_to_shwi (step) * vf)
+             % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "SLP: step doesn't divide the vector-size.\n");
+                            "step doesn't divide the vector-size.\n");
          misalign = NULL_TREE;
        }
     }
 
-  base = build_fold_indirect_ref (base_addr);
-  alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
+  /* To look at alignment of the base we have to preserve an inner MEM_REF
+     as that carries alignment information of the actual access.  */
+  base = ref;
+  while (handled_component_p (base))
+    base = TREE_OPERAND (base, 0);
+  if (TREE_CODE (base) == MEM_REF)
+    base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
+                  build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
+  unsigned int base_alignment = get_object_alignment (base);
+
+  if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
+    DR_VECT_AUX (dr)->base_element_aligned = true;
 
-  if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
+  alignment = TYPE_ALIGN_UNIT (vectype);
+
+  if ((compare_tree_int (aligned_to, alignment) < 0)
       || !misalign)
     {
       if (dump_enabled_p ())
        {
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                           "Unknown alignment for access: ");
-         dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
+         dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
          dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
        }
       return true;
     }
 
-  if ((DECL_P (base)
-       && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
-                               alignment) >= 0)
-      || (TREE_CODE (base_addr) == SSA_NAME
-         && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
-                                                     TREE_TYPE (base_addr)))),
-                                  alignment) >= 0)
-      || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
-    base_aligned = true;
-  else
-    base_aligned = false;
-
-  if (!base_aligned)
+  if (base_alignment < TYPE_ALIGN (vectype))
     {
-      /* Do not change the alignment of global variables here if
-        flag_section_anchors is enabled as we already generated
-        RTL for other functions.  Most global variables should
-        have been aligned during the IPA increase_alignment pass.  */
-      if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
-         || (TREE_STATIC (base) && flag_section_anchors))
+      /* Strip an inner MEM_REF to a bare decl if possible.  */
+      if (TREE_CODE (base) == MEM_REF
+         && integer_zerop (TREE_OPERAND (base, 1))
+         && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
+       base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
+
+      if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
        {
          if (dump_enabled_p ())
            {
@@ -734,14 +817,15 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
           dump_printf (MSG_NOTE, "\n");
         }
 
-      ((dataref_aux *)dr->aux)->base_decl = base;
-      ((dataref_aux *)dr->aux)->base_misaligned = true;
+      DR_VECT_AUX (dr)->base_decl = base;
+      DR_VECT_AUX (dr)->base_misaligned = true;
+      DR_VECT_AUX (dr)->base_element_aligned = true;
     }
 
   /* If this is a backward running DR then first access in the larger
      vectype actually is N-1 elements before the address in the DR.
      Adjust misalign accordingly.  */
-  if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
+  if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
     {
       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
@@ -751,19 +835,8 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
       misalign = size_binop (PLUS_EXPR, misalign, offset);
     }
 
-  /* Modulo alignment.  */
-  misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
-
-  if (!tree_fits_uhwi_p (misalign))
-    {
-      /* Negative or overflowed misalignment value.  */
-      if (dump_enabled_p ())
-       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "unexpected misalign value\n");
-      return false;
-    }
-
-  SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
+  SET_DR_MISALIGNMENT (dr,
+                      wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 
   if (dump_enabled_p ())
     {
@@ -777,42 +850,6 @@ vect_compute_data_ref_alignment (struct data_reference *dr)
 }
 
 
-/* Function vect_compute_data_refs_alignment
-
-   Compute the misalignment of data references in the loop.
-   Return FALSE if a data reference is found that cannot be vectorized.  */
-
-static bool
-vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
-                                  bb_vec_info bb_vinfo)
-{
-  vec<data_reference_p> datarefs;
-  struct data_reference *dr;
-  unsigned int i;
-
-  if (loop_vinfo)
-    datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
-  else
-    datarefs = BB_VINFO_DATAREFS (bb_vinfo);
-
-  FOR_EACH_VEC_ELT (datarefs, i, dr)
-    if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
-        && !vect_compute_data_ref_alignment (dr))
-      {
-        if (bb_vinfo)
-          {
-            /* Mark unsupported statement as unvectorizable.  */
-            STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
-            continue;
-          }
-        else
-          return false;
-      }
-
-  return true;
-}
-
-
 /* Function vect_update_misalignment_for_peel
 
    DR - the data reference whose misalignment is to be adjusted.
@@ -872,67 +909,76 @@ vect_update_misalignment_for_peel (struct data_reference *dr,
 }
 
 
+/* Function verify_data_ref_alignment
+
+   Return TRUE if DR can be handled with respect to alignment.  */
+
+static bool
+verify_data_ref_alignment (data_reference_p dr)
+{
+  enum dr_alignment_support supportable_dr_alignment
+    = vect_supportable_dr_alignment (dr, false);
+  if (!supportable_dr_alignment)
+    {
+      if (dump_enabled_p ())
+       {
+         if (DR_IS_READ (dr))
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: unsupported unaligned load.");
+         else
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: unsupported unaligned "
+                            "store.");
+
+         dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
+                            DR_REF (dr));
+         dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
+       }
+      return false;
+    }
+
+  if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                    "Vectorizing an unaligned access.\n");
+
+  return true;
+}
+
 /* Function vect_verify_datarefs_alignment
 
    Return TRUE if all data references in the loop can be
    handled with respect to alignment.  */
 
 bool
-vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
+vect_verify_datarefs_alignment (loop_vec_info vinfo)
 {
-  vec<data_reference_p> datarefs;
+  vec<data_reference_p> datarefs = vinfo->datarefs;
   struct data_reference *dr;
-  enum dr_alignment_support supportable_dr_alignment;
   unsigned int i;
 
-  if (loop_vinfo)
-    datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
-  else
-    datarefs = BB_VINFO_DATAREFS (bb_vinfo);
-
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
-      gimple stmt = DR_STMT (dr);
+      gimple *stmt = DR_STMT (dr);
       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 
       if (!STMT_VINFO_RELEVANT_P (stmt_info))
        continue;
 
-      /* For interleaving, only the alignment of the first access matters. 
-         Skip statements marked as not vectorizable.  */
-      if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
-           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
-          || !STMT_VINFO_VECTORIZABLE (stmt_info))
-        continue;
+      /* For interleaving, only the alignment of the first access matters.   */
+      if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
+         && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
+       continue;
 
-      /* Strided loads perform only component accesses, alignment is
+      /* Strided accesses perform only component accesses, alignment is
         irrelevant for them.  */
-      if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+      if (STMT_VINFO_STRIDED_P (stmt_info)
+         && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
        continue;
 
-      supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
-      if (!supportable_dr_alignment)
-        {
-          if (dump_enabled_p ())
-            {
-              if (DR_IS_READ (dr))
-                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                 "not vectorized: unsupported unaligned load.");
-              else
-                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                 "not vectorized: unsupported unaligned "
-                                 "store.");
-
-              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                 DR_REF (dr));
-              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-            }
-          return false;
-        }
-      if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location,
-                         "Vectorizing an unaligned access.\n");
+      if (! verify_data_ref_alignment (dr))
+       return false;
     }
+
   return true;
 }
 
@@ -957,7 +1003,7 @@ not_size_aligned (tree exp)
 static bool
 vector_alignment_reachable_p (struct data_reference *dr)
 {
-  gimple stmt = DR_STMT (dr);
+  gimple *stmt = DR_STMT (dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 
@@ -1028,7 +1074,7 @@ vect_get_data_access_cost (struct data_reference *dr,
                            unsigned int *outside_cost,
                           stmt_vector_for_cost *body_cost_vec)
 {
-  gimple stmt = DR_STMT (dr);
+  gimple *stmt = DR_STMT (dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
@@ -1048,10 +1094,48 @@ vect_get_data_access_cost (struct data_reference *dr,
 }
 
 
+typedef struct _vect_peel_info
+{
+  int npeel;
+  struct data_reference *dr;
+  unsigned int count;
+} *vect_peel_info;
+
+typedef struct _vect_peel_extended_info
+{
+  struct _vect_peel_info peel_info;
+  unsigned int inside_cost;
+  unsigned int outside_cost;
+  stmt_vector_for_cost body_cost_vec;
+} *vect_peel_extended_info;
+
+
+/* Peeling hashtable helpers.  */
+
+struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
+{
+  static inline hashval_t hash (const _vect_peel_info *);
+  static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
+};
+
+inline hashval_t
+peel_info_hasher::hash (const _vect_peel_info *peel_info)
+{
+  return (hashval_t) peel_info->npeel;
+}
+
+inline bool
+peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
+{
+  return (a->npeel == b->npeel);
+}
+
+
 /* Insert DR into peeling hash table with NPEEL as key.  */
 
 static void
-vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
+vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
+                         loop_vec_info loop_vinfo, struct data_reference *dr,
                           int npeel)
 {
   struct _vect_peel_info elem, *slot;
@@ -1059,7 +1143,7 @@ vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
 
   elem.npeel = npeel;
-  slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
+  slot = peeling_htab->find (&elem);
   if (slot)
     slot->count++;
   else
@@ -1068,7 +1152,7 @@ vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
       slot->npeel = npeel;
       slot->dr = dr;
       slot->count = 1;
-      new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
+      new_slot = peeling_htab->find_slot (slot, INSERT);
       *new_slot = slot;
     }
 
@@ -1110,13 +1194,12 @@ vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
   vect_peel_info elem = *slot;
   int save_misalignment, dummy;
   unsigned int inside_cost = 0, outside_cost = 0, i;
-  gimple stmt = DR_STMT (elem->dr);
+  gimple *stmt = DR_STMT (elem->dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
   struct data_reference *dr;
   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
-  int single_iter_cost;
 
   prologue_cost_vec.create (2);
   body_cost_vec.create (2);
@@ -1132,6 +1215,12 @@ vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
         continue;
 
+      /* Strided accesses perform only component accesses, alignment is
+         irrelevant for them.  */
+      if (STMT_VINFO_STRIDED_P (stmt_info)
+         && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
+       continue;
+
       save_misalignment = DR_MISALIGNMENT (dr);
       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
@@ -1139,11 +1228,10 @@ vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
       SET_DR_MISALIGNMENT (dr, save_misalignment);
     }
 
-  single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
-  outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
-                                              &dummy, single_iter_cost,
-                                              &prologue_cost_vec,
-                                              &epilogue_cost_vec);
+  outside_cost += vect_get_known_peeling_cost
+    (loop_vinfo, elem->npeel, &dummy,
+     &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
+     &prologue_cost_vec, &epilogue_cost_vec);
 
   /* Prologue and epilogue costs are added to the target model later.
      These costs depend only on the scalar iteration cost, the
@@ -1174,7 +1262,8 @@ vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
    option that aligns as many accesses as possible.  */
 
 static struct data_reference *
-vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
+vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
+                                      loop_vec_info loop_vinfo,
                                        unsigned int *npeel,
                                       stmt_vector_for_cost *body_cost_vec)
 {
@@ -1187,16 +1276,14 @@ vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
      {
        res.inside_cost = INT_MAX;
        res.outside_cost = INT_MAX;
-       LOOP_VINFO_PEELING_HTAB (loop_vinfo)
-           .traverse <_vect_peel_extended_info *,
-                      vect_peeling_hash_get_lowest_cost> (&res);
+       peeling_htab->traverse <_vect_peel_extended_info *,
+                              vect_peeling_hash_get_lowest_cost> (&res);
      }
    else
      {
        res.peel_info.count = 0;
-       LOOP_VINFO_PEELING_HTAB (loop_vinfo)
-           .traverse <_vect_peel_extended_info *,
-                      vect_peeling_hash_get_most_frequent> (&res);
+       peeling_htab->traverse <_vect_peel_extended_info *,
+                              vect_peeling_hash_get_most_frequent> (&res);
      }
 
    *npeel = res.peel_info.npeel;
@@ -1308,7 +1395,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
   bool do_peeling = false;
   bool do_versioning = false;
   bool stat;
-  gimple stmt;
+  gimple *stmt;
   stmt_vec_info stmt_info;
   unsigned int npeel = 0;
   bool all_misalignments_unknown = true;
@@ -1317,11 +1404,16 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
   tree vectype;
   unsigned int nelements, mis, same_align_drs_max = 0;
   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
+  hash_table<peel_info_hasher> peeling_htab (1);
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                      "=== vect_enhance_data_refs_alignment ===\n");
 
+  /* Reset data so we can safely be called multiple times.  */
+  LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
+  LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
+
   /* While cost model enhancements are expected in the future, the high level
      view of the code at this time is as follows:
 
@@ -1372,9 +1464,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
       if (integer_zerop (DR_STEP (dr)))
        continue;
 
-      /* Strided loads perform only component accesses, alignment is
+      /* Strided accesses perform only component accesses, alignment is
         irrelevant for them.  */
-      if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+      if (STMT_VINFO_STRIDED_P (stmt_info)
+         && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
        continue;
 
       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
@@ -1388,9 +1481,6 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
                                                    size_zero_node) < 0;
 
               /* Save info about DR in the hash table.  */
-              if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
-                LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
-
               vectype = STMT_VINFO_VECTYPE (stmt_info);
               nelements = TYPE_VECTOR_SUBPARTS (vectype);
               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
@@ -1413,7 +1503,13 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
                  We do this automtically for cost model, since we calculate cost
                  for every peeling option.  */
               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
-                possible_npeel_number = vf /nelements;
+               {
+                 if (STMT_SLP_TYPE (stmt_info))
+                   possible_npeel_number
+                     = (vf * GROUP_SIZE (stmt_info)) / nelements;
+                 else
+                   possible_npeel_number = vf / nelements;
+               }
 
               /* Handle the aligned case. We may decide to align some other
                  access, making DR unaligned.  */
@@ -1426,8 +1522,8 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
               for (j = 0; j < possible_npeel_number; j++)
                 {
-                  gcc_assert (npeel_tmp <= vf);
-                  vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
+                  vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
+                                           dr, npeel_tmp);
                   npeel_tmp += nelements;
                 }
 
@@ -1498,13 +1594,14 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
   /* Check if we can possibly peel the loop.  */
   if (!vect_can_advance_ivs_p (loop_vinfo)
-      || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
+      || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
+      || loop->inner)
     do_peeling = false;
 
-  if (do_peeling && all_misalignments_unknown
+  if (do_peeling
+      && all_misalignments_unknown
       && vect_supportable_dr_alignment (dr0, false))
     {
-
       /* Check if the target requires to prefer stores over loads, i.e., if
          misaligned stores are more expensive than misaligned loads (taking
          drs with same alignment into account).  */
@@ -1569,12 +1666,17 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
         }
 
       /* In case there are only loads with different unknown misalignments, use
-         peeling only if it may help to align other accesses in the loop.  */
+         peeling only if it may help to align other accesses in the loop or
+        if it may help improving load bandwith when we'd end up using
+        unaligned loads.  */
+      tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
       if (!first_store
          && !STMT_VINFO_SAME_ALIGN_REFS (
                  vinfo_for_stmt (DR_STMT (dr0))).length ()
-          && vect_supportable_dr_alignment (dr0, false)
-              != dr_unaligned_supported)
+         && (vect_supportable_dr_alignment (dr0, false)
+             != dr_unaligned_supported
+             || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
+                 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
         do_peeling = false;
     }
 
@@ -1587,7 +1689,8 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
       gcc_assert (!all_misalignments_unknown);
 
       /* Choose the best peeling from the hash table.  */
-      dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
+      dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
+                                                  loop_vinfo, &npeel,
                                                   &body_cost_vec);
       if (!dr0 || !npeel)
         do_peeling = false;
@@ -1645,9 +1748,10 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
              && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
            continue;
 
-         /* Strided loads perform only component accesses, alignment is
+         /* Strided accesses perform only component accesses, alignment is
             irrelevant for them.  */
-         if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
+         if (STMT_VINFO_STRIDED_P (stmt_info)
+             && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
            continue;
 
          save_misalignment = DR_MISALIGNMENT (dr);
@@ -1664,7 +1768,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
         {
-          stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
+          stat = vect_verify_datarefs_alignment (loop_vinfo);
           if (!stat)
             do_peeling = false;
           else
@@ -1674,6 +1778,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
            }
         }
 
+      /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
       if (do_peeling)
         {
           unsigned max_allowed_peel
@@ -1683,7 +1788,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
               unsigned max_peel = npeel;
               if (max_peel == 0)
                 {
-                  gimple dr_stmt = DR_STMT (dr0);
+                 gimple *dr_stmt = DR_STMT (dr0);
                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
@@ -1698,11 +1803,20 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
             }
         }
 
+      /* Cost model #2 - if peeling may result in a remaining loop not
+        iterating enough to be vectorized then do not peel.  */
+      if (do_peeling
+         && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
+       {
+         unsigned max_peel
+           = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
+         if (LOOP_VINFO_INT_NITERS (loop_vinfo)
+             < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
+           do_peeling = false;
+       }
+
       if (do_peeling)
         {
-         stmt_info_for_cost *si;
-         void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
-
           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
              If the misalignment of DR_i is identical to that of dr0 then set
              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
@@ -1728,22 +1842,12 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
               dump_printf_loc (MSG_NOTE, vect_location,
                                "Peeling for alignment will be applied.\n");
             }
-         /* We've delayed passing the inside-loop peeling costs to the
-            target cost model until we were sure peeling would happen.
-            Do so now.  */
-         if (body_cost_vec.exists ())
-           {
-             FOR_EACH_VEC_ELT (body_cost_vec, i, si)
-               {
-                 struct _stmt_vec_info *stmt_info
-                   = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
-                 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
-                                       si->misalign, vect_body);
-               }
-             body_cost_vec.release ();
-           }
+         /* The inside-loop cost will be accounted for in vectorizable_load
+            and vectorizable_store correctly with adjusted alignments.
+            Drop the body_cst_vec on the floor here.  */
+         body_cost_vec.release ();
 
-         stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
+         stat = vect_verify_datarefs_alignment (loop_vinfo);
          gcc_assert (stat);
           return stat;
         }
@@ -1778,16 +1882,21 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
                  && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
            continue;
 
-         /* Strided loads perform only component accesses, alignment is
-            irrelevant for them.  */
-         if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
-           continue;
+         if (STMT_VINFO_STRIDED_P (stmt_info))
+           {
+             /* Strided loads perform only component accesses, alignment is
+                irrelevant for them.  */
+             if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
+               continue;
+             do_versioning = false;
+             break;
+           }
 
          supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 
           if (!supportable_dr_alignment)
             {
-              gimple stmt;
+             gimple *stmt;
               int mask;
               tree vectype;
 
@@ -1831,9 +1940,9 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
 
   if (do_versioning)
     {
-      vec<gimple> may_misalign_stmts
+      vec<gimple *> may_misalign_stmts
         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
-      gimple stmt;
+      gimple *stmt;
 
       /* It can now be assumed that the data references in the statements
          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
@@ -1855,7 +1964,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
       /* Peeling and versioning can't be done together at this time.  */
       gcc_assert (! (do_peeling && do_versioning));
 
-      stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
+      stat = vect_verify_datarefs_alignment (loop_vinfo);
       gcc_assert (stat);
       return stat;
     }
@@ -1863,7 +1972,7 @@ vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
   /* This point is reached if neither peeling nor versioning is being done.  */
   gcc_assert (! (do_peeling || do_versioning));
 
-  stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
+  stat = vect_verify_datarefs_alignment (loop_vinfo);
   return stat;
 }
 
@@ -1947,8 +2056,7 @@ vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
    Return FALSE if a data reference is found that cannot be vectorized.  */
 
 bool
-vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
-                                  bb_vec_info bb_vinfo)
+vect_analyze_data_refs_alignment (loop_vec_info vinfo)
 {
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
@@ -1956,55 +2064,150 @@ vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
 
   /* Mark groups of data references with same alignment using
      data dependence information.  */
-  if (loop_vinfo)
+  vec<ddr_p> ddrs = vinfo->ddrs;
+  struct data_dependence_relation *ddr;
+  unsigned int i;
+
+  FOR_EACH_VEC_ELT (ddrs, i, ddr)
+    vect_find_same_alignment_drs (ddr, vinfo);
+
+  vec<data_reference_p> datarefs = vinfo->datarefs;
+  struct data_reference *dr;
+
+  FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
-      vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
-      struct data_dependence_relation *ddr;
-      unsigned int i;
+      stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
+      if (STMT_VINFO_VECTORIZABLE (stmt_info)
+         && !vect_compute_data_ref_alignment (dr))
+       {
+         /* Strided accesses perform only component accesses, misalignment
+            information is irrelevant for them.  */
+         if (STMT_VINFO_STRIDED_P (stmt_info)
+             && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
+           continue;
+
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "not vectorized: can't calculate alignment "
+                            "for data ref.\n");
 
-      FOR_EACH_VEC_ELT (ddrs, i, ddr)
-       vect_find_same_alignment_drs (ddr, loop_vinfo);
+         return false;
+       }
     }
 
-  if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
+  return true;
+}
+
+
+/* Analyze alignment of DRs of stmts in NODE.  */
+
+static bool
+vect_slp_analyze_and_verify_node_alignment (slp_tree node)
+{
+  /* We vectorize from the first scalar stmt in the node unless
+     the node is permuted in which case we start from the first
+     element in the group.  */
+  gimple *first_stmt = SLP_TREE_SCALAR_STMTS (node)[0];
+  data_reference_p first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
+  if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
+    first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (first_stmt));
+
+  data_reference_p dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
+  if (! vect_compute_data_ref_alignment (dr)
+      /* For creating the data-ref pointer we need alignment of the
+        first element anyway.  */
+      || (dr != first_dr
+         && ! vect_compute_data_ref_alignment (first_dr))
+      || ! verify_data_ref_alignment (dr))
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                        "not vectorized: can't calculate alignment "
-                        "for data ref.\n");
+                        "not vectorized: bad data alignment in basic "
+                        "block.\n");
       return false;
     }
 
   return true;
 }
 
+/* Function vect_slp_analyze_instance_alignment
+
+   Analyze the alignment of the data-references in the SLP instance.
+   Return FALSE if a data reference is found that cannot be vectorized.  */
+
+bool
+vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
+{
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+                     "=== vect_slp_analyze_and_verify_instance_alignment ===\n");
+
+  slp_tree node;
+  unsigned i;
+  FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
+    if (! vect_slp_analyze_and_verify_node_alignment (node))
+      return false;
+
+  node = SLP_INSTANCE_TREE (instance);
+  if (STMT_VINFO_DATA_REF (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]))
+      && ! vect_slp_analyze_and_verify_node_alignment
+            (SLP_INSTANCE_TREE (instance)))
+    return false;
+
+  return true;
+}
+
 
 /* Analyze groups of accesses: check that DR belongs to a group of
    accesses of legal size, step, etc.  Detect gaps, single element
    interleaving, and other special cases. Set grouped access info.
-   Collect groups of strided stores for further use in SLP analysis.  */
+   Collect groups of strided stores for further use in SLP analysis.
+   Worker for vect_analyze_group_access.  */
 
 static bool
-vect_analyze_group_access (struct data_reference *dr)
+vect_analyze_group_access_1 (struct data_reference *dr)
 {
   tree step = DR_STEP (dr);
   tree scalar_type = TREE_TYPE (DR_REF (dr));
   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
-  gimple stmt = DR_STMT (dr);
+  gimple *stmt = DR_STMT (dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
-  HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
+  HOST_WIDE_INT dr_step = -1;
   HOST_WIDE_INT groupsize, last_accessed_element = 1;
   bool slp_impossible = false;
-  struct loop *loop = NULL;
-
-  if (loop_vinfo)
-    loop = LOOP_VINFO_LOOP (loop_vinfo);
 
   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
      size of the interleaving group (including gaps).  */
-  groupsize = absu_hwi (dr_step) / type_size;
+  if (tree_fits_shwi_p (step))
+    {
+      dr_step = tree_to_shwi (step);
+      /* Check that STEP is a multiple of type size.  Otherwise there is
+         a non-element-sized gap at the end of the group which we
+        cannot represent in GROUP_GAP or GROUP_SIZE.
+        ???  As we can handle non-constant step fine here we should
+        simply remove uses of GROUP_GAP between the last and first
+        element and instead rely on DR_STEP.  GROUP_SIZE then would
+        simply not include that gap.  */
+      if ((dr_step % type_size) != 0)
+       {
+         if (dump_enabled_p ())
+           {
+             dump_printf_loc (MSG_NOTE, vect_location,
+                              "Step ");
+             dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
+             dump_printf (MSG_NOTE,
+                          " is not a multiple of the element size for ");
+             dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
+             dump_printf (MSG_NOTE, "\n");
+           }
+         return false;
+       }
+      groupsize = absu_hwi (dr_step) / type_size;
+    }
+  else
+    groupsize = 0;
 
   /* Not consecutive access is possible only if it is a part of interleaving.  */
   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
@@ -2031,24 +2234,6 @@ vect_analyze_group_access (struct data_reference *dr)
              dump_printf (MSG_NOTE, "\n");
            }
 
-         if (loop_vinfo)
-           {
-             if (dump_enabled_p ())
-               dump_printf_loc (MSG_NOTE, vect_location,
-                                "Data access with gaps requires scalar "
-                                "epilogue loop\n");
-              if (loop->inner)
-                {
-                  if (dump_enabled_p ())
-                    dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                     "Peeling for outer loop is not"
-                                     " supported\n");
-                  return false;
-                }
-
-              LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
-           }
-
          return true;
        }
 
@@ -2057,7 +2242,6 @@ vect_analyze_group_access (struct data_reference *dr)
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                           "not consecutive access ");
          dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
-         dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
         }
 
       if (bb_vinfo)
@@ -2067,19 +2251,20 @@ vect_analyze_group_access (struct data_reference *dr)
           return true;
         }
 
-      return false;
+      dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
+      STMT_VINFO_STRIDED_P (stmt_info) = true;
+      return true;
     }
 
   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
     {
       /* First stmt in the interleaving chain. Check the chain.  */
-      gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
+      gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
       struct data_reference *data_ref = dr;
       unsigned int count = 1;
       tree prev_init = DR_INIT (data_ref);
-      gimple prev = stmt;
+      gimple *prev = stmt;
       HOST_WIDE_INT diff, gaps = 0;
-      unsigned HOST_WIDE_INT count_in_bytes;
 
       while (next)
         {
@@ -2099,6 +2284,10 @@ vect_analyze_group_access (struct data_reference *dr)
                   return false;
                 }
 
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "Two or more load stmts share the same dr.\n");
+
               /* For load use the same data-ref load.  */
               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
 
@@ -2144,72 +2333,51 @@ vect_analyze_group_access (struct data_reference *dr)
           count++;
         }
 
-      /* COUNT is the number of accesses found, we multiply it by the size of
-         the type to get COUNT_IN_BYTES.  */
-      count_in_bytes = type_size * count;
+      if (groupsize == 0)
+        groupsize = count + gaps;
 
-      /* Check that the size of the interleaving (including gaps) is not
-         greater than STEP.  */
-      if (dr_step != 0
-         && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
-        {
-          if (dump_enabled_p ())
-            {
-              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                               "interleaving size is greater than step for ");
-              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                 DR_REF (dr));
-              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-            }
-          return false;
-        }
+      if (groupsize > UINT_MAX)
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "group is too large\n");
+         return false;
+       }
 
-      /* Check that the size of the interleaving is equal to STEP for stores,
+      /* Check that the size of the interleaving is equal to count for stores,
          i.e., that there are no gaps.  */
-      if (dr_step != 0
-         && absu_hwi (dr_step) != count_in_bytes)
+      if (groupsize != count
+         && !DR_IS_READ (dr))
         {
-          if (DR_IS_READ (dr))
-            {
-              slp_impossible = true;
-              /* There is a gap after the last load in the group. This gap is a
-                 difference between the groupsize and the number of elements.
-                When there is no gap, this difference should be 0.  */
-              GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
-            }
-          else
-            {
-              if (dump_enabled_p ())
-                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                 "interleaved store with gaps\n");
-              return false;
-            }
-        }
-
-      /* Check that STEP is a multiple of type size.  */
-      if (dr_step != 0
-         && (dr_step % type_size) != 0)
-        {
-          if (dump_enabled_p ())
-            {
-              dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                               "step is not a multiple of type size: step ");
-              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
-              dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
-              dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
-                                 TYPE_SIZE_UNIT (scalar_type));
-              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
-            }
-          return false;
-        }
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "interleaved store with gaps\n");
+         return false;
+       }
 
-      if (groupsize == 0)
-        groupsize = count;
+      /* If there is a gap after the last load in the group it is the
+        difference between the groupsize and the last accessed
+        element.
+        When there is no gap, this difference should be 0.  */
+      GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
 
       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
       if (dump_enabled_p ())
-        dump_printf_loc (MSG_NOTE, vect_location,
-                         "Detected interleaving of size %d\n", (int)groupsize);
+       {
+         dump_printf_loc (MSG_NOTE, vect_location,
+                          "Detected interleaving ");
+         if (DR_IS_READ (dr))
+           dump_printf (MSG_NOTE, "load ");
+         else
+           dump_printf (MSG_NOTE, "store ");
+         dump_printf (MSG_NOTE, "of size %u starting with ",
+                      (unsigned)groupsize);
+         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
+         if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
+           dump_printf_loc (MSG_NOTE, vect_location,
+                            "There is a gap of %u elements after the group\n",
+                            GROUP_GAP (vinfo_for_stmt (stmt)));
+       }
 
       /* SLP: create an SLP data structure for every interleaving group of
         stores for further analysis in vect_analyse_slp.  */
@@ -2220,30 +2388,37 @@ vect_analyze_group_access (struct data_reference *dr)
           if (bb_vinfo)
             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
         }
+    }
 
-      /* There is a gap in the end of the group.  */
-      if (groupsize - last_accessed_element > 0 && loop_vinfo)
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "Data access with gaps requires scalar "
-                            "epilogue loop\n");
-          if (loop->inner)
-            {
-              if (dump_enabled_p ())
-                dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                 "Peeling for outer loop is not supported\n");
-              return false;
-            }
+  return true;
+}
+
+/* Analyze groups of accesses: check that DR belongs to a group of
+   accesses of legal size, step, etc.  Detect gaps, single element
+   interleaving, and other special cases. Set grouped access info.
+   Collect groups of strided stores for further use in SLP analysis.  */
 
-          LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
+static bool
+vect_analyze_group_access (struct data_reference *dr)
+{
+  if (!vect_analyze_group_access_1 (dr))
+    {
+      /* Dissolve the group if present.  */
+      gimple *next;
+      gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
+      while (stmt)
+       {
+         stmt_vec_info vinfo = vinfo_for_stmt (stmt);
+         next = GROUP_NEXT_ELEMENT (vinfo);
+         GROUP_FIRST_ELEMENT (vinfo) = NULL;
+         GROUP_NEXT_ELEMENT (vinfo) = NULL;
+         stmt = next;
        }
+      return false;
     }
-
   return true;
 }
 
-
 /* Analyze the access pattern of the data-reference DR.
    In case of non-consecutive accesses call vect_analyze_group_access() to
    analyze groups of accesses.  */
@@ -2253,7 +2428,7 @@ vect_analyze_data_ref_access (struct data_reference *dr)
 {
   tree step = DR_STEP (dr);
   tree scalar_type = TREE_TYPE (DR_REF (dr));
-  gimple stmt = DR_STMT (dr);
+  gimple *stmt = DR_STMT (dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *loop = NULL;
@@ -2269,18 +2444,22 @@ vect_analyze_data_ref_access (struct data_reference *dr)
       return false;
     }
 
-  /* Allow invariant loads in not nested loops.  */
+  /* Allow loads with zero step in inner-loop vectorization.  */
   if (loop_vinfo && integer_zerop (step))
     {
       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
-      if (nested_in_vect_loop_p (loop, stmt))
+      if (!nested_in_vect_loop_p (loop, stmt))
+       return DR_IS_READ (dr);
+      /* Allow references with zero step for outer loops marked
+        with pragma omp simd only - it guarantees absence of
+        loop-carried dependencies between inner loop iterations.  */
+      if (!loop->force_vectorize)
        {
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "zero step in inner loop of nest\n");
          return false;
        }
-      return DR_IS_READ (dr);
     }
 
   if (loop && nested_in_vect_loop_p (loop, stmt))
@@ -2296,10 +2475,7 @@ vect_analyze_data_ref_access (struct data_reference *dr)
          if (dump_enabled_p ())
            dump_printf_loc (MSG_NOTE, vect_location,
                             "zero step in outer loop.\n");
-         if (DR_IS_READ (dr))
-           return true;
-         else
-           return false;
+         return DR_IS_READ (dr);
        }
     }
 
@@ -2325,9 +2501,12 @@ vect_analyze_data_ref_access (struct data_reference *dr)
       return false;
     }
 
+
   /* Assume this is a DR handled by non-constant strided load case.  */
   if (TREE_CODE (step) != INTEGER_CST)
-    return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
+    return (STMT_VINFO_STRIDED_P (stmt_info)
+           && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
+               || vect_analyze_group_access (dr)));
 
   /* Not consecutive access - check if it's a part of interleaving group.  */
   return vect_analyze_group_access (dr);
@@ -2353,6 +2532,8 @@ compare_tree (tree t1, tree t2)
   if (t2 == NULL)
     return 1;
 
+  STRIP_NOPS (t1);
+  STRIP_NOPS (t2);
 
   if (TREE_CODE (t1) != TREE_CODE (t2))
     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
@@ -2422,6 +2603,12 @@ dr_group_sort_cmp (const void *dra_, const void *drb_)
   if (dra == drb)
     return 0;
 
+  /* DRs in different loops never belong to the same group.  */
+  loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
+  loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
+  if (loopa != loopb)
+    return loopa->num < loopb->num ? -1 : 1;
+
   /* Ordering of DRs according to base.  */
   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
     {
@@ -2477,21 +2664,16 @@ dr_group_sort_cmp (const void *dra_, const void *drb_)
    FORNOW: handle only arrays and pointer accesses.  */
 
 bool
-vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
+vect_analyze_data_ref_accesses (vec_info *vinfo)
 {
   unsigned int i;
-  vec<data_reference_p> datarefs;
+  vec<data_reference_p> datarefs = vinfo->datarefs;
   struct data_reference *dr;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
                      "=== vect_analyze_data_ref_accesses ===\n");
 
-  if (loop_vinfo)
-    datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
-  else
-    datarefs = BB_VINFO_DATAREFS (bb_vinfo);
-
   if (datarefs.is_empty ())
     return true;
 
@@ -2499,8 +2681,7 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
      linear.  Don't modify the original vector's order, it is needed for
      determining what dependencies are reversed.  */
   vec<data_reference_p> datarefs_copy = datarefs.copy ();
-  qsort (datarefs_copy.address (), datarefs_copy.length (),
-        sizeof (data_reference_p), dr_group_sort_cmp);
+  datarefs_copy.qsort (dr_group_sort_cmp);
 
   /* Build the interleaving chains.  */
   for (i = 0; i < datarefs_copy.length () - 1;)
@@ -2519,23 +2700,33 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
             matters we can push those to a worklist and re-iterate
             over them.  The we can just skip ahead to the next DR here.  */
 
+         /* DRs in a different loop should not be put into the same
+            interleaving group.  */
+         if (gimple_bb (DR_STMT (dra))->loop_father
+             != gimple_bb (DR_STMT (drb))->loop_father)
+           break;
+
          /* Check that the data-refs have same first location (except init)
-            and they are both either store or load (not load and store).  */
+            and they are both either store or load (not load and store,
+            not masked loads or stores).  */
          if (DR_IS_READ (dra) != DR_IS_READ (drb)
              || !operand_equal_p (DR_BASE_ADDRESS (dra),
                                   DR_BASE_ADDRESS (drb), 0)
-             || !dr_equal_offsets_p (dra, drb))
+             || !dr_equal_offsets_p (dra, drb)
+             || !gimple_assign_single_p (DR_STMT (dra))
+             || !gimple_assign_single_p (DR_STMT (drb)))
            break;
 
-         /* Check that the data-refs have the same constant size and step.  */
+         /* Check that the data-refs have the same constant size.  */
          tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
          tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
          if (!tree_fits_uhwi_p (sza)
              || !tree_fits_uhwi_p (szb)
-             || !tree_int_cst_equal (sza, szb)
-             || !tree_fits_shwi_p (DR_STEP (dra))
-             || !tree_fits_shwi_p (DR_STEP (drb))
-             || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
+             || !tree_int_cst_equal (sza, szb))
+           break;
+
+         /* Check that the data-refs have the same step.  */
+         if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
            break;
 
          /* Do not place the same access in the interleaving chain twice.  */
@@ -2556,19 +2747,37 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
          /* If init_b == init_a + the size of the type * k, we have an
             interleaving, and DRA is accessed before DRB.  */
          HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
-         if ((init_b - init_a) % type_size_a != 0)
+         if (type_size_a == 0
+             || (init_b - init_a) % type_size_a != 0)
            break;
 
-         /* The step (if not zero) is greater than the difference between
-            data-refs' inits.  This splits groups into suitable sizes.  */
-         HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
-         if (step != 0 && step <= (init_b - init_a))
+         /* If we have a store, the accesses are adjacent.  This splits
+            groups into chunks we support (we don't support vectorization
+            of stores with gaps).  */
+         if (!DR_IS_READ (dra)
+             && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
+                                            (DR_INIT (datarefs_copy[i-1]))
+                 != type_size_a))
            break;
 
+         /* If the step (if not zero or non-constant) is greater than the
+            difference between data-refs' inits this splits groups into
+            suitable sizes.  */
+         if (tree_fits_shwi_p (DR_STEP (dra)))
+           {
+             HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
+             if (step != 0 && step <= (init_b - init_a))
+               break;
+           }
+
          if (dump_enabled_p ())
            {
              dump_printf_loc (MSG_NOTE, vect_location,
                               "Detected interleaving ");
+             if (DR_IS_READ (dra))
+               dump_printf (MSG_NOTE, "load ");
+             else
+               dump_printf (MSG_NOTE, "store ");
              dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
              dump_printf (MSG_NOTE,  " and ");
              dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
@@ -2595,7 +2804,7 @@ vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
          dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
                           "not vectorized: complicated access pattern.\n");
 
-        if (bb_vinfo)
+        if (is_a <bb_vec_info> (vinfo))
           {
             /* Mark the statement as not vectorizable.  */
             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
@@ -2670,14 +2879,6 @@ comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
   return 0;
 }
 
-template <class T> static void
-swap (T& a, T& b)
-{
-  T c (a);
-  a = b;
-  b = c;
-}
-
 /* Function vect_vfa_segment_size.
 
    Create an expression that computes the size of segment
@@ -2776,9 +2977,9 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
     {
       struct data_reference *dr_a, *dr_b;
-      gimple dr_group_first_a, dr_group_first_b;
+      gimple *dr_group_first_a, *dr_group_first_b;
       tree segment_length_a, segment_length_b;
-      gimple stmt_a, stmt_b;
+      gimple *stmt_a, *stmt_b;
 
       dr_a = DDR_A (ddr);
       stmt_a = DR_STMT (DDR_A (ddr));
@@ -2810,7 +3011,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
           dr_with_seg_len (dr_b, segment_length_b));
 
       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
-       swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
+       std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
 
       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
     }
@@ -2860,8 +3061,8 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
             and DR_A1 and DR_A2 are two consecutive memrefs.  */
          if (*dr_a1 == *dr_a2)
            {
-             swap (dr_a1, dr_b1);
-             swap (dr_a2, dr_b2);
+             std::swap (dr_a1, dr_b1);
+             std::swap (dr_a2, dr_b2);
            }
 
          if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
@@ -2891,15 +3092,13 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
 
             */
 
-         HOST_WIDE_INT
-         min_seg_len_b = (TREE_CODE (dr_b1->seg_len) == INTEGER_CST) ?
-                            TREE_INT_CST_LOW (dr_b1->seg_len) :
-                            vect_factor;
+         HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
+                                         ? tree_to_shwi (dr_b1->seg_len)
+                                         : vect_factor);
 
          if (diff <= min_seg_len_b
-             || (TREE_CODE (dr_a1->seg_len) == INTEGER_CST
-                 && diff - (HOST_WIDE_INT) TREE_INT_CST_LOW (dr_a1->seg_len) <
-                    min_seg_len_b))
+             || (tree_fits_shwi_p (dr_a1->seg_len)
+                 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
            {
              if (dump_enabled_p ())
                {
@@ -2936,12 +3135,12 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
   return true;
 }
 
-/* Check whether a non-affine read in stmt is suitable for gather load
-   and if so, return a builtin decl for that operation.  */
+/* Check whether a non-affine read or write in stmt is suitable for gather load
+   or scatter store and if so, return a builtin decl for that operation.  */
 
 tree
-vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
-                  tree *offp, int *scalep)
+vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
+                          tree *offp, int *scalep)
 {
   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
@@ -2949,8 +3148,8 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   tree offtype = NULL_TREE;
   tree decl, base, off;
-  enum machine_mode pmode;
-  int punsignedp, pvolatilep;
+  machine_mode pmode;
+  int punsignedp, reversep, pvolatilep = 0;
 
   base = DR_REF (dr);
   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
@@ -2964,13 +3163,13 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
       && integer_zerop (TREE_OPERAND (base, 1))
       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
     {
-      gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
+      gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
       if (is_gimple_assign (def_stmt)
          && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
        base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
     }
 
-  /* The gather builtins need address of the form
+  /* The gather and scatter builtins need address of the form
      loop_invariant + vector * {1, 2, 4, 8}
      or
      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
@@ -2982,9 +3181,9 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
      vectorized.  The following code attempts to find such a preexistng
      SSA_NAME OFF and put the loop invariants into a tree BASE
      that can be gimplified before the loop.  */
-  base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
-                             &pmode, &punsignedp, &pvolatilep, false);
-  gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
+  base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
+                             &punsignedp, &reversep, &pvolatilep, false);
+  gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
 
   if (TREE_CODE (base) == MEM_REF)
     {
@@ -2992,8 +3191,8 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
        {
          if (off == NULL_TREE)
            {
-             double_int moff = mem_ref_offset (base);
-             off = double_int_to_tree (sizetype, moff);
+             offset_int moff = mem_ref_offset (base);
+             off = wide_int_to_tree (sizetype, moff);
            }
          else
            off = size_binop (PLUS_EXPR, off,
@@ -3038,7 +3237,7 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
 
       if (TREE_CODE (off) == SSA_NAME)
        {
-         gimple def_stmt = SSA_NAME_DEF_STMT (off);
+         gimple *def_stmt = SSA_NAME_DEF_STMT (off);
 
          if (expr_invariant_in_loop_p (loop, off))
            return NULL_TREE;
@@ -3133,8 +3332,13 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
   if (offtype == NULL_TREE)
     offtype = TREE_TYPE (off);
 
-  decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
-                                          offtype, scale);
+  if (DR_IS_READ (dr))
+    decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
+                                            offtype, scale);
+  else
+    decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
+                                             offtype, scale);
+
   if (decl == NULL_TREE)
     return NULL_TREE;
 
@@ -3163,121 +3367,30 @@ vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
 */
 
 bool
-vect_analyze_data_refs (loop_vec_info loop_vinfo,
-                       bb_vec_info bb_vinfo,
-                       int *min_vf)
+vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
 {
   struct loop *loop = NULL;
-  basic_block bb = NULL;
   unsigned int i;
-  vec<data_reference_p> datarefs;
   struct data_reference *dr;
   tree scalar_type;
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_NOTE, vect_location,
-                     "=== vect_analyze_data_refs ===\n");
-
-  if (loop_vinfo)
-    {
-      basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
-
-      loop = LOOP_VINFO_LOOP (loop_vinfo);
-      datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
-      if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
-       {
-         if (dump_enabled_p ())
-           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                            "not vectorized: loop contains function calls"
-                            " or data references that cannot be analyzed\n");
-         return false;
-       }
+                    "=== vect_analyze_data_refs ===\n");
 
-      for (i = 0; i < loop->num_nodes; i++)
-       {
-         gimple_stmt_iterator gsi;
-
-         for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
-           {
-             gimple stmt = gsi_stmt (gsi);
-             if (!find_data_references_in_stmt (loop, stmt, &datarefs))
-               {
-                 if (is_gimple_call (stmt) && loop->safelen)
-                   {
-                     tree fndecl = gimple_call_fndecl (stmt), op;
-                     if (fndecl != NULL_TREE)
-                       {
-                         struct cgraph_node *node = cgraph_get_node (fndecl);
-                         if (node != NULL && node->simd_clones != NULL)
-                           {
-                             unsigned int j, n = gimple_call_num_args (stmt);
-                             for (j = 0; j < n; j++)
-                               {
-                                 op = gimple_call_arg (stmt, j);
-                                 if (DECL_P (op)
-                                     || (REFERENCE_CLASS_P (op)
-                                         && get_base_address (op)))
-                                   break;
-                               }
-                             op = gimple_call_lhs (stmt);
-                             /* Ignore #pragma omp declare simd functions
-                                if they don't have data references in the
-                                call stmt itself.  */
-                             if (j == n
-                                 && !(op
-                                      && (DECL_P (op)
-                                          || (REFERENCE_CLASS_P (op)
-                                              && get_base_address (op)))))
-                               continue;
-                           }
-                       }
-                   }
-                 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
-                 if (dump_enabled_p ())
-                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                                    "not vectorized: loop contains function "
-                                    "calls or data references that cannot "
-                                    "be analyzed\n");
-                 return false;
-               }
-           }
-       }
-
-      LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
-    }
-  else
-    {
-      gimple_stmt_iterator gsi;
-
-      bb = BB_VINFO_BB (bb_vinfo);
-      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
-       {
-         gimple stmt = gsi_stmt (gsi);
-         if (!find_data_references_in_stmt (NULL, stmt,
-                                            &BB_VINFO_DATAREFS (bb_vinfo)))
-           {
-             /* Mark the rest of the basic-block as unvectorizable.  */
-             for (; !gsi_end_p (gsi); gsi_next (&gsi))
-               {
-                 stmt = gsi_stmt (gsi);
-                 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
-               }
-             break;
-           }
-       }
-
-      datarefs = BB_VINFO_DATAREFS (bb_vinfo);
-    }
+  if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
+    loop = LOOP_VINFO_LOOP (loop_vinfo);
 
   /* Go through the data-refs, check that the analysis succeeded.  Update
      pointer from stmt_vec_info struct to DR and vectype.  */
 
+  vec<data_reference_p> datarefs = vinfo->datarefs;
   FOR_EACH_VEC_ELT (datarefs, i, dr)
     {
-      gimple stmt;
+      gimple *stmt;
       stmt_vec_info stmt_info;
       tree base, offset, init;
-      bool gather = false;
+      enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
       bool simd_lane_access = false;
       int vf;
 
@@ -3316,18 +3429,22 @@ again:
            = DR_IS_READ (dr)
              && !TREE_THIS_VOLATILE (DR_REF (dr))
              && targetm.vectorize.builtin_gather != NULL;
+         bool maybe_scatter
+           = DR_IS_WRITE (dr)
+             && !TREE_THIS_VOLATILE (DR_REF (dr))
+             && targetm.vectorize.builtin_scatter != NULL;
          bool maybe_simd_lane_access
-           = loop_vinfo && loop->simduid;
+           = is_a <loop_vec_info> (vinfo) && loop->simduid;
 
-         /* If target supports vector gather loads, or if this might be
-            a SIMD lane access, see if they can't be used.  */
-         if (loop_vinfo
-             && (maybe_gather || maybe_simd_lane_access)
+         /* If target supports vector gather loads or scatter stores, or if
+            this might be a SIMD lane access, see if they can't be used.  */
+         if (is_a <loop_vec_info> (vinfo)
+             && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
              && !nested_in_vect_loop_p (loop, stmt))
            {
              struct data_reference *newdr
                = create_data_ref (NULL, loop_containing_stmt (stmt),
-                                  DR_REF (dr), stmt, true);
+                                  DR_REF (dr), stmt, maybe_scatter ? false : true);
              gcc_assert (newdr != NULL && DR_REF (newdr));
              if (DR_BASE_ADDRESS (newdr)
                  && DR_OFFSET (newdr)
@@ -3353,7 +3470,7 @@ again:
                            off = TREE_OPERAND (off, 0);
                          if (TREE_CODE (off) == SSA_NAME)
                            {
-                             gimple def = SSA_NAME_DEF_STMT (off);
+                             gimple *def = SSA_NAME_DEF_STMT (off);
                              tree reft = TREE_TYPE (DR_REF (newdr));
                              if (is_gimple_call (def)
                                  && gimple_call_internal_p (def)
@@ -3380,17 +3497,20 @@ again:
                            }
                        }
                    }
-                 if (!simd_lane_access && maybe_gather)
+                 if (!simd_lane_access && (maybe_gather || maybe_scatter))
                    {
                      dr = newdr;
-                     gather = true;
+                     if (maybe_gather)
+                       gatherscatter = GATHER;
+                     else
+                       gatherscatter = SCATTER;
                    }
                }
-             if (!gather && !simd_lane_access)
+             if (gatherscatter == SG_NONE && !simd_lane_access)
                free_data_ref (newdr);
            }
 
-         if (!gather && !simd_lane_access)
+         if (gatherscatter == SG_NONE && !simd_lane_access)
            {
              if (dump_enabled_p ())
                {
@@ -3401,7 +3521,7 @@ again:
                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
 
-             if (bb_vinfo)
+             if (is_a <bb_vec_info> (vinfo))
                break;
 
              return false;
@@ -3415,10 +3535,10 @@ again:
                              "not vectorized: base addr of dr is a "
                              "constant\n");
 
-          if (bb_vinfo)
+          if (is_a <bb_vec_info> (vinfo))
            break;
 
-         if (gather || simd_lane_access)
+         if (gatherscatter != SG_NONE || simd_lane_access)
            free_data_ref (dr);
          return false;
         }
@@ -3433,7 +3553,7 @@ again:
               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
             }
 
-          if (bb_vinfo)
+          if (is_a <bb_vec_info> (vinfo))
            break;
 
           return false;
@@ -3450,10 +3570,10 @@ again:
               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
             }
 
-          if (bb_vinfo)
+          if (is_a <bb_vec_info> (vinfo))
            break;
 
-         if (gather || simd_lane_access)
+         if (gatherscatter != SG_NONE || simd_lane_access)
            free_data_ref (dr);
           return false;
         }
@@ -3470,10 +3590,10 @@ again:
               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
             }
 
-          if (bb_vinfo)
+          if (is_a <bb_vec_info> (vinfo))
            break;
 
-         if (gather || simd_lane_access)
+         if (gatherscatter != SG_NONE || simd_lane_access)
            free_data_ref (dr);
           return false;
        }
@@ -3495,10 +3615,10 @@ again:
              dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
            }
 
-         if (bb_vinfo)
+         if (is_a <bb_vec_info> (vinfo))
            break;
 
-         if (gather || simd_lane_access)
+         if (gatherscatter != SG_NONE || simd_lane_access)
            free_data_ref (dr);
          return false;
        }
@@ -3516,8 +3636,8 @@ again:
          tree outer_step, outer_base, outer_init;
          HOST_WIDE_INT pbitsize, pbitpos;
          tree poffset;
-         enum machine_mode pmode;
-         int punsignedp, pvolatilep;
+         machine_mode pmode;
+         int punsignedp, preversep, pvolatilep;
          affine_iv base_iv, offset_iv;
          tree dinit;
 
@@ -3536,7 +3656,8 @@ again:
            }
 
          outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
-                         &poffset, &pmode, &punsignedp, &pvolatilep, false);
+                                           &poffset, &pmode, &punsignedp,
+                                           &preversep, &pvolatilep, false);
          gcc_assert (outer_base != NULL_TREE);
 
          if (pbitpos % BITS_PER_UNIT != 0)
@@ -3547,6 +3668,14 @@ again:
              return false;
            }
 
+         if (preversep)
+           {
+             if (dump_enabled_p ())
+               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                "failed: reverse storage order.\n");
+             return false;
+           }
+
          outer_base = build_fold_addr_expr (outer_base);
          if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
                           &base_iv, false))
@@ -3633,10 +3762,10 @@ again:
               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
             }
 
-          if (bb_vinfo)
+          if (is_a <bb_vec_info> (vinfo))
            break;
 
-         if (gather || simd_lane_access)
+         if (gatherscatter != SG_NONE || simd_lane_access)
            free_data_ref (dr);
           return false;
         }
@@ -3666,13 +3795,18 @@ again:
               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
             }
 
-          if (bb_vinfo)
-           break;
+          if (is_a <bb_vec_info> (vinfo))
+           {
+             /* No vector type is fine, the ref can still participate
+                in dependence analysis, we just can't vectorize it.  */
+             STMT_VINFO_VECTORIZABLE (stmt_info) = false;
+             continue;
+           }
 
-         if (gather || simd_lane_access)
+         if (gatherscatter != SG_NONE || simd_lane_access)
            {
              STMT_VINFO_DATA_REF (stmt_info) = NULL;
-             if (gather)
+             if (gatherscatter != SG_NONE)
                free_data_ref (dr);
            }
          return false;
@@ -3696,37 +3830,38 @@ again:
       if (vf > *min_vf)
        *min_vf = vf;
 
-      if (gather)
+      if (gatherscatter != SG_NONE)
        {
          tree off;
-
-         gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
-         if (gather
-             && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
-           gather = false;
-         if (!gather)
+         if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
+                                         NULL, &off, NULL)
+             || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
            {
              STMT_VINFO_DATA_REF (stmt_info) = NULL;
              free_data_ref (dr);
              if (dump_enabled_p ())
                {
-                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location, 
-                                   "not vectorized: not suitable for gather "
-                                   "load ");
+                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                  (gatherscatter == GATHER) ?
+                                  "not vectorized: not suitable for gather "
+                                  "load " :
+                                  "not vectorized: not suitable for scatter "
+                                  "store ");
                  dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
                }
              return false;
            }
 
+         free_data_ref (datarefs[i]);
          datarefs[i] = dr;
-         STMT_VINFO_GATHER_P (stmt_info) = true;
+         STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
        }
-      else if (loop_vinfo
+
+      else if (is_a <loop_vec_info> (vinfo)
               && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
        {
-         if (nested_in_vect_loop_p (loop, stmt)
-             || !DR_IS_READ (dr))
+         if (nested_in_vect_loop_p (loop, stmt))
            {
              if (dump_enabled_p ())
                {
@@ -3738,7 +3873,7 @@ again:
                }
              return false;
            }
-         STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
+         STMT_VINFO_STRIDED_P (stmt_info) = true;
        }
     }
 
@@ -3748,7 +3883,7 @@ again:
      avoids spending useless time in analyzing their dependence.  */
   if (i != datarefs.length ())
     {
-      gcc_assert (bb_vinfo != NULL);
+      gcc_assert (is_a <bb_vec_info> (vinfo));
       for (unsigned j = i; j < datarefs.length (); ++j)
        {
          data_reference_p dr = datarefs[j];
@@ -3783,6 +3918,9 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
   case vect_scalar_var:
     prefix = "stmp";
     break;
+  case vect_mask_var:
+    prefix = "mask";
+    break;
   case vect_pointer_var:
     prefix = "vectp";
     break;
@@ -3802,6 +3940,55 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
   return new_vect_var;
 }
 
+/* Like vect_get_new_vect_var but return an SSA name.  */
+
+tree
+vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
+{
+  const char *prefix;
+  tree new_vect_var;
+
+  switch (var_kind)
+  {
+  case vect_simple_var:
+    prefix = "vect";
+    break;
+  case vect_scalar_var:
+    prefix = "stmp";
+    break;
+  case vect_pointer_var:
+    prefix = "vectp";
+    break;
+  default:
+    gcc_unreachable ();
+  }
+
+  if (name)
+    {
+      char* tmp = concat (prefix, "_", name, NULL);
+      new_vect_var = make_temp_ssa_name (type, NULL, tmp);
+      free (tmp);
+    }
+  else
+    new_vect_var = make_temp_ssa_name (type, NULL, prefix);
+
+  return new_vect_var;
+}
+
+/* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
+
+static void
+vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
+                                 stmt_vec_info stmt_info)
+{
+  duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
+  unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
+  int misalign = DR_MISALIGNMENT (dr);
+  if (misalign == -1)
+    mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
+  else
+    set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
+}
 
 /* Function vect_create_addr_base_for_vector_ref.
 
@@ -3825,6 +4012,9 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
            is as follows:
            if LOOP=i_loop:     &in             (relative to i_loop)
            if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
+   BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
+           initial address.  Unlike OFFSET, which is number of elements to
+           be added, BYTE_OFFSET is measured in bytes.
 
    Output:
    1. Return an SSA_NAME whose value is the address of the memory location of
@@ -3835,10 +4025,11 @@ vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
    FORNOW: We are only handling array accesses with step 1.  */
 
 tree
-vect_create_addr_base_for_vector_ref (gimple stmt,
+vect_create_addr_base_for_vector_ref (gimple *stmt,
                                      gimple_seq *new_stmt_list,
                                      tree offset,
-                                     struct loop *loop)
+                                     struct loop *loop,
+                                     tree byte_offset)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
@@ -3891,6 +4082,12 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
       base_offset = fold_build2 (PLUS_EXPR, sizetype,
                                 base_offset, offset);
     }
+  if (byte_offset)
+    {
+      byte_offset = fold_convert (sizetype, byte_offset);
+      base_offset = fold_build2 (PLUS_EXPR, sizetype,
+                                base_offset, byte_offset);
+    }
 
   /* base + base_offset */
   if (loop_vinfo)
@@ -3903,16 +4100,16 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
     }
 
   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
-  addr_base = fold_convert (vect_ptr_type, addr_base);
   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
-  addr_base = force_gimple_operand (addr_base, &seq, false, dest);
+  addr_base = force_gimple_operand (addr_base, &seq, true, dest);
   gimple_seq_add_seq (new_stmt_list, seq);
 
   if (DR_PTR_INFO (dr)
-      && TREE_CODE (addr_base) == SSA_NAME)
+      && TREE_CODE (addr_base) == SSA_NAME
+      && !SSA_NAME_PTR_INFO (addr_base))
     {
-      duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
-      if (offset)
+      vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
+      if (offset || byte_offset)
        mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
     }
 
@@ -3948,6 +4145,10 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
    5. BSI: location where the new stmts are to be placed if there is no loop
    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
         pointing to the initial address.
+   7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
+       to the initial address accessed by the data-ref in STMT.  This is
+       similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
+       in bytes.
 
    Output:
    1. Declare a new ptr to vector_type, and have it point to the base of the
@@ -3961,6 +4162,8 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
          initial_address = &a[init];
       if OFFSET is supplied:
          initial_address = &a[init + OFFSET];
+      if BYTE_OFFSET is supplied:
+        initial_address = &a[init] + BYTE_OFFSET;
 
       Return the initial_address in INITIAL_ADDRESS.
 
@@ -3975,10 +4178,10 @@ vect_create_addr_base_for_vector_ref (gimple stmt,
    4. Return the pointer.  */
 
 tree
-vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
+vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
                          tree offset, tree *initial_address,
-                         gimple_stmt_iterator *gsi, gimple *ptr_incr,
-                         bool only_init, bool *inv_p)
+                         gimple_stmt_iterator *gsi, gimple **ptr_incr,
+                         bool only_init, bool *inv_p, tree byte_offset)
 {
   const char *base_name;
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
@@ -3989,7 +4192,6 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
   tree aggr_ptr_type;
   tree aggr_ptr;
   tree new_temp;
-  gimple vec_stmt;
   gimple_seq new_stmt_list = NULL;
   edge pe = NULL;
   basic_block new_bb;
@@ -3999,7 +4201,7 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
   gimple_stmt_iterator incr_gsi;
   bool insert_after;
   tree indx_before_incr, indx_after_incr;
-  gimple incr;
+  gimple *incr;
   tree step;
   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
 
@@ -4067,7 +4269,7 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
   /* Likewise for any of the data references in the stmt group.  */
   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
     {
-      gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
+      gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
       do
        {
          stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
@@ -4121,10 +4323,10 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
   /* (2) Calculate the initial address of the aggregate-pointer, and set
      the aggregate-pointer to point to it before the loop.  */
 
-  /* Create: (&(base[init_val+offset]) in the loop preheader.  */
+  /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
 
   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
-                                                   offset, loop);
+                                                  offset, loop, byte_offset);
   if (new_stmt_list)
     {
       if (pe)
@@ -4137,28 +4339,7 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
     }
 
   *initial_address = new_temp;
-
-  /* Create: p = (aggr_type *) initial_base  */
-  if (TREE_CODE (new_temp) != SSA_NAME
-      || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
-    {
-      vec_stmt = gimple_build_assign (aggr_ptr,
-                                     fold_convert (aggr_ptr_type, new_temp));
-      aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
-      /* Copy the points-to information if it exists. */
-      if (DR_PTR_INFO (dr))
-       duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
-      gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
-      if (pe)
-       {
-         new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
-         gcc_assert (!new_bb);
-       }
-      else
-       gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
-    }
-  else
-    aggr_ptr_init = new_temp;
+  aggr_ptr_init = new_temp;
 
   /* (3) Handle the updating of the aggregate-pointer inside the loop.
      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
@@ -4185,13 +4366,13 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
                 aggr_ptr, loop, &incr_gsi, insert_after,
                 &indx_before_incr, &indx_after_incr);
       incr = gsi_stmt (incr_gsi);
-      set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
+      set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
 
       /* Copy the points-to information if it exists. */
       if (DR_PTR_INFO (dr))
        {
-         duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
-         duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
+         vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
+         vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
        }
       if (ptr_incr)
        *ptr_incr = incr;
@@ -4215,13 +4396,13 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
                 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
                 &indx_after_incr);
       incr = gsi_stmt (incr_gsi);
-      set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
+      set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
 
       /* Copy the points-to information if it exists. */
       if (DR_PTR_INFO (dr))
        {
-         duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
-         duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
+         vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
+         vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
        }
       if (ptr_incr)
        *ptr_incr = incr;
@@ -4268,14 +4449,14 @@ vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
 */
 
 tree
-bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
-                gimple stmt, tree bump)
+bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
+                gimple *stmt, tree bump)
 {
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
   tree update = TYPE_SIZE_UNIT (vectype);
-  gimple incr_stmt;
+  gassign *incr_stmt;
   ssa_op_iter iter;
   use_operand_p use_p;
   tree new_dataref_ptr;
@@ -4283,9 +4464,12 @@ bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
   if (bump)
     update = bump;
 
-  new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
-  incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
-                                           dataref_ptr, update);
+  if (TREE_CODE (dataref_ptr) == SSA_NAME)
+    new_dataref_ptr = copy_ssa_name (dataref_ptr);
+  else
+    new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
+  incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
+                                  dataref_ptr, update);
   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
 
   /* Copy the points-to information if it exists. */
@@ -4326,16 +4510,20 @@ vect_create_destination_var (tree scalar_dest, tree vectype)
   tree type;
   enum vect_var_kind kind;
 
-  kind = vectype ? vect_simple_var : vect_scalar_var;
+  kind = vectype
+    ? VECTOR_BOOLEAN_TYPE_P (vectype)
+    ? vect_mask_var
+    : vect_simple_var
+    : vect_scalar_var;
   type = vectype ? vectype : TREE_TYPE (scalar_dest);
 
   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
 
   name = get_name (scalar_dest);
   if (name)
-    asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
+    new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
   else
-    asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
+    new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
   vec_dest = vect_get_new_vect_var (type, kind, new_name);
   free (new_name);
 
@@ -4350,15 +4538,16 @@ vect_create_destination_var (tree scalar_dest, tree vectype)
 bool
 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
 {
-  enum machine_mode mode = TYPE_MODE (vectype);
+  machine_mode mode = TYPE_MODE (vectype);
 
-  /* vect_permute_store_chain requires the group size to be a power of two.  */
-  if (exact_log2 (count) == -1)
+  /* vect_permute_store_chain requires the group size to be equal to 3 or
+     be a power of two.  */
+  if (count != 3 && exact_log2 (count) == -1)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "the size of the group of accesses"
-                         " is not a power of 2\n");
+                        "the size of the group of accesses"
+                        " is not a power of 2 or not eqaul to 3\n");
       return false;
     }
 
@@ -4367,23 +4556,76 @@ vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
     {
       unsigned int i, nelt = GET_MODE_NUNITS (mode);
       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
-      for (i = 0; i < nelt / 2; i++)
+
+      if (count == 3)
        {
-         sel[i * 2] = i;
-         sel[i * 2 + 1] = i + nelt;
+         unsigned int j0 = 0, j1 = 0, j2 = 0;
+         unsigned int i, j;
+
+         for (j = 0; j < 3; j++)
+           {
+             int nelt0 = ((3 - j) * nelt) % 3;
+             int nelt1 = ((3 - j) * nelt + 1) % 3;
+             int nelt2 = ((3 - j) * nelt + 2) % 3;
+             for (i = 0; i < nelt; i++)
+               {
+                 if (3 * i + nelt0 < nelt)
+                   sel[3 * i + nelt0] = j0++;
+                 if (3 * i + nelt1 < nelt)
+                   sel[3 * i + nelt1] = nelt + j1++;
+                 if (3 * i + nelt2 < nelt)
+                   sel[3 * i + nelt2] = 0;
+               }
+             if (!can_vec_perm_p (mode, false, sel))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf (MSG_MISSED_OPTIMIZATION,
+                                "permutaion op not supported by target.\n");
+                 return false;
+               }
+
+             for (i = 0; i < nelt; i++)
+               {
+                 if (3 * i + nelt0 < nelt)
+                   sel[3 * i + nelt0] = 3 * i + nelt0;
+                 if (3 * i + nelt1 < nelt)
+                   sel[3 * i + nelt1] = 3 * i + nelt1;
+                 if (3 * i + nelt2 < nelt)
+                   sel[3 * i + nelt2] = nelt + j2++;
+               }
+             if (!can_vec_perm_p (mode, false, sel))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf (MSG_MISSED_OPTIMIZATION,
+                                "permutaion op not supported by target.\n");
+                 return false;
+               }
+           }
+         return true;
        }
-      if (can_vec_perm_p (mode, false, sel))
+      else
        {
-         for (i = 0; i < nelt; i++)
-           sel[i] += nelt / 2;
-         if (can_vec_perm_p (mode, false, sel))
-           return true;
+         /* If length is not equal to 3 then only power of 2 is supported.  */
+         gcc_assert (exact_log2 (count) != -1);
+
+         for (i = 0; i < nelt / 2; i++)
+           {
+             sel[i * 2] = i;
+             sel[i * 2 + 1] = i + nelt;
+           }
+           if (can_vec_perm_p (mode, false, sel))
+             {
+               for (i = 0; i < nelt; i++)
+                 sel[i] += nelt / 2;
+               if (can_vec_perm_p (mode, false, sel))
+                 return true;
+             }
        }
     }
 
   if (dump_enabled_p ())
     dump_printf (MSG_MISSED_OPTIMIZATION,
-                 "interleave op not supported by target.\n");
+                "permutaion op not supported by target.\n");
   return false;
 }
 
@@ -4403,9 +4645,9 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
 /* Function vect_permute_store_chain.
 
    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
-   a power of 2, generate interleave_high/low stmts to reorder the data
-   correctly for the stores.  Return the final references for stores in
-   RESULT_CHAIN.
+   a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
+   the data correctly for the stores.  Return the final references for stores
+   in RESULT_CHAIN.
 
    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
    The input is 4 vectors each containing 8 elements.  We assign a number to
@@ -4464,15 +4706,17 @@ vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
 void
 vect_permute_store_chain (vec<tree> dr_chain,
                          unsigned int length,
-                         gimple stmt,
+                         gimple *stmt,
                          gimple_stmt_iterator *gsi,
                          vec<tree> *result_chain)
 {
   tree vect1, vect2, high, low;
-  gimple perm_stmt;
+  gimple *perm_stmt;
   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
   tree perm_mask_low, perm_mask_high;
-  unsigned int i, n;
+  tree data_ref;
+  tree perm3_mask_low, perm3_mask_high;
+  unsigned int i, n, log_length = exact_log2 (length);
   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
 
@@ -4480,47 +4724,108 @@ vect_permute_store_chain (vec<tree> dr_chain,
   memcpy (result_chain->address (), dr_chain.address (),
          length * sizeof (tree));
 
-  for (i = 0, n = nelt / 2; i < n; i++)
+  if (length == 3)
     {
-      sel[i * 2] = i;
-      sel[i * 2 + 1] = i + nelt;
-    }
-  perm_mask_high = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_high != NULL);
+      unsigned int j0 = 0, j1 = 0, j2 = 0;
 
-  for (i = 0; i < nelt; i++)
-    sel[i] += nelt / 2;
-  perm_mask_low = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_low != NULL);
+      for (j = 0; j < 3; j++)
+        {
+         int nelt0 = ((3 - j) * nelt) % 3;
+         int nelt1 = ((3 - j) * nelt + 1) % 3;
+         int nelt2 = ((3 - j) * nelt + 2) % 3;
 
-  for (i = 0, n = exact_log2 (length); i < n; i++)
-    {
-      for (j = 0; j < length/2; j++)
-       {
-         vect1 = dr_chain[j];
-         vect2 = dr_chain[j+length/2];
+         for (i = 0; i < nelt; i++)
+           {
+             if (3 * i + nelt0 < nelt)
+               sel[3 * i + nelt0] = j0++;
+             if (3 * i + nelt1 < nelt)
+               sel[3 * i + nelt1] = nelt + j1++;
+             if (3 * i + nelt2 < nelt)
+               sel[3 * i + nelt2] = 0;
+           }
+         perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
+
+         for (i = 0; i < nelt; i++)
+           {
+             if (3 * i + nelt0 < nelt)
+               sel[3 * i + nelt0] = 3 * i + nelt0;
+             if (3 * i + nelt1 < nelt)
+               sel[3 * i + nelt1] = 3 * i + nelt1;
+             if (3 * i + nelt2 < nelt)
+               sel[3 * i + nelt2] = nelt + j2++;
+           }
+         perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
+
+         vect1 = dr_chain[0];
+         vect2 = dr_chain[1];
 
          /* Create interleaving stmt:
-            high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}>  */
-         high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
-         perm_stmt
-           = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
-                                           vect1, vect2, perm_mask_high);
+            low = VEC_PERM_EXPR <vect1, vect2,
+                                 {j, nelt, *, j + 1, nelt + j + 1, *,
+                                  j + 2, nelt + j + 2, *, ...}>  */
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
+         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
+                                          vect2, perm3_mask_low);
          vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-         (*result_chain)[2*j] = high;
 
+         vect1 = data_ref;
+         vect2 = dr_chain[2];
          /* Create interleaving stmt:
-            low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
-                                                nelt*3/2+1, ...}>  */
-         low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
-         perm_stmt
-           = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
-                                           vect1, vect2, perm_mask_low);
+            low = VEC_PERM_EXPR <vect1, vect2,
+                                 {0, 1, nelt + j, 3, 4, nelt + j + 1,
+                                  6, 7, nelt + j + 2, ...}>  */
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
+         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
+                                          vect2, perm3_mask_high);
          vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-         (*result_chain)[2*j+1] = low;
+         (*result_chain)[j] = data_ref;
+       }
+    }
+  else
+    {
+      /* If length is not equal to 3 then only power of 2 is supported.  */
+      gcc_assert (exact_log2 (length) != -1);
+
+      for (i = 0, n = nelt / 2; i < n; i++)
+       {
+         sel[i * 2] = i;
+         sel[i * 2 + 1] = i + nelt;
        }
-      memcpy (dr_chain.address (), result_chain->address (),
-             length * sizeof (tree));
+       perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
+
+       for (i = 0; i < nelt; i++)
+         sel[i] += nelt / 2;
+       perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
+
+       for (i = 0, n = log_length; i < n; i++)
+         {
+           for (j = 0; j < length/2; j++)
+             {
+               vect1 = dr_chain[j];
+               vect2 = dr_chain[j+length/2];
+
+               /* Create interleaving stmt:
+                  high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
+                                                       ...}>  */
+               high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
+               perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
+                                                vect2, perm_mask_high);
+               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+               (*result_chain)[2*j] = high;
+
+               /* Create interleaving stmt:
+                  low = VEC_PERM_EXPR <vect1, vect2,
+                                       {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
+                                        ...}>  */
+               low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
+               perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
+                                                vect2, perm_mask_low);
+               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+               (*result_chain)[2*j+1] = low;
+             }
+           memcpy (dr_chain.address (), result_chain->address (),
+                   length * sizeof (tree));
+         }
     }
 }
 
@@ -4576,7 +4881,7 @@ vect_permute_store_chain (vec<tree> dr_chain,
    Return value - the result of the loop-header phi node.  */
 
 tree
-vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
+vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
                         tree *realignment_token,
                        enum dr_alignment_support alignment_support_scheme,
                        tree init_addr,
@@ -4590,14 +4895,13 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
   edge pe = NULL;
   tree scalar_dest = gimple_assign_lhs (stmt);
   tree vec_dest;
-  gimple inc;
+  gimple *inc;
   tree ptr;
   tree data_ref;
-  gimple new_stmt;
   basic_block new_bb;
   tree msq_init = NULL_TREE;
   tree new_temp;
-  gimple phi_stmt;
+  gphi *phi_stmt;
   tree msq = NULL_TREE;
   gimple_seq stmts = NULL;
   bool inv_p;
@@ -4688,15 +4992,19 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
   if (alignment_support_scheme == dr_explicit_realign_optimized)
     {
       /* Create msq_init = *(floor(p1)) in the loop preheader  */
+      gassign *new_stmt;
 
       gcc_assert (!compute_in_loop);
       vec_dest = vect_create_destination_var (scalar_dest, vectype);
       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
                                      NULL_TREE, &init_addr, NULL, &inc,
                                      true, &inv_p);
-      new_temp = copy_ssa_name (ptr, NULL);
-      new_stmt = gimple_build_assign_with_ops
-                  (BIT_AND_EXPR, new_temp, ptr,
+      if (TREE_CODE (ptr) == SSA_NAME)
+       new_temp = copy_ssa_name (ptr);
+      else
+       new_temp = make_ssa_name (TREE_TYPE (ptr));
+      new_stmt = gimple_build_assign
+                  (new_temp, BIT_AND_EXPR, ptr,
                    build_int_cst (TREE_TYPE (ptr),
                                   -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
@@ -4724,6 +5032,7 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
 
   if (targetm.vectorize.builtin_mask_for_load)
     {
+      gcall *new_stmt;
       tree builtin_decl;
 
       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
@@ -4781,7 +5090,7 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
 
   pe = loop_preheader_edge (containing_loop);
   vec_dest = vect_create_destination_var (scalar_dest, vectype);
-  msq = make_ssa_name (vec_dest, NULL);
+  msq = make_ssa_name (vec_dest);
   phi_stmt = create_phi_node (msq, containing_loop->header);
   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
 
@@ -4797,38 +5106,78 @@ vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
 bool
 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
 {
-  enum machine_mode mode = TYPE_MODE (vectype);
+  machine_mode mode = TYPE_MODE (vectype);
 
-  /* vect_permute_load_chain requires the group size to be a power of two.  */
-  if (exact_log2 (count) == -1)
+  /* vect_permute_load_chain requires the group size to be equal to 3 or
+     be a power of two.  */
+  if (count != 3 && exact_log2 (count) == -1)
     {
       if (dump_enabled_p ())
        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                         "the size of the group of accesses"
-                         " is not a power of 2\n");
+                        "the size of the group of accesses"
+                        " is not a power of 2 or not equal to 3\n");
       return false;
     }
 
   /* Check that the permutation is supported.  */
   if (VECTOR_MODE_P (mode))
     {
-      unsigned int i, nelt = GET_MODE_NUNITS (mode);
+      unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
 
-      for (i = 0; i < nelt; i++)
-       sel[i] = i * 2;
-      if (can_vec_perm_p (mode, false, sel))
+      if (count == 3)
        {
+         unsigned int k;
+         for (k = 0; k < 3; k++)
+           {
+             for (i = 0; i < nelt; i++)
+               if (3 * i + k < 2 * nelt)
+                 sel[i] = 3 * i + k;
+               else
+                 sel[i] = 0;
+             if (!can_vec_perm_p (mode, false, sel))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "shuffle of 3 loads is not supported by"
+                                    " target\n");
+                 return false;
+               }
+             for (i = 0, j = 0; i < nelt; i++)
+               if (3 * i + k < 2 * nelt)
+                 sel[i] = i;
+               else
+                 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+             if (!can_vec_perm_p (mode, false, sel))
+               {
+                 if (dump_enabled_p ())
+                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                                    "shuffle of 3 loads is not supported by"
+                                    " target\n");
+                 return false;
+               }
+           }
+         return true;
+       }
+      else
+       {
+         /* If length is not equal to 3 then only power of 2 is supported.  */
+         gcc_assert (exact_log2 (count) != -1);
          for (i = 0; i < nelt; i++)
-           sel[i] = i * 2 + 1;
+           sel[i] = i * 2;
          if (can_vec_perm_p (mode, false, sel))
-           return true;
-       }
+           {
+             for (i = 0; i < nelt; i++)
+               sel[i] = i * 2 + 1;
+             if (can_vec_perm_p (mode, false, sel))
+               return true;
+           }
+        }
     }
 
   if (dump_enabled_p ())
     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
-                     "extract even/odd not supported by target\n");
+                    "extract even/odd not supported by target\n");
   return false;
 }
 
@@ -4846,8 +5195,9 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
 /* Function vect_permute_load_chain.
 
    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
-   a power of 2, generate extract_even/odd stmts to reorder the input data
-   correctly.  Return the final references for loads in RESULT_CHAIN.
+   a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
+   the input data correctly.  Return the final references for loads in
+   RESULT_CHAIN.
 
    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
    The input is 4 vectors each containing 8 elements. We assign a number to each
@@ -4922,13 +5272,14 @@ vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
 static void
 vect_permute_load_chain (vec<tree> dr_chain,
                         unsigned int length,
-                        gimple stmt,
+                        gimple *stmt,
                         gimple_stmt_iterator *gsi,
                         vec<tree> *result_chain)
 {
   tree data_ref, first_vect, second_vect;
   tree perm_mask_even, perm_mask_odd;
-  gimple perm_stmt;
+  tree perm3_mask_low, perm3_mask_high;
+  gimple *perm_stmt;
   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
   unsigned int i, j, log_length = exact_log2 (length);
   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
@@ -4938,45 +5289,426 @@ vect_permute_load_chain (vec<tree> dr_chain,
   memcpy (result_chain->address (), dr_chain.address (),
          length * sizeof (tree));
 
-  for (i = 0; i < nelt; ++i)
-    sel[i] = i * 2;
-  perm_mask_even = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_even != NULL);
+  if (length == 3)
+    {
+      unsigned int k;
+
+      for (k = 0; k < 3; k++)
+       {
+         for (i = 0; i < nelt; i++)
+           if (3 * i + k < 2 * nelt)
+             sel[i] = 3 * i + k;
+           else
+             sel[i] = 0;
+         perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
+
+         for (i = 0, j = 0; i < nelt; i++)
+           if (3 * i + k < 2 * nelt)
+             sel[i] = i;
+           else
+             sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
+
+         perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
+
+         first_vect = dr_chain[0];
+         second_vect = dr_chain[1];
+
+         /* Create interleaving stmt (low part of):
+            low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+                                                            ...}>  */
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
+         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
+                                          second_vect, perm3_mask_low);
+         vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+
+         /* Create interleaving stmt (high part of):
+            high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
+                                                             ...}>  */
+         first_vect = data_ref;
+         second_vect = dr_chain[2];
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
+         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
+                                          second_vect, perm3_mask_high);
+         vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+         (*result_chain)[k] = data_ref;
+       }
+    }
+  else
+    {
+      /* If length is not equal to 3 then only power of 2 is supported.  */
+      gcc_assert (exact_log2 (length) != -1);
+
+      for (i = 0; i < nelt; ++i)
+       sel[i] = i * 2;
+      perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
+
+      for (i = 0; i < nelt; ++i)
+       sel[i] = i * 2 + 1;
+      perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
+
+      for (i = 0; i < log_length; i++)
+       {
+         for (j = 0; j < length; j += 2)
+           {
+             first_vect = dr_chain[j];
+             second_vect = dr_chain[j+1];
+
+             /* data_ref = permute_even (first_data_ref, second_data_ref);  */
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
+             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+                                              first_vect, second_vect,
+                                              perm_mask_even);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             (*result_chain)[j/2] = data_ref;
+
+             /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
+             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+                                              first_vect, second_vect,
+                                              perm_mask_odd);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             (*result_chain)[j/2+length/2] = data_ref;
+           }
+         memcpy (dr_chain.address (), result_chain->address (),
+                 length * sizeof (tree));
+       }
+    }
+}
+
+/* Function vect_shift_permute_load_chain.
+
+   Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
+   sequence of stmts to reorder the input data accordingly.
+   Return the final references for loads in RESULT_CHAIN.
+   Return true if successed, false otherwise.
+
+   E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
+   The input is 3 vectors each containing 8 elements.  We assign a
+   number to each element, the input sequence is:
+
+   1st vec:   0  1  2  3  4  5  6  7
+   2nd vec:   8  9 10 11 12 13 14 15
+   3rd vec:  16 17 18 19 20 21 22 23
+
+   The output sequence should be:
+
+   1st vec:  0 3 6  9 12 15 18 21
+   2nd vec:  1 4 7 10 13 16 19 22
+   3rd vec:  2 5 8 11 14 17 20 23
+
+   We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
+
+   First we shuffle all 3 vectors to get correct elements order:
+
+   1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
+   2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
+   3rd vec:  (16 19 22) (17 20 23) (18 21)
+
+   Next we unite and shift vector 3 times:
+
+   1st step:
+     shift right by 6 the concatenation of:
+     "1st vec" and  "2nd vec"
+       ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
+     "2nd vec" and  "3rd vec"
+       ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
+     "3rd vec" and  "1st vec"
+       (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
+                            | New vectors                   |
+
+     So that now new vectors are:
+
+     1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
+     2nd vec:  (10 13) (16 19 22) (17 20 23)
+     3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
+
+   2nd step:
+     shift right by 5 the concatenation of:
+     "1st vec" and  "3rd vec"
+       ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
+     "2nd vec" and  "1st vec"
+       (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
+     "3rd vec" and  "2nd vec"
+       (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
+                         | New vectors                   |
+
+     So that now new vectors are:
+
+     1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
+     2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
+     3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
+
+   3rd step:
+     shift right by 5 the concatenation of:
+     "1st vec" and  "1st vec"
+       ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
+     shift right by 3 the concatenation of:
+     "2nd vec" and  "2nd vec"
+               (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
+                         | New vectors                   |
+
+     So that now all vectors are READY:
+     1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
+     2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
+     3rd vec:  ( 1  4  7) (10 13) (16 19 22)
+
+   This algorithm is faster than one in vect_permute_load_chain if:
+     1.  "shift of a concatination" is faster than general permutation.
+        This is usually so.
+     2.  The TARGET machine can't execute vector instructions in parallel.
+        This is because each step of the algorithm depends on previous.
+        The algorithm in vect_permute_load_chain is much more parallel.
+
+   The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
+*/
+
+static bool
+vect_shift_permute_load_chain (vec<tree> dr_chain,
+                              unsigned int length,
+                              gimple *stmt,
+                              gimple_stmt_iterator *gsi,
+                              vec<tree> *result_chain)
+{
+  tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
+  tree perm2_mask1, perm2_mask2, perm3_mask;
+  tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
+  gimple *perm_stmt;
+
+  tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
+  unsigned int i;
+  unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
+  unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
+  stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+  loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
+
+  result_chain->quick_grow (length);
+  memcpy (result_chain->address (), dr_chain.address (),
+         length * sizeof (tree));
+
+  if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
+    {
+      unsigned int j, log_length = exact_log2 (length);
+      for (i = 0; i < nelt / 2; ++i)
+       sel[i] = i * 2;
+      for (i = 0; i < nelt / 2; ++i)
+       sel[nelt / 2 + i] = i * 2 + 1;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "shuffle of 2 fields structure is not \
+                             supported by target\n");
+         return false;
+       }
+      perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
 
-  for (i = 0; i < nelt; ++i)
-    sel[i] = i * 2 + 1;
-  perm_mask_odd = vect_gen_perm_mask (vectype, sel);
-  gcc_assert (perm_mask_odd != NULL);
+      for (i = 0; i < nelt / 2; ++i)
+       sel[i] = i * 2 + 1;
+      for (i = 0; i < nelt / 2; ++i)
+       sel[nelt / 2 + i] = i * 2;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "shuffle of 2 fields structure is not \
+                             supported by target\n");
+         return false;
+       }
+      perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
 
-  for (i = 0; i < log_length; i++)
+      /* Generating permutation constant to shift all elements.
+        For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
+      for (i = 0; i < nelt; i++)
+       sel[i] = nelt / 2 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "shift permutation is not supported by target\n");
+         return false;
+       }
+      shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+      /* Generating permutation constant to select vector from 2.
+        For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
+      for (i = 0; i < nelt / 2; i++)
+       sel[i] = i;
+      for (i = nelt / 2; i < nelt; i++)
+       sel[i] = nelt + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "select is not supported by target\n");
+         return false;
+       }
+      select_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+      for (i = 0; i < log_length; i++)
+       {
+         for (j = 0; j < length; j += 2)
+           {
+             first_vect = dr_chain[j];
+             second_vect = dr_chain[j + 1];
+
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+                                              first_vect, first_vect,
+                                              perm2_mask1);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             vect[0] = data_ref;
+
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
+             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+                                              second_vect, second_vect,
+                                              perm2_mask2);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             vect[1] = data_ref;
+
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
+             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+                                              vect[0], vect[1], shift1_mask);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             (*result_chain)[j/2 + length/2] = data_ref;
+
+             data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
+             perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+                                              vect[0], vect[1], select_mask);
+             vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+             (*result_chain)[j/2] = data_ref;
+           }
+         memcpy (dr_chain.address (), result_chain->address (),
+                 length * sizeof (tree));
+       }
+      return true;
+    }
+  if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
     {
-      for (j = 0; j < length; j += 2)
+      unsigned int k = 0, l = 0;
+
+      /* Generating permutation constant to get all elements in rigth order.
+        For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
+      for (i = 0; i < nelt; i++)
+       {
+         if (3 * k + (l % 3) >= nelt)
+           {
+             k = 0;
+             l += (3 - (nelt % 3));
+           }
+         sel[i] = 3 * k + (l % 3);
+         k++;
+       }
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
        {
-         first_vect = dr_chain[j];
-         second_vect = dr_chain[j+1];
-
-         /* data_ref = permute_even (first_data_ref, second_data_ref);  */
-         data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
-         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
-                                                   first_vect, second_vect,
-                                                   perm_mask_even);
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "shuffle of 3 fields structure is not \
+                             supported by target\n");
+         return false;
+       }
+      perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+      /* Generating permutation constant to shift all elements.
+        For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
+      for (i = 0; i < nelt; i++)
+       sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "shift permutation is not supported by target\n");
+         return false;
+       }
+      shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+      /* Generating permutation constant to shift all elements.
+        For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
+      for (i = 0; i < nelt; i++)
+       sel[i] = 2 * (nelt / 3) + 1 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "shift permutation is not supported by target\n");
+         return false;
+       }
+      shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+      /* Generating permutation constant to shift all elements.
+        For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
+      for (i = 0; i < nelt; i++)
+       sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "shift permutation is not supported by target\n");
+         return false;
+       }
+      shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+      /* Generating permutation constant to shift all elements.
+        For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
+      for (i = 0; i < nelt; i++)
+       sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
+      if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
+       {
+         if (dump_enabled_p ())
+           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
+                            "shift permutation is not supported by target\n");
+         return false;
+       }
+      shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
+
+      for (k = 0; k < 3; k++)
+       {
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
+         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+                                          dr_chain[k], dr_chain[k],
+                                          perm3_mask);
          vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-         (*result_chain)[j/2] = data_ref;
+         vect[k] = data_ref;
+       }
+
+      for (k = 0; k < 3; k++)
+       {
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
+         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+                                          vect[k % 3], vect[(k + 1) % 3],
+                                          shift1_mask);
+         vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+         vect_shift[k] = data_ref;
+       }
 
-         /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
-         data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
-         perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
-                                                   first_vect, second_vect,
-                                                   perm_mask_odd);
+      for (k = 0; k < 3; k++)
+       {
+         data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
+         perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
+                                          vect_shift[(4 - k) % 3],
+                                          vect_shift[(3 - k) % 3],
+                                          shift2_mask);
          vect_finish_stmt_generation (stmt, perm_stmt, gsi);
-         (*result_chain)[j/2+length/2] = data_ref;
+         vect[k] = data_ref;
        }
-      memcpy (dr_chain.address (), result_chain->address (),
-             length * sizeof (tree));
+
+      (*result_chain)[3 - (nelt % 3)] = vect[2];
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
+      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
+                                      vect[0], shift3_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[nelt % 3] = data_ref;
+
+      data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
+      perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
+                                      vect[1], shift4_mask);
+      vect_finish_stmt_generation (stmt, perm_stmt, gsi);
+      (*result_chain)[0] = data_ref;
+      return true;
     }
+  return false;
 }
 
-
 /* Function vect_transform_grouped_load.
 
    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
@@ -4985,16 +5717,26 @@ vect_permute_load_chain (vec<tree> dr_chain,
 */
 
 void
-vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
+vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
                             gimple_stmt_iterator *gsi)
 {
+  machine_mode mode;
   vec<tree> result_chain = vNULL;
 
   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
      vectors, that are ready for vector computation.  */
   result_chain.create (size);
-  vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
+
+  /* If reassociation width for vector type is 2 or greater target machine can
+     execute 2 or more vector instructions in parallel.  Otherwise try to
+     get chain for loads group using vect_shift_permute_load_chain.  */
+  mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
+  if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
+      || exact_log2 (size) != -1
+      || !vect_shift_permute_load_chain (dr_chain, size, stmt,
+                                        gsi, &result_chain))
+    vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
   vect_record_grouped_load_vectors (stmt, result_chain);
   result_chain.release ();
 }
@@ -5004,10 +5746,10 @@ vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
    for each vector to the associated scalar statement.  */
 
 void
-vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
+vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
 {
-  gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
-  gimple next_stmt, new_stmt;
+  gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
+  gimple *next_stmt, *new_stmt;
   unsigned int i, gap_count;
   tree tmp_data_ref;
 
@@ -5046,9 +5788,9 @@ vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
             {
               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
                 {
-                 gimple prev_stmt =
+                 gimple *prev_stmt =
                    STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
-                 gimple rel_stmt =
+                 gimple *rel_stmt =
                    STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
                  while (rel_stmt)
                    {
@@ -5084,30 +5826,8 @@ vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
   if (TREE_CODE (decl) != VAR_DECL)
     return false;
 
-  /* We cannot change alignment of common or external symbols as another
-     translation unit may contain a definition with lower alignment.  
-     The rules of common symbol linking mean that the definition
-     will override the common symbol.  The same is true for constant
-     pool entries which may be shared and are not properly merged
-     by LTO.  */
-  if (DECL_EXTERNAL (decl)
-      || DECL_COMMON (decl)
-      || DECL_IN_CONSTANT_POOL (decl))
-    return false;
-
-  if (TREE_ASM_WRITTEN (decl))
-    return false;
-
-  /* Do not override the alignment as specified by the ABI when the used
-     attribute is set.  */
-  if (DECL_PRESERVE_P (decl))
-    return false;
-
-  /* Do not override explicit alignment set by the user when an explicit
-     section name is also used.  This is a common idiom used by many
-     software projects.  */
-  if (DECL_SECTION_NAME (decl) != NULL_TREE
-      && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
+  if (decl_in_symtab_p (decl)
+      && !symtab_node::get (decl)->can_increase_alignment_p ())
     return false;
 
   if (TREE_STATIC (decl))
@@ -5127,10 +5847,10 @@ enum dr_alignment_support
 vect_supportable_dr_alignment (struct data_reference *dr,
                                bool check_aligned_accesses)
 {
-  gimple stmt = DR_STMT (dr);
+  gimple *stmt = DR_STMT (dr);
   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
-  enum machine_mode mode = TYPE_MODE (vectype);
+  machine_mode mode = TYPE_MODE (vectype);
   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
   struct loop *vect_loop = NULL;
   bool nested_in_vect_loop = false;