gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148
 149   *lhs_size_unit = lhs;
 150   *rhs_size_unit = rhs;
 151   return scalar_type;
 152 }
 153
 154
 155 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 156    tested at run-time.  Return TRUE if DDR was successfully inserted.
 157    Return false if versioning is not supported.  */
 158
 159 static bool
 160 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 161 {
 162   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 163
 164   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 165     return false;
 166
 167   if (!runtime_alias_check_p (ddr, loop,
 168                               optimize_loop_nest_for_speed_p (loop)))
 169     return false;
 170
 171   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 172   return true;
 173 }
 174
 175 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 176
 177 static void
 178 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 179 {
 180   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 181   for (unsigned int i = 0; i < checks.length(); ++i)
 182     if (checks[i] == value)
 183       return;
 184
 185   if (dump_enabled_p ())
 186     {
 187       dump_printf_loc (MSG_NOTE, vect_location, "need run-time check that ");
 188       dump_generic_expr (MSG_NOTE, TDF_SLIM, value);
 189       dump_printf (MSG_NOTE, " is nonzero\n");
 190     }
 191   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 192 }
 193
 194 /* Return true if we know that the order of vectorized DR_INFO_A and
 195    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 196    DR_INFO_B.  At least one of the accesses is a write.  */
 197
 198 static bool
 199 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 200 {
 201   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 202   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 203
 204   /* Single statements are always kept in their original order.  */
 205   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 206       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 207     return true;
 208
 209   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 210      group are emitted at the position of the first scalar load and all
 211      stores in a group are emitted at the position of the last scalar store.
 212      Thus writes will happen no earlier than their current position
 213      (but could happen later) while reads will happen no later than their
 214      current position (but could happen earlier).  Reordering is therefore
 215      only possible if the first access is a write.  */
 216   stmtinfo_a = vect_orig_stmt (stmtinfo_a);
 217   stmtinfo_b = vect_orig_stmt (stmtinfo_b);
 218   stmt_vec_info earlier_stmt_info = get_earlier_stmt (stmtinfo_a, stmtinfo_b);
 219   return !DR_IS_WRITE (STMT_VINFO_DATA_REF (earlier_stmt_info));
 220 }
 221
 222 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 223    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 224    distances.  These distances are conservatively correct but they don't
 225    reflect a guaranteed dependence.
 226
 227    Return true if this function does all the work necessary to avoid
 228    an alias or false if the caller should use the dependence distances
 229    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 230    the depth of the loop described by LOOP_VINFO and the other arguments
 231    are as for vect_analyze_data_ref_dependence.  */
 232
 233 static bool
 234 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 235                                        loop_vec_info loop_vinfo,
 236                                        int loop_depth, unsigned int *max_vf)
 237 {
 238   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 239   lambda_vector dist_v;
 240   unsigned int i;
 241   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 242     {
 243       int dist = dist_v[loop_depth];
 244       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 245         {
 246           /* If the user asserted safelen >= DIST consecutive iterations
 247              can be executed concurrently, assume independence.
 248
 249              ??? An alternative would be to add the alias check even
 250              in this case, and vectorize the fallback loop with the
 251              maximum VF set to safelen.  However, if the user has
 252              explicitly given a length, it's less likely that that
 253              would be a win.  */
 254           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 255             {
 256               if ((unsigned int) loop->safelen < *max_vf)
 257                 *max_vf = loop->safelen;
 258               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 259               continue;
 260             }
 261
 262           /* For dependence distances of 2 or more, we have the option
 263              of limiting VF or checking for an alias at runtime.
 264              Prefer to check at runtime if we can, to avoid limiting
 265              the VF unnecessarily when the bases are in fact independent.
 266
 267              Note that the alias checks will be removed if the VF ends up
 268              being small enough.  */
 269           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 270           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 271           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 272                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 273                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 274         }
 275     }
 276   return true;
 277 }
 278
 279
 280 /* Function vect_analyze_data_ref_dependence.
 281
 282    Return TRUE if there (might) exist a dependence between a memory-reference
 283    DRA and a memory-reference DRB.  When versioning for alias may check a
 284    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 285    the data dependence.  */
 286
 287 static bool
 288 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 289                                   loop_vec_info loop_vinfo,
 290                                   unsigned int *max_vf)
 291 {
 292   unsigned int i;
 293   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 294   struct data_reference *dra = DDR_A (ddr);
 295   struct data_reference *drb = DDR_B (ddr);
 296   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 297   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 298   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 299   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 300   lambda_vector dist_v;
 301   unsigned int loop_depth;
 302
 303   /* In loop analysis all data references should be vectorizable.  */
 304   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 305       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 306     gcc_unreachable ();
 307
 308   /* Independent data accesses.  */
 309   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 310     return false;
 311
 312   if (dra == drb
 313       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 314     return false;
 315
 316   /* We do not have to consider dependences between accesses that belong
 317      to the same group, unless the stride could be smaller than the
 318      group size.  */
 319   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 320       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 321           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 322       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 323     return false;
 324
 325   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 326      least two scalar iterations, there is always also a true dependence.
 327      As the vectorizer does not re-order loads and stores we can ignore
 328      the anti-dependence if TBAA can disambiguate both DRs similar to the
 329      case with known negative distance anti-dependences (positive
 330      distance anti-dependences would violate TBAA constraints).  */
 331   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 332        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 333       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 334                                  get_alias_set (DR_REF (drb))))
 335     return false;
 336
 337   /* Unknown data dependence.  */
 338   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 339     {
 340       /* If user asserted safelen consecutive iterations can be
 341          executed concurrently, assume independence.  */
 342       if (loop->safelen >= 2)
 343         {
 344           if ((unsigned int) loop->safelen < *max_vf)
 345             *max_vf = loop->safelen;
 346           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 347           return false;
 348         }
 349
 350       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 351           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 352         {
 353           if (dump_enabled_p ())
 354             {
 355               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 356                                "versioning for alias not supported for: "
 357                                "can't determine dependence between ");
 358               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 359                                  DR_REF (dra));
 360               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 361               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 362                                  DR_REF (drb));
 363               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 364             }
 365           return true;
 366         }
 367
 368       if (dump_enabled_p ())
 369         {
 370           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 371                            "versioning for alias required: "
 372                            "can't determine dependence between ");
 373           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 374                              DR_REF (dra));
 375           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 376           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 377                              DR_REF (drb));
 378           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 379         }
 380
 381       /* Add to list of ddrs that need to be tested at run-time.  */
 382       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 383     }
 384
 385   /* Known data dependence.  */
 386   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 387     {
 388       /* If user asserted safelen consecutive iterations can be
 389          executed concurrently, assume independence.  */
 390       if (loop->safelen >= 2)
 391         {
 392           if ((unsigned int) loop->safelen < *max_vf)
 393             *max_vf = loop->safelen;
 394           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 395           return false;
 396         }
 397
 398       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 399           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 400         {
 401           if (dump_enabled_p ())
 402             {
 403               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 404                                "versioning for alias not supported for: "
 405                                "bad dist vector for ");
 406               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 407                                  DR_REF (dra));
 408               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 409               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 410                                  DR_REF (drb));
 411               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 412             }
 413           return true;
 414         }
 415
 416       if (dump_enabled_p ())
 417         {
 418           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 419                            "versioning for alias required: "
 420                            "bad dist vector for ");
 421           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 422           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 423           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 424           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 425         }
 426       /* Add to list of ddrs that need to be tested at run-time.  */
 427       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 428     }
 429
 430   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 431
 432   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 433       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 434                                                 loop_depth, max_vf))
 435     return false;
 436
 437   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 438     {
 439       int dist = dist_v[loop_depth];
 440
 441       if (dump_enabled_p ())
 442         dump_printf_loc (MSG_NOTE, vect_location,
 443                          "dependence distance  = %d.\n", dist);
 444
 445       if (dist == 0)
 446         {
 447           if (dump_enabled_p ())
 448             {
 449               dump_printf_loc (MSG_NOTE, vect_location,
 450                                "dependence distance == 0 between ");
 451               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 452               dump_printf (MSG_NOTE, " and ");
 453               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 454               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 455             }
 456
 457           /* When we perform grouped accesses and perform implicit CSE
 458              by detecting equal accesses and doing disambiguation with
 459              runtime alias tests like for
 460                 .. = a[i];
 461                 .. = a[i+1];
 462                 a[i] = ..;
 463                 a[i+1] = ..;
 464                 *p = ..;
 465                 .. = a[i];
 466                 .. = a[i+1];
 467              where we will end up loading { a[i], a[i+1] } once, make
 468              sure that inserting group loads before the first load and
 469              stores after the last store will do the right thing.
 470              Similar for groups like
 471                 a[i] = ...;
 472                 ... = a[i];
 473                 a[i+1] = ...;
 474              where loads from the group interleave with the store.  */
 475           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 476             {
 477               if (dump_enabled_p ())
 478                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 479                                  "READ_WRITE dependence in interleaving.\n");
 480               return true;
 481             }
 482
 483           if (loop->safelen < 2)
 484             {
 485               tree indicator = dr_zero_step_indicator (dra);
 486               if (!indicator || integer_zerop (indicator))
 487                 {
 488                   if (dump_enabled_p ())
 489                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 490                                  "access also has a zero step\n");
 491                   return true;
 492                 }
 493               else if (TREE_CODE (indicator) != INTEGER_CST)
 494                 vect_check_nonzero_value (loop_vinfo, indicator);
 495             }
 496           continue;
 497         }
 498
 499       if (dist > 0 && DDR_REVERSED_P (ddr))
 500         {
 501           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 502              reversed (to make distance vector positive), and the actual
 503              distance is negative.  */
 504           if (dump_enabled_p ())
 505             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 506                              "dependence distance negative.\n");
 507           /* Record a negative dependence distance to later limit the
 508              amount of stmt copying / unrolling we can perform.
 509              Only need to handle read-after-write dependence.  */
 510           if (DR_IS_READ (drb)
 511               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 512                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 513             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 514           continue;
 515         }
 516
 517       unsigned int abs_dist = abs (dist);
 518       if (abs_dist >= 2 && abs_dist < *max_vf)
 519         {
 520           /* The dependence distance requires reduction of the maximal
 521              vectorization factor.  */
 522           *max_vf = abs (dist);
 523           if (dump_enabled_p ())
 524             dump_printf_loc (MSG_NOTE, vect_location,
 525                              "adjusting maximal vectorization factor to %i\n",
 526                              *max_vf);
 527         }
 528
 529       if (abs_dist >= *max_vf)
 530         {
 531           /* Dependence distance does not create dependence, as far as
 532              vectorization is concerned, in this case.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance >= VF.\n");
 536           continue;
 537         }
 538
 539       if (dump_enabled_p ())
 540         {
 541           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 542                        "not vectorized, possible dependence "
 543                        "between data-refs ");
 544           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 545           dump_printf (MSG_NOTE,  " and ");
 546           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 547           dump_printf (MSG_NOTE,  "\n");
 548         }
 549
 550       return true;
 551     }
 552
 553   return false;
 554 }
 555
 556 /* Function vect_analyze_data_ref_dependences.
 557
 558    Examine all the data references in the loop, and make sure there do not
 559    exist any data dependences between them.  Set *MAX_VF according to
 560    the maximum vectorization factor the data dependences allow.  */
 561
 562 bool
 563 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 564                                    unsigned int *max_vf)
 565 {
 566   unsigned int i;
 567   struct data_dependence_relation *ddr;
 568
 569   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 570
 571   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 572     {
 573       LOOP_VINFO_DDRS (loop_vinfo)
 574         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 575                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 576       /* We need read-read dependences to compute
 577          STMT_VINFO_SAME_ALIGN_REFS.  */
 578       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 579                                           &LOOP_VINFO_DDRS (loop_vinfo),
 580                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 581                                           true);
 582       gcc_assert (res);
 583     }
 584
 585   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 586
 587   /* For epilogues we either have no aliases or alias versioning
 588      was applied to original loop.  Therefore we may just get max_vf
 589      using VF of original loop.  */
 590   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 591     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 592   else
 593     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 594       if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 595         return false;
 596
 597   return true;
 598 }
 599
 600
 601 /* Function vect_slp_analyze_data_ref_dependence.
 602
 603    Return TRUE if there (might) exist a dependence between a memory-reference
 604    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 605    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 606    according to the data dependence.  */
 607
 608 static bool
 609 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 610                                       struct data_dependence_relation *ddr)
 611 {
 612   struct data_reference *dra = DDR_A (ddr);
 613   struct data_reference *drb = DDR_B (ddr);
 614   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 615   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 616
 617   /* We need to check dependences of statements marked as unvectorizable
 618      as well, they still can prohibit vectorization.  */
 619
 620   /* Independent data accesses.  */
 621   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 622     return false;
 623
 624   if (dra == drb)
 625     return false;
 626
 627   /* Read-read is OK.  */
 628   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 629     return false;
 630
 631   /* If dra and drb are part of the same interleaving chain consider
 632      them independent.  */
 633   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 634       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 635           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 636     return false;
 637
 638   /* Unknown data dependence.  */
 639   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 640     {
 641       if  (dump_enabled_p ())
 642         {
 643           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 644                            "can't determine dependence between ");
 645           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 646           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 647           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 648           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 649         }
 650     }
 651   else if (dump_enabled_p ())
 652     {
 653       dump_printf_loc (MSG_NOTE, vect_location,
 654                        "determined dependence between ");
 655       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 656       dump_printf (MSG_NOTE, " and ");
 657       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 658       dump_printf (MSG_NOTE,  "\n");
 659     }
 660
 661   return true;
 662 }
 663
 664
 665 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 666    contain the vector of scalar stores of this instance if we are
 667    disambiguating the loads.  */
 668
 669 static bool
 670 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 671                                    vec<stmt_vec_info> stores,
 672                                    stmt_vec_info last_store_info)
 673 {
 674   /* This walks over all stmts involved in the SLP load/store done
 675      in NODE verifying we can sink them up to the last stmt in the
 676      group.  */
 677   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 678   vec_info *vinfo = last_access_info->vinfo;
 679   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 680     {
 681       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 682       if (access_info == last_access_info)
 683         continue;
 684       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 685       ao_ref ref;
 686       bool ref_initialized_p = false;
 687       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 688            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 689         {
 690           gimple *stmt = gsi_stmt (gsi);
 691           if (! gimple_vuse (stmt)
 692               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 693             continue;
 694
 695           /* If we couldn't record a (single) data reference for this
 696              stmt we have to resort to the alias oracle.  */
 697           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 698           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 699           if (!dr_b)
 700             {
 701               /* We are moving a store or sinking a load - this means
 702                  we cannot use TBAA for disambiguation.  */
 703               if (!ref_initialized_p)
 704                 ao_ref_init (&ref, DR_REF (dr_a));
 705               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 706                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 707                 return false;
 708               continue;
 709             }
 710
 711           bool dependent = false;
 712           /* If we run into a store of this same instance (we've just
 713              marked those) then delay dependence checking until we run
 714              into the last store because this is where it will have
 715              been sunk to (and we verify if we can do that as well).  */
 716           if (gimple_visited_p (stmt))
 717             {
 718               if (stmt_info != last_store_info)
 719                 continue;
 720               unsigned i;
 721               stmt_vec_info store_info;
 722               FOR_EACH_VEC_ELT (stores, i, store_info)
 723                 {
 724                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 725                   ddr_p ddr = initialize_data_dependence_relation
 726                                 (dr_a, store_dr, vNULL);
 727                   dependent
 728                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 729                   free_dependence_relation (ddr);
 730                   if (dependent)
 731                     break;
 732                 }
 733             }
 734           else
 735             {
 736               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 737                                                                dr_b, vNULL);
 738               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 739               free_dependence_relation (ddr);
 740             }
 741           if (dependent)
 742             return false;
 743         }
 744     }
 745   return true;
 746 }
 747
 748
 749 /* Function vect_analyze_data_ref_dependences.
 750
 751    Examine all the data references in the basic-block, and make sure there
 752    do not exist any data dependences between them.  Set *MAX_VF according to
 753    the maximum vectorization factor the data dependences allow.  */
 754
 755 bool
 756 vect_slp_analyze_instance_dependence (slp_instance instance)
 757 {
 758   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 759
 760   /* The stores of this instance are at the root of the SLP tree.  */
 761   slp_tree store = SLP_INSTANCE_TREE (instance);
 762   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 763     store = NULL;
 764
 765   /* Verify we can sink stores to the vectorized stmt insert location.  */
 766   stmt_vec_info last_store_info = NULL;
 767   if (store)
 768     {
 769       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 770         return false;
 771
 772       /* Mark stores in this instance and remember the last one.  */
 773       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 774       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 775         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 776     }
 777
 778   bool res = true;
 779
 780   /* Verify we can sink loads to the vectorized stmt insert location,
 781      special-casing stores of this instance.  */
 782   slp_tree load;
 783   unsigned int i;
 784   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 785     if (! vect_slp_analyze_node_dependences (instance, load,
 786                                              store
 787                                              ? SLP_TREE_SCALAR_STMTS (store)
 788                                              : vNULL, last_store_info))
 789       {
 790         res = false;
 791         break;
 792       }
 793
 794   /* Unset the visited flag.  */
 795   if (store)
 796     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 797       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 798
 799   return res;
 800 }
 801
 802 /* Record the base alignment guarantee given by DRB, which occurs
 803    in STMT_INFO.  */
 804
 805 static void
 806 vect_record_base_alignment (stmt_vec_info stmt_info,
 807                             innermost_loop_behavior *drb)
 808 {
 809   vec_info *vinfo = stmt_info->vinfo;
 810   bool existed;
 811   innermost_loop_behavior *&entry
 812     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 813   if (!existed || entry->base_alignment < drb->base_alignment)
 814     {
 815       entry = drb;
 816       if (dump_enabled_p ())
 817         {
 818           dump_printf_loc (MSG_NOTE, vect_location,
 819                            "recording new base alignment for ");
 820           dump_generic_expr (MSG_NOTE, TDF_SLIM, drb->base_address);
 821           dump_printf (MSG_NOTE, "\n");
 822           dump_printf_loc (MSG_NOTE, vect_location,
 823                            "  alignment:    %d\n", drb->base_alignment);
 824           dump_printf_loc (MSG_NOTE, vect_location,
 825                            "  misalignment: %d\n", drb->base_misalignment);
 826           dump_printf_loc (MSG_NOTE, vect_location,
 827                            "  based on:     ");
 828           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 829         }
 830     }
 831 }
 832
 833 /* If the region we're going to vectorize is reached, all unconditional
 834    data references occur at least once.  We can therefore pool the base
 835    alignment guarantees from each unconditional reference.  Do this by
 836    going through all the data references in VINFO and checking whether
 837    the containing statement makes the reference unconditionally.  If so,
 838    record the alignment of the base address in VINFO so that it can be
 839    used for all other references with the same base.  */
 840
 841 void
 842 vect_record_base_alignments (vec_info *vinfo)
 843 {
 844   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 845   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 846   data_reference *dr;
 847   unsigned int i;
 848   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 849     {
 850       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 851       stmt_vec_info stmt_info = dr_info->stmt;
 852       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 853           && STMT_VINFO_VECTORIZABLE (stmt_info)
 854           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 855         {
 856           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 857
 858           /* If DR is nested in the loop that is being vectorized, we can also
 859              record the alignment of the base wrt the outer loop.  */
 860           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 861             vect_record_base_alignment
 862               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 863         }
 864     }
 865 }
 866
 867 /* Return the target alignment for the vectorized form of DR_INFO.  */
 868
 869 static unsigned int
 870 vect_calculate_target_alignment (dr_vec_info *dr_info)
 871 {
 872   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 873   return targetm.vectorize.preferred_vector_alignment (vectype);
 874 }
 875
 876 /* Function vect_compute_data_ref_alignment
 877
 878    Compute the misalignment of the data reference DR_INFO.
 879
 880    Output:
 881    1. DR_MISALIGNMENT (DR_INFO) is defined.
 882
 883    FOR NOW: No analysis is actually performed. Misalignment is calculated
 884    only for trivial cases. TODO.  */
 885
 886 static void
 887 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 888 {
 889   stmt_vec_info stmt_info = dr_info->stmt;
 890   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 891   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 892   struct loop *loop = NULL;
 893   tree ref = DR_REF (dr_info->dr);
 894   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 895
 896   if (dump_enabled_p ())
 897     dump_printf_loc (MSG_NOTE, vect_location,
 898                      "vect_compute_data_ref_alignment:\n");
 899
 900   if (loop_vinfo)
 901     loop = LOOP_VINFO_LOOP (loop_vinfo);
 902
 903   /* Initialize misalignment to unknown.  */
 904   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 905
 906   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 907     return;
 908
 909   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 910   bool step_preserves_misalignment_p;
 911
 912   unsigned HOST_WIDE_INT vector_alignment
 913     = vect_calculate_target_alignment (dr_info) / BITS_PER_UNIT;
 914   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 915
 916   /* No step for BB vectorization.  */
 917   if (!loop)
 918     {
 919       gcc_assert (integer_zerop (drb->step));
 920       step_preserves_misalignment_p = true;
 921     }
 922
 923   /* In case the dataref is in an inner-loop of the loop that is being
 924      vectorized (LOOP), we use the base and misalignment information
 925      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 926      stays the same throughout the execution of the inner-loop, which is why
 927      we have to check that the stride of the dataref in the inner-loop evenly
 928      divides by the vector alignment.  */
 929   else if (nested_in_vect_loop_p (loop, stmt_info))
 930     {
 931       step_preserves_misalignment_p
 932         = (DR_STEP_ALIGNMENT (dr_info->dr) % vector_alignment) == 0;
 933
 934       if (dump_enabled_p ())
 935         {
 936           if (step_preserves_misalignment_p)
 937             dump_printf_loc (MSG_NOTE, vect_location,
 938                              "inner step divides the vector alignment.\n");
 939           else
 940             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 941                              "inner step doesn't divide the vector"
 942                              " alignment.\n");
 943         }
 944     }
 945
 946   /* Similarly we can only use base and misalignment information relative to
 947      an innermost loop if the misalignment stays the same throughout the
 948      execution of the loop.  As above, this is the case if the stride of
 949      the dataref evenly divides by the alignment.  */
 950   else
 951     {
 952       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 953       step_preserves_misalignment_p
 954         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vector_alignment);
 955
 956       if (!step_preserves_misalignment_p && dump_enabled_p ())
 957         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 958                          "step doesn't divide the vector alignment.\n");
 959     }
 960
 961   unsigned int base_alignment = drb->base_alignment;
 962   unsigned int base_misalignment = drb->base_misalignment;
 963
 964   /* Calculate the maximum of the pooled base address alignment and the
 965      alignment that we can compute for DR itself.  */
 966   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 967   if (entry && base_alignment < (*entry)->base_alignment)
 968     {
 969       base_alignment = (*entry)->base_alignment;
 970       base_misalignment = (*entry)->base_misalignment;
 971     }
 972
 973   if (drb->offset_alignment < vector_alignment
 974       || !step_preserves_misalignment_p
 975       /* We need to know whether the step wrt the vectorized loop is
 976          negative when computing the starting misalignment below.  */
 977       || TREE_CODE (drb->step) != INTEGER_CST)
 978     {
 979       if (dump_enabled_p ())
 980         {
 981           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 982                            "Unknown alignment for access: ");
 983           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 984           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 985         }
 986       return;
 987     }
 988
 989   if (base_alignment < vector_alignment)
 990     {
 991       unsigned int max_alignment;
 992       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
 993       if (max_alignment < vector_alignment
 994           || !vect_can_force_dr_alignment_p (base,
 995                                              vector_alignment * BITS_PER_UNIT))
 996         {
 997           if (dump_enabled_p ())
 998             {
 999               dump_printf_loc (MSG_NOTE, vect_location,
1000                                "can't force alignment of ref: ");
1001               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1002               dump_printf (MSG_NOTE, "\n");
1003             }
1004           return;
1005         }
1006
1007       /* Force the alignment of the decl.
1008          NOTE: This is the only change to the code we make during
1009          the analysis phase, before deciding to vectorize the loop.  */
1010       if (dump_enabled_p ())
1011         {
1012           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
1013           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
1014           dump_printf (MSG_NOTE, "\n");
1015         }
1016
1017       dr_info->base_decl = base;
1018       dr_info->base_misaligned = true;
1019       base_misalignment = 0;
1020     }
1021   poly_int64 misalignment
1022     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1023
1024   /* If this is a backward running DR then first access in the larger
1025      vectype actually is N-1 elements before the address in the DR.
1026      Adjust misalign accordingly.  */
1027   if (tree_int_cst_sgn (drb->step) < 0)
1028     /* PLUS because STEP is negative.  */
1029     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1030                      * TREE_INT_CST_LOW (drb->step));
1031
1032   unsigned int const_misalignment;
1033   if (!known_misalignment (misalignment, vector_alignment,
1034                            &const_misalignment))
1035     {
1036       if (dump_enabled_p ())
1037         {
1038           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1039                            "Non-constant misalignment for access: ");
1040           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1041           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1042         }
1043       return;
1044     }
1045
1046   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1047
1048   if (dump_enabled_p ())
1049     {
1050       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1051                        "misalign = %d bytes of ref ",
1052                        DR_MISALIGNMENT (dr_info));
1053       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1054       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1055     }
1056
1057   return;
1058 }
1059
1060 /* Function vect_update_misalignment_for_peel.
1061    Sets DR_INFO's misalignment
1062    - to 0 if it has the same alignment as DR_PEEL_INFO,
1063    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1064    - to -1 (unknown) otherwise.
1065
1066    DR_INFO - the data reference whose misalignment is to be adjusted.
1067    DR_PEEL_INFO - the data reference whose misalignment is being made
1068                   zero in the vector loop by the peel.
1069    NPEEL - the number of iterations in the peel loop if the misalignment
1070            of DR_PEEL_INFO is known at compile time.  */
1071
1072 static void
1073 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1074                                    dr_vec_info *dr_peel_info, int npeel)
1075 {
1076   unsigned int i;
1077   vec<dr_p> same_aligned_drs;
1078   struct data_reference *current_dr;
1079   int dr_size = vect_get_scalar_dr_size (dr_info);
1080   int dr_peel_size = vect_get_scalar_dr_size (dr_peel_info);
1081   stmt_vec_info stmt_info = dr_info->stmt;
1082   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1083
1084  /* For interleaved data accesses the step in the loop must be multiplied by
1085      the size of the interleaving group.  */
1086   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1087     dr_size *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
1088   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1089     dr_peel_size *= DR_GROUP_SIZE (peel_stmt_info);
1090
1091   /* It can be assumed that the data refs with the same alignment as dr_peel
1092      are aligned in the vector loop.  */
1093   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1094   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1095     {
1096       if (current_dr != dr_info->dr)
1097         continue;
1098       gcc_assert (!known_alignment_for_access_p (dr_info)
1099                   || !known_alignment_for_access_p (dr_peel_info)
1100                   || (DR_MISALIGNMENT (dr_info) / dr_size
1101                       == DR_MISALIGNMENT (dr_peel_info) / dr_peel_size));
1102       SET_DR_MISALIGNMENT (dr_info, 0);
1103       return;
1104     }
1105
1106   if (known_alignment_for_access_p (dr_info)
1107       && known_alignment_for_access_p (dr_peel_info))
1108     {
1109       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1110                                             size_zero_node) < 0;
1111       int misal = DR_MISALIGNMENT (dr_info);
1112       misal += negative ? -npeel * dr_size : npeel * dr_size;
1113       misal &= DR_TARGET_ALIGNMENT (dr_info) - 1;
1114       SET_DR_MISALIGNMENT (dr_info, misal);
1115       return;
1116     }
1117
1118   if (dump_enabled_p ())
1119     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1120                      "to unknown (-1).\n");
1121   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1122 }
1123
1124
1125 /* Function verify_data_ref_alignment
1126
1127    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1128
1129 static bool
1130 verify_data_ref_alignment (dr_vec_info *dr_info)
1131 {
1132   enum dr_alignment_support supportable_dr_alignment
1133     = vect_supportable_dr_alignment (dr_info, false);
1134   if (!supportable_dr_alignment)
1135     {
1136       if (dump_enabled_p ())
1137         {
1138           if (DR_IS_READ (dr_info->dr))
1139             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140                              "not vectorized: unsupported unaligned load.");
1141           else
1142             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1143                              "not vectorized: unsupported unaligned "
1144                              "store.");
1145
1146           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1147                              DR_REF (dr_info->dr));
1148           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1149         }
1150       return false;
1151     }
1152
1153   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1154     dump_printf_loc (MSG_NOTE, vect_location,
1155                      "Vectorizing an unaligned access.\n");
1156
1157   return true;
1158 }
1159
1160 /* Function vect_verify_datarefs_alignment
1161
1162    Return TRUE if all data references in the loop can be
1163    handled with respect to alignment.  */
1164
1165 bool
1166 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1167 {
1168   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1169   struct data_reference *dr;
1170   unsigned int i;
1171
1172   FOR_EACH_VEC_ELT (datarefs, i, dr)
1173     {
1174       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1175       stmt_vec_info stmt_info = dr_info->stmt;
1176
1177       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1178         continue;
1179
1180       /* For interleaving, only the alignment of the first access matters.   */
1181       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1182           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1183         continue;
1184
1185       /* Strided accesses perform only component accesses, alignment is
1186          irrelevant for them.  */
1187       if (STMT_VINFO_STRIDED_P (stmt_info)
1188           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1189         continue;
1190
1191       if (! verify_data_ref_alignment (dr_info))
1192         return false;
1193     }
1194
1195   return true;
1196 }
1197
1198 /* Given an memory reference EXP return whether its alignment is less
1199    than its size.  */
1200
1201 static bool
1202 not_size_aligned (tree exp)
1203 {
1204   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1205     return true;
1206
1207   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1208           > get_object_alignment (exp));
1209 }
1210
1211 /* Function vector_alignment_reachable_p
1212
1213    Return true if vector alignment for DR_INFO is reachable by peeling
1214    a few loop iterations.  Return false otherwise.  */
1215
1216 static bool
1217 vector_alignment_reachable_p (dr_vec_info *dr_info)
1218 {
1219   stmt_vec_info stmt_info = dr_info->stmt;
1220   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1221
1222   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1223     {
1224       /* For interleaved access we peel only if number of iterations in
1225          the prolog loop ({VF - misalignment}), is a multiple of the
1226          number of the interleaved accesses.  */
1227       int elem_size, mis_in_elements;
1228
1229       /* FORNOW: handle only known alignment.  */
1230       if (!known_alignment_for_access_p (dr_info))
1231         return false;
1232
1233       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1234       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1235       elem_size = vector_element_size (vector_size, nelements);
1236       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1237
1238       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1239         return false;
1240     }
1241
1242   /* If misalignment is known at the compile time then allow peeling
1243      only if natural alignment is reachable through peeling.  */
1244   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1245     {
1246       HOST_WIDE_INT elmsize =
1247                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1248       if (dump_enabled_p ())
1249         {
1250           dump_printf_loc (MSG_NOTE, vect_location,
1251                            "data size = %wd. misalignment = %d.\n", elmsize,
1252                            DR_MISALIGNMENT (dr_info));
1253         }
1254       if (DR_MISALIGNMENT (dr_info) % elmsize)
1255         {
1256           if (dump_enabled_p ())
1257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258                              "data size does not divide the misalignment.\n");
1259           return false;
1260         }
1261     }
1262
1263   if (!known_alignment_for_access_p (dr_info))
1264     {
1265       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1266       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1267       if (dump_enabled_p ())
1268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1269                          "Unknown misalignment, %snaturally aligned\n",
1270                          is_packed ? "not " : "");
1271       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1272     }
1273
1274   return true;
1275 }
1276
1277
1278 /* Calculate the cost of the memory access represented by DR_INFO.  */
1279
1280 static void
1281 vect_get_data_access_cost (dr_vec_info *dr_info,
1282                            unsigned int *inside_cost,
1283                            unsigned int *outside_cost,
1284                            stmt_vector_for_cost *body_cost_vec,
1285                            stmt_vector_for_cost *prologue_cost_vec)
1286 {
1287   stmt_vec_info stmt_info = dr_info->stmt;
1288   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1289   int ncopies;
1290
1291   if (PURE_SLP_STMT (stmt_info))
1292     ncopies = 1;
1293   else
1294     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1295
1296   if (DR_IS_READ (dr_info->dr))
1297     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1298                         prologue_cost_vec, body_cost_vec, false);
1299   else
1300     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1301
1302   if (dump_enabled_p ())
1303     dump_printf_loc (MSG_NOTE, vect_location,
1304                      "vect_get_data_access_cost: inside_cost = %d, "
1305                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1306 }
1307
1308
1309 typedef struct _vect_peel_info
1310 {
1311   dr_vec_info *dr_info;
1312   int npeel;
1313   unsigned int count;
1314 } *vect_peel_info;
1315
1316 typedef struct _vect_peel_extended_info
1317 {
1318   struct _vect_peel_info peel_info;
1319   unsigned int inside_cost;
1320   unsigned int outside_cost;
1321 } *vect_peel_extended_info;
1322
1323
1324 /* Peeling hashtable helpers.  */
1325
1326 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1327 {
1328   static inline hashval_t hash (const _vect_peel_info *);
1329   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1330 };
1331
1332 inline hashval_t
1333 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1334 {
1335   return (hashval_t) peel_info->npeel;
1336 }
1337
1338 inline bool
1339 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1340 {
1341   return (a->npeel == b->npeel);
1342 }
1343
1344
1345 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1346
1347 static void
1348 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1349                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1350                           int npeel)
1351 {
1352   struct _vect_peel_info elem, *slot;
1353   _vect_peel_info **new_slot;
1354   bool supportable_dr_alignment
1355     = vect_supportable_dr_alignment (dr_info, true);
1356
1357   elem.npeel = npeel;
1358   slot = peeling_htab->find (&elem);
1359   if (slot)
1360     slot->count++;
1361   else
1362     {
1363       slot = XNEW (struct _vect_peel_info);
1364       slot->npeel = npeel;
1365       slot->dr_info = dr_info;
1366       slot->count = 1;
1367       new_slot = peeling_htab->find_slot (slot, INSERT);
1368       *new_slot = slot;
1369     }
1370
1371   if (!supportable_dr_alignment
1372       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1373     slot->count += VECT_MAX_COST;
1374 }
1375
1376
1377 /* Traverse peeling hash table to find peeling option that aligns maximum
1378    number of data accesses.  */
1379
1380 int
1381 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1382                                      _vect_peel_extended_info *max)
1383 {
1384   vect_peel_info elem = *slot;
1385
1386   if (elem->count > max->peel_info.count
1387       || (elem->count == max->peel_info.count
1388           && max->peel_info.npeel > elem->npeel))
1389     {
1390       max->peel_info.npeel = elem->npeel;
1391       max->peel_info.count = elem->count;
1392       max->peel_info.dr_info = elem->dr_info;
1393     }
1394
1395   return 1;
1396 }
1397
1398 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1399    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1400    we assume DR0_INFO's misalignment will be zero after peeling.  */
1401
1402 static void
1403 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1404                                 dr_vec_info *dr0_info,
1405                                 unsigned int *inside_cost,
1406                                 unsigned int *outside_cost,
1407                                 stmt_vector_for_cost *body_cost_vec,
1408                                 stmt_vector_for_cost *prologue_cost_vec,
1409                                 unsigned int npeel,
1410                                 bool unknown_misalignment)
1411 {
1412   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1413   unsigned i;
1414   data_reference *dr;
1415
1416   FOR_EACH_VEC_ELT (datarefs, i, dr)
1417     {
1418       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1419       stmt_vec_info stmt_info = dr_info->stmt;
1420       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1421         continue;
1422
1423       /* For interleaving, only the alignment of the first access
1424          matters.  */
1425       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1426           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1427         continue;
1428
1429       /* Strided accesses perform only component accesses, alignment is
1430          irrelevant for them.  */
1431       if (STMT_VINFO_STRIDED_P (stmt_info)
1432           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1433         continue;
1434
1435       int save_misalignment;
1436       save_misalignment = DR_MISALIGNMENT (dr_info);
1437       if (npeel == 0)
1438         ;
1439       else if (unknown_misalignment && dr_info == dr0_info)
1440         SET_DR_MISALIGNMENT (dr_info, 0);
1441       else
1442         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1443       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1444                                  body_cost_vec, prologue_cost_vec);
1445       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1446     }
1447 }
1448
1449 /* Traverse peeling hash table and calculate cost for each peeling option.
1450    Find the one with the lowest cost.  */
1451
1452 int
1453 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1454                                    _vect_peel_extended_info *min)
1455 {
1456   vect_peel_info elem = *slot;
1457   int dummy;
1458   unsigned int inside_cost = 0, outside_cost = 0;
1459   stmt_vec_info stmt_info = elem->dr_info->stmt;
1460   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1461   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1462                        epilogue_cost_vec;
1463
1464   prologue_cost_vec.create (2);
1465   body_cost_vec.create (2);
1466   epilogue_cost_vec.create (2);
1467
1468   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1469                                   &outside_cost, &body_cost_vec,
1470                                   &prologue_cost_vec, elem->npeel, false);
1471
1472   body_cost_vec.release ();
1473
1474   outside_cost += vect_get_known_peeling_cost
1475     (loop_vinfo, elem->npeel, &dummy,
1476      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1477      &prologue_cost_vec, &epilogue_cost_vec);
1478
1479   /* Prologue and epilogue costs are added to the target model later.
1480      These costs depend only on the scalar iteration cost, the
1481      number of peeling iterations finally chosen, and the number of
1482      misaligned statements.  So discard the information found here.  */
1483   prologue_cost_vec.release ();
1484   epilogue_cost_vec.release ();
1485
1486   if (inside_cost < min->inside_cost
1487       || (inside_cost == min->inside_cost
1488           && outside_cost < min->outside_cost))
1489     {
1490       min->inside_cost = inside_cost;
1491       min->outside_cost = outside_cost;
1492       min->peel_info.dr_info = elem->dr_info;
1493       min->peel_info.npeel = elem->npeel;
1494       min->peel_info.count = elem->count;
1495     }
1496
1497   return 1;
1498 }
1499
1500
1501 /* Choose best peeling option by traversing peeling hash table and either
1502    choosing an option with the lowest cost (if cost model is enabled) or the
1503    option that aligns as many accesses as possible.  */
1504
1505 static struct _vect_peel_extended_info
1506 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1507                                        loop_vec_info loop_vinfo)
1508 {
1509    struct _vect_peel_extended_info res;
1510
1511    res.peel_info.dr_info = NULL;
1512
1513    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1514      {
1515        res.inside_cost = INT_MAX;
1516        res.outside_cost = INT_MAX;
1517        peeling_htab->traverse <_vect_peel_extended_info *,
1518                                vect_peeling_hash_get_lowest_cost> (&res);
1519      }
1520    else
1521      {
1522        res.peel_info.count = 0;
1523        peeling_htab->traverse <_vect_peel_extended_info *,
1524                                vect_peeling_hash_get_most_frequent> (&res);
1525        res.inside_cost = 0;
1526        res.outside_cost = 0;
1527      }
1528
1529    return res;
1530 }
1531
1532 /* Return true if the new peeling NPEEL is supported.  */
1533
1534 static bool
1535 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1536                           unsigned npeel)
1537 {
1538   unsigned i;
1539   struct data_reference *dr = NULL;
1540   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1541   enum dr_alignment_support supportable_dr_alignment;
1542
1543   /* Ensure that all data refs can be vectorized after the peel.  */
1544   FOR_EACH_VEC_ELT (datarefs, i, dr)
1545     {
1546       int save_misalignment;
1547
1548       if (dr == dr0_info->dr)
1549         continue;
1550
1551       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1552       stmt_vec_info stmt_info = dr_info->stmt;
1553       /* For interleaving, only the alignment of the first access
1554          matters.  */
1555       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1556           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1557         continue;
1558
1559       /* Strided accesses perform only component accesses, alignment is
1560          irrelevant for them.  */
1561       if (STMT_VINFO_STRIDED_P (stmt_info)
1562           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1563         continue;
1564
1565       save_misalignment = DR_MISALIGNMENT (dr_info);
1566       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1567       supportable_dr_alignment
1568         = vect_supportable_dr_alignment (dr_info, false);
1569       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1570
1571       if (!supportable_dr_alignment)
1572         return false;
1573     }
1574
1575   return true;
1576 }
1577
1578 /* Function vect_enhance_data_refs_alignment
1579
1580    This pass will use loop versioning and loop peeling in order to enhance
1581    the alignment of data references in the loop.
1582
1583    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1584    original loop is to be vectorized.  Any other loops that are created by
1585    the transformations performed in this pass - are not supposed to be
1586    vectorized.  This restriction will be relaxed.
1587
1588    This pass will require a cost model to guide it whether to apply peeling
1589    or versioning or a combination of the two.  For example, the scheme that
1590    intel uses when given a loop with several memory accesses, is as follows:
1591    choose one memory access ('p') which alignment you want to force by doing
1592    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1593    other accesses are not necessarily aligned, or (2) use loop versioning to
1594    generate one loop in which all accesses are aligned, and another loop in
1595    which only 'p' is necessarily aligned.
1596
1597    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1598    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1599    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1600
1601    Devising a cost model is the most critical aspect of this work.  It will
1602    guide us on which access to peel for, whether to use loop versioning, how
1603    many versions to create, etc.  The cost model will probably consist of
1604    generic considerations as well as target specific considerations (on
1605    powerpc for example, misaligned stores are more painful than misaligned
1606    loads).
1607
1608    Here are the general steps involved in alignment enhancements:
1609
1610      -- original loop, before alignment analysis:
1611         for (i=0; i<N; i++){
1612           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1613           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1614         }
1615
1616      -- After vect_compute_data_refs_alignment:
1617         for (i=0; i<N; i++){
1618           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1619           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1620         }
1621
1622      -- Possibility 1: we do loop versioning:
1623      if (p is aligned) {
1624         for (i=0; i<N; i++){    # loop 1A
1625           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1626           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1627         }
1628      }
1629      else {
1630         for (i=0; i<N; i++){    # loop 1B
1631           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1632           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1633         }
1634      }
1635
1636      -- Possibility 2: we do loop peeling:
1637      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1638         x = q[i];
1639         p[i] = y;
1640      }
1641      for (i = 3; i < N; i++){   # loop 2A
1642         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1643         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1644      }
1645
1646      -- Possibility 3: combination of loop peeling and versioning:
1647      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1648         x = q[i];
1649         p[i] = y;
1650      }
1651      if (p is aligned) {
1652         for (i = 3; i<N; i++){  # loop 3A
1653           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1654           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1655         }
1656      }
1657      else {
1658         for (i = 3; i<N; i++){  # loop 3B
1659           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1660           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1661         }
1662      }
1663
1664      These loops are later passed to loop_transform to be vectorized.  The
1665      vectorizer will use the alignment information to guide the transformation
1666      (whether to generate regular loads/stores, or with special handling for
1667      misalignment).  */
1668
1669 bool
1670 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1671 {
1672   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1673   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674   enum dr_alignment_support supportable_dr_alignment;
1675   dr_vec_info *first_store = NULL;
1676   dr_vec_info *dr0_info = NULL;
1677   struct data_reference *dr;
1678   unsigned int i, j;
1679   bool do_peeling = false;
1680   bool do_versioning = false;
1681   bool stat;
1682   unsigned int npeel = 0;
1683   bool one_misalignment_known = false;
1684   bool one_misalignment_unknown = false;
1685   bool one_dr_unsupportable = false;
1686   dr_vec_info *unsupportable_dr_info = NULL;
1687   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1688   unsigned possible_npeel_number = 1;
1689   tree vectype;
1690   unsigned int mis, same_align_drs_max = 0;
1691   hash_table<peel_info_hasher> peeling_htab (1);
1692
1693   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1694
1695   /* Reset data so we can safely be called multiple times.  */
1696   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1697   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1698
1699   /* While cost model enhancements are expected in the future, the high level
1700      view of the code at this time is as follows:
1701
1702      A) If there is a misaligned access then see if peeling to align
1703         this access can make all data references satisfy
1704         vect_supportable_dr_alignment.  If so, update data structures
1705         as needed and return true.
1706
1707      B) If peeling wasn't possible and there is a data reference with an
1708         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1709         then see if loop versioning checks can be used to make all data
1710         references satisfy vect_supportable_dr_alignment.  If so, update
1711         data structures as needed and return true.
1712
1713      C) If neither peeling nor versioning were successful then return false if
1714         any data reference does not satisfy vect_supportable_dr_alignment.
1715
1716      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1717
1718      Note, Possibility 3 above (which is peeling and versioning together) is not
1719      being done at this time.  */
1720
1721   /* (1) Peeling to force alignment.  */
1722
1723   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1724      Considerations:
1725      + How many accesses will become aligned due to the peeling
1726      - How many accesses will become unaligned due to the peeling,
1727        and the cost of misaligned accesses.
1728      - The cost of peeling (the extra runtime checks, the increase
1729        in code size).  */
1730
1731   FOR_EACH_VEC_ELT (datarefs, i, dr)
1732     {
1733       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1734       stmt_vec_info stmt_info = dr_info->stmt;
1735
1736       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1737         continue;
1738
1739       /* For interleaving, only the alignment of the first access
1740          matters.  */
1741       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1742           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1743         continue;
1744
1745       /* For scatter-gather or invariant accesses there is nothing
1746          to enhance.  */
1747       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1748           || integer_zerop (DR_STEP (dr)))
1749         continue;
1750
1751       /* Strided accesses perform only component accesses, alignment is
1752          irrelevant for them.  */
1753       if (STMT_VINFO_STRIDED_P (stmt_info)
1754           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1755         continue;
1756
1757       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1758       do_peeling = vector_alignment_reachable_p (dr_info);
1759       if (do_peeling)
1760         {
1761           if (known_alignment_for_access_p (dr_info))
1762             {
1763               unsigned int npeel_tmp = 0;
1764               bool negative = tree_int_cst_compare (DR_STEP (dr),
1765                                                     size_zero_node) < 0;
1766
1767               vectype = STMT_VINFO_VECTYPE (stmt_info);
1768               unsigned int target_align = DR_TARGET_ALIGNMENT (dr_info);
1769               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1770               mis = (negative
1771                      ? DR_MISALIGNMENT (dr_info)
1772                      : -DR_MISALIGNMENT (dr_info));
1773               if (DR_MISALIGNMENT (dr_info) != 0)
1774                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1775
1776               /* For multiple types, it is possible that the bigger type access
1777                  will have more than one peeling option.  E.g., a loop with two
1778                  types: one of size (vector size / 4), and the other one of
1779                  size (vector size / 8).  Vectorization factor will 8.  If both
1780                  accesses are misaligned by 3, the first one needs one scalar
1781                  iteration to be aligned, and the second one needs 5.  But the
1782                  first one will be aligned also by peeling 5 scalar
1783                  iterations, and in that case both accesses will be aligned.
1784                  Hence, except for the immediate peeling amount, we also want
1785                  to try to add full vector size, while we don't exceed
1786                  vectorization factor.
1787                  We do this automatically for cost model, since we calculate
1788                  cost for every peeling option.  */
1789               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1790                 {
1791                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1792                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1793                   possible_npeel_number
1794                     = vect_get_num_vectors (nscalars, vectype);
1795
1796                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1797                      allow peeling NELEMENTS.  */
1798                   if (DR_MISALIGNMENT (dr_info) == 0)
1799                     possible_npeel_number++;
1800                 }
1801
1802               /* Save info about DR in the hash table.  Also include peeling
1803                  amounts according to the explanation above.  */
1804               for (j = 0; j < possible_npeel_number; j++)
1805                 {
1806                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1807                                             dr_info, npeel_tmp);
1808                   npeel_tmp += target_align / dr_size;
1809                 }
1810
1811               one_misalignment_known = true;
1812             }
1813           else
1814             {
1815               /* If we don't know any misalignment values, we prefer
1816                  peeling for data-ref that has the maximum number of data-refs
1817                  with the same alignment, unless the target prefers to align
1818                  stores over load.  */
1819               unsigned same_align_drs
1820                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1821               if (!dr0_info
1822                   || same_align_drs_max < same_align_drs)
1823                 {
1824                   same_align_drs_max = same_align_drs;
1825                   dr0_info = dr_info;
1826                 }
1827               /* For data-refs with the same number of related
1828                  accesses prefer the one where the misalign
1829                  computation will be invariant in the outermost loop.  */
1830               else if (same_align_drs_max == same_align_drs)
1831                 {
1832                   struct loop *ivloop0, *ivloop;
1833                   ivloop0 = outermost_invariant_loop_for_expr
1834                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1835                   ivloop = outermost_invariant_loop_for_expr
1836                     (loop, DR_BASE_ADDRESS (dr));
1837                   if ((ivloop && !ivloop0)
1838                       || (ivloop && ivloop0
1839                           && flow_loop_nested_p (ivloop, ivloop0)))
1840                     dr0_info = dr_info;
1841                 }
1842
1843               one_misalignment_unknown = true;
1844
1845               /* Check for data refs with unsupportable alignment that
1846                  can be peeled.  */
1847               if (!supportable_dr_alignment)
1848               {
1849                 one_dr_unsupportable = true;
1850                 unsupportable_dr_info = dr_info;
1851               }
1852
1853               if (!first_store && DR_IS_WRITE (dr))
1854                 first_store = dr_info;
1855             }
1856         }
1857       else
1858         {
1859           if (!aligned_access_p (dr_info))
1860             {
1861               if (dump_enabled_p ())
1862                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863                                  "vector alignment may not be reachable\n");
1864               break;
1865             }
1866         }
1867     }
1868
1869   /* Check if we can possibly peel the loop.  */
1870   if (!vect_can_advance_ivs_p (loop_vinfo)
1871       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1872       || loop->inner)
1873     do_peeling = false;
1874
1875   struct _vect_peel_extended_info peel_for_known_alignment;
1876   struct _vect_peel_extended_info peel_for_unknown_alignment;
1877   struct _vect_peel_extended_info best_peel;
1878
1879   peel_for_unknown_alignment.inside_cost = INT_MAX;
1880   peel_for_unknown_alignment.outside_cost = INT_MAX;
1881   peel_for_unknown_alignment.peel_info.count = 0;
1882
1883   if (do_peeling
1884       && one_misalignment_unknown)
1885     {
1886       /* Check if the target requires to prefer stores over loads, i.e., if
1887          misaligned stores are more expensive than misaligned loads (taking
1888          drs with same alignment into account).  */
1889       unsigned int load_inside_cost = 0;
1890       unsigned int load_outside_cost = 0;
1891       unsigned int store_inside_cost = 0;
1892       unsigned int store_outside_cost = 0;
1893       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1894
1895       stmt_vector_for_cost dummy;
1896       dummy.create (2);
1897       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1898                                       &load_inside_cost,
1899                                       &load_outside_cost,
1900                                       &dummy, &dummy, estimated_npeels, true);
1901       dummy.release ();
1902
1903       if (first_store)
1904         {
1905           dummy.create (2);
1906           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1907                                           &store_inside_cost,
1908                                           &store_outside_cost,
1909                                           &dummy, &dummy,
1910                                           estimated_npeels, true);
1911           dummy.release ();
1912         }
1913       else
1914         {
1915           store_inside_cost = INT_MAX;
1916           store_outside_cost = INT_MAX;
1917         }
1918
1919       if (load_inside_cost > store_inside_cost
1920           || (load_inside_cost == store_inside_cost
1921               && load_outside_cost > store_outside_cost))
1922         {
1923           dr0_info = first_store;
1924           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1925           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1926         }
1927       else
1928         {
1929           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1930           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1931         }
1932
1933       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1934       prologue_cost_vec.create (2);
1935       epilogue_cost_vec.create (2);
1936
1937       int dummy2;
1938       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1939         (loop_vinfo, estimated_npeels, &dummy2,
1940          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1941          &prologue_cost_vec, &epilogue_cost_vec);
1942
1943       prologue_cost_vec.release ();
1944       epilogue_cost_vec.release ();
1945
1946       peel_for_unknown_alignment.peel_info.count = 1
1947         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1948     }
1949
1950   peel_for_unknown_alignment.peel_info.npeel = 0;
1951   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1952
1953   best_peel = peel_for_unknown_alignment;
1954
1955   peel_for_known_alignment.inside_cost = INT_MAX;
1956   peel_for_known_alignment.outside_cost = INT_MAX;
1957   peel_for_known_alignment.peel_info.count = 0;
1958   peel_for_known_alignment.peel_info.dr_info = NULL;
1959
1960   if (do_peeling && one_misalignment_known)
1961     {
1962       /* Peeling is possible, but there is no data access that is not supported
1963          unless aligned.  So we try to choose the best possible peeling from
1964          the hash table.  */
1965       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1966         (&peeling_htab, loop_vinfo);
1967     }
1968
1969   /* Compare costs of peeling for known and unknown alignment. */
1970   if (peel_for_known_alignment.peel_info.dr_info != NULL
1971       && peel_for_unknown_alignment.inside_cost
1972       >= peel_for_known_alignment.inside_cost)
1973     {
1974       best_peel = peel_for_known_alignment;
1975
1976       /* If the best peeling for known alignment has NPEEL == 0, perform no
1977          peeling at all except if there is an unsupportable dr that we can
1978          align.  */
1979       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1980         do_peeling = false;
1981     }
1982
1983   /* If there is an unsupportable data ref, prefer this over all choices so far
1984      since we'd have to discard a chosen peeling except when it accidentally
1985      aligned the unsupportable data ref.  */
1986   if (one_dr_unsupportable)
1987     dr0_info = unsupportable_dr_info;
1988   else if (do_peeling)
1989     {
1990       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1991          TODO: Use nopeel_outside_cost or get rid of it?  */
1992       unsigned nopeel_inside_cost = 0;
1993       unsigned nopeel_outside_cost = 0;
1994
1995       stmt_vector_for_cost dummy;
1996       dummy.create (2);
1997       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1998                                       &nopeel_outside_cost, &dummy, &dummy,
1999                                       0, false);
2000       dummy.release ();
2001
2002       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
2003          costs will be recorded.  */
2004       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2005       prologue_cost_vec.create (2);
2006       epilogue_cost_vec.create (2);
2007
2008       int dummy2;
2009       nopeel_outside_cost += vect_get_known_peeling_cost
2010         (loop_vinfo, 0, &dummy2,
2011          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2012          &prologue_cost_vec, &epilogue_cost_vec);
2013
2014       prologue_cost_vec.release ();
2015       epilogue_cost_vec.release ();
2016
2017       npeel = best_peel.peel_info.npeel;
2018       dr0_info = best_peel.peel_info.dr_info;
2019
2020       /* If no peeling is not more expensive than the best peeling we
2021          have so far, don't perform any peeling.  */
2022       if (nopeel_inside_cost <= best_peel.inside_cost)
2023         do_peeling = false;
2024     }
2025
2026   if (do_peeling)
2027     {
2028       stmt_vec_info stmt_info = dr0_info->stmt;
2029       vectype = STMT_VINFO_VECTYPE (stmt_info);
2030
2031       if (known_alignment_for_access_p (dr0_info))
2032         {
2033           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2034                                                 size_zero_node) < 0;
2035           if (!npeel)
2036             {
2037               /* Since it's known at compile time, compute the number of
2038                  iterations in the peeled loop (the peeling factor) for use in
2039                  updating DR_MISALIGNMENT values.  The peeling factor is the
2040                  vectorization factor minus the misalignment as an element
2041                  count.  */
2042               mis = (negative
2043                      ? DR_MISALIGNMENT (dr0_info)
2044                      : -DR_MISALIGNMENT (dr0_info));
2045               unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2046               npeel = ((mis & (target_align - 1))
2047                        / vect_get_scalar_dr_size (dr0_info));
2048             }
2049
2050           /* For interleaved data access every iteration accesses all the
2051              members of the group, therefore we divide the number of iterations
2052              by the group size.  */
2053           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2054             npeel /= DR_GROUP_SIZE (stmt_info);
2055
2056           if (dump_enabled_p ())
2057             dump_printf_loc (MSG_NOTE, vect_location,
2058                              "Try peeling by %d\n", npeel);
2059         }
2060
2061       /* Ensure that all datarefs can be vectorized after the peel.  */
2062       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2063         do_peeling = false;
2064
2065       /* Check if all datarefs are supportable and log.  */
2066       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2067         {
2068           stat = vect_verify_datarefs_alignment (loop_vinfo);
2069           if (!stat)
2070             do_peeling = false;
2071           else
2072             return stat;
2073         }
2074
2075       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2076       if (do_peeling)
2077         {
2078           unsigned max_allowed_peel
2079             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2080           if (max_allowed_peel != (unsigned)-1)
2081             {
2082               unsigned max_peel = npeel;
2083               if (max_peel == 0)
2084                 {
2085                   unsigned int target_align = DR_TARGET_ALIGNMENT (dr0_info);
2086                   max_peel = (target_align
2087                               / vect_get_scalar_dr_size (dr0_info) - 1);
2088                 }
2089               if (max_peel > max_allowed_peel)
2090                 {
2091                   do_peeling = false;
2092                   if (dump_enabled_p ())
2093                     dump_printf_loc (MSG_NOTE, vect_location,
2094                         "Disable peeling, max peels reached: %d\n", max_peel);
2095                 }
2096             }
2097         }
2098
2099       /* Cost model #2 - if peeling may result in a remaining loop not
2100          iterating enough to be vectorized then do not peel.  Since this
2101          is a cost heuristic rather than a correctness decision, use the
2102          most likely runtime value for variable vectorization factors.  */
2103       if (do_peeling
2104           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2105         {
2106           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2107           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2108           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2109               < assumed_vf + max_peel)
2110             do_peeling = false;
2111         }
2112
2113       if (do_peeling)
2114         {
2115           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2116              If the misalignment of DR_i is identical to that of dr0 then set
2117              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2118              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2119              by the peeling factor times the element size of DR_i (MOD the
2120              vectorization factor times the size).  Otherwise, the
2121              misalignment of DR_i must be set to unknown.  */
2122           FOR_EACH_VEC_ELT (datarefs, i, dr)
2123             if (dr != dr0_info->dr)
2124               {
2125                 /* Strided accesses perform only component accesses, alignment
2126                    is irrelevant for them.  */
2127                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2128                 stmt_info = dr_info->stmt;
2129                 if (STMT_VINFO_STRIDED_P (stmt_info)
2130                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2131                   continue;
2132
2133                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2134               }
2135
2136           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2137           if (npeel)
2138             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2139           else
2140             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2141               = DR_MISALIGNMENT (dr0_info);
2142           SET_DR_MISALIGNMENT (dr0_info, 0);
2143           if (dump_enabled_p ())
2144             {
2145               dump_printf_loc (MSG_NOTE, vect_location,
2146                                "Alignment of access forced using peeling.\n");
2147               dump_printf_loc (MSG_NOTE, vect_location,
2148                                "Peeling for alignment will be applied.\n");
2149             }
2150
2151           /* The inside-loop cost will be accounted for in vectorizable_load
2152              and vectorizable_store correctly with adjusted alignments.
2153              Drop the body_cst_vec on the floor here.  */
2154           stat = vect_verify_datarefs_alignment (loop_vinfo);
2155           gcc_assert (stat);
2156           return stat;
2157         }
2158     }
2159
2160   /* (2) Versioning to force alignment.  */
2161
2162   /* Try versioning if:
2163      1) optimize loop for speed
2164      2) there is at least one unsupported misaligned data ref with an unknown
2165         misalignment, and
2166      3) all misaligned data refs with a known misalignment are supported, and
2167      4) the number of runtime alignment checks is within reason.  */
2168
2169   do_versioning =
2170         optimize_loop_nest_for_speed_p (loop)
2171         && (!loop->inner); /* FORNOW */
2172
2173   if (do_versioning)
2174     {
2175       FOR_EACH_VEC_ELT (datarefs, i, dr)
2176         {
2177           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2178           stmt_vec_info stmt_info = dr_info->stmt;
2179
2180           /* For interleaving, only the alignment of the first access
2181              matters.  */
2182           if (aligned_access_p (dr_info)
2183               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2184                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2185             continue;
2186
2187           if (STMT_VINFO_STRIDED_P (stmt_info))
2188             {
2189               /* Strided loads perform only component accesses, alignment is
2190                  irrelevant for them.  */
2191               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2192                 continue;
2193               do_versioning = false;
2194               break;
2195             }
2196
2197           supportable_dr_alignment
2198             = vect_supportable_dr_alignment (dr_info, false);
2199
2200           if (!supportable_dr_alignment)
2201             {
2202               int mask;
2203               tree vectype;
2204
2205               if (known_alignment_for_access_p (dr_info)
2206                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2207                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2208                 {
2209                   do_versioning = false;
2210                   break;
2211                 }
2212
2213               vectype = STMT_VINFO_VECTYPE (stmt_info);
2214               gcc_assert (vectype);
2215
2216               /* At present we don't support versioning for alignment
2217                  with variable VF, since there's no guarantee that the
2218                  VF is a power of two.  We could relax this if we added
2219                  a way of enforcing a power-of-two size.  */
2220               unsigned HOST_WIDE_INT size;
2221               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2222                 {
2223                   do_versioning = false;
2224                   break;
2225                 }
2226
2227               /* The rightmost bits of an aligned address must be zeros.
2228                  Construct the mask needed for this test.  For example,
2229                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2230                  mask must be 15 = 0xf. */
2231               mask = size - 1;
2232
2233               /* FORNOW: use the same mask to test all potentially unaligned
2234                  references in the loop.  The vectorizer currently supports
2235                  a single vector size, see the reference to
2236                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2237                  vectorization factor is computed.  */
2238               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2239                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2240               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2241               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2242             }
2243         }
2244
2245       /* Versioning requires at least one misaligned data reference.  */
2246       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2247         do_versioning = false;
2248       else if (!do_versioning)
2249         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2250     }
2251
2252   if (do_versioning)
2253     {
2254       vec<stmt_vec_info> may_misalign_stmts
2255         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2256       stmt_vec_info stmt_info;
2257
2258       /* It can now be assumed that the data references in the statements
2259          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2260          of the loop being vectorized.  */
2261       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2262         {
2263           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2264           SET_DR_MISALIGNMENT (dr_info, 0);
2265           if (dump_enabled_p ())
2266             dump_printf_loc (MSG_NOTE, vect_location,
2267                              "Alignment of access forced using versioning.\n");
2268         }
2269
2270       if (dump_enabled_p ())
2271         dump_printf_loc (MSG_NOTE, vect_location,
2272                          "Versioning for alignment will be applied.\n");
2273
2274       /* Peeling and versioning can't be done together at this time.  */
2275       gcc_assert (! (do_peeling && do_versioning));
2276
2277       stat = vect_verify_datarefs_alignment (loop_vinfo);
2278       gcc_assert (stat);
2279       return stat;
2280     }
2281
2282   /* This point is reached if neither peeling nor versioning is being done.  */
2283   gcc_assert (! (do_peeling || do_versioning));
2284
2285   stat = vect_verify_datarefs_alignment (loop_vinfo);
2286   return stat;
2287 }
2288
2289
2290 /* Function vect_find_same_alignment_drs.
2291
2292    Update group and alignment relations in VINFO according to the chosen
2293    vectorization factor.  */
2294
2295 static void
2296 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2297 {
2298   struct data_reference *dra = DDR_A (ddr);
2299   struct data_reference *drb = DDR_B (ddr);
2300   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2301   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2302   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2303   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2304
2305   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2306     return;
2307
2308   if (dra == drb)
2309     return;
2310
2311   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2312       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2313     return;
2314
2315   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2316       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2317       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2318     return;
2319
2320   /* Two references with distance zero have the same alignment.  */
2321   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2322                           - wi::to_poly_offset (DR_INIT (drb)));
2323   if (maybe_ne (diff, 0))
2324     {
2325       /* Get the wider of the two alignments.  */
2326       unsigned int align_a = (vect_calculate_target_alignment (dr_info_a)
2327                               / BITS_PER_UNIT);
2328       unsigned int align_b = (vect_calculate_target_alignment (dr_info_b)
2329                               / BITS_PER_UNIT);
2330       unsigned int max_align = MAX (align_a, align_b);
2331
2332       /* Require the gap to be a multiple of the larger vector alignment.  */
2333       if (!multiple_p (diff, max_align))
2334         return;
2335     }
2336
2337   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2338   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2339   if (dump_enabled_p ())
2340     {
2341       dump_printf_loc (MSG_NOTE, vect_location,
2342                        "accesses have the same alignment: ");
2343       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2344       dump_printf (MSG_NOTE,  " and ");
2345       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2346       dump_printf (MSG_NOTE, "\n");
2347     }
2348 }
2349
2350
2351 /* Function vect_analyze_data_refs_alignment
2352
2353    Analyze the alignment of the data-references in the loop.
2354    Return FALSE if a data reference is found that cannot be vectorized.  */
2355
2356 bool
2357 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2358 {
2359   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2360
2361   /* Mark groups of data references with same alignment using
2362      data dependence information.  */
2363   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2364   struct data_dependence_relation *ddr;
2365   unsigned int i;
2366
2367   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2368     vect_find_same_alignment_drs (vinfo, ddr);
2369
2370   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2371   struct data_reference *dr;
2372
2373   vect_record_base_alignments (vinfo);
2374   FOR_EACH_VEC_ELT (datarefs, i, dr)
2375     {
2376       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2377       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2378         vect_compute_data_ref_alignment (dr_info);
2379     }
2380
2381   return true;
2382 }
2383
2384
2385 /* Analyze alignment of DRs of stmts in NODE.  */
2386
2387 static bool
2388 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2389 {
2390   /* We vectorize from the first scalar stmt in the node unless
2391      the node is permuted in which case we start from the first
2392      element in the group.  */
2393   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2394   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2395   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2396     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2397
2398   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2399   vect_compute_data_ref_alignment (dr_info);
2400   /* For creating the data-ref pointer we need alignment of the
2401      first element anyway.  */
2402   if (dr_info != first_dr_info)
2403     vect_compute_data_ref_alignment (first_dr_info);
2404   if (! verify_data_ref_alignment (dr_info))
2405     {
2406       if (dump_enabled_p ())
2407         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2408                          "not vectorized: bad data alignment in basic "
2409                          "block.\n");
2410       return false;
2411     }
2412
2413   return true;
2414 }
2415
2416 /* Function vect_slp_analyze_instance_alignment
2417
2418    Analyze the alignment of the data-references in the SLP instance.
2419    Return FALSE if a data reference is found that cannot be vectorized.  */
2420
2421 bool
2422 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2423 {
2424   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2425
2426   slp_tree node;
2427   unsigned i;
2428   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2429     if (! vect_slp_analyze_and_verify_node_alignment (node))
2430       return false;
2431
2432   node = SLP_INSTANCE_TREE (instance);
2433   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2434       && ! vect_slp_analyze_and_verify_node_alignment
2435              (SLP_INSTANCE_TREE (instance)))
2436     return false;
2437
2438   return true;
2439 }
2440
2441
2442 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2443    accesses of legal size, step, etc.  Detect gaps, single element
2444    interleaving, and other special cases. Set grouped access info.
2445    Collect groups of strided stores for further use in SLP analysis.
2446    Worker for vect_analyze_group_access.  */
2447
2448 static bool
2449 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2450 {
2451   data_reference *dr = dr_info->dr;
2452   tree step = DR_STEP (dr);
2453   tree scalar_type = TREE_TYPE (DR_REF (dr));
2454   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2455   stmt_vec_info stmt_info = dr_info->stmt;
2456   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2457   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2458   HOST_WIDE_INT dr_step = -1;
2459   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2460   bool slp_impossible = false;
2461
2462   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2463      size of the interleaving group (including gaps).  */
2464   if (tree_fits_shwi_p (step))
2465     {
2466       dr_step = tree_to_shwi (step);
2467       /* Check that STEP is a multiple of type size.  Otherwise there is
2468          a non-element-sized gap at the end of the group which we
2469          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2470          ???  As we can handle non-constant step fine here we should
2471          simply remove uses of DR_GROUP_GAP between the last and first
2472          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2473          simply not include that gap.  */
2474       if ((dr_step % type_size) != 0)
2475         {
2476           if (dump_enabled_p ())
2477             {
2478               dump_printf_loc (MSG_NOTE, vect_location,
2479                                "Step ");
2480               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2481               dump_printf (MSG_NOTE,
2482                            " is not a multiple of the element size for ");
2483               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2484               dump_printf (MSG_NOTE, "\n");
2485             }
2486           return false;
2487         }
2488       groupsize = absu_hwi (dr_step) / type_size;
2489     }
2490   else
2491     groupsize = 0;
2492
2493   /* Not consecutive access is possible only if it is a part of interleaving.  */
2494   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2495     {
2496       /* Check if it this DR is a part of interleaving, and is a single
2497          element of the group that is accessed in the loop.  */
2498
2499       /* Gaps are supported only for loads. STEP must be a multiple of the type
2500          size.  */
2501       if (DR_IS_READ (dr)
2502           && (dr_step % type_size) == 0
2503           && groupsize > 0)
2504         {
2505           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2506           DR_GROUP_SIZE (stmt_info) = groupsize;
2507           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2508           if (dump_enabled_p ())
2509             {
2510               dump_printf_loc (MSG_NOTE, vect_location,
2511                                "Detected single element interleaving ");
2512               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2513               dump_printf (MSG_NOTE, " step ");
2514               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2515               dump_printf (MSG_NOTE, "\n");
2516             }
2517
2518           return true;
2519         }
2520
2521       if (dump_enabled_p ())
2522         {
2523           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2524                            "not consecutive access ");
2525           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2526                             stmt_info->stmt, 0);
2527         }
2528
2529       if (bb_vinfo)
2530         {
2531           /* Mark the statement as unvectorizable.  */
2532           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2533           return true;
2534         }
2535
2536       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2537       STMT_VINFO_STRIDED_P (stmt_info) = true;
2538       return true;
2539     }
2540
2541   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2542     {
2543       /* First stmt in the interleaving chain. Check the chain.  */
2544       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2545       struct data_reference *data_ref = dr;
2546       unsigned int count = 1;
2547       tree prev_init = DR_INIT (data_ref);
2548       stmt_vec_info prev = stmt_info;
2549       HOST_WIDE_INT diff, gaps = 0;
2550
2551       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2552       while (next)
2553         {
2554           /* Skip same data-refs.  In case that two or more stmts share
2555              data-ref (supported only for loads), we vectorize only the first
2556              stmt, and the rest get their vectorized loads from the first
2557              one.  */
2558           if (!tree_int_cst_compare (DR_INIT (data_ref),
2559                                      DR_INIT (STMT_VINFO_DATA_REF (next))))
2560             {
2561               if (DR_IS_WRITE (data_ref))
2562                 {
2563                   if (dump_enabled_p ())
2564                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2565                                      "Two store stmts share the same dr.\n");
2566                   return false;
2567                 }
2568
2569               if (dump_enabled_p ())
2570                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2571                                  "Two or more load stmts share the same dr.\n");
2572
2573               /* For load use the same data-ref load.  */
2574               DR_GROUP_SAME_DR_STMT (next) = prev;
2575
2576               prev = next;
2577               next = DR_GROUP_NEXT_ELEMENT (next);
2578               continue;
2579             }
2580
2581           prev = next;
2582           data_ref = STMT_VINFO_DATA_REF (next);
2583
2584           /* All group members have the same STEP by construction.  */
2585           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2586
2587           /* Check that the distance between two accesses is equal to the type
2588              size. Otherwise, we have gaps.  */
2589           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2590                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2591           if (diff != 1)
2592             {
2593               /* FORNOW: SLP of accesses with gaps is not supported.  */
2594               slp_impossible = true;
2595               if (DR_IS_WRITE (data_ref))
2596                 {
2597                   if (dump_enabled_p ())
2598                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2599                                      "interleaved store with gaps\n");
2600                   return false;
2601                 }
2602
2603               gaps += diff - 1;
2604             }
2605
2606           last_accessed_element += diff;
2607
2608           /* Store the gap from the previous member of the group. If there is no
2609              gap in the access, DR_GROUP_GAP is always 1.  */
2610           DR_GROUP_GAP (next) = diff;
2611
2612           prev_init = DR_INIT (data_ref);
2613           next = DR_GROUP_NEXT_ELEMENT (next);
2614           /* Count the number of data-refs in the chain.  */
2615           count++;
2616         }
2617
2618       if (groupsize == 0)
2619         groupsize = count + gaps;
2620
2621       /* This could be UINT_MAX but as we are generating code in a very
2622          inefficient way we have to cap earlier.  See PR78699 for example.  */
2623       if (groupsize > 4096)
2624         {
2625           if (dump_enabled_p ())
2626             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2627                              "group is too large\n");
2628           return false;
2629         }
2630
2631       /* Check that the size of the interleaving is equal to count for stores,
2632          i.e., that there are no gaps.  */
2633       if (groupsize != count
2634           && !DR_IS_READ (dr))
2635         {
2636           groupsize = count;
2637           STMT_VINFO_STRIDED_P (stmt_info) = true;
2638         }
2639
2640       /* If there is a gap after the last load in the group it is the
2641          difference between the groupsize and the last accessed
2642          element.
2643          When there is no gap, this difference should be 0.  */
2644       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2645
2646       DR_GROUP_SIZE (stmt_info) = groupsize;
2647       if (dump_enabled_p ())
2648         {
2649           dump_printf_loc (MSG_NOTE, vect_location,
2650                            "Detected interleaving ");
2651           if (DR_IS_READ (dr))
2652             dump_printf (MSG_NOTE, "load ");
2653           else if (STMT_VINFO_STRIDED_P (stmt_info))
2654             dump_printf (MSG_NOTE, "strided store ");
2655           else
2656             dump_printf (MSG_NOTE, "store ");
2657           dump_printf (MSG_NOTE, "of size %u starting with ",
2658                        (unsigned)groupsize);
2659           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
2660           if (DR_GROUP_GAP (stmt_info) != 0)
2661             dump_printf_loc (MSG_NOTE, vect_location,
2662                              "There is a gap of %u elements after the group\n",
2663                              DR_GROUP_GAP (stmt_info));
2664         }
2665
2666       /* SLP: create an SLP data structure for every interleaving group of
2667          stores for further analysis in vect_analyse_slp.  */
2668       if (DR_IS_WRITE (dr) && !slp_impossible)
2669         {
2670           if (loop_vinfo)
2671             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2672           if (bb_vinfo)
2673             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2674         }
2675     }
2676
2677   return true;
2678 }
2679
2680 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2681    accesses of legal size, step, etc.  Detect gaps, single element
2682    interleaving, and other special cases. Set grouped access info.
2683    Collect groups of strided stores for further use in SLP analysis.  */
2684
2685 static bool
2686 vect_analyze_group_access (dr_vec_info *dr_info)
2687 {
2688   if (!vect_analyze_group_access_1 (dr_info))
2689     {
2690       /* Dissolve the group if present.  */
2691       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2692       while (stmt_info)
2693         {
2694           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2695           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2696           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2697           stmt_info = next;
2698         }
2699       return false;
2700     }
2701   return true;
2702 }
2703
2704 /* Analyze the access pattern of the data-reference DR_INFO.
2705    In case of non-consecutive accesses call vect_analyze_group_access() to
2706    analyze groups of accesses.  */
2707
2708 static bool
2709 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2710 {
2711   data_reference *dr = dr_info->dr;
2712   tree step = DR_STEP (dr);
2713   tree scalar_type = TREE_TYPE (DR_REF (dr));
2714   stmt_vec_info stmt_info = dr_info->stmt;
2715   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2716   struct loop *loop = NULL;
2717
2718   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2719     return true;
2720
2721   if (loop_vinfo)
2722     loop = LOOP_VINFO_LOOP (loop_vinfo);
2723
2724   if (loop_vinfo && !step)
2725     {
2726       if (dump_enabled_p ())
2727         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2728                          "bad data-ref access in loop\n");
2729       return false;
2730     }
2731
2732   /* Allow loads with zero step in inner-loop vectorization.  */
2733   if (loop_vinfo && integer_zerop (step))
2734     {
2735       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2736       if (!nested_in_vect_loop_p (loop, stmt_info))
2737         return DR_IS_READ (dr);
2738       /* Allow references with zero step for outer loops marked
2739          with pragma omp simd only - it guarantees absence of
2740          loop-carried dependencies between inner loop iterations.  */
2741       if (loop->safelen < 2)
2742         {
2743           if (dump_enabled_p ())
2744             dump_printf_loc (MSG_NOTE, vect_location,
2745                              "zero step in inner loop of nest\n");
2746           return false;
2747         }
2748     }
2749
2750   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2751     {
2752       /* Interleaved accesses are not yet supported within outer-loop
2753         vectorization for references in the inner-loop.  */
2754       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2755
2756       /* For the rest of the analysis we use the outer-loop step.  */
2757       step = STMT_VINFO_DR_STEP (stmt_info);
2758       if (integer_zerop (step))
2759         {
2760           if (dump_enabled_p ())
2761             dump_printf_loc (MSG_NOTE, vect_location,
2762                              "zero step in outer loop.\n");
2763           return DR_IS_READ (dr);
2764         }
2765     }
2766
2767   /* Consecutive?  */
2768   if (TREE_CODE (step) == INTEGER_CST)
2769     {
2770       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2771       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2772           || (dr_step < 0
2773               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2774         {
2775           /* Mark that it is not interleaving.  */
2776           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2777           return true;
2778         }
2779     }
2780
2781   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2782     {
2783       if (dump_enabled_p ())
2784         dump_printf_loc (MSG_NOTE, vect_location,
2785                          "grouped access in outer loop.\n");
2786       return false;
2787     }
2788
2789
2790   /* Assume this is a DR handled by non-constant strided load case.  */
2791   if (TREE_CODE (step) != INTEGER_CST)
2792     return (STMT_VINFO_STRIDED_P (stmt_info)
2793             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2794                 || vect_analyze_group_access (dr_info)));
2795
2796   /* Not consecutive access - check if it's a part of interleaving group.  */
2797   return vect_analyze_group_access (dr_info);
2798 }
2799
2800 /* Compare two data-references DRA and DRB to group them into chunks
2801    suitable for grouping.  */
2802
2803 static int
2804 dr_group_sort_cmp (const void *dra_, const void *drb_)
2805 {
2806   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2807   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2808   int cmp;
2809
2810   /* Stabilize sort.  */
2811   if (dra == drb)
2812     return 0;
2813
2814   /* DRs in different loops never belong to the same group.  */
2815   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2816   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2817   if (loopa != loopb)
2818     return loopa->num < loopb->num ? -1 : 1;
2819
2820   /* Ordering of DRs according to base.  */
2821   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2822                                DR_BASE_ADDRESS (drb));
2823   if (cmp != 0)
2824     return cmp;
2825
2826   /* And according to DR_OFFSET.  */
2827   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2828   if (cmp != 0)
2829     return cmp;
2830
2831   /* Put reads before writes.  */
2832   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2833     return DR_IS_READ (dra) ? -1 : 1;
2834
2835   /* Then sort after access size.  */
2836   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2837                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2838   if (cmp != 0)
2839     return cmp;
2840
2841   /* And after step.  */
2842   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2843   if (cmp != 0)
2844     return cmp;
2845
2846   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2847   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2848   if (cmp == 0)
2849     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2850   return cmp;
2851 }
2852
2853 /* If OP is the result of a conversion, return the unconverted value,
2854    otherwise return null.  */
2855
2856 static tree
2857 strip_conversion (tree op)
2858 {
2859   if (TREE_CODE (op) != SSA_NAME)
2860     return NULL_TREE;
2861   gimple *stmt = SSA_NAME_DEF_STMT (op);
2862   if (!is_gimple_assign (stmt)
2863       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2864     return NULL_TREE;
2865   return gimple_assign_rhs1 (stmt);
2866 }
2867
2868 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2869    and STMT2_INFO being in a single group.  */
2870
2871 static bool
2872 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info)
2873 {
2874   if (gimple_assign_single_p (stmt1_info->stmt))
2875     return gimple_assign_single_p (stmt2_info->stmt);
2876
2877   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2878   if (call1 && gimple_call_internal_p (call1))
2879     {
2880       /* Check for two masked loads or two masked stores.  */
2881       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2882       if (!call2 || !gimple_call_internal_p (call2))
2883         return false;
2884       internal_fn ifn = gimple_call_internal_fn (call1);
2885       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2886         return false;
2887       if (ifn != gimple_call_internal_fn (call2))
2888         return false;
2889
2890       /* Check that the masks are the same.  Cope with casts of masks,
2891          like those created by build_mask_conversion.  */
2892       tree mask1 = gimple_call_arg (call1, 2);
2893       tree mask2 = gimple_call_arg (call2, 2);
2894       if (!operand_equal_p (mask1, mask2, 0))
2895         {
2896           mask1 = strip_conversion (mask1);
2897           if (!mask1)
2898             return false;
2899           mask2 = strip_conversion (mask2);
2900           if (!mask2)
2901             return false;
2902           if (!operand_equal_p (mask1, mask2, 0))
2903             return false;
2904         }
2905       return true;
2906     }
2907
2908   return false;
2909 }
2910
2911 /* Function vect_analyze_data_ref_accesses.
2912
2913    Analyze the access pattern of all the data references in the loop.
2914
2915    FORNOW: the only access pattern that is considered vectorizable is a
2916            simple step 1 (consecutive) access.
2917
2918    FORNOW: handle only arrays and pointer accesses.  */
2919
2920 bool
2921 vect_analyze_data_ref_accesses (vec_info *vinfo)
2922 {
2923   unsigned int i;
2924   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2925   struct data_reference *dr;
2926
2927   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2928
2929   if (datarefs.is_empty ())
2930     return true;
2931
2932   /* Sort the array of datarefs to make building the interleaving chains
2933      linear.  Don't modify the original vector's order, it is needed for
2934      determining what dependencies are reversed.  */
2935   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2936   datarefs_copy.qsort (dr_group_sort_cmp);
2937
2938   /* Build the interleaving chains.  */
2939   for (i = 0; i < datarefs_copy.length () - 1;)
2940     {
2941       data_reference_p dra = datarefs_copy[i];
2942       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2943       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2944       stmt_vec_info lastinfo = NULL;
2945       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2946           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2947         {
2948           ++i;
2949           continue;
2950         }
2951       for (i = i + 1; i < datarefs_copy.length (); ++i)
2952         {
2953           data_reference_p drb = datarefs_copy[i];
2954           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2955           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2956           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2957               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2958             break;
2959
2960           /* ???  Imperfect sorting (non-compatible types, non-modulo
2961              accesses, same accesses) can lead to a group to be artificially
2962              split here as we don't just skip over those.  If it really
2963              matters we can push those to a worklist and re-iterate
2964              over them.  The we can just skip ahead to the next DR here.  */
2965
2966           /* DRs in a different loop should not be put into the same
2967              interleaving group.  */
2968           if (gimple_bb (DR_STMT (dra))->loop_father
2969               != gimple_bb (DR_STMT (drb))->loop_father)
2970             break;
2971
2972           /* Check that the data-refs have same first location (except init)
2973              and they are both either store or load (not load and store,
2974              not masked loads or stores).  */
2975           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2976               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2977                                         DR_BASE_ADDRESS (drb)) != 0
2978               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2979               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b))
2980             break;
2981
2982           /* Check that the data-refs have the same constant size.  */
2983           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2984           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2985           if (!tree_fits_uhwi_p (sza)
2986               || !tree_fits_uhwi_p (szb)
2987               || !tree_int_cst_equal (sza, szb))
2988             break;
2989
2990           /* Check that the data-refs have the same step.  */
2991           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2992             break;
2993
2994           /* Check the types are compatible.
2995              ???  We don't distinguish this during sorting.  */
2996           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2997                                    TREE_TYPE (DR_REF (drb))))
2998             break;
2999
3000           /* Check that the DR_INITs are compile-time constants.  */
3001           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3002               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3003             break;
3004
3005           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3006           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3007           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3008           HOST_WIDE_INT init_prev
3009             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3010           gcc_assert (init_a <= init_b
3011                       && init_a <= init_prev
3012                       && init_prev <= init_b);
3013
3014           /* Do not place the same access in the interleaving chain twice.  */
3015           if (init_b == init_prev)
3016             {
3017               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3018                           < gimple_uid (DR_STMT (drb)));
3019               /* ???  For now we simply "drop" the later reference which is
3020                  otherwise the same rather than finishing off this group.
3021                  In the end we'd want to re-process duplicates forming
3022                  multiple groups from the refs, likely by just collecting
3023                  all candidates (including duplicates and split points
3024                  below) in a vector and then process them together.  */
3025               continue;
3026             }
3027
3028           /* If init_b == init_a + the size of the type * k, we have an
3029              interleaving, and DRA is accessed before DRB.  */
3030           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3031           if (type_size_a == 0
3032               || (init_b - init_a) % type_size_a != 0)
3033             break;
3034
3035           /* If we have a store, the accesses are adjacent.  This splits
3036              groups into chunks we support (we don't support vectorization
3037              of stores with gaps).  */
3038           if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3039             break;
3040
3041           /* If the step (if not zero or non-constant) is greater than the
3042              difference between data-refs' inits this splits groups into
3043              suitable sizes.  */
3044           if (tree_fits_shwi_p (DR_STEP (dra)))
3045             {
3046               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3047               if (step != 0 && step <= (init_b - init_a))
3048                 break;
3049             }
3050
3051           if (dump_enabled_p ())
3052             {
3053               dump_printf_loc (MSG_NOTE, vect_location,
3054                                "Detected interleaving ");
3055               if (DR_IS_READ (dra))
3056                 dump_printf (MSG_NOTE, "load ");
3057               else
3058                 dump_printf (MSG_NOTE, "store ");
3059               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
3060               dump_printf (MSG_NOTE,  " and ");
3061               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
3062               dump_printf (MSG_NOTE, "\n");
3063             }
3064
3065           /* Link the found element into the group list.  */
3066           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3067             {
3068               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3069               lastinfo = stmtinfo_a;
3070             }
3071           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3072           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3073           lastinfo = stmtinfo_b;
3074         }
3075     }
3076
3077   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3078     {
3079       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3080       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3081           && !vect_analyze_data_ref_access (dr_info))
3082         {
3083           if (dump_enabled_p ())
3084             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3085                              "not vectorized: complicated access pattern.\n");
3086
3087           if (is_a <bb_vec_info> (vinfo))
3088             {
3089               /* Mark the statement as not vectorizable.  */
3090               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3091               continue;
3092             }
3093           else
3094             {
3095               datarefs_copy.release ();
3096               return false;
3097             }
3098         }
3099     }
3100
3101   datarefs_copy.release ();
3102   return true;
3103 }
3104
3105 /* Function vect_vfa_segment_size.
3106
3107    Input:
3108      DR_INFO: The data reference.
3109      LENGTH_FACTOR: segment length to consider.
3110
3111    Return a value suitable for the dr_with_seg_len::seg_len field.
3112    This is the "distance travelled" by the pointer from the first
3113    iteration in the segment to the last.  Note that it does not include
3114    the size of the access; in effect it only describes the first byte.  */
3115
3116 static tree
3117 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3118 {
3119   length_factor = size_binop (MINUS_EXPR,
3120                               fold_convert (sizetype, length_factor),
3121                               size_one_node);
3122   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3123                      length_factor);
3124 }
3125
3126 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3127    gives the worst-case number of bytes covered by the segment.  */
3128
3129 static unsigned HOST_WIDE_INT
3130 vect_vfa_access_size (dr_vec_info *dr_info)
3131 {
3132   stmt_vec_info stmt_vinfo = dr_info->stmt;
3133   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3134   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3135   unsigned HOST_WIDE_INT access_size = ref_size;
3136   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3137     {
3138       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3139       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3140     }
3141   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3142       && (vect_supportable_dr_alignment (dr_info, false)
3143           == dr_explicit_realign_optimized))
3144     {
3145       /* We might access a full vector's worth.  */
3146       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3147       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3148     }
3149   return access_size;
3150 }
3151
3152 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3153    describes.  */
3154
3155 static unsigned int
3156 vect_vfa_align (dr_vec_info *dr_info)
3157 {
3158   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3159 }
3160
3161 /* Function vect_no_alias_p.
3162
3163    Given data references A and B with equal base and offset, see whether
3164    the alias relation can be decided at compilation time.  Return 1 if
3165    it can and the references alias, 0 if it can and the references do
3166    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3167    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3168    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3169
3170 static int
3171 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3172                          tree segment_length_a, tree segment_length_b,
3173                          unsigned HOST_WIDE_INT access_size_a,
3174                          unsigned HOST_WIDE_INT access_size_b)
3175 {
3176   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3177   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3178   poly_uint64 const_length_a;
3179   poly_uint64 const_length_b;
3180
3181   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3182      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3183      [a, a+12) */
3184   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3185     {
3186       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3187       offset_a = (offset_a + access_size_a) - const_length_a;
3188     }
3189   else
3190     const_length_a = tree_to_poly_uint64 (segment_length_a);
3191   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3192     {
3193       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3194       offset_b = (offset_b + access_size_b) - const_length_b;
3195     }
3196   else
3197     const_length_b = tree_to_poly_uint64 (segment_length_b);
3198
3199   const_length_a += access_size_a;
3200   const_length_b += access_size_b;
3201
3202   if (ranges_known_overlap_p (offset_a, const_length_a,
3203                               offset_b, const_length_b))
3204     return 1;
3205
3206   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3207                                offset_b, const_length_b))
3208     return 0;
3209
3210   return -1;
3211 }
3212
3213 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3214    in DDR is >= VF.  */
3215
3216 static bool
3217 dependence_distance_ge_vf (data_dependence_relation *ddr,
3218                            unsigned int loop_depth, poly_uint64 vf)
3219 {
3220   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3221       || DDR_NUM_DIST_VECTS (ddr) == 0)
3222     return false;
3223
3224   /* If the dependence is exact, we should have limited the VF instead.  */
3225   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3226
3227   unsigned int i;
3228   lambda_vector dist_v;
3229   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3230     {
3231       HOST_WIDE_INT dist = dist_v[loop_depth];
3232       if (dist != 0
3233           && !(dist > 0 && DDR_REVERSED_P (ddr))
3234           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3235         return false;
3236     }
3237
3238   if (dump_enabled_p ())
3239     {
3240       dump_printf_loc (MSG_NOTE, vect_location,
3241                        "dependence distance between ");
3242       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
3243       dump_printf (MSG_NOTE,  " and ");
3244       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
3245       dump_printf (MSG_NOTE,  " is >= VF\n");
3246     }
3247
3248   return true;
3249 }
3250
3251 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3252
3253 static void
3254 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3255 {
3256   dump_printf (dump_kind, "%s (", lower_bound.unsigned_p ? "unsigned" : "abs");
3257   dump_generic_expr (dump_kind, TDF_SLIM, lower_bound.expr);
3258   dump_printf (dump_kind, ") >= ");
3259   dump_dec (dump_kind, lower_bound.min_value);
3260 }
3261
3262 /* Record that the vectorized loop requires the vec_lower_bound described
3263    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3264
3265 static void
3266 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3267                         poly_uint64 min_value)
3268 {
3269   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3270   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3271     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3272       {
3273         unsigned_p &= lower_bounds[i].unsigned_p;
3274         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3275         if (lower_bounds[i].unsigned_p != unsigned_p
3276             || maybe_lt (lower_bounds[i].min_value, min_value))
3277           {
3278             lower_bounds[i].unsigned_p = unsigned_p;
3279             lower_bounds[i].min_value = min_value;
3280             if (dump_enabled_p ())
3281               {
3282                 dump_printf_loc (MSG_NOTE, vect_location,
3283                                  "updating run-time check to ");
3284                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3285                 dump_printf (MSG_NOTE, "\n");
3286               }
3287           }
3288         return;
3289       }
3290
3291   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3292   if (dump_enabled_p ())
3293     {
3294       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3295       dump_lower_bound (MSG_NOTE, lower_bound);
3296       dump_printf (MSG_NOTE, "\n");
3297     }
3298   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3299 }
3300
3301 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3302    will span fewer than GAP bytes.  */
3303
3304 static bool
3305 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3306                   poly_int64 gap)
3307 {
3308   stmt_vec_info stmt_info = dr_info->stmt;
3309   HOST_WIDE_INT count
3310     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3311   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3312     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3313   return (estimated_poly_value (gap)
3314           <= count * vect_get_scalar_dr_size (dr_info));
3315 }
3316
3317 /* Return true if we know that there is no alias between DR_INFO_A and
3318    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3319    When returning true, set *LOWER_BOUND_OUT to this N.  */
3320
3321 static bool
3322 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3323                                 poly_uint64 *lower_bound_out)
3324 {
3325   /* Check that there is a constant gap of known sign between DR_A
3326      and DR_B.  */
3327   data_reference *dr_a = dr_info_a->dr;
3328   data_reference *dr_b = dr_info_b->dr;
3329   poly_int64 init_a, init_b;
3330   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3331       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3332       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3333       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3334       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3335       || !ordered_p (init_a, init_b))
3336     return false;
3337
3338   /* Sort DR_A and DR_B by the address they access.  */
3339   if (maybe_lt (init_b, init_a))
3340     {
3341       std::swap (init_a, init_b);
3342       std::swap (dr_info_a, dr_info_b);
3343       std::swap (dr_a, dr_b);
3344     }
3345
3346   /* If the two accesses could be dependent within a scalar iteration,
3347      make sure that we'd retain their order.  */
3348   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3349       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3350     return false;
3351
3352   /* There is no alias if abs (DR_STEP) is greater than or equal to
3353      the bytes spanned by the combination of the two accesses.  */
3354   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3355   return true;
3356 }
3357
3358 /* Function vect_prune_runtime_alias_test_list.
3359
3360    Prune a list of ddrs to be tested at run-time by versioning for alias.
3361    Merge several alias checks into one if possible.
3362    Return FALSE if resulting list of ddrs is longer then allowed by
3363    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3364
3365 bool
3366 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3367 {
3368   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3369   hash_set <tree_pair_hash> compared_objects;
3370
3371   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3372   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3373     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3374   vec<vec_object_pair> &check_unequal_addrs
3375     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3376   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3377   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3378
3379   ddr_p ddr;
3380   unsigned int i;
3381   tree length_factor;
3382
3383   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3384
3385   /* Step values are irrelevant for aliasing if the number of vector
3386      iterations is equal to the number of scalar iterations (which can
3387      happen for fully-SLP loops).  */
3388   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3389
3390   if (!ignore_step_p)
3391     {
3392       /* Convert the checks for nonzero steps into bound tests.  */
3393       tree value;
3394       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3395         vect_check_lower_bound (loop_vinfo, value, true, 1);
3396     }
3397
3398   if (may_alias_ddrs.is_empty ())
3399     return true;
3400
3401   comp_alias_ddrs.create (may_alias_ddrs.length ());
3402
3403   unsigned int loop_depth
3404     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3405                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3406
3407   /* First, we collect all data ref pairs for aliasing checks.  */
3408   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3409     {
3410       int comp_res;
3411       poly_uint64 lower_bound;
3412       tree segment_length_a, segment_length_b;
3413       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3414       unsigned int align_a, align_b;
3415
3416       /* Ignore the alias if the VF we chose ended up being no greater
3417          than the dependence distance.  */
3418       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3419         continue;
3420
3421       if (DDR_OBJECT_A (ddr))
3422         {
3423           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3424           if (!compared_objects.add (new_pair))
3425             {
3426               if (dump_enabled_p ())
3427                 {
3428                   dump_printf_loc (MSG_NOTE, vect_location, "checking that ");
3429                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.first);
3430                   dump_printf (MSG_NOTE, " and ");
3431                   dump_generic_expr (MSG_NOTE, TDF_SLIM, new_pair.second);
3432                   dump_printf (MSG_NOTE, " have different addresses\n");
3433                 }
3434               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3435             }
3436           continue;
3437         }
3438
3439       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3440       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3441
3442       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3443       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3444
3445       /* Skip the pair if inter-iteration dependencies are irrelevant
3446          and intra-iteration dependencies are guaranteed to be honored.  */
3447       if (ignore_step_p
3448           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3449               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3450                                                  &lower_bound)))
3451         {
3452           if (dump_enabled_p ())
3453             {
3454               dump_printf_loc (MSG_NOTE, vect_location,
3455                                "no need for alias check between ");
3456               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3457               dump_printf (MSG_NOTE, " and ");
3458               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3459               dump_printf (MSG_NOTE, " when VF is 1\n");
3460             }
3461           continue;
3462         }
3463
3464       /* See whether we can handle the alias using a bounds check on
3465          the step, and whether that's likely to be the best approach.
3466          (It might not be, for example, if the minimum step is much larger
3467          than the number of bytes handled by one vector iteration.)  */
3468       if (!ignore_step_p
3469           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3470           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3471                                              &lower_bound)
3472           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3473               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3474         {
3475           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3476           if (dump_enabled_p ())
3477             {
3478               dump_printf_loc (MSG_NOTE, vect_location, "no alias between ");
3479               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3480               dump_printf (MSG_NOTE, " and ");
3481               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3482               dump_printf (MSG_NOTE, " when the step ");
3483               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_STEP (dr_info_a->dr));
3484               dump_printf (MSG_NOTE, " is outside ");
3485               if (unsigned_p)
3486                 dump_printf (MSG_NOTE, "[0");
3487               else
3488                 {
3489                   dump_printf (MSG_NOTE, "(");
3490                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3491                 }
3492               dump_printf (MSG_NOTE, ", ");
3493               dump_dec (MSG_NOTE, lower_bound);
3494               dump_printf (MSG_NOTE, ")\n");
3495             }
3496           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3497                                   unsigned_p, lower_bound);
3498           continue;
3499         }
3500
3501       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3502       if (dr_group_first_a)
3503         {
3504           stmt_info_a = dr_group_first_a;
3505           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3506         }
3507
3508       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3509       if (dr_group_first_b)
3510         {
3511           stmt_info_b = dr_group_first_b;
3512           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3513         }
3514
3515       if (ignore_step_p)
3516         {
3517           segment_length_a = size_zero_node;
3518           segment_length_b = size_zero_node;
3519         }
3520       else
3521         {
3522           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3523                                 DR_STEP (dr_info_b->dr), 0))
3524             length_factor = scalar_loop_iters;
3525           else
3526             length_factor = size_int (vect_factor);
3527           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3528           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3529         }
3530       access_size_a = vect_vfa_access_size (dr_info_a);
3531       access_size_b = vect_vfa_access_size (dr_info_b);
3532       align_a = vect_vfa_align (dr_info_a);
3533       align_b = vect_vfa_align (dr_info_b);
3534
3535       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3536                                         DR_BASE_ADDRESS (dr_info_b->dr));
3537       if (comp_res == 0)
3538         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3539                                           DR_OFFSET (dr_info_b->dr));
3540
3541       /* See whether the alias is known at compilation time.  */
3542       if (comp_res == 0
3543           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3544           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3545           && poly_int_tree_p (segment_length_a)
3546           && poly_int_tree_p (segment_length_b))
3547         {
3548           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3549                                              segment_length_a,
3550                                              segment_length_b,
3551                                              access_size_a,
3552                                              access_size_b);
3553           if (res >= 0 && dump_enabled_p ())
3554             {
3555               dump_printf_loc (MSG_NOTE, vect_location,
3556                                "can tell at compile time that ");
3557               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_a->dr));
3558               dump_printf (MSG_NOTE, " and ");
3559               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr_info_b->dr));
3560               if (res == 0)
3561                 dump_printf (MSG_NOTE, " do not alias\n");
3562               else
3563                 dump_printf (MSG_NOTE, " alias\n");
3564             }
3565
3566           if (res == 0)
3567             continue;
3568
3569           if (res == 1)
3570             {
3571               if (dump_enabled_p ())
3572                 dump_printf_loc (MSG_NOTE, vect_location,
3573                                  "not vectorized: compilation time alias.\n");
3574               return false;
3575             }
3576         }
3577
3578       dr_with_seg_len_pair_t dr_with_seg_len_pair
3579         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3580                           access_size_a, align_a),
3581          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3582                           access_size_b, align_b));
3583
3584       /* Canonicalize pairs by sorting the two DR members.  */
3585       if (comp_res > 0)
3586         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3587
3588       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3589     }
3590
3591   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3592
3593   unsigned int count = (comp_alias_ddrs.length ()
3594                         + check_unequal_addrs.length ());
3595
3596   dump_printf_loc (MSG_NOTE, vect_location,
3597                    "improved number of alias checks from %d to %d\n",
3598                    may_alias_ddrs.length (), count);
3599   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3600     {
3601       if (dump_enabled_p ())
3602         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3603                          "number of versioning for alias "
3604                          "run-time tests exceeds %d "
3605                          "(--param vect-max-version-for-alias-checks)\n",
3606                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3607       return false;
3608     }
3609
3610   return true;
3611 }
3612
3613 /* Check whether we can use an internal function for a gather load
3614    or scatter store.  READ_P is true for loads and false for stores.
3615    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3616    the type of the memory elements being loaded or stored.  OFFSET_BITS
3617    is the number of bits in each scalar offset and OFFSET_SIGN is the
3618    sign of the offset.  SCALE is the amount by which the offset should
3619    be multiplied *after* it has been converted to address width.
3620
3621    Return true if the function is supported, storing the function
3622    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3623
3624 bool
3625 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3626                           tree memory_type, unsigned int offset_bits,
3627                           signop offset_sign, int scale,
3628                           internal_fn *ifn_out, tree *element_type_out)
3629 {
3630   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3631   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3632   if (offset_bits > element_bits)
3633     /* Internal functions require the offset to be the same width as
3634        the vector elements.  We can extend narrower offsets, but it isn't
3635        safe to truncate wider offsets.  */
3636     return false;
3637
3638   if (element_bits != memory_bits)
3639     /* For now the vector elements must be the same width as the
3640        memory elements.  */
3641     return false;
3642
3643   /* Work out which function we need.  */
3644   internal_fn ifn;
3645   if (read_p)
3646     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3647   else
3648     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3649
3650   /* Test whether the target supports this combination.  */
3651   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3652                                                offset_sign, scale))
3653     return false;
3654
3655   *ifn_out = ifn;
3656   *element_type_out = TREE_TYPE (vectype);
3657   return true;
3658 }
3659
3660 /* STMT_INFO is a call to an internal gather load or scatter store function.
3661    Describe the operation in INFO.  */
3662
3663 static void
3664 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3665                                    gather_scatter_info *info)
3666 {
3667   gcall *call = as_a <gcall *> (stmt_info->stmt);
3668   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3669   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3670
3671   info->ifn = gimple_call_internal_fn (call);
3672   info->decl = NULL_TREE;
3673   info->base = gimple_call_arg (call, 0);
3674   info->offset = gimple_call_arg (call, 1);
3675   info->offset_dt = vect_unknown_def_type;
3676   info->offset_vectype = NULL_TREE;
3677   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3678   info->element_type = TREE_TYPE (vectype);
3679   info->memory_type = TREE_TYPE (DR_REF (dr));
3680 }
3681
3682 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3683    gather load or scatter store.  Describe the operation in *INFO if so.  */
3684
3685 bool
3686 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3687                            gather_scatter_info *info)
3688 {
3689   HOST_WIDE_INT scale = 1;
3690   poly_int64 pbitpos, pbitsize;
3691   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3692   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3693   tree offtype = NULL_TREE;
3694   tree decl = NULL_TREE, base, off;
3695   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3696   tree memory_type = TREE_TYPE (DR_REF (dr));
3697   machine_mode pmode;
3698   int punsignedp, reversep, pvolatilep = 0;
3699   internal_fn ifn;
3700   tree element_type;
3701   bool masked_p = false;
3702
3703   /* See whether this is already a call to a gather/scatter internal function.
3704      If not, see whether it's a masked load or store.  */
3705   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3706   if (call && gimple_call_internal_p (call))
3707     {
3708       ifn = gimple_call_internal_fn (call);
3709       if (internal_gather_scatter_fn_p (ifn))
3710         {
3711           vect_describe_gather_scatter_call (stmt_info, info);
3712           return true;
3713         }
3714       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3715     }
3716
3717   /* True if we should aim to use internal functions rather than
3718      built-in functions.  */
3719   bool use_ifn_p = (DR_IS_READ (dr)
3720                     ? supports_vec_gather_load_p ()
3721                     : supports_vec_scatter_store_p ());
3722
3723   base = DR_REF (dr);
3724   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3725      see if we can use the def stmt of the address.  */
3726   if (masked_p
3727       && TREE_CODE (base) == MEM_REF
3728       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3729       && integer_zerop (TREE_OPERAND (base, 1))
3730       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3731     {
3732       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3733       if (is_gimple_assign (def_stmt)
3734           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3735         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3736     }
3737
3738   /* The gather and scatter builtins need address of the form
3739      loop_invariant + vector * {1, 2, 4, 8}
3740      or
3741      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3742      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3743      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3744      multiplications and additions in it.  To get a vector, we need
3745      a single SSA_NAME that will be defined in the loop and will
3746      contain everything that is not loop invariant and that can be
3747      vectorized.  The following code attempts to find such a preexistng
3748      SSA_NAME OFF and put the loop invariants into a tree BASE
3749      that can be gimplified before the loop.  */
3750   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3751                               &punsignedp, &reversep, &pvolatilep);
3752   if (reversep)
3753     return false;
3754
3755   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3756
3757   if (TREE_CODE (base) == MEM_REF)
3758     {
3759       if (!integer_zerop (TREE_OPERAND (base, 1)))
3760         {
3761           if (off == NULL_TREE)
3762             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3763           else
3764             off = size_binop (PLUS_EXPR, off,
3765                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3766         }
3767       base = TREE_OPERAND (base, 0);
3768     }
3769   else
3770     base = build_fold_addr_expr (base);
3771
3772   if (off == NULL_TREE)
3773     off = size_zero_node;
3774
3775   /* If base is not loop invariant, either off is 0, then we start with just
3776      the constant offset in the loop invariant BASE and continue with base
3777      as OFF, otherwise give up.
3778      We could handle that case by gimplifying the addition of base + off
3779      into some SSA_NAME and use that as off, but for now punt.  */
3780   if (!expr_invariant_in_loop_p (loop, base))
3781     {
3782       if (!integer_zerop (off))
3783         return false;
3784       off = base;
3785       base = size_int (pbytepos);
3786     }
3787   /* Otherwise put base + constant offset into the loop invariant BASE
3788      and continue with OFF.  */
3789   else
3790     {
3791       base = fold_convert (sizetype, base);
3792       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3793     }
3794
3795   /* OFF at this point may be either a SSA_NAME or some tree expression
3796      from get_inner_reference.  Try to peel off loop invariants from it
3797      into BASE as long as possible.  */
3798   STRIP_NOPS (off);
3799   while (offtype == NULL_TREE)
3800     {
3801       enum tree_code code;
3802       tree op0, op1, add = NULL_TREE;
3803
3804       if (TREE_CODE (off) == SSA_NAME)
3805         {
3806           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3807
3808           if (expr_invariant_in_loop_p (loop, off))
3809             return false;
3810
3811           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3812             break;
3813
3814           op0 = gimple_assign_rhs1 (def_stmt);
3815           code = gimple_assign_rhs_code (def_stmt);
3816           op1 = gimple_assign_rhs2 (def_stmt);
3817         }
3818       else
3819         {
3820           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3821             return false;
3822           code = TREE_CODE (off);
3823           extract_ops_from_tree (off, &code, &op0, &op1);
3824         }
3825       switch (code)
3826         {
3827         case POINTER_PLUS_EXPR:
3828         case PLUS_EXPR:
3829           if (expr_invariant_in_loop_p (loop, op0))
3830             {
3831               add = op0;
3832               off = op1;
3833             do_add:
3834               add = fold_convert (sizetype, add);
3835               if (scale != 1)
3836                 add = size_binop (MULT_EXPR, add, size_int (scale));
3837               base = size_binop (PLUS_EXPR, base, add);
3838               continue;
3839             }
3840           if (expr_invariant_in_loop_p (loop, op1))
3841             {
3842               add = op1;
3843               off = op0;
3844               goto do_add;
3845             }
3846           break;
3847         case MINUS_EXPR:
3848           if (expr_invariant_in_loop_p (loop, op1))
3849             {
3850               add = fold_convert (sizetype, op1);
3851               add = size_binop (MINUS_EXPR, size_zero_node, add);
3852               off = op0;
3853               goto do_add;
3854             }
3855           break;
3856         case MULT_EXPR:
3857           if (scale == 1 && tree_fits_shwi_p (op1))
3858             {
3859               int new_scale = tree_to_shwi (op1);
3860               /* Only treat this as a scaling operation if the target
3861                  supports it.  */
3862               if (use_ifn_p
3863                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3864                                                 vectype, memory_type, 1,
3865                                                 TYPE_SIGN (TREE_TYPE (op0)),
3866                                                 new_scale, &ifn,
3867                                                 &element_type))
3868                 break;
3869               scale = new_scale;
3870               off = op0;
3871               continue;
3872             }
3873           break;
3874         case SSA_NAME:
3875           off = op0;
3876           continue;
3877         CASE_CONVERT:
3878           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3879               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3880             break;
3881           if (TYPE_PRECISION (TREE_TYPE (op0))
3882               == TYPE_PRECISION (TREE_TYPE (off)))
3883             {
3884               off = op0;
3885               continue;
3886             }
3887
3888           /* The internal functions need the offset to be the same width
3889              as the elements of VECTYPE.  Don't include operations that
3890              cast the offset from that width to a different width.  */
3891           if (use_ifn_p
3892               && (int_size_in_bytes (TREE_TYPE (vectype))
3893                   == int_size_in_bytes (TREE_TYPE (off))))
3894             break;
3895
3896           if (TYPE_PRECISION (TREE_TYPE (op0))
3897               < TYPE_PRECISION (TREE_TYPE (off)))
3898             {
3899               off = op0;
3900               offtype = TREE_TYPE (off);
3901               STRIP_NOPS (off);
3902               continue;
3903             }
3904           break;
3905         default:
3906           break;
3907         }
3908       break;
3909     }
3910
3911   /* If at the end OFF still isn't a SSA_NAME or isn't
3912      defined in the loop, punt.  */
3913   if (TREE_CODE (off) != SSA_NAME
3914       || expr_invariant_in_loop_p (loop, off))
3915     return false;
3916
3917   if (offtype == NULL_TREE)
3918     offtype = TREE_TYPE (off);
3919
3920   if (use_ifn_p)
3921     {
3922       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3923                                      memory_type, TYPE_PRECISION (offtype),
3924                                      TYPE_SIGN (offtype), scale, &ifn,
3925                                      &element_type))
3926         return false;
3927     }
3928   else
3929     {
3930       if (DR_IS_READ (dr))
3931         {
3932           if (targetm.vectorize.builtin_gather)
3933             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3934         }
3935       else
3936         {
3937           if (targetm.vectorize.builtin_scatter)
3938             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3939         }
3940
3941       if (!decl)
3942         return false;
3943
3944       ifn = IFN_LAST;
3945       element_type = TREE_TYPE (vectype);
3946     }
3947
3948   info->ifn = ifn;
3949   info->decl = decl;
3950   info->base = base;
3951   info->offset = off;
3952   info->offset_dt = vect_unknown_def_type;
3953   info->offset_vectype = NULL_TREE;
3954   info->scale = scale;
3955   info->element_type = element_type;
3956   info->memory_type = memory_type;
3957   return true;
3958 }
3959
3960 /* Find the data references in STMT, analyze them with respect to LOOP and
3961    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3962    be handled.  */
3963
3964 bool
3965 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
3966                                vec<data_reference_p> *datarefs)
3967 {
3968   /* We can ignore clobbers for dataref analysis - they are removed during
3969      loop vectorization and BB vectorization checks dependences with a
3970      stmt walk.  */
3971   if (gimple_clobber_p (stmt))
3972     return true;
3973
3974   if (gimple_has_volatile_ops (stmt))
3975     {
3976       if (dump_enabled_p ())
3977         {
3978           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3979                            "not vectorized: volatile type ");
3980           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3981         }
3982       return false;
3983     }
3984
3985   if (stmt_can_throw_internal (stmt))
3986     {
3987       if (dump_enabled_p ())
3988         {
3989           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3990                            "not vectorized: statement can throw an "
3991                            "exception ");
3992           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3993         }
3994       return false;
3995     }
3996
3997   auto_vec<data_reference_p, 2> refs;
3998   if (!find_data_references_in_stmt (loop, stmt, &refs))
3999     return false;
4000
4001   if (refs.is_empty ())
4002     return true;
4003
4004   if (refs.length () > 1)
4005     {
4006       if (dump_enabled_p ())
4007         {
4008           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4009                            "not vectorized: more than one data ref "
4010                            "in stmt: ");
4011           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4012         }
4013       return false;
4014     }
4015
4016   if (gcall *call = dyn_cast <gcall *> (stmt))
4017     if (!gimple_call_internal_p (call)
4018         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4019             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4020       {
4021         if (dump_enabled_p ())
4022           {
4023             dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
4024                              "not vectorized: dr in a call ");
4025             dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4026           }
4027         return false;
4028       }
4029
4030   data_reference_p dr = refs.pop ();
4031   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4032       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4033     {
4034       if (dump_enabled_p ())
4035         {
4036           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4037                            "not vectorized: statement is bitfield "
4038                            "access ");
4039           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
4040         }
4041       return false;
4042     }
4043
4044   if (DR_BASE_ADDRESS (dr)
4045       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4046     {
4047       if (dump_enabled_p ())
4048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4049                          "not vectorized: base addr of dr is a "
4050                          "constant\n");
4051       return false;
4052     }
4053
4054   /* Check whether this may be a SIMD lane access and adjust the
4055      DR to make it easier for us to handle it.  */
4056   if (loop
4057       && loop->simduid
4058       && (!DR_BASE_ADDRESS (dr)
4059           || !DR_OFFSET (dr)
4060           || !DR_INIT (dr)
4061           || !DR_STEP (dr)))
4062     {
4063       struct data_reference *newdr
4064         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4065                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4066       if (DR_BASE_ADDRESS (newdr)
4067           && DR_OFFSET (newdr)
4068           && DR_INIT (newdr)
4069           && DR_STEP (newdr)
4070           && integer_zerop (DR_STEP (newdr)))
4071         {
4072           tree off = DR_OFFSET (newdr);
4073           STRIP_NOPS (off);
4074           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4075               && TREE_CODE (off) == MULT_EXPR
4076               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4077             {
4078               tree step = TREE_OPERAND (off, 1);
4079               off = TREE_OPERAND (off, 0);
4080               STRIP_NOPS (off);
4081               if (CONVERT_EXPR_P (off)
4082                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4083                       < TYPE_PRECISION (TREE_TYPE (off))))
4084                 off = TREE_OPERAND (off, 0);
4085               if (TREE_CODE (off) == SSA_NAME)
4086                 {
4087                   gimple *def = SSA_NAME_DEF_STMT (off);
4088                   tree reft = TREE_TYPE (DR_REF (newdr));
4089                   if (is_gimple_call (def)
4090                       && gimple_call_internal_p (def)
4091                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4092                     {
4093                       tree arg = gimple_call_arg (def, 0);
4094                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4095                       arg = SSA_NAME_VAR (arg);
4096                       if (arg == loop->simduid
4097                           /* For now.  */
4098                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4099                         {
4100                           DR_OFFSET (newdr) = ssize_int (0);
4101                           DR_STEP (newdr) = step;
4102                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4103                           DR_STEP_ALIGNMENT (newdr)
4104                             = highest_pow2_factor (step);
4105                           /* Mark as simd-lane access.  */
4106                           newdr->aux = (void *)-1;
4107                           free_data_ref (dr);
4108                           datarefs->safe_push (newdr);
4109                           return true;
4110                         }
4111                     }
4112                 }
4113             }
4114         }
4115       free_data_ref (newdr);
4116     }
4117
4118   datarefs->safe_push (dr);
4119   return true;
4120 }
4121
4122 /* Function vect_analyze_data_refs.
4123
4124   Find all the data references in the loop or basic block.
4125
4126    The general structure of the analysis of data refs in the vectorizer is as
4127    follows:
4128    1- vect_analyze_data_refs(loop/bb): call
4129       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4130       in the loop/bb and their dependences.
4131    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4132    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4133    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4134
4135 */
4136
4137 bool
4138 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4139 {
4140   struct loop *loop = NULL;
4141   unsigned int i;
4142   struct data_reference *dr;
4143   tree scalar_type;
4144
4145   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4146
4147   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4148     loop = LOOP_VINFO_LOOP (loop_vinfo);
4149
4150   /* Go through the data-refs, check that the analysis succeeded.  Update
4151      pointer from stmt_vec_info struct to DR and vectype.  */
4152
4153   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4154   FOR_EACH_VEC_ELT (datarefs, i, dr)
4155     {
4156       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4157       poly_uint64 vf;
4158
4159       gcc_assert (DR_REF (dr));
4160       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4161       gcc_assert (!stmt_info->dr_aux.dr);
4162       stmt_info->dr_aux.dr = dr;
4163       stmt_info->dr_aux.stmt = stmt_info;
4164
4165       /* Check that analysis of the data-ref succeeded.  */
4166       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4167           || !DR_STEP (dr))
4168         {
4169           bool maybe_gather
4170             = DR_IS_READ (dr)
4171               && !TREE_THIS_VOLATILE (DR_REF (dr))
4172               && (targetm.vectorize.builtin_gather != NULL
4173                   || supports_vec_gather_load_p ());
4174           bool maybe_scatter
4175             = DR_IS_WRITE (dr)
4176               && !TREE_THIS_VOLATILE (DR_REF (dr))
4177               && (targetm.vectorize.builtin_scatter != NULL
4178                   || supports_vec_scatter_store_p ());
4179
4180           /* If target supports vector gather loads or scatter stores,
4181              see if they can't be used.  */
4182           if (is_a <loop_vec_info> (vinfo)
4183               && !nested_in_vect_loop_p (loop, stmt_info))
4184             {
4185               if (maybe_gather || maybe_scatter)
4186                 {
4187                   if (maybe_gather)
4188                     gatherscatter = GATHER;
4189                   else
4190                     gatherscatter = SCATTER;
4191                 }
4192             }
4193
4194           if (gatherscatter == SG_NONE)
4195             {
4196               if (dump_enabled_p ())
4197                 {
4198                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4199                                    "not vectorized: data ref analysis "
4200                                    "failed ");
4201                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4202                                     stmt_info->stmt, 0);
4203                 }
4204               if (is_a <bb_vec_info> (vinfo))
4205                 {
4206                   /* In BB vectorization the ref can still participate
4207                      in dependence analysis, we just can't vectorize it.  */
4208                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4209                   continue;
4210                 }
4211               return false;
4212             }
4213         }
4214
4215       /* See if this was detected as SIMD lane access.  */
4216       if (dr->aux == (void *)-1)
4217         {
4218           if (nested_in_vect_loop_p (loop, stmt_info))
4219             {
4220               if (dump_enabled_p ())
4221                 {
4222                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4223                                    "not vectorized: data ref analysis "
4224                                    "failed ");
4225                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4226                                     stmt_info->stmt, 0);
4227                 }
4228               return false;
4229             }
4230           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4231         }
4232
4233       tree base = get_base_address (DR_REF (dr));
4234       if (base && VAR_P (base) && DECL_NONALIASED (base))
4235         {
4236           if (dump_enabled_p ())
4237             {
4238               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4239                                "not vectorized: base object not addressable "
4240                                "for stmt: ");
4241               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4242                                 stmt_info->stmt, 0);
4243             }
4244           if (is_a <bb_vec_info> (vinfo))
4245             {
4246               /* In BB vectorization the ref can still participate
4247                  in dependence analysis, we just can't vectorize it.  */
4248               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4249               continue;
4250             }
4251           return false;
4252         }
4253
4254       if (is_a <loop_vec_info> (vinfo)
4255           && DR_STEP (dr)
4256           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4257         {
4258           if (nested_in_vect_loop_p (loop, stmt_info))
4259             {
4260               if (dump_enabled_p ())
4261                 {
4262                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4263                                    "not vectorized: not suitable for strided "
4264                                    "load ");
4265                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4266                                     stmt_info->stmt, 0);
4267                 }
4268               return false;
4269             }
4270           STMT_VINFO_STRIDED_P (stmt_info) = true;
4271         }
4272
4273       /* Update DR field in stmt_vec_info struct.  */
4274
4275       /* If the dataref is in an inner-loop of the loop that is considered for
4276          for vectorization, we also want to analyze the access relative to
4277          the outer-loop (DR contains information only relative to the
4278          inner-most enclosing loop).  We do that by building a reference to the
4279          first location accessed by the inner-loop, and analyze it relative to
4280          the outer-loop.  */
4281       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4282         {
4283           /* Build a reference to the first location accessed by the
4284              inner loop: *(BASE + INIT + OFFSET).  By construction,
4285              this address must be invariant in the inner loop, so we
4286              can consider it as being used in the outer loop.  */
4287           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4288           tree offset = unshare_expr (DR_OFFSET (dr));
4289           tree init = unshare_expr (DR_INIT (dr));
4290           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4291                                           init, offset);
4292           tree init_addr = fold_build_pointer_plus (base, init_offset);
4293           tree init_ref = build_fold_indirect_ref (init_addr);
4294
4295           if (dump_enabled_p ())
4296             {
4297               dump_printf_loc (MSG_NOTE, vect_location,
4298                                "analyze in outer loop: ");
4299               dump_generic_expr (MSG_NOTE, TDF_SLIM, init_ref);
4300               dump_printf (MSG_NOTE, "\n");
4301             }
4302
4303           if (!dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4304                                      init_ref, loop))
4305             /* dr_analyze_innermost already explained the failure.  */
4306             return false;
4307
4308           if (dump_enabled_p ())
4309             {
4310               dump_printf_loc (MSG_NOTE, vect_location,
4311                                "\touter base_address: ");
4312               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4313                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4314               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
4315               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4316                                  STMT_VINFO_DR_OFFSET (stmt_info));
4317               dump_printf (MSG_NOTE,
4318                            "\n\touter constant offset from base address: ");
4319               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4320                                  STMT_VINFO_DR_INIT (stmt_info));
4321               dump_printf (MSG_NOTE, "\n\touter step: ");
4322               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4323                                  STMT_VINFO_DR_STEP (stmt_info));
4324               dump_printf (MSG_NOTE, "\n\touter base alignment: %d\n",
4325                            STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info));
4326               dump_printf (MSG_NOTE, "\n\touter base misalignment: %d\n",
4327                            STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info));
4328               dump_printf (MSG_NOTE, "\n\touter offset alignment: %d\n",
4329                            STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info));
4330               dump_printf (MSG_NOTE, "\n\touter step alignment: %d\n",
4331                            STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4332             }
4333         }
4334
4335       /* Set vectype for STMT.  */
4336       scalar_type = TREE_TYPE (DR_REF (dr));
4337       STMT_VINFO_VECTYPE (stmt_info)
4338         = get_vectype_for_scalar_type (scalar_type);
4339       if (!STMT_VINFO_VECTYPE (stmt_info))
4340         {
4341           if (dump_enabled_p ())
4342             {
4343               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4344                                "not vectorized: no vectype for stmt: ");
4345               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4346                                 stmt_info->stmt, 0);
4347               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4348               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4349                                  scalar_type);
4350               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4351             }
4352
4353           if (is_a <bb_vec_info> (vinfo))
4354             {
4355               /* No vector type is fine, the ref can still participate
4356                  in dependence analysis, we just can't vectorize it.  */
4357               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4358               continue;
4359             }
4360           return false;
4361         }
4362       else
4363         {
4364           if (dump_enabled_p ())
4365             {
4366               dump_printf_loc (MSG_NOTE, vect_location,
4367                                "got vectype for stmt: ");
4368               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
4369               dump_generic_expr (MSG_NOTE, TDF_SLIM,
4370                                  STMT_VINFO_VECTYPE (stmt_info));
4371               dump_printf (MSG_NOTE, "\n");
4372             }
4373         }
4374
4375       /* Adjust the minimal vectorization factor according to the
4376          vector type.  */
4377       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4378       *min_vf = upper_bound (*min_vf, vf);
4379
4380       if (gatherscatter != SG_NONE)
4381         {
4382           gather_scatter_info gs_info;
4383           if (!vect_check_gather_scatter (stmt_info,
4384                                           as_a <loop_vec_info> (vinfo),
4385                                           &gs_info)
4386               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4387             {
4388               if (dump_enabled_p ())
4389                 {
4390                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4391                                    (gatherscatter == GATHER) ?
4392                                    "not vectorized: not suitable for gather "
4393                                    "load " :
4394                                    "not vectorized: not suitable for scatter "
4395                                    "store ");
4396                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
4397                                     stmt_info->stmt, 0);
4398                 }
4399               return false;
4400             }
4401           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4402         }
4403     }
4404
4405   /* We used to stop processing and prune the list here.  Verify we no
4406      longer need to.  */
4407   gcc_assert (i == datarefs.length ());
4408
4409   return true;
4410 }
4411
4412
4413 /* Function vect_get_new_vect_var.
4414
4415    Returns a name for a new variable.  The current naming scheme appends the
4416    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4417    the name of vectorizer generated variables, and appends that to NAME if
4418    provided.  */
4419
4420 tree
4421 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4422 {
4423   const char *prefix;
4424   tree new_vect_var;
4425
4426   switch (var_kind)
4427   {
4428   case vect_simple_var:
4429     prefix = "vect";
4430     break;
4431   case vect_scalar_var:
4432     prefix = "stmp";
4433     break;
4434   case vect_mask_var:
4435     prefix = "mask";
4436     break;
4437   case vect_pointer_var:
4438     prefix = "vectp";
4439     break;
4440   default:
4441     gcc_unreachable ();
4442   }
4443
4444   if (name)
4445     {
4446       char* tmp = concat (prefix, "_", name, NULL);
4447       new_vect_var = create_tmp_reg (type, tmp);
4448       free (tmp);
4449     }
4450   else
4451     new_vect_var = create_tmp_reg (type, prefix);
4452
4453   return new_vect_var;
4454 }
4455
4456 /* Like vect_get_new_vect_var but return an SSA name.  */
4457
4458 tree
4459 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4460 {
4461   const char *prefix;
4462   tree new_vect_var;
4463
4464   switch (var_kind)
4465   {
4466   case vect_simple_var:
4467     prefix = "vect";
4468     break;
4469   case vect_scalar_var:
4470     prefix = "stmp";
4471     break;
4472   case vect_pointer_var:
4473     prefix = "vectp";
4474     break;
4475   default:
4476     gcc_unreachable ();
4477   }
4478
4479   if (name)
4480     {
4481       char* tmp = concat (prefix, "_", name, NULL);
4482       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4483       free (tmp);
4484     }
4485   else
4486     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4487
4488   return new_vect_var;
4489 }
4490
4491 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4492
4493 static void
4494 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4495 {
4496   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4497   int misalign = DR_MISALIGNMENT (dr_info);
4498   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4499     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4500   else
4501     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4502                             DR_TARGET_ALIGNMENT (dr_info), misalign);
4503 }
4504
4505 /* Function vect_create_addr_base_for_vector_ref.
4506
4507    Create an expression that computes the address of the first memory location
4508    that will be accessed for a data reference.
4509
4510    Input:
4511    STMT_INFO: The statement containing the data reference.
4512    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4513    OFFSET: Optional. If supplied, it is be added to the initial address.
4514    LOOP:    Specify relative to which loop-nest should the address be computed.
4515             For example, when the dataref is in an inner-loop nested in an
4516             outer-loop that is now being vectorized, LOOP can be either the
4517             outer-loop, or the inner-loop.  The first memory location accessed
4518             by the following dataref ('in' points to short):
4519
4520                 for (i=0; i<N; i++)
4521                    for (j=0; j<M; j++)
4522                      s += in[i+j]
4523
4524             is as follows:
4525             if LOOP=i_loop:     &in             (relative to i_loop)
4526             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4527    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4528             initial address.  Unlike OFFSET, which is number of elements to
4529             be added, BYTE_OFFSET is measured in bytes.
4530
4531    Output:
4532    1. Return an SSA_NAME whose value is the address of the memory location of
4533       the first vector of the data reference.
4534    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4535       these statement(s) which define the returned SSA_NAME.
4536
4537    FORNOW: We are only handling array accesses with step 1.  */
4538
4539 tree
4540 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4541                                       gimple_seq *new_stmt_list,
4542                                       tree offset,
4543                                       tree byte_offset)
4544 {
4545   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4546   struct data_reference *dr = dr_info->dr;
4547   const char *base_name;
4548   tree addr_base;
4549   tree dest;
4550   gimple_seq seq = NULL;
4551   tree vect_ptr_type;
4552   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4553   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4554   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4555
4556   tree data_ref_base = unshare_expr (drb->base_address);
4557   tree base_offset = unshare_expr (drb->offset);
4558   tree init = unshare_expr (drb->init);
4559
4560   if (loop_vinfo)
4561     base_name = get_name (data_ref_base);
4562   else
4563     {
4564       base_offset = ssize_int (0);
4565       init = ssize_int (0);
4566       base_name = get_name (DR_REF (dr));
4567     }
4568
4569   /* Create base_offset */
4570   base_offset = size_binop (PLUS_EXPR,
4571                             fold_convert (sizetype, base_offset),
4572                             fold_convert (sizetype, init));
4573
4574   if (offset)
4575     {
4576       offset = fold_build2 (MULT_EXPR, sizetype,
4577                             fold_convert (sizetype, offset), step);
4578       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4579                                  base_offset, offset);
4580     }
4581   if (byte_offset)
4582     {
4583       byte_offset = fold_convert (sizetype, byte_offset);
4584       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4585                                  base_offset, byte_offset);
4586     }
4587
4588   /* base + base_offset */
4589   if (loop_vinfo)
4590     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4591   else
4592     {
4593       addr_base = build1 (ADDR_EXPR,
4594                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4595                           unshare_expr (DR_REF (dr)));
4596     }
4597
4598   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4599   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4600   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4601   gimple_seq_add_seq (new_stmt_list, seq);
4602
4603   if (DR_PTR_INFO (dr)
4604       && TREE_CODE (addr_base) == SSA_NAME
4605       && !SSA_NAME_PTR_INFO (addr_base))
4606     {
4607       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4608       if (offset || byte_offset)
4609         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4610     }
4611
4612   if (dump_enabled_p ())
4613     {
4614       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4615       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4616       dump_printf (MSG_NOTE, "\n");
4617     }
4618
4619   return addr_base;
4620 }
4621
4622
4623 /* Function vect_create_data_ref_ptr.
4624
4625    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4626    location accessed in the loop by STMT_INFO, along with the def-use update
4627    chain to appropriately advance the pointer through the loop iterations.
4628    Also set aliasing information for the pointer.  This pointer is used by
4629    the callers to this function to create a memory reference expression for
4630    vector load/store access.
4631
4632    Input:
4633    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4634          GIMPLE_ASSIGN <name, data-ref> or
4635          GIMPLE_ASSIGN <data-ref, name>.
4636    2. AGGR_TYPE: the type of the reference, which should be either a vector
4637         or an array.
4638    3. AT_LOOP: the loop where the vector memref is to be created.
4639    4. OFFSET (optional): an offset to be added to the initial address accessed
4640         by the data-ref in STMT_INFO.
4641    5. BSI: location where the new stmts are to be placed if there is no loop
4642    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4643         pointing to the initial address.
4644    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4645         to the initial address accessed by the data-ref in STMT_INFO.  This is
4646         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4647         in bytes.
4648    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4649         to the IV during each iteration of the loop.  NULL says to move
4650         by one copy of AGGR_TYPE up or down, depending on the step of the
4651         data reference.
4652
4653    Output:
4654    1. Declare a new ptr to vector_type, and have it point to the base of the
4655       data reference (initial addressed accessed by the data reference).
4656       For example, for vector of type V8HI, the following code is generated:
4657
4658       v8hi *ap;
4659       ap = (v8hi *)initial_address;
4660
4661       if OFFSET is not supplied:
4662          initial_address = &a[init];
4663       if OFFSET is supplied:
4664          initial_address = &a[init + OFFSET];
4665       if BYTE_OFFSET is supplied:
4666          initial_address = &a[init] + BYTE_OFFSET;
4667
4668       Return the initial_address in INITIAL_ADDRESS.
4669
4670    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4671       update the pointer in each iteration of the loop.
4672
4673       Return the increment stmt that updates the pointer in PTR_INCR.
4674
4675    3. Return the pointer.  */
4676
4677 tree
4678 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4679                           struct loop *at_loop, tree offset,
4680                           tree *initial_address, gimple_stmt_iterator *gsi,
4681                           gimple **ptr_incr, bool only_init,
4682                           tree byte_offset, tree iv_step)
4683 {
4684   const char *base_name;
4685   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4686   struct loop *loop = NULL;
4687   bool nested_in_vect_loop = false;
4688   struct loop *containing_loop = NULL;
4689   tree aggr_ptr_type;
4690   tree aggr_ptr;
4691   tree new_temp;
4692   gimple_seq new_stmt_list = NULL;
4693   edge pe = NULL;
4694   basic_block new_bb;
4695   tree aggr_ptr_init;
4696   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4697   struct data_reference *dr = dr_info->dr;
4698   tree aptr;
4699   gimple_stmt_iterator incr_gsi;
4700   bool insert_after;
4701   tree indx_before_incr, indx_after_incr;
4702   gimple *incr;
4703   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4704
4705   gcc_assert (iv_step != NULL_TREE
4706               || TREE_CODE (aggr_type) == ARRAY_TYPE
4707               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4708
4709   if (loop_vinfo)
4710     {
4711       loop = LOOP_VINFO_LOOP (loop_vinfo);
4712       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4713       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4714       pe = loop_preheader_edge (loop);
4715     }
4716   else
4717     {
4718       gcc_assert (bb_vinfo);
4719       only_init = true;
4720       *ptr_incr = NULL;
4721     }
4722
4723   /* Create an expression for the first address accessed by this load
4724      in LOOP.  */
4725   base_name = get_name (DR_BASE_ADDRESS (dr));
4726
4727   if (dump_enabled_p ())
4728     {
4729       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4730       dump_printf_loc (MSG_NOTE, vect_location,
4731                        "create %s-pointer variable to type: ",
4732                        get_tree_code_name (TREE_CODE (aggr_type)));
4733       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4734       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4735         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4736       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4737         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4738       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4739         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4740       else
4741         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4742       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4743       dump_printf (MSG_NOTE, "\n");
4744     }
4745
4746   /* (1) Create the new aggregate-pointer variable.
4747      Vector and array types inherit the alias set of their component
4748      type by default so we need to use a ref-all pointer if the data
4749      reference does not conflict with the created aggregated data
4750      reference because it is not addressable.  */
4751   bool need_ref_all = false;
4752   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4753                               get_alias_set (DR_REF (dr))))
4754     need_ref_all = true;
4755   /* Likewise for any of the data references in the stmt group.  */
4756   else if (DR_GROUP_SIZE (stmt_info) > 1)
4757     {
4758       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4759       do
4760         {
4761           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4762           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4763                                       get_alias_set (DR_REF (sdr))))
4764             {
4765               need_ref_all = true;
4766               break;
4767             }
4768           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4769         }
4770       while (sinfo);
4771     }
4772   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4773                                                need_ref_all);
4774   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4775
4776
4777   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4778      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4779      def-use update cycles for the pointer: one relative to the outer-loop
4780      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4781      to the inner-loop (which is the inner-most loop containing the dataref),
4782      and this is done be step (5) below.
4783
4784      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4785      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4786      redundant.  Steps (3),(4) create the following:
4787
4788         vp0 = &base_addr;
4789         LOOP:   vp1 = phi(vp0,vp2)
4790                 ...
4791                 ...
4792                 vp2 = vp1 + step
4793                 goto LOOP
4794
4795      If there is an inner-loop nested in loop, then step (5) will also be
4796      applied, and an additional update in the inner-loop will be created:
4797
4798         vp0 = &base_addr;
4799         LOOP:   vp1 = phi(vp0,vp2)
4800                 ...
4801         inner:     vp3 = phi(vp1,vp4)
4802                    vp4 = vp3 + inner_step
4803                    if () goto inner
4804                 ...
4805                 vp2 = vp1 + step
4806                 if () goto LOOP   */
4807
4808   /* (2) Calculate the initial address of the aggregate-pointer, and set
4809      the aggregate-pointer to point to it before the loop.  */
4810
4811   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4812
4813   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4814                                                    offset, byte_offset);
4815   if (new_stmt_list)
4816     {
4817       if (pe)
4818         {
4819           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4820           gcc_assert (!new_bb);
4821         }
4822       else
4823         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4824     }
4825
4826   *initial_address = new_temp;
4827   aggr_ptr_init = new_temp;
4828
4829   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4830      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4831      inner-loop nested in LOOP (during outer-loop vectorization).  */
4832
4833   /* No update in loop is required.  */
4834   if (only_init && (!loop_vinfo || at_loop == loop))
4835     aptr = aggr_ptr_init;
4836   else
4837     {
4838       /* Accesses to invariant addresses should be handled specially
4839          by the caller.  */
4840       tree step = vect_dr_behavior (dr_info)->step;
4841       gcc_assert (!integer_zerop (step));
4842
4843       if (iv_step == NULL_TREE)
4844         {
4845           /* The step of the aggregate pointer is the type size,
4846              negated for downward accesses.  */
4847           iv_step = TYPE_SIZE_UNIT (aggr_type);
4848           if (tree_int_cst_sgn (step) == -1)
4849             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4850         }
4851
4852       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4853
4854       create_iv (aggr_ptr_init,
4855                  fold_convert (aggr_ptr_type, iv_step),
4856                  aggr_ptr, loop, &incr_gsi, insert_after,
4857                  &indx_before_incr, &indx_after_incr);
4858       incr = gsi_stmt (incr_gsi);
4859       loop_vinfo->add_stmt (incr);
4860
4861       /* Copy the points-to information if it exists. */
4862       if (DR_PTR_INFO (dr))
4863         {
4864           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4865           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4866         }
4867       if (ptr_incr)
4868         *ptr_incr = incr;
4869
4870       aptr = indx_before_incr;
4871     }
4872
4873   if (!nested_in_vect_loop || only_init)
4874     return aptr;
4875
4876
4877   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4878      nested in LOOP, if exists.  */
4879
4880   gcc_assert (nested_in_vect_loop);
4881   if (!only_init)
4882     {
4883       standard_iv_increment_position (containing_loop, &incr_gsi,
4884                                       &insert_after);
4885       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4886                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4887                  &indx_after_incr);
4888       incr = gsi_stmt (incr_gsi);
4889       loop_vinfo->add_stmt (incr);
4890
4891       /* Copy the points-to information if it exists. */
4892       if (DR_PTR_INFO (dr))
4893         {
4894           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4895           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4896         }
4897       if (ptr_incr)
4898         *ptr_incr = incr;
4899
4900       return indx_before_incr;
4901     }
4902   else
4903     gcc_unreachable ();
4904 }
4905
4906
4907 /* Function bump_vector_ptr
4908
4909    Increment a pointer (to a vector type) by vector-size. If requested,
4910    i.e. if PTR-INCR is given, then also connect the new increment stmt
4911    to the existing def-use update-chain of the pointer, by modifying
4912    the PTR_INCR as illustrated below:
4913
4914    The pointer def-use update-chain before this function:
4915                         DATAREF_PTR = phi (p_0, p_2)
4916                         ....
4917         PTR_INCR:       p_2 = DATAREF_PTR + step
4918
4919    The pointer def-use update-chain after this function:
4920                         DATAREF_PTR = phi (p_0, p_2)
4921                         ....
4922                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4923                         ....
4924         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4925
4926    Input:
4927    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4928                  in the loop.
4929    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4930               the loop.  The increment amount across iterations is expected
4931               to be vector_size.
4932    BSI - location where the new update stmt is to be placed.
4933    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4934    BUMP - optional. The offset by which to bump the pointer. If not given,
4935           the offset is assumed to be vector_size.
4936
4937    Output: Return NEW_DATAREF_PTR as illustrated above.
4938
4939 */
4940
4941 tree
4942 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4943                  stmt_vec_info stmt_info, tree bump)
4944 {
4945   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4946   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4947   tree update = TYPE_SIZE_UNIT (vectype);
4948   gassign *incr_stmt;
4949   ssa_op_iter iter;
4950   use_operand_p use_p;
4951   tree new_dataref_ptr;
4952
4953   if (bump)
4954     update = bump;
4955
4956   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4957     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4958   else
4959     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4960   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4961                                    dataref_ptr, update);
4962   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4963
4964   /* Copy the points-to information if it exists. */
4965   if (DR_PTR_INFO (dr))
4966     {
4967       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4968       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4969     }
4970
4971   if (!ptr_incr)
4972     return new_dataref_ptr;
4973
4974   /* Update the vector-pointer's cross-iteration increment.  */
4975   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4976     {
4977       tree use = USE_FROM_PTR (use_p);
4978
4979       if (use == dataref_ptr)
4980         SET_USE (use_p, new_dataref_ptr);
4981       else
4982         gcc_assert (operand_equal_p (use, update, 0));
4983     }
4984
4985   return new_dataref_ptr;
4986 }
4987
4988
4989 /* Copy memory reference info such as base/clique from the SRC reference
4990    to the DEST MEM_REF.  */
4991
4992 void
4993 vect_copy_ref_info (tree dest, tree src)
4994 {
4995   if (TREE_CODE (dest) != MEM_REF)
4996     return;
4997
4998   tree src_base = src;
4999   while (handled_component_p (src_base))
5000     src_base = TREE_OPERAND (src_base, 0);
5001   if (TREE_CODE (src_base) != MEM_REF
5002       && TREE_CODE (src_base) != TARGET_MEM_REF)
5003     return;
5004
5005   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
5006   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
5007 }
5008
5009
5010 /* Function vect_create_destination_var.
5011
5012    Create a new temporary of type VECTYPE.  */
5013
5014 tree
5015 vect_create_destination_var (tree scalar_dest, tree vectype)
5016 {
5017   tree vec_dest;
5018   const char *name;
5019   char *new_name;
5020   tree type;
5021   enum vect_var_kind kind;
5022
5023   kind = vectype
5024     ? VECTOR_BOOLEAN_TYPE_P (vectype)
5025     ? vect_mask_var
5026     : vect_simple_var
5027     : vect_scalar_var;
5028   type = vectype ? vectype : TREE_TYPE (scalar_dest);
5029
5030   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
5031
5032   name = get_name (scalar_dest);
5033   if (name)
5034     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
5035   else
5036     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
5037   vec_dest = vect_get_new_vect_var (type, kind, new_name);
5038   free (new_name);
5039
5040   return vec_dest;
5041 }
5042
5043 /* Function vect_grouped_store_supported.
5044
5045    Returns TRUE if interleave high and interleave low permutations
5046    are supported, and FALSE otherwise.  */
5047
5048 bool
5049 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5050 {
5051   machine_mode mode = TYPE_MODE (vectype);
5052
5053   /* vect_permute_store_chain requires the group size to be equal to 3 or
5054      be a power of two.  */
5055   if (count != 3 && exact_log2 (count) == -1)
5056     {
5057       if (dump_enabled_p ())
5058         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5059                          "the size of the group of accesses"
5060                          " is not a power of 2 or not eqaul to 3\n");
5061       return false;
5062     }
5063
5064   /* Check that the permutation is supported.  */
5065   if (VECTOR_MODE_P (mode))
5066     {
5067       unsigned int i;
5068       if (count == 3)
5069         {
5070           unsigned int j0 = 0, j1 = 0, j2 = 0;
5071           unsigned int i, j;
5072
5073           unsigned int nelt;
5074           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5075             {
5076               if (dump_enabled_p ())
5077                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5078                                  "cannot handle groups of 3 stores for"
5079                                  " variable-length vectors\n");
5080               return false;
5081             }
5082
5083           vec_perm_builder sel (nelt, nelt, 1);
5084           sel.quick_grow (nelt);
5085           vec_perm_indices indices;
5086           for (j = 0; j < 3; j++)
5087             {
5088               int nelt0 = ((3 - j) * nelt) % 3;
5089               int nelt1 = ((3 - j) * nelt + 1) % 3;
5090               int nelt2 = ((3 - j) * nelt + 2) % 3;
5091               for (i = 0; i < nelt; i++)
5092                 {
5093                   if (3 * i + nelt0 < nelt)
5094                     sel[3 * i + nelt0] = j0++;
5095                   if (3 * i + nelt1 < nelt)
5096                     sel[3 * i + nelt1] = nelt + j1++;
5097                   if (3 * i + nelt2 < nelt)
5098                     sel[3 * i + nelt2] = 0;
5099                 }
5100               indices.new_vector (sel, 2, nelt);
5101               if (!can_vec_perm_const_p (mode, indices))
5102                 {
5103                   if (dump_enabled_p ())
5104                     dump_printf (MSG_MISSED_OPTIMIZATION,
5105                                  "permutation op not supported by target.\n");
5106                   return false;
5107                 }
5108
5109               for (i = 0; i < nelt; i++)
5110                 {
5111                   if (3 * i + nelt0 < nelt)
5112                     sel[3 * i + nelt0] = 3 * i + nelt0;
5113                   if (3 * i + nelt1 < nelt)
5114                     sel[3 * i + nelt1] = 3 * i + nelt1;
5115                   if (3 * i + nelt2 < nelt)
5116                     sel[3 * i + nelt2] = nelt + j2++;
5117                 }
5118               indices.new_vector (sel, 2, nelt);
5119               if (!can_vec_perm_const_p (mode, indices))
5120                 {
5121                   if (dump_enabled_p ())
5122                     dump_printf (MSG_MISSED_OPTIMIZATION,
5123                                  "permutation op not supported by target.\n");
5124                   return false;
5125                 }
5126             }
5127           return true;
5128         }
5129       else
5130         {
5131           /* If length is not equal to 3 then only power of 2 is supported.  */
5132           gcc_assert (pow2p_hwi (count));
5133           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5134
5135           /* The encoding has 2 interleaved stepped patterns.  */
5136           vec_perm_builder sel (nelt, 2, 3);
5137           sel.quick_grow (6);
5138           for (i = 0; i < 3; i++)
5139             {
5140               sel[i * 2] = i;
5141               sel[i * 2 + 1] = i + nelt;
5142             }
5143           vec_perm_indices indices (sel, 2, nelt);
5144           if (can_vec_perm_const_p (mode, indices))
5145             {
5146               for (i = 0; i < 6; i++)
5147                 sel[i] += exact_div (nelt, 2);
5148               indices.new_vector (sel, 2, nelt);
5149               if (can_vec_perm_const_p (mode, indices))
5150                 return true;
5151             }
5152         }
5153     }
5154
5155   if (dump_enabled_p ())
5156     dump_printf (MSG_MISSED_OPTIMIZATION,
5157                  "permutation op not supported by target.\n");
5158   return false;
5159 }
5160
5161
5162 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5163    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5164
5165 bool
5166 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5167                             bool masked_p)
5168 {
5169   if (masked_p)
5170     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5171                                          vec_mask_store_lanes_optab,
5172                                          vectype, count);
5173   else
5174     return vect_lanes_optab_supported_p ("vec_store_lanes",
5175                                          vec_store_lanes_optab,
5176                                          vectype, count);
5177 }
5178
5179
5180 /* Function vect_permute_store_chain.
5181
5182    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5183    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5184    the data correctly for the stores.  Return the final references for stores
5185    in RESULT_CHAIN.
5186
5187    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5188    The input is 4 vectors each containing 8 elements.  We assign a number to
5189    each element, the input sequence is:
5190
5191    1st vec:   0  1  2  3  4  5  6  7
5192    2nd vec:   8  9 10 11 12 13 14 15
5193    3rd vec:  16 17 18 19 20 21 22 23
5194    4th vec:  24 25 26 27 28 29 30 31
5195
5196    The output sequence should be:
5197
5198    1st vec:  0  8 16 24  1  9 17 25
5199    2nd vec:  2 10 18 26  3 11 19 27
5200    3rd vec:  4 12 20 28  5 13 21 30
5201    4th vec:  6 14 22 30  7 15 23 31
5202
5203    i.e., we interleave the contents of the four vectors in their order.
5204
5205    We use interleave_high/low instructions to create such output.  The input of
5206    each interleave_high/low operation is two vectors:
5207    1st vec    2nd vec
5208    0 1 2 3    4 5 6 7
5209    the even elements of the result vector are obtained left-to-right from the
5210    high/low elements of the first vector.  The odd elements of the result are
5211    obtained left-to-right from the high/low elements of the second vector.
5212    The output of interleave_high will be:   0 4 1 5
5213    and of interleave_low:                   2 6 3 7
5214
5215
5216    The permutation is done in log LENGTH stages.  In each stage interleave_high
5217    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5218    where the first argument is taken from the first half of DR_CHAIN and the
5219    second argument from it's second half.
5220    In our example,
5221
5222    I1: interleave_high (1st vec, 3rd vec)
5223    I2: interleave_low (1st vec, 3rd vec)
5224    I3: interleave_high (2nd vec, 4th vec)
5225    I4: interleave_low (2nd vec, 4th vec)
5226
5227    The output for the first stage is:
5228
5229    I1:  0 16  1 17  2 18  3 19
5230    I2:  4 20  5 21  6 22  7 23
5231    I3:  8 24  9 25 10 26 11 27
5232    I4: 12 28 13 29 14 30 15 31
5233
5234    The output of the second stage, i.e. the final result is:
5235
5236    I1:  0  8 16 24  1  9 17 25
5237    I2:  2 10 18 26  3 11 19 27
5238    I3:  4 12 20 28  5 13 21 30
5239    I4:  6 14 22 30  7 15 23 31.  */
5240
5241 void
5242 vect_permute_store_chain (vec<tree> dr_chain,
5243                           unsigned int length,
5244                           stmt_vec_info stmt_info,
5245                           gimple_stmt_iterator *gsi,
5246                           vec<tree> *result_chain)
5247 {
5248   tree vect1, vect2, high, low;
5249   gimple *perm_stmt;
5250   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5251   tree perm_mask_low, perm_mask_high;
5252   tree data_ref;
5253   tree perm3_mask_low, perm3_mask_high;
5254   unsigned int i, j, n, log_length = exact_log2 (length);
5255
5256   result_chain->quick_grow (length);
5257   memcpy (result_chain->address (), dr_chain.address (),
5258           length * sizeof (tree));
5259
5260   if (length == 3)
5261     {
5262       /* vect_grouped_store_supported ensures that this is constant.  */
5263       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5264       unsigned int j0 = 0, j1 = 0, j2 = 0;
5265
5266       vec_perm_builder sel (nelt, nelt, 1);
5267       sel.quick_grow (nelt);
5268       vec_perm_indices indices;
5269       for (j = 0; j < 3; j++)
5270         {
5271           int nelt0 = ((3 - j) * nelt) % 3;
5272           int nelt1 = ((3 - j) * nelt + 1) % 3;
5273           int nelt2 = ((3 - j) * nelt + 2) % 3;
5274
5275           for (i = 0; i < nelt; i++)
5276             {
5277               if (3 * i + nelt0 < nelt)
5278                 sel[3 * i + nelt0] = j0++;
5279               if (3 * i + nelt1 < nelt)
5280                 sel[3 * i + nelt1] = nelt + j1++;
5281               if (3 * i + nelt2 < nelt)
5282                 sel[3 * i + nelt2] = 0;
5283             }
5284           indices.new_vector (sel, 2, nelt);
5285           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5286
5287           for (i = 0; i < nelt; i++)
5288             {
5289               if (3 * i + nelt0 < nelt)
5290                 sel[3 * i + nelt0] = 3 * i + nelt0;
5291               if (3 * i + nelt1 < nelt)
5292                 sel[3 * i + nelt1] = 3 * i + nelt1;
5293               if (3 * i + nelt2 < nelt)
5294                 sel[3 * i + nelt2] = nelt + j2++;
5295             }
5296           indices.new_vector (sel, 2, nelt);
5297           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5298
5299           vect1 = dr_chain[0];
5300           vect2 = dr_chain[1];
5301
5302           /* Create interleaving stmt:
5303              low = VEC_PERM_EXPR <vect1, vect2,
5304                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5305                                    j + 2, nelt + j + 2, *, ...}>  */
5306           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5307           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5308                                            vect2, perm3_mask_low);
5309           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5310
5311           vect1 = data_ref;
5312           vect2 = dr_chain[2];
5313           /* Create interleaving stmt:
5314              low = VEC_PERM_EXPR <vect1, vect2,
5315                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5316                                    6, 7, nelt + j + 2, ...}>  */
5317           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5318           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5319                                            vect2, perm3_mask_high);
5320           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5321           (*result_chain)[j] = data_ref;
5322         }
5323     }
5324   else
5325     {
5326       /* If length is not equal to 3 then only power of 2 is supported.  */
5327       gcc_assert (pow2p_hwi (length));
5328
5329       /* The encoding has 2 interleaved stepped patterns.  */
5330       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5331       vec_perm_builder sel (nelt, 2, 3);
5332       sel.quick_grow (6);
5333       for (i = 0; i < 3; i++)
5334         {
5335           sel[i * 2] = i;
5336           sel[i * 2 + 1] = i + nelt;
5337         }
5338         vec_perm_indices indices (sel, 2, nelt);
5339         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5340
5341         for (i = 0; i < 6; i++)
5342           sel[i] += exact_div (nelt, 2);
5343         indices.new_vector (sel, 2, nelt);
5344         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5345
5346         for (i = 0, n = log_length; i < n; i++)
5347           {
5348             for (j = 0; j < length/2; j++)
5349               {
5350                 vect1 = dr_chain[j];
5351                 vect2 = dr_chain[j+length/2];
5352
5353                 /* Create interleaving stmt:
5354                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5355                                                         ...}>  */
5356                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5357                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5358                                                  vect2, perm_mask_high);
5359                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5360                 (*result_chain)[2*j] = high;
5361
5362                 /* Create interleaving stmt:
5363                    low = VEC_PERM_EXPR <vect1, vect2,
5364                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5365                                          ...}>  */
5366                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5367                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5368                                                  vect2, perm_mask_low);
5369                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5370                 (*result_chain)[2*j+1] = low;
5371               }
5372             memcpy (dr_chain.address (), result_chain->address (),
5373                     length * sizeof (tree));
5374           }
5375     }
5376 }
5377
5378 /* Function vect_setup_realignment
5379
5380    This function is called when vectorizing an unaligned load using
5381    the dr_explicit_realign[_optimized] scheme.
5382    This function generates the following code at the loop prolog:
5383
5384       p = initial_addr;
5385    x  msq_init = *(floor(p));   # prolog load
5386       realignment_token = call target_builtin;
5387     loop:
5388    x  msq = phi (msq_init, ---)
5389
5390    The stmts marked with x are generated only for the case of
5391    dr_explicit_realign_optimized.
5392
5393    The code above sets up a new (vector) pointer, pointing to the first
5394    location accessed by STMT_INFO, and a "floor-aligned" load using that
5395    pointer.  It also generates code to compute the "realignment-token"
5396    (if the relevant target hook was defined), and creates a phi-node at the
5397    loop-header bb whose arguments are the result of the prolog-load (created
5398    by this function) and the result of a load that takes place in the loop
5399    (to be created by the caller to this function).
5400
5401    For the case of dr_explicit_realign_optimized:
5402    The caller to this function uses the phi-result (msq) to create the
5403    realignment code inside the loop, and sets up the missing phi argument,
5404    as follows:
5405     loop:
5406       msq = phi (msq_init, lsq)
5407       lsq = *(floor(p'));        # load in loop
5408       result = realign_load (msq, lsq, realignment_token);
5409
5410    For the case of dr_explicit_realign:
5411     loop:
5412       msq = *(floor(p));        # load in loop
5413       p' = p + (VS-1);
5414       lsq = *(floor(p'));       # load in loop
5415       result = realign_load (msq, lsq, realignment_token);
5416
5417    Input:
5418    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5419                a memory location that may be unaligned.
5420    BSI - place where new code is to be inserted.
5421    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5422                               is used.
5423
5424    Output:
5425    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5426                        target hook, if defined.
5427    Return value - the result of the loop-header phi node.  */
5428
5429 tree
5430 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5431                         tree *realignment_token,
5432                         enum dr_alignment_support alignment_support_scheme,
5433                         tree init_addr,
5434                         struct loop **at_loop)
5435 {
5436   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5437   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5438   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5439   struct data_reference *dr = dr_info->dr;
5440   struct loop *loop = NULL;
5441   edge pe = NULL;
5442   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5443   tree vec_dest;
5444   gimple *inc;
5445   tree ptr;
5446   tree data_ref;
5447   basic_block new_bb;
5448   tree msq_init = NULL_TREE;
5449   tree new_temp;
5450   gphi *phi_stmt;
5451   tree msq = NULL_TREE;
5452   gimple_seq stmts = NULL;
5453   bool compute_in_loop = false;
5454   bool nested_in_vect_loop = false;
5455   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5456   struct loop *loop_for_initial_load = NULL;
5457
5458   if (loop_vinfo)
5459     {
5460       loop = LOOP_VINFO_LOOP (loop_vinfo);
5461       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5462     }
5463
5464   gcc_assert (alignment_support_scheme == dr_explicit_realign
5465               || alignment_support_scheme == dr_explicit_realign_optimized);
5466
5467   /* We need to generate three things:
5468      1. the misalignment computation
5469      2. the extra vector load (for the optimized realignment scheme).
5470      3. the phi node for the two vectors from which the realignment is
5471       done (for the optimized realignment scheme).  */
5472
5473   /* 1. Determine where to generate the misalignment computation.
5474
5475      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5476      calculation will be generated by this function, outside the loop (in the
5477      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5478      caller, inside the loop.
5479
5480      Background: If the misalignment remains fixed throughout the iterations of
5481      the loop, then both realignment schemes are applicable, and also the
5482      misalignment computation can be done outside LOOP.  This is because we are
5483      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5484      are a multiple of VS (the Vector Size), and therefore the misalignment in
5485      different vectorized LOOP iterations is always the same.
5486      The problem arises only if the memory access is in an inner-loop nested
5487      inside LOOP, which is now being vectorized using outer-loop vectorization.
5488      This is the only case when the misalignment of the memory access may not
5489      remain fixed throughout the iterations of the inner-loop (as explained in
5490      detail in vect_supportable_dr_alignment).  In this case, not only is the
5491      optimized realignment scheme not applicable, but also the misalignment
5492      computation (and generation of the realignment token that is passed to
5493      REALIGN_LOAD) have to be done inside the loop.
5494
5495      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5496      or not, which in turn determines if the misalignment is computed inside
5497      the inner-loop, or outside LOOP.  */
5498
5499   if (init_addr != NULL_TREE || !loop_vinfo)
5500     {
5501       compute_in_loop = true;
5502       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5503     }
5504
5505
5506   /* 2. Determine where to generate the extra vector load.
5507
5508      For the optimized realignment scheme, instead of generating two vector
5509      loads in each iteration, we generate a single extra vector load in the
5510      preheader of the loop, and in each iteration reuse the result of the
5511      vector load from the previous iteration.  In case the memory access is in
5512      an inner-loop nested inside LOOP, which is now being vectorized using
5513      outer-loop vectorization, we need to determine whether this initial vector
5514      load should be generated at the preheader of the inner-loop, or can be
5515      generated at the preheader of LOOP.  If the memory access has no evolution
5516      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5517      to be generated inside LOOP (in the preheader of the inner-loop).  */
5518
5519   if (nested_in_vect_loop)
5520     {
5521       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5522       bool invariant_in_outerloop =
5523             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5524       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5525     }
5526   else
5527     loop_for_initial_load = loop;
5528   if (at_loop)
5529     *at_loop = loop_for_initial_load;
5530
5531   if (loop_for_initial_load)
5532     pe = loop_preheader_edge (loop_for_initial_load);
5533
5534   /* 3. For the case of the optimized realignment, create the first vector
5535       load at the loop preheader.  */
5536
5537   if (alignment_support_scheme == dr_explicit_realign_optimized)
5538     {
5539       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5540       gassign *new_stmt;
5541
5542       gcc_assert (!compute_in_loop);
5543       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5544       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5545                                       loop_for_initial_load, NULL_TREE,
5546                                       &init_addr, NULL, &inc, true);
5547       if (TREE_CODE (ptr) == SSA_NAME)
5548         new_temp = copy_ssa_name (ptr);
5549       else
5550         new_temp = make_ssa_name (TREE_TYPE (ptr));
5551       unsigned int align = DR_TARGET_ALIGNMENT (dr_info);
5552       new_stmt = gimple_build_assign
5553                    (new_temp, BIT_AND_EXPR, ptr,
5554                     build_int_cst (TREE_TYPE (ptr), -(HOST_WIDE_INT) align));
5555       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5556       gcc_assert (!new_bb);
5557       data_ref
5558         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5559                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5560       vect_copy_ref_info (data_ref, DR_REF (dr));
5561       new_stmt = gimple_build_assign (vec_dest, data_ref);
5562       new_temp = make_ssa_name (vec_dest, new_stmt);
5563       gimple_assign_set_lhs (new_stmt, new_temp);
5564       if (pe)
5565         {
5566           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5567           gcc_assert (!new_bb);
5568         }
5569       else
5570          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5571
5572       msq_init = gimple_assign_lhs (new_stmt);
5573     }
5574
5575   /* 4. Create realignment token using a target builtin, if available.
5576       It is done either inside the containing loop, or before LOOP (as
5577       determined above).  */
5578
5579   if (targetm.vectorize.builtin_mask_for_load)
5580     {
5581       gcall *new_stmt;
5582       tree builtin_decl;
5583
5584       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5585       if (!init_addr)
5586         {
5587           /* Generate the INIT_ADDR computation outside LOOP.  */
5588           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5589                                                             NULL_TREE);
5590           if (loop)
5591             {
5592               pe = loop_preheader_edge (loop);
5593               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5594               gcc_assert (!new_bb);
5595             }
5596           else
5597              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5598         }
5599
5600       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5601       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5602       vec_dest =
5603         vect_create_destination_var (scalar_dest,
5604                                      gimple_call_return_type (new_stmt));
5605       new_temp = make_ssa_name (vec_dest, new_stmt);
5606       gimple_call_set_lhs (new_stmt, new_temp);
5607
5608       if (compute_in_loop)
5609         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5610       else
5611         {
5612           /* Generate the misalignment computation outside LOOP.  */
5613           pe = loop_preheader_edge (loop);
5614           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5615           gcc_assert (!new_bb);
5616         }
5617
5618       *realignment_token = gimple_call_lhs (new_stmt);
5619
5620       /* The result of the CALL_EXPR to this builtin is determined from
5621          the value of the parameter and no global variables are touched
5622          which makes the builtin a "const" function.  Requiring the
5623          builtin to have the "const" attribute makes it unnecessary
5624          to call mark_call_clobbered.  */
5625       gcc_assert (TREE_READONLY (builtin_decl));
5626     }
5627
5628   if (alignment_support_scheme == dr_explicit_realign)
5629     return msq;
5630
5631   gcc_assert (!compute_in_loop);
5632   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5633
5634
5635   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5636
5637   pe = loop_preheader_edge (containing_loop);
5638   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5639   msq = make_ssa_name (vec_dest);
5640   phi_stmt = create_phi_node (msq, containing_loop->header);
5641   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5642
5643   return msq;
5644 }
5645
5646
5647 /* Function vect_grouped_load_supported.
5648
5649    COUNT is the size of the load group (the number of statements plus the
5650    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5651    only one statement, with a gap of COUNT - 1.
5652
5653    Returns true if a suitable permute exists.  */
5654
5655 bool
5656 vect_grouped_load_supported (tree vectype, bool single_element_p,
5657                              unsigned HOST_WIDE_INT count)
5658 {
5659   machine_mode mode = TYPE_MODE (vectype);
5660
5661   /* If this is single-element interleaving with an element distance
5662      that leaves unused vector loads around punt - we at least create
5663      very sub-optimal code in that case (and blow up memory,
5664      see PR65518).  */
5665   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5666     {
5667       if (dump_enabled_p ())
5668         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5669                          "single-element interleaving not supported "
5670                          "for not adjacent vector loads\n");
5671       return false;
5672     }
5673
5674   /* vect_permute_load_chain requires the group size to be equal to 3 or
5675      be a power of two.  */
5676   if (count != 3 && exact_log2 (count) == -1)
5677     {
5678       if (dump_enabled_p ())
5679         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5680                          "the size of the group of accesses"
5681                          " is not a power of 2 or not equal to 3\n");
5682       return false;
5683     }
5684
5685   /* Check that the permutation is supported.  */
5686   if (VECTOR_MODE_P (mode))
5687     {
5688       unsigned int i, j;
5689       if (count == 3)
5690         {
5691           unsigned int nelt;
5692           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5693             {
5694               if (dump_enabled_p ())
5695                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5696                                  "cannot handle groups of 3 loads for"
5697                                  " variable-length vectors\n");
5698               return false;
5699             }
5700
5701           vec_perm_builder sel (nelt, nelt, 1);
5702           sel.quick_grow (nelt);
5703           vec_perm_indices indices;
5704           unsigned int k;
5705           for (k = 0; k < 3; k++)
5706             {
5707               for (i = 0; i < nelt; i++)
5708                 if (3 * i + k < 2 * nelt)
5709                   sel[i] = 3 * i + k;
5710                 else
5711                   sel[i] = 0;
5712               indices.new_vector (sel, 2, nelt);
5713               if (!can_vec_perm_const_p (mode, indices))
5714                 {
5715                   if (dump_enabled_p ())
5716                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5717                                      "shuffle of 3 loads is not supported by"
5718                                      " target\n");
5719                   return false;
5720                 }
5721               for (i = 0, j = 0; i < nelt; i++)
5722                 if (3 * i + k < 2 * nelt)
5723                   sel[i] = i;
5724                 else
5725                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5726               indices.new_vector (sel, 2, nelt);
5727               if (!can_vec_perm_const_p (mode, indices))
5728                 {
5729                   if (dump_enabled_p ())
5730                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5731                                      "shuffle of 3 loads is not supported by"
5732                                      " target\n");
5733                   return false;
5734                 }
5735             }
5736           return true;
5737         }
5738       else
5739         {
5740           /* If length is not equal to 3 then only power of 2 is supported.  */
5741           gcc_assert (pow2p_hwi (count));
5742           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5743
5744           /* The encoding has a single stepped pattern.  */
5745           vec_perm_builder sel (nelt, 1, 3);
5746           sel.quick_grow (3);
5747           for (i = 0; i < 3; i++)
5748             sel[i] = i * 2;
5749           vec_perm_indices indices (sel, 2, nelt);
5750           if (can_vec_perm_const_p (mode, indices))
5751             {
5752               for (i = 0; i < 3; i++)
5753                 sel[i] = i * 2 + 1;
5754               indices.new_vector (sel, 2, nelt);
5755               if (can_vec_perm_const_p (mode, indices))
5756                 return true;
5757             }
5758         }
5759     }
5760
5761   if (dump_enabled_p ())
5762     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5763                      "extract even/odd not supported by target\n");
5764   return false;
5765 }
5766
5767 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5768    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5769
5770 bool
5771 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5772                            bool masked_p)
5773 {
5774   if (masked_p)
5775     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5776                                          vec_mask_load_lanes_optab,
5777                                          vectype, count);
5778   else
5779     return vect_lanes_optab_supported_p ("vec_load_lanes",
5780                                          vec_load_lanes_optab,
5781                                          vectype, count);
5782 }
5783
5784 /* Function vect_permute_load_chain.
5785
5786    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5787    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5788    the input data correctly.  Return the final references for loads in
5789    RESULT_CHAIN.
5790
5791    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5792    The input is 4 vectors each containing 8 elements. We assign a number to each
5793    element, the input sequence is:
5794
5795    1st vec:   0  1  2  3  4  5  6  7
5796    2nd vec:   8  9 10 11 12 13 14 15
5797    3rd vec:  16 17 18 19 20 21 22 23
5798    4th vec:  24 25 26 27 28 29 30 31
5799
5800    The output sequence should be:
5801
5802    1st vec:  0 4  8 12 16 20 24 28
5803    2nd vec:  1 5  9 13 17 21 25 29
5804    3rd vec:  2 6 10 14 18 22 26 30
5805    4th vec:  3 7 11 15 19 23 27 31
5806
5807    i.e., the first output vector should contain the first elements of each
5808    interleaving group, etc.
5809
5810    We use extract_even/odd instructions to create such output.  The input of
5811    each extract_even/odd operation is two vectors
5812    1st vec    2nd vec
5813    0 1 2 3    4 5 6 7
5814
5815    and the output is the vector of extracted even/odd elements.  The output of
5816    extract_even will be:   0 2 4 6
5817    and of extract_odd:     1 3 5 7
5818
5819
5820    The permutation is done in log LENGTH stages.  In each stage extract_even
5821    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5822    their order.  In our example,
5823
5824    E1: extract_even (1st vec, 2nd vec)
5825    E2: extract_odd (1st vec, 2nd vec)
5826    E3: extract_even (3rd vec, 4th vec)
5827    E4: extract_odd (3rd vec, 4th vec)
5828
5829    The output for the first stage will be:
5830
5831    E1:  0  2  4  6  8 10 12 14
5832    E2:  1  3  5  7  9 11 13 15
5833    E3: 16 18 20 22 24 26 28 30
5834    E4: 17 19 21 23 25 27 29 31
5835
5836    In order to proceed and create the correct sequence for the next stage (or
5837    for the correct output, if the second stage is the last one, as in our
5838    example), we first put the output of extract_even operation and then the
5839    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5840    The input for the second stage is:
5841
5842    1st vec (E1):  0  2  4  6  8 10 12 14
5843    2nd vec (E3): 16 18 20 22 24 26 28 30
5844    3rd vec (E2):  1  3  5  7  9 11 13 15
5845    4th vec (E4): 17 19 21 23 25 27 29 31
5846
5847    The output of the second stage:
5848
5849    E1: 0 4  8 12 16 20 24 28
5850    E2: 2 6 10 14 18 22 26 30
5851    E3: 1 5  9 13 17 21 25 29
5852    E4: 3 7 11 15 19 23 27 31
5853
5854    And RESULT_CHAIN after reordering:
5855
5856    1st vec (E1):  0 4  8 12 16 20 24 28
5857    2nd vec (E3):  1 5  9 13 17 21 25 29
5858    3rd vec (E2):  2 6 10 14 18 22 26 30
5859    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5860
5861 static void
5862 vect_permute_load_chain (vec<tree> dr_chain,
5863                          unsigned int length,
5864                          stmt_vec_info stmt_info,
5865                          gimple_stmt_iterator *gsi,
5866                          vec<tree> *result_chain)
5867 {
5868   tree data_ref, first_vect, second_vect;
5869   tree perm_mask_even, perm_mask_odd;
5870   tree perm3_mask_low, perm3_mask_high;
5871   gimple *perm_stmt;
5872   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5873   unsigned int i, j, log_length = exact_log2 (length);
5874
5875   result_chain->quick_grow (length);
5876   memcpy (result_chain->address (), dr_chain.address (),
5877           length * sizeof (tree));
5878
5879   if (length == 3)
5880     {
5881       /* vect_grouped_load_supported ensures that this is constant.  */
5882       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5883       unsigned int k;
5884
5885       vec_perm_builder sel (nelt, nelt, 1);
5886       sel.quick_grow (nelt);
5887       vec_perm_indices indices;
5888       for (k = 0; k < 3; k++)
5889         {
5890           for (i = 0; i < nelt; i++)
5891             if (3 * i + k < 2 * nelt)
5892               sel[i] = 3 * i + k;
5893             else
5894               sel[i] = 0;
5895           indices.new_vector (sel, 2, nelt);
5896           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5897
5898           for (i = 0, j = 0; i < nelt; i++)
5899             if (3 * i + k < 2 * nelt)
5900               sel[i] = i;
5901             else
5902               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5903           indices.new_vector (sel, 2, nelt);
5904           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5905
5906           first_vect = dr_chain[0];
5907           second_vect = dr_chain[1];
5908
5909           /* Create interleaving stmt (low part of):
5910              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5911                                                              ...}>  */
5912           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5913           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5914                                            second_vect, perm3_mask_low);
5915           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5916
5917           /* Create interleaving stmt (high part of):
5918              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5919                                                               ...}>  */
5920           first_vect = data_ref;
5921           second_vect = dr_chain[2];
5922           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5923           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5924                                            second_vect, perm3_mask_high);
5925           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5926           (*result_chain)[k] = data_ref;
5927         }
5928     }
5929   else
5930     {
5931       /* If length is not equal to 3 then only power of 2 is supported.  */
5932       gcc_assert (pow2p_hwi (length));
5933
5934       /* The encoding has a single stepped pattern.  */
5935       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5936       vec_perm_builder sel (nelt, 1, 3);
5937       sel.quick_grow (3);
5938       for (i = 0; i < 3; ++i)
5939         sel[i] = i * 2;
5940       vec_perm_indices indices (sel, 2, nelt);
5941       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5942
5943       for (i = 0; i < 3; ++i)
5944         sel[i] = i * 2 + 1;
5945       indices.new_vector (sel, 2, nelt);
5946       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5947
5948       for (i = 0; i < log_length; i++)
5949         {
5950           for (j = 0; j < length; j += 2)
5951             {
5952               first_vect = dr_chain[j];
5953               second_vect = dr_chain[j+1];
5954
5955               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5956               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5957               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5958                                                first_vect, second_vect,
5959                                                perm_mask_even);
5960               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5961               (*result_chain)[j/2] = data_ref;
5962
5963               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5964               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5965               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5966                                                first_vect, second_vect,
5967                                                perm_mask_odd);
5968               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5969               (*result_chain)[j/2+length/2] = data_ref;
5970             }
5971           memcpy (dr_chain.address (), result_chain->address (),
5972                   length * sizeof (tree));
5973         }
5974     }
5975 }
5976
5977 /* Function vect_shift_permute_load_chain.
5978
5979    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5980    sequence of stmts to reorder the input data accordingly.
5981    Return the final references for loads in RESULT_CHAIN.
5982    Return true if successed, false otherwise.
5983
5984    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5985    The input is 3 vectors each containing 8 elements.  We assign a
5986    number to each element, the input sequence is:
5987
5988    1st vec:   0  1  2  3  4  5  6  7
5989    2nd vec:   8  9 10 11 12 13 14 15
5990    3rd vec:  16 17 18 19 20 21 22 23
5991
5992    The output sequence should be:
5993
5994    1st vec:  0 3 6  9 12 15 18 21
5995    2nd vec:  1 4 7 10 13 16 19 22
5996    3rd vec:  2 5 8 11 14 17 20 23
5997
5998    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5999
6000    First we shuffle all 3 vectors to get correct elements order:
6001
6002    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
6003    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
6004    3rd vec:  (16 19 22) (17 20 23) (18 21)
6005
6006    Next we unite and shift vector 3 times:
6007
6008    1st step:
6009      shift right by 6 the concatenation of:
6010      "1st vec" and  "2nd vec"
6011        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
6012      "2nd vec" and  "3rd vec"
6013        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
6014      "3rd vec" and  "1st vec"
6015        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
6016                              | New vectors                   |
6017
6018      So that now new vectors are:
6019
6020      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
6021      2nd vec:  (10 13) (16 19 22) (17 20 23)
6022      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
6023
6024    2nd step:
6025      shift right by 5 the concatenation of:
6026      "1st vec" and  "3rd vec"
6027        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
6028      "2nd vec" and  "1st vec"
6029        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
6030      "3rd vec" and  "2nd vec"
6031        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
6032                           | New vectors                   |
6033
6034      So that now new vectors are:
6035
6036      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
6037      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6038      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6039
6040    3rd step:
6041      shift right by 5 the concatenation of:
6042      "1st vec" and  "1st vec"
6043        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6044      shift right by 3 the concatenation of:
6045      "2nd vec" and  "2nd vec"
6046                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6047                           | New vectors                   |
6048
6049      So that now all vectors are READY:
6050      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6051      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6052      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6053
6054    This algorithm is faster than one in vect_permute_load_chain if:
6055      1.  "shift of a concatination" is faster than general permutation.
6056          This is usually so.
6057      2.  The TARGET machine can't execute vector instructions in parallel.
6058          This is because each step of the algorithm depends on previous.
6059          The algorithm in vect_permute_load_chain is much more parallel.
6060
6061    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6062 */
6063
6064 static bool
6065 vect_shift_permute_load_chain (vec<tree> dr_chain,
6066                                unsigned int length,
6067                                stmt_vec_info stmt_info,
6068                                gimple_stmt_iterator *gsi,
6069                                vec<tree> *result_chain)
6070 {
6071   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6072   tree perm2_mask1, perm2_mask2, perm3_mask;
6073   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6074   gimple *perm_stmt;
6075
6076   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6077   unsigned int i;
6078   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6079
6080   unsigned HOST_WIDE_INT nelt, vf;
6081   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6082       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6083     /* Not supported for variable-length vectors.  */
6084     return false;
6085
6086   vec_perm_builder sel (nelt, nelt, 1);
6087   sel.quick_grow (nelt);
6088
6089   result_chain->quick_grow (length);
6090   memcpy (result_chain->address (), dr_chain.address (),
6091           length * sizeof (tree));
6092
6093   if (pow2p_hwi (length) && vf > 4)
6094     {
6095       unsigned int j, log_length = exact_log2 (length);
6096       for (i = 0; i < nelt / 2; ++i)
6097         sel[i] = i * 2;
6098       for (i = 0; i < nelt / 2; ++i)
6099         sel[nelt / 2 + i] = i * 2 + 1;
6100       vec_perm_indices indices (sel, 2, nelt);
6101       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6102         {
6103           if (dump_enabled_p ())
6104             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6105                              "shuffle of 2 fields structure is not \
6106                               supported by target\n");
6107           return false;
6108         }
6109       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6110
6111       for (i = 0; i < nelt / 2; ++i)
6112         sel[i] = i * 2 + 1;
6113       for (i = 0; i < nelt / 2; ++i)
6114         sel[nelt / 2 + i] = i * 2;
6115       indices.new_vector (sel, 2, nelt);
6116       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6117         {
6118           if (dump_enabled_p ())
6119             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6120                              "shuffle of 2 fields structure is not \
6121                               supported by target\n");
6122           return false;
6123         }
6124       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6125
6126       /* Generating permutation constant to shift all elements.
6127          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6128       for (i = 0; i < nelt; i++)
6129         sel[i] = nelt / 2 + i;
6130       indices.new_vector (sel, 2, nelt);
6131       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6132         {
6133           if (dump_enabled_p ())
6134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6135                              "shift permutation is not supported by target\n");
6136           return false;
6137         }
6138       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6139
6140       /* Generating permutation constant to select vector from 2.
6141          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6142       for (i = 0; i < nelt / 2; i++)
6143         sel[i] = i;
6144       for (i = nelt / 2; i < nelt; i++)
6145         sel[i] = nelt + i;
6146       indices.new_vector (sel, 2, nelt);
6147       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6148         {
6149           if (dump_enabled_p ())
6150             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6151                              "select is not supported by target\n");
6152           return false;
6153         }
6154       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6155
6156       for (i = 0; i < log_length; i++)
6157         {
6158           for (j = 0; j < length; j += 2)
6159             {
6160               first_vect = dr_chain[j];
6161               second_vect = dr_chain[j + 1];
6162
6163               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6164               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6165                                                first_vect, first_vect,
6166                                                perm2_mask1);
6167               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6168               vect[0] = data_ref;
6169
6170               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6171               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6172                                                second_vect, second_vect,
6173                                                perm2_mask2);
6174               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6175               vect[1] = data_ref;
6176
6177               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6178               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6179                                                vect[0], vect[1], shift1_mask);
6180               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6181               (*result_chain)[j/2 + length/2] = data_ref;
6182
6183               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6184               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6185                                                vect[0], vect[1], select_mask);
6186               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6187               (*result_chain)[j/2] = data_ref;
6188             }
6189           memcpy (dr_chain.address (), result_chain->address (),
6190                   length * sizeof (tree));
6191         }
6192       return true;
6193     }
6194   if (length == 3 && vf > 2)
6195     {
6196       unsigned int k = 0, l = 0;
6197
6198       /* Generating permutation constant to get all elements in rigth order.
6199          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6200       for (i = 0; i < nelt; i++)
6201         {
6202           if (3 * k + (l % 3) >= nelt)
6203             {
6204               k = 0;
6205               l += (3 - (nelt % 3));
6206             }
6207           sel[i] = 3 * k + (l % 3);
6208           k++;
6209         }
6210       vec_perm_indices indices (sel, 2, nelt);
6211       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6212         {
6213           if (dump_enabled_p ())
6214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6215                              "shuffle of 3 fields structure is not \
6216                               supported by target\n");
6217           return false;
6218         }
6219       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6220
6221       /* Generating permutation constant to shift all elements.
6222          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6223       for (i = 0; i < nelt; i++)
6224         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6225       indices.new_vector (sel, 2, nelt);
6226       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6227         {
6228           if (dump_enabled_p ())
6229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6230                              "shift permutation is not supported by target\n");
6231           return false;
6232         }
6233       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6234
6235       /* Generating permutation constant to shift all elements.
6236          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6237       for (i = 0; i < nelt; i++)
6238         sel[i] = 2 * (nelt / 3) + 1 + i;
6239       indices.new_vector (sel, 2, nelt);
6240       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6241         {
6242           if (dump_enabled_p ())
6243             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6244                              "shift permutation is not supported by target\n");
6245           return false;
6246         }
6247       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6248
6249       /* Generating permutation constant to shift all elements.
6250          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6251       for (i = 0; i < nelt; i++)
6252         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6253       indices.new_vector (sel, 2, nelt);
6254       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6255         {
6256           if (dump_enabled_p ())
6257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258                              "shift permutation is not supported by target\n");
6259           return false;
6260         }
6261       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6262
6263       /* Generating permutation constant to shift all elements.
6264          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6265       for (i = 0; i < nelt; i++)
6266         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6267       indices.new_vector (sel, 2, nelt);
6268       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6269         {
6270           if (dump_enabled_p ())
6271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6272                              "shift permutation is not supported by target\n");
6273           return false;
6274         }
6275       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6276
6277       for (k = 0; k < 3; k++)
6278         {
6279           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6280           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6281                                            dr_chain[k], dr_chain[k],
6282                                            perm3_mask);
6283           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6284           vect[k] = data_ref;
6285         }
6286
6287       for (k = 0; k < 3; k++)
6288         {
6289           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6290           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6291                                            vect[k % 3], vect[(k + 1) % 3],
6292                                            shift1_mask);
6293           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6294           vect_shift[k] = data_ref;
6295         }
6296
6297       for (k = 0; k < 3; k++)
6298         {
6299           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6300           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6301                                            vect_shift[(4 - k) % 3],
6302                                            vect_shift[(3 - k) % 3],
6303                                            shift2_mask);
6304           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6305           vect[k] = data_ref;
6306         }
6307
6308       (*result_chain)[3 - (nelt % 3)] = vect[2];
6309
6310       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6311       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6312                                        vect[0], shift3_mask);
6313       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6314       (*result_chain)[nelt % 3] = data_ref;
6315
6316       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6317       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6318                                        vect[1], shift4_mask);
6319       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6320       (*result_chain)[0] = data_ref;
6321       return true;
6322     }
6323   return false;
6324 }
6325
6326 /* Function vect_transform_grouped_load.
6327
6328    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6329    to perform their permutation and ascribe the result vectorized statements to
6330    the scalar statements.
6331 */
6332
6333 void
6334 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6335                              int size, gimple_stmt_iterator *gsi)
6336 {
6337   machine_mode mode;
6338   vec<tree> result_chain = vNULL;
6339
6340   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6341      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6342      vectors, that are ready for vector computation.  */
6343   result_chain.create (size);
6344
6345   /* If reassociation width for vector type is 2 or greater target machine can
6346      execute 2 or more vector instructions in parallel.  Otherwise try to
6347      get chain for loads group using vect_shift_permute_load_chain.  */
6348   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6349   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6350       || pow2p_hwi (size)
6351       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6352                                          gsi, &result_chain))
6353     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6354   vect_record_grouped_load_vectors (stmt_info, result_chain);
6355   result_chain.release ();
6356 }
6357
6358 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6359    generated as part of the vectorization of STMT_INFO.  Assign the statement
6360    for each vector to the associated scalar statement.  */
6361
6362 void
6363 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6364                                   vec<tree> result_chain)
6365 {
6366   vec_info *vinfo = stmt_info->vinfo;
6367   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6368   unsigned int i, gap_count;
6369   tree tmp_data_ref;
6370
6371   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6372      Since we scan the chain starting from it's first node, their order
6373      corresponds the order of data-refs in RESULT_CHAIN.  */
6374   stmt_vec_info next_stmt_info = first_stmt_info;
6375   gap_count = 1;
6376   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6377     {
6378       if (!next_stmt_info)
6379         break;
6380
6381       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6382        code elimination pass later.  No need to check for the first stmt in
6383        the group, since it always exists.
6384        DR_GROUP_GAP is the number of steps in elements from the previous
6385        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6386        correspond to the gaps.  */
6387       if (next_stmt_info != first_stmt_info
6388           && gap_count < DR_GROUP_GAP (next_stmt_info))
6389       {
6390         gap_count++;
6391         continue;
6392       }
6393
6394       while (next_stmt_info)
6395         {
6396           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6397           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6398              copies, and we put the new vector statement in the first available
6399              RELATED_STMT.  */
6400           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6401             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6402           else
6403             {
6404               if (!DR_GROUP_SAME_DR_STMT (next_stmt_info))
6405                 {
6406                   stmt_vec_info prev_stmt_info
6407                     = STMT_VINFO_VEC_STMT (next_stmt_info);
6408                   stmt_vec_info rel_stmt_info
6409                     = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6410                   while (rel_stmt_info)
6411                     {
6412                       prev_stmt_info = rel_stmt_info;
6413                       rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6414                     }
6415
6416                   STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6417                 }
6418             }
6419
6420           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6421           gap_count = 1;
6422           /* If NEXT_STMT_INFO accesses the same DR as the previous statement,
6423              put the same TMP_DATA_REF as its vectorized statement; otherwise
6424              get the next data-ref from RESULT_CHAIN.  */
6425           if (!next_stmt_info || !DR_GROUP_SAME_DR_STMT (next_stmt_info))
6426             break;
6427         }
6428     }
6429 }
6430
6431 /* Function vect_force_dr_alignment_p.
6432
6433    Returns whether the alignment of a DECL can be forced to be aligned
6434    on ALIGNMENT bit boundary.  */
6435
6436 bool
6437 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
6438 {
6439   if (!VAR_P (decl))
6440     return false;
6441
6442   if (decl_in_symtab_p (decl)
6443       && !symtab_node::get (decl)->can_increase_alignment_p ())
6444     return false;
6445
6446   if (TREE_STATIC (decl))
6447     return (alignment <= MAX_OFILE_ALIGNMENT);
6448   else
6449     return (alignment <= MAX_STACK_ALIGNMENT);
6450 }
6451
6452
6453 /* Return whether the data reference DR_INFO is supported with respect to its
6454    alignment.
6455    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6456    it is aligned, i.e., check if it is possible to vectorize it with different
6457    alignment.  */
6458
6459 enum dr_alignment_support
6460 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6461                                bool check_aligned_accesses)
6462 {
6463   data_reference *dr = dr_info->dr;
6464   stmt_vec_info stmt_info = dr_info->stmt;
6465   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6466   machine_mode mode = TYPE_MODE (vectype);
6467   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6468   struct loop *vect_loop = NULL;
6469   bool nested_in_vect_loop = false;
6470
6471   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6472     return dr_aligned;
6473
6474   /* For now assume all conditional loads/stores support unaligned
6475      access without any special code.  */
6476   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6477     if (gimple_call_internal_p (stmt)
6478         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6479             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6480       return dr_unaligned_supported;
6481
6482   if (loop_vinfo)
6483     {
6484       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6485       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6486     }
6487
6488   /* Possibly unaligned access.  */
6489
6490   /* We can choose between using the implicit realignment scheme (generating
6491      a misaligned_move stmt) and the explicit realignment scheme (generating
6492      aligned loads with a REALIGN_LOAD).  There are two variants to the
6493      explicit realignment scheme: optimized, and unoptimized.
6494      We can optimize the realignment only if the step between consecutive
6495      vector loads is equal to the vector size.  Since the vector memory
6496      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6497      is guaranteed that the misalignment amount remains the same throughout the
6498      execution of the vectorized loop.  Therefore, we can create the
6499      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6500      at the loop preheader.
6501
6502      However, in the case of outer-loop vectorization, when vectorizing a
6503      memory access in the inner-loop nested within the LOOP that is now being
6504      vectorized, while it is guaranteed that the misalignment of the
6505      vectorized memory access will remain the same in different outer-loop
6506      iterations, it is *not* guaranteed that is will remain the same throughout
6507      the execution of the inner-loop.  This is because the inner-loop advances
6508      with the original scalar step (and not in steps of VS).  If the inner-loop
6509      step happens to be a multiple of VS, then the misalignment remains fixed
6510      and we can use the optimized realignment scheme.  For example:
6511
6512       for (i=0; i<N; i++)
6513         for (j=0; j<M; j++)
6514           s += a[i+j];
6515
6516      When vectorizing the i-loop in the above example, the step between
6517      consecutive vector loads is 1, and so the misalignment does not remain
6518      fixed across the execution of the inner-loop, and the realignment cannot
6519      be optimized (as illustrated in the following pseudo vectorized loop):
6520
6521       for (i=0; i<N; i+=4)
6522         for (j=0; j<M; j++){
6523           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6524                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6525                          // (assuming that we start from an aligned address).
6526           }
6527
6528      We therefore have to use the unoptimized realignment scheme:
6529
6530       for (i=0; i<N; i+=4)
6531           for (j=k; j<M; j+=4)
6532           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6533                            // that the misalignment of the initial address is
6534                            // 0).
6535
6536      The loop can then be vectorized as follows:
6537
6538       for (k=0; k<4; k++){
6539         rt = get_realignment_token (&vp[k]);
6540         for (i=0; i<N; i+=4){
6541           v1 = vp[i+k];
6542           for (j=k; j<M; j+=4){
6543             v2 = vp[i+j+VS-1];
6544             va = REALIGN_LOAD <v1,v2,rt>;
6545             vs += va;
6546             v1 = v2;
6547           }
6548         }
6549     } */
6550
6551   if (DR_IS_READ (dr))
6552     {
6553       bool is_packed = false;
6554       tree type = (TREE_TYPE (DR_REF (dr)));
6555
6556       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6557           && (!targetm.vectorize.builtin_mask_for_load
6558               || targetm.vectorize.builtin_mask_for_load ()))
6559         {
6560           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6561
6562           /* If we are doing SLP then the accesses need not have the
6563              same alignment, instead it depends on the SLP group size.  */
6564           if (loop_vinfo
6565               && STMT_SLP_TYPE (stmt_info)
6566               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6567                               * (DR_GROUP_SIZE
6568                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6569                               TYPE_VECTOR_SUBPARTS (vectype)))
6570             ;
6571           else if (!loop_vinfo
6572                    || (nested_in_vect_loop
6573                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6574                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6575             return dr_explicit_realign;
6576           else
6577             return dr_explicit_realign_optimized;
6578         }
6579       if (!known_alignment_for_access_p (dr_info))
6580         is_packed = not_size_aligned (DR_REF (dr));
6581
6582       if (targetm.vectorize.support_vector_misalignment
6583             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6584         /* Can't software pipeline the loads, but can at least do them.  */
6585         return dr_unaligned_supported;
6586     }
6587   else
6588     {
6589       bool is_packed = false;
6590       tree type = (TREE_TYPE (DR_REF (dr)));
6591
6592       if (!known_alignment_for_access_p (dr_info))
6593         is_packed = not_size_aligned (DR_REF (dr));
6594
6595      if (targetm.vectorize.support_vector_misalignment
6596            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6597        return dr_unaligned_supported;
6598     }
6599
6600   /* Unsupported.  */
6601   return dr_unaligned_unsupported;
6602 }