gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "alias.h"
  28 #include "symtab.h"
  29 #include "tree.h"
  30 #include "fold-const.h"
  31 #include "stor-layout.h"
  32 #include "tm_p.h"
  33 #include "target.h"
  34 #include "predict.h"
  35 #include "hard-reg-set.h"
  36 #include "function.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "basic-block.h"
  40 #include "gimple-pretty-print.h"
  41 #include "tree-ssa-alias.h"
  42 #include "internal-fn.h"
  43 #include "tree-eh.h"
  44 #include "gimple-expr.h"
  45 #include "gimple.h"
  46 #include "gimplify.h"
  47 #include "gimple-iterator.h"
  48 #include "gimplify-me.h"
  49 #include "gimple-ssa.h"
  50 #include "tree-phinodes.h"
  51 #include "ssa-iterators.h"
  52 #include "stringpool.h"
  53 #include "tree-ssanames.h"
  54 #include "tree-ssa-loop-ivopts.h"
  55 #include "tree-ssa-loop-manip.h"
  56 #include "tree-ssa-loop.h"
  57 #include "cfgloop.h"
  58 #include "tree-chrec.h"
  59 #include "tree-scalar-evolution.h"
  60 #include "tree-vectorizer.h"
  61 #include "diagnostic-core.h"
  62 #include "plugin-api.h"
  63 #include "ipa-ref.h"
  64 #include "cgraph.h"
  65 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  66 #include "rtl.h"
  67 #include "flags.h"
  68 #include "insn-config.h"
  69 #include "expmed.h"
  70 #include "dojump.h"
  71 #include "explow.h"
  72 #include "calls.h"
  73 #include "emit-rtl.h"
  74 #include "varasm.h"
  75 #include "stmt.h"
  76 #include "expr.h"
  77 #include "insn-codes.h"
  78 #include "optabs.h"
  79 #include "builtins.h"
  80
  81 /* Return true if load- or store-lanes optab OPTAB is implemented for
  82    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  83
  84 static bool
  85 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  86                               tree vectype, unsigned HOST_WIDE_INT count)
  87 {
  88   machine_mode mode, array_mode;
  89   bool limit_p;
  90
  91   mode = TYPE_MODE (vectype);
  92   limit_p = !targetm.array_mode_supported_p (mode, count);
  93   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  94                               MODE_INT, limit_p);
  95
  96   if (array_mode == BLKmode)
  97     {
  98       if (dump_enabled_p ())
  99         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 100                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
 101                          GET_MODE_NAME (mode), count);
 102       return false;
 103     }
 104
 105   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
 106     {
 107       if (dump_enabled_p ())
 108         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 109                          "cannot use %s<%s><%s>\n", name,
 110                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
 111       return false;
 112     }
 113
 114   if (dump_enabled_p ())
 115     dump_printf_loc (MSG_NOTE, vect_location,
 116                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
 117                      GET_MODE_NAME (mode));
 118
 119   return true;
 120 }
 121
 122
 123 /* Return the smallest scalar part of STMT.
 124    This is used to determine the vectype of the stmt.  We generally set the
 125    vectype according to the type of the result (lhs).  For stmts whose
 126    result-type is different than the type of the arguments (e.g., demotion,
 127    promotion), vectype will be reset appropriately (later).  Note that we have
 128    to visit the smallest datatype in this function, because that determines the
 129    VF.  If the smallest datatype in the loop is present only as the rhs of a
 130    promotion operation - we'd miss it.
 131    Such a case, where a variable of this datatype does not appear in the lhs
 132    anywhere in the loop, can only occur if it's an invariant: e.g.:
 133    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 134    invariant motion.  However, we cannot rely on invariant motion to always
 135    take invariants out of the loop, and so in the case of promotion we also
 136    have to check the rhs.
 137    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 138    types.  */
 139
 140 tree
 141 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 142                                HOST_WIDE_INT *rhs_size_unit)
 143 {
 144   tree scalar_type = gimple_expr_type (stmt);
 145   HOST_WIDE_INT lhs, rhs;
 146
 147   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 148
 149   if (is_gimple_assign (stmt)
 150       && (gimple_assign_cast_p (stmt)
 151           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 152           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 153           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 154     {
 155       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 156
 157       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 158       if (rhs < lhs)
 159         scalar_type = rhs_type;
 160     }
 161
 162   *lhs_size_unit = lhs;
 163   *rhs_size_unit = rhs;
 164   return scalar_type;
 165 }
 166
 167
 168 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 169    tested at run-time.  Return TRUE if DDR was successfully inserted.
 170    Return false if versioning is not supported.  */
 171
 172 static bool
 173 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 174 {
 175   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 176
 177   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 178     return false;
 179
 180   if (dump_enabled_p ())
 181     {
 182       dump_printf_loc (MSG_NOTE, vect_location,
 183                        "mark for run-time aliasing test between ");
 184       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 185       dump_printf (MSG_NOTE,  " and ");
 186       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 187       dump_printf (MSG_NOTE, "\n");
 188     }
 189
 190   if (optimize_loop_nest_for_size_p (loop))
 191     {
 192       if (dump_enabled_p ())
 193         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 194                          "versioning not supported when optimizing"
 195                          " for size.\n");
 196       return false;
 197     }
 198
 199   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 200   if (loop->inner)
 201     {
 202       if (dump_enabled_p ())
 203         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 204                          "versioning not yet supported for outer-loops.\n");
 205       return false;
 206     }
 207
 208   /* FORNOW: We don't support creating runtime alias tests for non-constant
 209      step.  */
 210   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 211       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 212     {
 213       if (dump_enabled_p ())
 214         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 215                          "versioning not yet supported for non-constant "
 216                          "step\n");
 217       return false;
 218     }
 219
 220   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 221   return true;
 222 }
 223
 224
 225 /* Function vect_analyze_data_ref_dependence.
 226
 227    Return TRUE if there (might) exist a dependence between a memory-reference
 228    DRA and a memory-reference DRB.  When versioning for alias may check a
 229    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 230    the data dependence.  */
 231
 232 static bool
 233 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 234                                   loop_vec_info loop_vinfo, int *max_vf)
 235 {
 236   unsigned int i;
 237   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 238   struct data_reference *dra = DDR_A (ddr);
 239   struct data_reference *drb = DDR_B (ddr);
 240   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 241   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 242   lambda_vector dist_v;
 243   unsigned int loop_depth;
 244
 245   /* In loop analysis all data references should be vectorizable.  */
 246   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 247       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 248     gcc_unreachable ();
 249
 250   /* Independent data accesses.  */
 251   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 252     return false;
 253
 254   if (dra == drb
 255       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 256     return false;
 257
 258   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 259      least two scalar iterations, there is always also a true dependence.
 260      As the vectorizer does not re-order loads and stores we can ignore
 261      the anti-dependence if TBAA can disambiguate both DRs similar to the
 262      case with known negative distance anti-dependences (positive
 263      distance anti-dependences would violate TBAA constraints).  */
 264   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 265        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 266       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 267                                  get_alias_set (DR_REF (drb))))
 268     return false;
 269
 270   /* Unknown data dependence.  */
 271   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 272     {
 273       /* If user asserted safelen consecutive iterations can be
 274          executed concurrently, assume independence.  */
 275       if (loop->safelen >= 2)
 276         {
 277           if (loop->safelen < *max_vf)
 278             *max_vf = loop->safelen;
 279           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 280           return false;
 281         }
 282
 283       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 284           || STMT_VINFO_GATHER_P (stmtinfo_b))
 285         {
 286           if (dump_enabled_p ())
 287             {
 288               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 289                                "versioning for alias not supported for: "
 290                                "can't determine dependence between ");
 291               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 292                                  DR_REF (dra));
 293               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 294               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 295                                  DR_REF (drb));
 296               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 297             }
 298           return true;
 299         }
 300
 301       if (dump_enabled_p ())
 302         {
 303           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 304                            "versioning for alias required: "
 305                            "can't determine dependence between ");
 306           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 307                              DR_REF (dra));
 308           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 309           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 310                              DR_REF (drb));
 311           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 312         }
 313
 314       /* Add to list of ddrs that need to be tested at run-time.  */
 315       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 316     }
 317
 318   /* Known data dependence.  */
 319   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 320     {
 321       /* If user asserted safelen consecutive iterations can be
 322          executed concurrently, assume independence.  */
 323       if (loop->safelen >= 2)
 324         {
 325           if (loop->safelen < *max_vf)
 326             *max_vf = loop->safelen;
 327           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 328           return false;
 329         }
 330
 331       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 332           || STMT_VINFO_GATHER_P (stmtinfo_b))
 333         {
 334           if (dump_enabled_p ())
 335             {
 336               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 337                                "versioning for alias not supported for: "
 338                                "bad dist vector for ");
 339               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 340                                  DR_REF (dra));
 341               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 342               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 343                                  DR_REF (drb));
 344               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 345             }
 346           return true;
 347         }
 348
 349       if (dump_enabled_p ())
 350         {
 351           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 352                            "versioning for alias required: "
 353                            "bad dist vector for ");
 354           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 355           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 356           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 357           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 358         }
 359       /* Add to list of ddrs that need to be tested at run-time.  */
 360       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 361     }
 362
 363   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 364   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 365     {
 366       int dist = dist_v[loop_depth];
 367
 368       if (dump_enabled_p ())
 369         dump_printf_loc (MSG_NOTE, vect_location,
 370                          "dependence distance  = %d.\n", dist);
 371
 372       if (dist == 0)
 373         {
 374           if (dump_enabled_p ())
 375             {
 376               dump_printf_loc (MSG_NOTE, vect_location,
 377                                "dependence distance == 0 between ");
 378               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 379               dump_printf (MSG_NOTE, " and ");
 380               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 381               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 382             }
 383
 384           /* When we perform grouped accesses and perform implicit CSE
 385              by detecting equal accesses and doing disambiguation with
 386              runtime alias tests like for
 387                 .. = a[i];
 388                 .. = a[i+1];
 389                 a[i] = ..;
 390                 a[i+1] = ..;
 391                 *p = ..;
 392                 .. = a[i];
 393                 .. = a[i+1];
 394              where we will end up loading { a[i], a[i+1] } once, make
 395              sure that inserting group loads before the first load and
 396              stores after the last store will do the right thing.
 397              Similar for groups like
 398                 a[i] = ...;
 399                 ... = a[i];
 400                 a[i+1] = ...;
 401              where loads from the group interleave with the store.  */
 402           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 403               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 404             {
 405               gimple earlier_stmt;
 406               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 407               if (DR_IS_WRITE
 408                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 409                 {
 410                   if (dump_enabled_p ())
 411                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 412                                      "READ_WRITE dependence in interleaving."
 413                                      "\n");
 414                   return true;
 415                 }
 416             }
 417
 418           continue;
 419         }
 420
 421       if (dist > 0 && DDR_REVERSED_P (ddr))
 422         {
 423           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 424              reversed (to make distance vector positive), and the actual
 425              distance is negative.  */
 426           if (dump_enabled_p ())
 427             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 428                              "dependence distance negative.\n");
 429           /* Record a negative dependence distance to later limit the
 430              amount of stmt copying / unrolling we can perform.
 431              Only need to handle read-after-write dependence.  */
 432           if (DR_IS_READ (drb)
 433               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 434                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 435             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 436           continue;
 437         }
 438
 439       if (abs (dist) >= 2
 440           && abs (dist) < *max_vf)
 441         {
 442           /* The dependence distance requires reduction of the maximal
 443              vectorization factor.  */
 444           *max_vf = abs (dist);
 445           if (dump_enabled_p ())
 446             dump_printf_loc (MSG_NOTE, vect_location,
 447                              "adjusting maximal vectorization factor to %i\n",
 448                              *max_vf);
 449         }
 450
 451       if (abs (dist) >= *max_vf)
 452         {
 453           /* Dependence distance does not create dependence, as far as
 454              vectorization is concerned, in this case.  */
 455           if (dump_enabled_p ())
 456             dump_printf_loc (MSG_NOTE, vect_location,
 457                              "dependence distance >= VF.\n");
 458           continue;
 459         }
 460
 461       if (dump_enabled_p ())
 462         {
 463           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 464                        "not vectorized, possible dependence "
 465                        "between data-refs ");
 466           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 467           dump_printf (MSG_NOTE,  " and ");
 468           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 469           dump_printf (MSG_NOTE,  "\n");
 470         }
 471
 472       return true;
 473     }
 474
 475   return false;
 476 }
 477
 478 /* Function vect_analyze_data_ref_dependences.
 479
 480    Examine all the data references in the loop, and make sure there do not
 481    exist any data dependences between them.  Set *MAX_VF according to
 482    the maximum vectorization factor the data dependences allow.  */
 483
 484 bool
 485 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 486 {
 487   unsigned int i;
 488   struct data_dependence_relation *ddr;
 489
 490   if (dump_enabled_p ())
 491     dump_printf_loc (MSG_NOTE, vect_location,
 492                      "=== vect_analyze_data_ref_dependences ===\n");
 493
 494   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 495   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 496                                 &LOOP_VINFO_DDRS (loop_vinfo),
 497                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 498     return false;
 499
 500   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 501     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 502       return false;
 503
 504   return true;
 505 }
 506
 507
 508 /* Function vect_slp_analyze_data_ref_dependence.
 509
 510    Return TRUE if there (might) exist a dependence between a memory-reference
 511    DRA and a memory-reference DRB.  When versioning for alias may check a
 512    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 513    the data dependence.  */
 514
 515 static bool
 516 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 517 {
 518   struct data_reference *dra = DDR_A (ddr);
 519   struct data_reference *drb = DDR_B (ddr);
 520
 521   /* We need to check dependences of statements marked as unvectorizable
 522      as well, they still can prohibit vectorization.  */
 523
 524   /* Independent data accesses.  */
 525   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 526     return false;
 527
 528   if (dra == drb)
 529     return false;
 530
 531   /* Read-read is OK.  */
 532   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 533     return false;
 534
 535   /* If dra and drb are part of the same interleaving chain consider
 536      them independent.  */
 537   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 538       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 539           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 540     return false;
 541
 542   /* Unknown data dependence.  */
 543   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 544     {
 545       if  (dump_enabled_p ())
 546         {
 547           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 548                            "can't determine dependence between ");
 549           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 550           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 551           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 552           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 553         }
 554     }
 555   else if (dump_enabled_p ())
 556     {
 557       dump_printf_loc (MSG_NOTE, vect_location,
 558                        "determined dependence between ");
 559       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 560       dump_printf (MSG_NOTE, " and ");
 561       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 562       dump_printf (MSG_NOTE,  "\n");
 563     }
 564
 565   /* We do not vectorize basic blocks with write-write dependencies.  */
 566   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 567     return true;
 568
 569   /* If we have a read-write dependence check that the load is before the store.
 570      When we vectorize basic blocks, vector load can be only before
 571      corresponding scalar load, and vector store can be only after its
 572      corresponding scalar store.  So the order of the acceses is preserved in
 573      case the load is before the store.  */
 574   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 575   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 576     {
 577       /* That only holds for load-store pairs taking part in vectorization.  */
 578       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 579           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 580         return false;
 581     }
 582
 583   return true;
 584 }
 585
 586
 587 /* Function vect_analyze_data_ref_dependences.
 588
 589    Examine all the data references in the basic-block, and make sure there
 590    do not exist any data dependences between them.  Set *MAX_VF according to
 591    the maximum vectorization factor the data dependences allow.  */
 592
 593 bool
 594 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 595 {
 596   struct data_dependence_relation *ddr;
 597   unsigned int i;
 598
 599   if (dump_enabled_p ())
 600     dump_printf_loc (MSG_NOTE, vect_location,
 601                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 602
 603   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 604                                 &BB_VINFO_DDRS (bb_vinfo),
 605                                 vNULL, true))
 606     return false;
 607
 608   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 609     if (vect_slp_analyze_data_ref_dependence (ddr))
 610       return false;
 611
 612   return true;
 613 }
 614
 615
 616 /* Function vect_compute_data_ref_alignment
 617
 618    Compute the misalignment of the data reference DR.
 619
 620    Output:
 621    1. If during the misalignment computation it is found that the data reference
 622       cannot be vectorized then false is returned.
 623    2. DR_MISALIGNMENT (DR) is defined.
 624
 625    FOR NOW: No analysis is actually performed. Misalignment is calculated
 626    only for trivial cases. TODO.  */
 627
 628 static bool
 629 vect_compute_data_ref_alignment (struct data_reference *dr)
 630 {
 631   gimple stmt = DR_STMT (dr);
 632   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 633   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 634   struct loop *loop = NULL;
 635   tree ref = DR_REF (dr);
 636   tree vectype;
 637   tree base, base_addr;
 638   bool base_aligned;
 639   tree misalign = NULL_TREE;
 640   tree aligned_to;
 641   unsigned HOST_WIDE_INT alignment;
 642
 643   if (dump_enabled_p ())
 644     dump_printf_loc (MSG_NOTE, vect_location,
 645                      "vect_compute_data_ref_alignment:\n");
 646
 647   if (loop_vinfo)
 648     loop = LOOP_VINFO_LOOP (loop_vinfo);
 649
 650   /* Initialize misalignment to unknown.  */
 651   SET_DR_MISALIGNMENT (dr, -1);
 652
 653   /* Strided accesses perform only component accesses, misalignment information
 654      is irrelevant for them.  */
 655   if (STMT_VINFO_STRIDED_P (stmt_info)
 656       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 657     return true;
 658
 659   if (tree_fits_shwi_p (DR_STEP (dr)))
 660     misalign = DR_INIT (dr);
 661   aligned_to = DR_ALIGNED_TO (dr);
 662   base_addr = DR_BASE_ADDRESS (dr);
 663   vectype = STMT_VINFO_VECTYPE (stmt_info);
 664
 665   /* In case the dataref is in an inner-loop of the loop that is being
 666      vectorized (LOOP), we use the base and misalignment information
 667      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 668      stays the same throughout the execution of the inner-loop, which is why
 669      we have to check that the stride of the dataref in the inner-loop evenly
 670      divides by the vector size.  */
 671   if (loop && nested_in_vect_loop_p (loop, stmt))
 672     {
 673       tree step = DR_STEP (dr);
 674
 675       if (tree_fits_shwi_p (step)
 676           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 677         {
 678           if (dump_enabled_p ())
 679             dump_printf_loc (MSG_NOTE, vect_location,
 680                              "inner step divides the vector-size.\n");
 681           misalign = STMT_VINFO_DR_INIT (stmt_info);
 682           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 683           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 684         }
 685       else
 686         {
 687           if (dump_enabled_p ())
 688             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 689                              "inner step doesn't divide the vector-size.\n");
 690           misalign = NULL_TREE;
 691         }
 692     }
 693
 694   /* Similarly we can only use base and misalignment information relative to
 695      an innermost loop if the misalignment stays the same throughout the
 696      execution of the loop.  As above, this is the case if the stride of
 697      the dataref evenly divides by the vector size.  */
 698   else
 699     {
 700       tree step = DR_STEP (dr);
 701       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 702
 703       if (tree_fits_shwi_p (step)
 704           && ((tree_to_shwi (step) * vf)
 705               % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 706         {
 707           if (dump_enabled_p ())
 708             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 709                              "step doesn't divide the vector-size.\n");
 710           misalign = NULL_TREE;
 711         }
 712     }
 713
 714   alignment = TYPE_ALIGN_UNIT (vectype);
 715
 716   if ((compare_tree_int (aligned_to, alignment) < 0)
 717       || !misalign)
 718     {
 719       if (dump_enabled_p ())
 720         {
 721           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 722                            "Unknown alignment for access: ");
 723           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 724           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 725         }
 726       return true;
 727     }
 728
 729   /* To look at alignment of the base we have to preserve an inner MEM_REF
 730      as that carries alignment information of the actual access.  */
 731   base = ref;
 732   while (handled_component_p (base))
 733     base = TREE_OPERAND (base, 0);
 734   if (TREE_CODE (base) == MEM_REF)
 735     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 736                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 737
 738   if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
 739     base_aligned = true;
 740   else
 741     base_aligned = false;
 742
 743   if (!base_aligned)
 744     {
 745       /* Strip an inner MEM_REF to a bare decl if possible.  */
 746       if (TREE_CODE (base) == MEM_REF
 747           && integer_zerop (TREE_OPERAND (base, 1))
 748           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 749         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 750
 751       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 752         {
 753           if (dump_enabled_p ())
 754             {
 755               dump_printf_loc (MSG_NOTE, vect_location,
 756                                "can't force alignment of ref: ");
 757               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 758               dump_printf (MSG_NOTE, "\n");
 759             }
 760           return true;
 761         }
 762
 763       /* Force the alignment of the decl.
 764          NOTE: This is the only change to the code we make during
 765          the analysis phase, before deciding to vectorize the loop.  */
 766       if (dump_enabled_p ())
 767         {
 768           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 769           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 770           dump_printf (MSG_NOTE, "\n");
 771         }
 772
 773       ((dataref_aux *)dr->aux)->base_decl = base;
 774       ((dataref_aux *)dr->aux)->base_misaligned = true;
 775     }
 776
 777   /* If this is a backward running DR then first access in the larger
 778      vectype actually is N-1 elements before the address in the DR.
 779      Adjust misalign accordingly.  */
 780   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 781     {
 782       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 783       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 784          otherwise we wouldn't be here.  */
 785       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 786       /* PLUS because DR_STEP was negative.  */
 787       misalign = size_binop (PLUS_EXPR, misalign, offset);
 788     }
 789
 790   SET_DR_MISALIGNMENT (dr,
 791                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 792
 793   if (dump_enabled_p ())
 794     {
 795       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 796                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 797       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 798       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 799     }
 800
 801   return true;
 802 }
 803
 804
 805 /* Function vect_compute_data_refs_alignment
 806
 807    Compute the misalignment of data references in the loop.
 808    Return FALSE if a data reference is found that cannot be vectorized.  */
 809
 810 static bool
 811 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 812                                   bb_vec_info bb_vinfo)
 813 {
 814   vec<data_reference_p> datarefs;
 815   struct data_reference *dr;
 816   unsigned int i;
 817
 818   if (loop_vinfo)
 819     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 820   else
 821     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 822
 823   FOR_EACH_VEC_ELT (datarefs, i, dr)
 824     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 825         && !vect_compute_data_ref_alignment (dr))
 826       {
 827         if (bb_vinfo)
 828           {
 829             /* Mark unsupported statement as unvectorizable.  */
 830             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 831             continue;
 832           }
 833         else
 834           return false;
 835       }
 836
 837   return true;
 838 }
 839
 840
 841 /* Function vect_update_misalignment_for_peel
 842
 843    DR - the data reference whose misalignment is to be adjusted.
 844    DR_PEEL - the data reference whose misalignment is being made
 845              zero in the vector loop by the peel.
 846    NPEEL - the number of iterations in the peel loop if the misalignment
 847            of DR_PEEL is known at compile time.  */
 848
 849 static void
 850 vect_update_misalignment_for_peel (struct data_reference *dr,
 851                                    struct data_reference *dr_peel, int npeel)
 852 {
 853   unsigned int i;
 854   vec<dr_p> same_align_drs;
 855   struct data_reference *current_dr;
 856   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 857   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 858   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 859   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 860
 861  /* For interleaved data accesses the step in the loop must be multiplied by
 862      the size of the interleaving group.  */
 863   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 864     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 865   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 866     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 867
 868   /* It can be assumed that the data refs with the same alignment as dr_peel
 869      are aligned in the vector loop.  */
 870   same_align_drs
 871     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 872   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 873     {
 874       if (current_dr != dr)
 875         continue;
 876       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 877                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 878       SET_DR_MISALIGNMENT (dr, 0);
 879       return;
 880     }
 881
 882   if (known_alignment_for_access_p (dr)
 883       && known_alignment_for_access_p (dr_peel))
 884     {
 885       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 886       int misal = DR_MISALIGNMENT (dr);
 887       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 888       misal += negative ? -npeel * dr_size : npeel * dr_size;
 889       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 890       SET_DR_MISALIGNMENT (dr, misal);
 891       return;
 892     }
 893
 894   if (dump_enabled_p ())
 895     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 896   SET_DR_MISALIGNMENT (dr, -1);
 897 }
 898
 899
 900 /* Function vect_verify_datarefs_alignment
 901
 902    Return TRUE if all data references in the loop can be
 903    handled with respect to alignment.  */
 904
 905 bool
 906 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 907 {
 908   vec<data_reference_p> datarefs;
 909   struct data_reference *dr;
 910   enum dr_alignment_support supportable_dr_alignment;
 911   unsigned int i;
 912
 913   if (loop_vinfo)
 914     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 915   else
 916     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 917
 918   FOR_EACH_VEC_ELT (datarefs, i, dr)
 919     {
 920       gimple stmt = DR_STMT (dr);
 921       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 922
 923       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 924         continue;
 925
 926       /* For interleaving, only the alignment of the first access matters.
 927          Skip statements marked as not vectorizable.  */
 928       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 929            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 930           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 931         continue;
 932
 933       /* Strided accesses perform only component accesses, alignment is
 934          irrelevant for them.  */
 935       if (STMT_VINFO_STRIDED_P (stmt_info)
 936           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 937         continue;
 938
 939       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 940       if (!supportable_dr_alignment)
 941         {
 942           if (dump_enabled_p ())
 943             {
 944               if (DR_IS_READ (dr))
 945                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 946                                  "not vectorized: unsupported unaligned load.");
 947               else
 948                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 949                                  "not vectorized: unsupported unaligned "
 950                                  "store.");
 951
 952               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 953                                  DR_REF (dr));
 954               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 955             }
 956           return false;
 957         }
 958       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 959         dump_printf_loc (MSG_NOTE, vect_location,
 960                          "Vectorizing an unaligned access.\n");
 961     }
 962   return true;
 963 }
 964
 965 /* Given an memory reference EXP return whether its alignment is less
 966    than its size.  */
 967
 968 static bool
 969 not_size_aligned (tree exp)
 970 {
 971   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 972     return true;
 973
 974   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 975           > get_object_alignment (exp));
 976 }
 977
 978 /* Function vector_alignment_reachable_p
 979
 980    Return true if vector alignment for DR is reachable by peeling
 981    a few loop iterations.  Return false otherwise.  */
 982
 983 static bool
 984 vector_alignment_reachable_p (struct data_reference *dr)
 985 {
 986   gimple stmt = DR_STMT (dr);
 987   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 988   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 989
 990   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 991     {
 992       /* For interleaved access we peel only if number of iterations in
 993          the prolog loop ({VF - misalignment}), is a multiple of the
 994          number of the interleaved accesses.  */
 995       int elem_size, mis_in_elements;
 996       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 997
 998       /* FORNOW: handle only known alignment.  */
 999       if (!known_alignment_for_access_p (dr))
1000         return false;
1001
1002       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1003       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1004
1005       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1006         return false;
1007     }
1008
1009   /* If misalignment is known at the compile time then allow peeling
1010      only if natural alignment is reachable through peeling.  */
1011   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1012     {
1013       HOST_WIDE_INT elmsize =
1014                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1015       if (dump_enabled_p ())
1016         {
1017           dump_printf_loc (MSG_NOTE, vect_location,
1018                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1019           dump_printf (MSG_NOTE,
1020                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1021         }
1022       if (DR_MISALIGNMENT (dr) % elmsize)
1023         {
1024           if (dump_enabled_p ())
1025             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1026                              "data size does not divide the misalignment.\n");
1027           return false;
1028         }
1029     }
1030
1031   if (!known_alignment_for_access_p (dr))
1032     {
1033       tree type = TREE_TYPE (DR_REF (dr));
1034       bool is_packed = not_size_aligned (DR_REF (dr));
1035       if (dump_enabled_p ())
1036         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1037                          "Unknown misalignment, is_packed = %d\n",is_packed);
1038       if ((TYPE_USER_ALIGN (type) && !is_packed)
1039           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1040         return true;
1041       else
1042         return false;
1043     }
1044
1045   return true;
1046 }
1047
1048
1049 /* Calculate the cost of the memory access represented by DR.  */
1050
1051 static void
1052 vect_get_data_access_cost (struct data_reference *dr,
1053                            unsigned int *inside_cost,
1054                            unsigned int *outside_cost,
1055                            stmt_vector_for_cost *body_cost_vec)
1056 {
1057   gimple stmt = DR_STMT (dr);
1058   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1059   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1060   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1061   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1062   int ncopies = vf / nunits;
1063
1064   if (DR_IS_READ (dr))
1065     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1066                         NULL, body_cost_vec, false);
1067   else
1068     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1069
1070   if (dump_enabled_p ())
1071     dump_printf_loc (MSG_NOTE, vect_location,
1072                      "vect_get_data_access_cost: inside_cost = %d, "
1073                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1074 }
1075
1076
1077 /* Insert DR into peeling hash table with NPEEL as key.  */
1078
1079 static void
1080 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1081                           int npeel)
1082 {
1083   struct _vect_peel_info elem, *slot;
1084   _vect_peel_info **new_slot;
1085   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1086
1087   elem.npeel = npeel;
1088   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1089   if (slot)
1090     slot->count++;
1091   else
1092     {
1093       slot = XNEW (struct _vect_peel_info);
1094       slot->npeel = npeel;
1095       slot->dr = dr;
1096       slot->count = 1;
1097       new_slot
1098         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1099       *new_slot = slot;
1100     }
1101
1102   if (!supportable_dr_alignment
1103       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1104     slot->count += VECT_MAX_COST;
1105 }
1106
1107
1108 /* Traverse peeling hash table to find peeling option that aligns maximum
1109    number of data accesses.  */
1110
1111 int
1112 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1113                                      _vect_peel_extended_info *max)
1114 {
1115   vect_peel_info elem = *slot;
1116
1117   if (elem->count > max->peel_info.count
1118       || (elem->count == max->peel_info.count
1119           && max->peel_info.npeel > elem->npeel))
1120     {
1121       max->peel_info.npeel = elem->npeel;
1122       max->peel_info.count = elem->count;
1123       max->peel_info.dr = elem->dr;
1124     }
1125
1126   return 1;
1127 }
1128
1129
1130 /* Traverse peeling hash table and calculate cost for each peeling option.
1131    Find the one with the lowest cost.  */
1132
1133 int
1134 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1135                                    _vect_peel_extended_info *min)
1136 {
1137   vect_peel_info elem = *slot;
1138   int save_misalignment, dummy;
1139   unsigned int inside_cost = 0, outside_cost = 0, i;
1140   gimple stmt = DR_STMT (elem->dr);
1141   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1142   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1143   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1144   struct data_reference *dr;
1145   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1146
1147   prologue_cost_vec.create (2);
1148   body_cost_vec.create (2);
1149   epilogue_cost_vec.create (2);
1150
1151   FOR_EACH_VEC_ELT (datarefs, i, dr)
1152     {
1153       stmt = DR_STMT (dr);
1154       stmt_info = vinfo_for_stmt (stmt);
1155       /* For interleaving, only the alignment of the first access
1156          matters.  */
1157       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1158           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1159         continue;
1160
1161       save_misalignment = DR_MISALIGNMENT (dr);
1162       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1163       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1164                                  &body_cost_vec);
1165       SET_DR_MISALIGNMENT (dr, save_misalignment);
1166     }
1167
1168   outside_cost += vect_get_known_peeling_cost
1169     (loop_vinfo, elem->npeel, &dummy,
1170      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1171      &prologue_cost_vec, &epilogue_cost_vec);
1172
1173   /* Prologue and epilogue costs are added to the target model later.
1174      These costs depend only on the scalar iteration cost, the
1175      number of peeling iterations finally chosen, and the number of
1176      misaligned statements.  So discard the information found here.  */
1177   prologue_cost_vec.release ();
1178   epilogue_cost_vec.release ();
1179
1180   if (inside_cost < min->inside_cost
1181       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1182     {
1183       min->inside_cost = inside_cost;
1184       min->outside_cost = outside_cost;
1185       min->body_cost_vec.release ();
1186       min->body_cost_vec = body_cost_vec;
1187       min->peel_info.dr = elem->dr;
1188       min->peel_info.npeel = elem->npeel;
1189     }
1190   else
1191     body_cost_vec.release ();
1192
1193   return 1;
1194 }
1195
1196
1197 /* Choose best peeling option by traversing peeling hash table and either
1198    choosing an option with the lowest cost (if cost model is enabled) or the
1199    option that aligns as many accesses as possible.  */
1200
1201 static struct data_reference *
1202 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1203                                        unsigned int *npeel,
1204                                        stmt_vector_for_cost *body_cost_vec)
1205 {
1206    struct _vect_peel_extended_info res;
1207
1208    res.peel_info.dr = NULL;
1209    res.body_cost_vec = stmt_vector_for_cost ();
1210
1211    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1212      {
1213        res.inside_cost = INT_MAX;
1214        res.outside_cost = INT_MAX;
1215        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1216            ->traverse <_vect_peel_extended_info *,
1217                        vect_peeling_hash_get_lowest_cost> (&res);
1218      }
1219    else
1220      {
1221        res.peel_info.count = 0;
1222        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1223            ->traverse <_vect_peel_extended_info *,
1224                        vect_peeling_hash_get_most_frequent> (&res);
1225      }
1226
1227    *npeel = res.peel_info.npeel;
1228    *body_cost_vec = res.body_cost_vec;
1229    return res.peel_info.dr;
1230 }
1231
1232
1233 /* Function vect_enhance_data_refs_alignment
1234
1235    This pass will use loop versioning and loop peeling in order to enhance
1236    the alignment of data references in the loop.
1237
1238    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1239    original loop is to be vectorized.  Any other loops that are created by
1240    the transformations performed in this pass - are not supposed to be
1241    vectorized.  This restriction will be relaxed.
1242
1243    This pass will require a cost model to guide it whether to apply peeling
1244    or versioning or a combination of the two.  For example, the scheme that
1245    intel uses when given a loop with several memory accesses, is as follows:
1246    choose one memory access ('p') which alignment you want to force by doing
1247    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1248    other accesses are not necessarily aligned, or (2) use loop versioning to
1249    generate one loop in which all accesses are aligned, and another loop in
1250    which only 'p' is necessarily aligned.
1251
1252    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1253    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1254    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1255
1256    Devising a cost model is the most critical aspect of this work.  It will
1257    guide us on which access to peel for, whether to use loop versioning, how
1258    many versions to create, etc.  The cost model will probably consist of
1259    generic considerations as well as target specific considerations (on
1260    powerpc for example, misaligned stores are more painful than misaligned
1261    loads).
1262
1263    Here are the general steps involved in alignment enhancements:
1264
1265      -- original loop, before alignment analysis:
1266         for (i=0; i<N; i++){
1267           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1268           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1269         }
1270
1271      -- After vect_compute_data_refs_alignment:
1272         for (i=0; i<N; i++){
1273           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1274           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1275         }
1276
1277      -- Possibility 1: we do loop versioning:
1278      if (p is aligned) {
1279         for (i=0; i<N; i++){    # loop 1A
1280           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1281           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1282         }
1283      }
1284      else {
1285         for (i=0; i<N; i++){    # loop 1B
1286           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1287           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1288         }
1289      }
1290
1291      -- Possibility 2: we do loop peeling:
1292      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1293         x = q[i];
1294         p[i] = y;
1295      }
1296      for (i = 3; i < N; i++){   # loop 2A
1297         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1298         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1299      }
1300
1301      -- Possibility 3: combination of loop peeling and versioning:
1302      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1303         x = q[i];
1304         p[i] = y;
1305      }
1306      if (p is aligned) {
1307         for (i = 3; i<N; i++){  # loop 3A
1308           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1309           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1310         }
1311      }
1312      else {
1313         for (i = 3; i<N; i++){  # loop 3B
1314           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1315           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1316         }
1317      }
1318
1319      These loops are later passed to loop_transform to be vectorized.  The
1320      vectorizer will use the alignment information to guide the transformation
1321      (whether to generate regular loads/stores, or with special handling for
1322      misalignment).  */
1323
1324 bool
1325 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1326 {
1327   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1328   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1329   enum dr_alignment_support supportable_dr_alignment;
1330   struct data_reference *dr0 = NULL, *first_store = NULL;
1331   struct data_reference *dr;
1332   unsigned int i, j;
1333   bool do_peeling = false;
1334   bool do_versioning = false;
1335   bool stat;
1336   gimple stmt;
1337   stmt_vec_info stmt_info;
1338   unsigned int npeel = 0;
1339   bool all_misalignments_unknown = true;
1340   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1341   unsigned possible_npeel_number = 1;
1342   tree vectype;
1343   unsigned int nelements, mis, same_align_drs_max = 0;
1344   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1345
1346   if (dump_enabled_p ())
1347     dump_printf_loc (MSG_NOTE, vect_location,
1348                      "=== vect_enhance_data_refs_alignment ===\n");
1349
1350   /* While cost model enhancements are expected in the future, the high level
1351      view of the code at this time is as follows:
1352
1353      A) If there is a misaligned access then see if peeling to align
1354         this access can make all data references satisfy
1355         vect_supportable_dr_alignment.  If so, update data structures
1356         as needed and return true.
1357
1358      B) If peeling wasn't possible and there is a data reference with an
1359         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1360         then see if loop versioning checks can be used to make all data
1361         references satisfy vect_supportable_dr_alignment.  If so, update
1362         data structures as needed and return true.
1363
1364      C) If neither peeling nor versioning were successful then return false if
1365         any data reference does not satisfy vect_supportable_dr_alignment.
1366
1367      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1368
1369      Note, Possibility 3 above (which is peeling and versioning together) is not
1370      being done at this time.  */
1371
1372   /* (1) Peeling to force alignment.  */
1373
1374   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1375      Considerations:
1376      + How many accesses will become aligned due to the peeling
1377      - How many accesses will become unaligned due to the peeling,
1378        and the cost of misaligned accesses.
1379      - The cost of peeling (the extra runtime checks, the increase
1380        in code size).  */
1381
1382   FOR_EACH_VEC_ELT (datarefs, i, dr)
1383     {
1384       stmt = DR_STMT (dr);
1385       stmt_info = vinfo_for_stmt (stmt);
1386
1387       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1388         continue;
1389
1390       /* For interleaving, only the alignment of the first access
1391          matters.  */
1392       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1393           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1394         continue;
1395
1396       /* For invariant accesses there is nothing to enhance.  */
1397       if (integer_zerop (DR_STEP (dr)))
1398         continue;
1399
1400       /* Strided accesses perform only component accesses, alignment is
1401          irrelevant for them.  */
1402       if (STMT_VINFO_STRIDED_P (stmt_info)
1403           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1404         continue;
1405
1406       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1407       do_peeling = vector_alignment_reachable_p (dr);
1408       if (do_peeling)
1409         {
1410           if (known_alignment_for_access_p (dr))
1411             {
1412               unsigned int npeel_tmp;
1413               bool negative = tree_int_cst_compare (DR_STEP (dr),
1414                                                     size_zero_node) < 0;
1415
1416               /* Save info about DR in the hash table.  */
1417               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1418                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1419                   = new hash_table<peel_info_hasher> (1);
1420
1421               vectype = STMT_VINFO_VECTYPE (stmt_info);
1422               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1423               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1424                                                 TREE_TYPE (DR_REF (dr))));
1425               npeel_tmp = (negative
1426                            ? (mis - nelements) : (nelements - mis))
1427                   & (nelements - 1);
1428
1429               /* For multiple types, it is possible that the bigger type access
1430                  will have more than one peeling option.  E.g., a loop with two
1431                  types: one of size (vector size / 4), and the other one of
1432                  size (vector size / 8).  Vectorization factor will 8.  If both
1433                  access are misaligned by 3, the first one needs one scalar
1434                  iteration to be aligned, and the second one needs 5.  But the
1435                  the first one will be aligned also by peeling 5 scalar
1436                  iterations, and in that case both accesses will be aligned.
1437                  Hence, except for the immediate peeling amount, we also want
1438                  to try to add full vector size, while we don't exceed
1439                  vectorization factor.
1440                  We do this automtically for cost model, since we calculate cost
1441                  for every peeling option.  */
1442               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1443                 {
1444                   if (STMT_SLP_TYPE (stmt_info))
1445                     possible_npeel_number
1446                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1447                   else
1448                     possible_npeel_number = vf / nelements;
1449                 }
1450
1451               /* Handle the aligned case. We may decide to align some other
1452                  access, making DR unaligned.  */
1453               if (DR_MISALIGNMENT (dr) == 0)
1454                 {
1455                   npeel_tmp = 0;
1456                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1457                     possible_npeel_number++;
1458                 }
1459
1460               for (j = 0; j < possible_npeel_number; j++)
1461                 {
1462                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1463                   npeel_tmp += nelements;
1464                 }
1465
1466               all_misalignments_unknown = false;
1467               /* Data-ref that was chosen for the case that all the
1468                  misalignments are unknown is not relevant anymore, since we
1469                  have a data-ref with known alignment.  */
1470               dr0 = NULL;
1471             }
1472           else
1473             {
1474               /* If we don't know any misalignment values, we prefer
1475                  peeling for data-ref that has the maximum number of data-refs
1476                  with the same alignment, unless the target prefers to align
1477                  stores over load.  */
1478               if (all_misalignments_unknown)
1479                 {
1480                   unsigned same_align_drs
1481                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1482                   if (!dr0
1483                       || same_align_drs_max < same_align_drs)
1484                     {
1485                       same_align_drs_max = same_align_drs;
1486                       dr0 = dr;
1487                     }
1488                   /* For data-refs with the same number of related
1489                      accesses prefer the one where the misalign
1490                      computation will be invariant in the outermost loop.  */
1491                   else if (same_align_drs_max == same_align_drs)
1492                     {
1493                       struct loop *ivloop0, *ivloop;
1494                       ivloop0 = outermost_invariant_loop_for_expr
1495                           (loop, DR_BASE_ADDRESS (dr0));
1496                       ivloop = outermost_invariant_loop_for_expr
1497                           (loop, DR_BASE_ADDRESS (dr));
1498                       if ((ivloop && !ivloop0)
1499                           || (ivloop && ivloop0
1500                               && flow_loop_nested_p (ivloop, ivloop0)))
1501                         dr0 = dr;
1502                     }
1503
1504                   if (!first_store && DR_IS_WRITE (dr))
1505                     first_store = dr;
1506                 }
1507
1508               /* If there are both known and unknown misaligned accesses in the
1509                  loop, we choose peeling amount according to the known
1510                  accesses.  */
1511               if (!supportable_dr_alignment)
1512                 {
1513                   dr0 = dr;
1514                   if (!first_store && DR_IS_WRITE (dr))
1515                     first_store = dr;
1516                 }
1517             }
1518         }
1519       else
1520         {
1521           if (!aligned_access_p (dr))
1522             {
1523               if (dump_enabled_p ())
1524                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525                                  "vector alignment may not be reachable\n");
1526               break;
1527             }
1528         }
1529     }
1530
1531   /* Check if we can possibly peel the loop.  */
1532   if (!vect_can_advance_ivs_p (loop_vinfo)
1533       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1534     do_peeling = false;
1535
1536   if (do_peeling
1537       && all_misalignments_unknown
1538       && vect_supportable_dr_alignment (dr0, false))
1539     {
1540       /* Check if the target requires to prefer stores over loads, i.e., if
1541          misaligned stores are more expensive than misaligned loads (taking
1542          drs with same alignment into account).  */
1543       if (first_store && DR_IS_READ (dr0))
1544         {
1545           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1546           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1547           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1548           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1549           stmt_vector_for_cost dummy;
1550           dummy.create (2);
1551
1552           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1553                                      &dummy);
1554           vect_get_data_access_cost (first_store, &store_inside_cost,
1555                                      &store_outside_cost, &dummy);
1556
1557           dummy.release ();
1558
1559           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1560              aligning the load DR0).  */
1561           load_inside_penalty = store_inside_cost;
1562           load_outside_penalty = store_outside_cost;
1563           for (i = 0;
1564                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1565                           DR_STMT (first_store))).iterate (i, &dr);
1566                i++)
1567             if (DR_IS_READ (dr))
1568               {
1569                 load_inside_penalty += load_inside_cost;
1570                 load_outside_penalty += load_outside_cost;
1571               }
1572             else
1573               {
1574                 load_inside_penalty += store_inside_cost;
1575                 load_outside_penalty += store_outside_cost;
1576               }
1577
1578           /* Calculate the penalty for leaving DR0 unaligned (by
1579              aligning the FIRST_STORE).  */
1580           store_inside_penalty = load_inside_cost;
1581           store_outside_penalty = load_outside_cost;
1582           for (i = 0;
1583                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1584                       DR_STMT (dr0))).iterate (i, &dr);
1585                i++)
1586             if (DR_IS_READ (dr))
1587               {
1588                 store_inside_penalty += load_inside_cost;
1589                 store_outside_penalty += load_outside_cost;
1590               }
1591             else
1592               {
1593                 store_inside_penalty += store_inside_cost;
1594                 store_outside_penalty += store_outside_cost;
1595               }
1596
1597           if (load_inside_penalty > store_inside_penalty
1598               || (load_inside_penalty == store_inside_penalty
1599                   && load_outside_penalty > store_outside_penalty))
1600             dr0 = first_store;
1601         }
1602
1603       /* In case there are only loads with different unknown misalignments, use
1604          peeling only if it may help to align other accesses in the loop or
1605          if it may help improving load bandwith when we'd end up using
1606          unaligned loads.  */
1607       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1608       if (!first_store
1609           && !STMT_VINFO_SAME_ALIGN_REFS (
1610                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1611           && (vect_supportable_dr_alignment (dr0, false)
1612               != dr_unaligned_supported
1613               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1614                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1615         do_peeling = false;
1616     }
1617
1618   if (do_peeling && !dr0)
1619     {
1620       /* Peeling is possible, but there is no data access that is not supported
1621          unless aligned. So we try to choose the best possible peeling.  */
1622
1623       /* We should get here only if there are drs with known misalignment.  */
1624       gcc_assert (!all_misalignments_unknown);
1625
1626       /* Choose the best peeling from the hash table.  */
1627       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1628                                                    &body_cost_vec);
1629       if (!dr0 || !npeel)
1630         do_peeling = false;
1631     }
1632
1633   if (do_peeling)
1634     {
1635       stmt = DR_STMT (dr0);
1636       stmt_info = vinfo_for_stmt (stmt);
1637       vectype = STMT_VINFO_VECTYPE (stmt_info);
1638       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1639
1640       if (known_alignment_for_access_p (dr0))
1641         {
1642           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1643                                                 size_zero_node) < 0;
1644           if (!npeel)
1645             {
1646               /* Since it's known at compile time, compute the number of
1647                  iterations in the peeled loop (the peeling factor) for use in
1648                  updating DR_MISALIGNMENT values.  The peeling factor is the
1649                  vectorization factor minus the misalignment as an element
1650                  count.  */
1651               mis = DR_MISALIGNMENT (dr0);
1652               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1653               npeel = ((negative ? mis - nelements : nelements - mis)
1654                        & (nelements - 1));
1655             }
1656
1657           /* For interleaved data access every iteration accesses all the
1658              members of the group, therefore we divide the number of iterations
1659              by the group size.  */
1660           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1661           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1662             npeel /= GROUP_SIZE (stmt_info);
1663
1664           if (dump_enabled_p ())
1665             dump_printf_loc (MSG_NOTE, vect_location,
1666                              "Try peeling by %d\n", npeel);
1667         }
1668
1669       /* Ensure that all data refs can be vectorized after the peel.  */
1670       FOR_EACH_VEC_ELT (datarefs, i, dr)
1671         {
1672           int save_misalignment;
1673
1674           if (dr == dr0)
1675             continue;
1676
1677           stmt = DR_STMT (dr);
1678           stmt_info = vinfo_for_stmt (stmt);
1679           /* For interleaving, only the alignment of the first access
1680             matters.  */
1681           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1682               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1683             continue;
1684
1685           /* Strided accesses perform only component accesses, alignment is
1686              irrelevant for them.  */
1687           if (STMT_VINFO_STRIDED_P (stmt_info)
1688               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1689             continue;
1690
1691           save_misalignment = DR_MISALIGNMENT (dr);
1692           vect_update_misalignment_for_peel (dr, dr0, npeel);
1693           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1694           SET_DR_MISALIGNMENT (dr, save_misalignment);
1695
1696           if (!supportable_dr_alignment)
1697             {
1698               do_peeling = false;
1699               break;
1700             }
1701         }
1702
1703       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1704         {
1705           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1706           if (!stat)
1707             do_peeling = false;
1708           else
1709             {
1710               body_cost_vec.release ();
1711               return stat;
1712             }
1713         }
1714
1715       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1716       if (do_peeling)
1717         {
1718           unsigned max_allowed_peel
1719             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1720           if (max_allowed_peel != (unsigned)-1)
1721             {
1722               unsigned max_peel = npeel;
1723               if (max_peel == 0)
1724                 {
1725                   gimple dr_stmt = DR_STMT (dr0);
1726                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1727                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1728                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1729                 }
1730               if (max_peel > max_allowed_peel)
1731                 {
1732                   do_peeling = false;
1733                   if (dump_enabled_p ())
1734                     dump_printf_loc (MSG_NOTE, vect_location,
1735                         "Disable peeling, max peels reached: %d\n", max_peel);
1736                 }
1737             }
1738         }
1739
1740       /* Cost model #2 - if peeling may result in a remaining loop not
1741          iterating enough to be vectorized then do not peel.  */
1742       if (do_peeling
1743           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1744         {
1745           unsigned max_peel
1746             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1747           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1748               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1749             do_peeling = false;
1750         }
1751
1752       if (do_peeling)
1753         {
1754           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1755              If the misalignment of DR_i is identical to that of dr0 then set
1756              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1757              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1758              by the peeling factor times the element size of DR_i (MOD the
1759              vectorization factor times the size).  Otherwise, the
1760              misalignment of DR_i must be set to unknown.  */
1761           FOR_EACH_VEC_ELT (datarefs, i, dr)
1762             if (dr != dr0)
1763               vect_update_misalignment_for_peel (dr, dr0, npeel);
1764
1765           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1766           if (npeel)
1767             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1768           else
1769             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1770               = DR_MISALIGNMENT (dr0);
1771           SET_DR_MISALIGNMENT (dr0, 0);
1772           if (dump_enabled_p ())
1773             {
1774               dump_printf_loc (MSG_NOTE, vect_location,
1775                                "Alignment of access forced using peeling.\n");
1776               dump_printf_loc (MSG_NOTE, vect_location,
1777                                "Peeling for alignment will be applied.\n");
1778             }
1779           /* The inside-loop cost will be accounted for in vectorizable_load
1780              and vectorizable_store correctly with adjusted alignments.
1781              Drop the body_cst_vec on the floor here.  */
1782           body_cost_vec.release ();
1783
1784           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1785           gcc_assert (stat);
1786           return stat;
1787         }
1788     }
1789
1790   body_cost_vec.release ();
1791
1792   /* (2) Versioning to force alignment.  */
1793
1794   /* Try versioning if:
1795      1) optimize loop for speed
1796      2) there is at least one unsupported misaligned data ref with an unknown
1797         misalignment, and
1798      3) all misaligned data refs with a known misalignment are supported, and
1799      4) the number of runtime alignment checks is within reason.  */
1800
1801   do_versioning =
1802         optimize_loop_nest_for_speed_p (loop)
1803         && (!loop->inner); /* FORNOW */
1804
1805   if (do_versioning)
1806     {
1807       FOR_EACH_VEC_ELT (datarefs, i, dr)
1808         {
1809           stmt = DR_STMT (dr);
1810           stmt_info = vinfo_for_stmt (stmt);
1811
1812           /* For interleaving, only the alignment of the first access
1813              matters.  */
1814           if (aligned_access_p (dr)
1815               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1816                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1817             continue;
1818
1819           if (STMT_VINFO_STRIDED_P (stmt_info))
1820             {
1821               /* Strided loads perform only component accesses, alignment is
1822                  irrelevant for them.  */
1823               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1824                 continue;
1825               do_versioning = false;
1826               break;
1827             }
1828
1829           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1830
1831           if (!supportable_dr_alignment)
1832             {
1833               gimple stmt;
1834               int mask;
1835               tree vectype;
1836
1837               if (known_alignment_for_access_p (dr)
1838                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1839                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1840                 {
1841                   do_versioning = false;
1842                   break;
1843                 }
1844
1845               stmt = DR_STMT (dr);
1846               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1847               gcc_assert (vectype);
1848
1849               /* The rightmost bits of an aligned address must be zeros.
1850                  Construct the mask needed for this test.  For example,
1851                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1852                  mask must be 15 = 0xf. */
1853               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1854
1855               /* FORNOW: use the same mask to test all potentially unaligned
1856                  references in the loop.  The vectorizer currently supports
1857                  a single vector size, see the reference to
1858                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1859                  vectorization factor is computed.  */
1860               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1861                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1862               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1863               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1864                       DR_STMT (dr));
1865             }
1866         }
1867
1868       /* Versioning requires at least one misaligned data reference.  */
1869       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1870         do_versioning = false;
1871       else if (!do_versioning)
1872         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1873     }
1874
1875   if (do_versioning)
1876     {
1877       vec<gimple> may_misalign_stmts
1878         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1879       gimple stmt;
1880
1881       /* It can now be assumed that the data references in the statements
1882          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1883          of the loop being vectorized.  */
1884       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1885         {
1886           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1887           dr = STMT_VINFO_DATA_REF (stmt_info);
1888           SET_DR_MISALIGNMENT (dr, 0);
1889           if (dump_enabled_p ())
1890             dump_printf_loc (MSG_NOTE, vect_location,
1891                              "Alignment of access forced using versioning.\n");
1892         }
1893
1894       if (dump_enabled_p ())
1895         dump_printf_loc (MSG_NOTE, vect_location,
1896                          "Versioning for alignment will be applied.\n");
1897
1898       /* Peeling and versioning can't be done together at this time.  */
1899       gcc_assert (! (do_peeling && do_versioning));
1900
1901       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1902       gcc_assert (stat);
1903       return stat;
1904     }
1905
1906   /* This point is reached if neither peeling nor versioning is being done.  */
1907   gcc_assert (! (do_peeling || do_versioning));
1908
1909   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1910   return stat;
1911 }
1912
1913
1914 /* Function vect_find_same_alignment_drs.
1915
1916    Update group and alignment relations according to the chosen
1917    vectorization factor.  */
1918
1919 static void
1920 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1921                               loop_vec_info loop_vinfo)
1922 {
1923   unsigned int i;
1924   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1925   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1926   struct data_reference *dra = DDR_A (ddr);
1927   struct data_reference *drb = DDR_B (ddr);
1928   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1929   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1930   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1931   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1932   lambda_vector dist_v;
1933   unsigned int loop_depth;
1934
1935   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1936     return;
1937
1938   if (dra == drb)
1939     return;
1940
1941   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1942     return;
1943
1944   /* Loop-based vectorization and known data dependence.  */
1945   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1946     return;
1947
1948   /* Data-dependence analysis reports a distance vector of zero
1949      for data-references that overlap only in the first iteration
1950      but have different sign step (see PR45764).
1951      So as a sanity check require equal DR_STEP.  */
1952   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1953     return;
1954
1955   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1956   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1957     {
1958       int dist = dist_v[loop_depth];
1959
1960       if (dump_enabled_p ())
1961         dump_printf_loc (MSG_NOTE, vect_location,
1962                          "dependence distance  = %d.\n", dist);
1963
1964       /* Same loop iteration.  */
1965       if (dist == 0
1966           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1967         {
1968           /* Two references with distance zero have the same alignment.  */
1969           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1970           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1971           if (dump_enabled_p ())
1972             {
1973               dump_printf_loc (MSG_NOTE, vect_location,
1974                                "accesses have the same alignment.\n");
1975               dump_printf (MSG_NOTE,
1976                            "dependence distance modulo vf == 0 between ");
1977               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1978               dump_printf (MSG_NOTE,  " and ");
1979               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1980               dump_printf (MSG_NOTE, "\n");
1981             }
1982         }
1983     }
1984 }
1985
1986
1987 /* Function vect_analyze_data_refs_alignment
1988
1989    Analyze the alignment of the data-references in the loop.
1990    Return FALSE if a data reference is found that cannot be vectorized.  */
1991
1992 bool
1993 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1994                                   bb_vec_info bb_vinfo)
1995 {
1996   if (dump_enabled_p ())
1997     dump_printf_loc (MSG_NOTE, vect_location,
1998                      "=== vect_analyze_data_refs_alignment ===\n");
1999
2000   /* Mark groups of data references with same alignment using
2001      data dependence information.  */
2002   if (loop_vinfo)
2003     {
2004       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2005       struct data_dependence_relation *ddr;
2006       unsigned int i;
2007
2008       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2009         vect_find_same_alignment_drs (ddr, loop_vinfo);
2010     }
2011
2012   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2013     {
2014       if (dump_enabled_p ())
2015         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2016                          "not vectorized: can't calculate alignment "
2017                          "for data ref.\n");
2018       return false;
2019     }
2020
2021   return true;
2022 }
2023
2024
2025 /* Analyze groups of accesses: check that DR belongs to a group of
2026    accesses of legal size, step, etc.  Detect gaps, single element
2027    interleaving, and other special cases. Set grouped access info.
2028    Collect groups of strided stores for further use in SLP analysis.  */
2029
2030 static bool
2031 vect_analyze_group_access (struct data_reference *dr)
2032 {
2033   tree step = DR_STEP (dr);
2034   tree scalar_type = TREE_TYPE (DR_REF (dr));
2035   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2036   gimple stmt = DR_STMT (dr);
2037   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2038   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2039   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2040   HOST_WIDE_INT dr_step = -1;
2041   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2042   bool slp_impossible = false;
2043   struct loop *loop = NULL;
2044
2045   if (loop_vinfo)
2046     loop = LOOP_VINFO_LOOP (loop_vinfo);
2047
2048   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2049      size of the interleaving group (including gaps).  */
2050   if (tree_fits_shwi_p (step))
2051     {
2052       dr_step = tree_to_shwi (step);
2053       groupsize = absu_hwi (dr_step) / type_size;
2054     }
2055   else
2056     groupsize = 0;
2057
2058   /* Not consecutive access is possible only if it is a part of interleaving.  */
2059   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2060     {
2061       /* Check if it this DR is a part of interleaving, and is a single
2062          element of the group that is accessed in the loop.  */
2063
2064       /* Gaps are supported only for loads. STEP must be a multiple of the type
2065          size.  The size of the group must be a power of 2.  */
2066       if (DR_IS_READ (dr)
2067           && (dr_step % type_size) == 0
2068           && groupsize > 0
2069           && exact_log2 (groupsize) != -1)
2070         {
2071           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2072           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2073           if (dump_enabled_p ())
2074             {
2075               dump_printf_loc (MSG_NOTE, vect_location,
2076                                "Detected single element interleaving ");
2077               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2078               dump_printf (MSG_NOTE, " step ");
2079               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2080               dump_printf (MSG_NOTE, "\n");
2081             }
2082
2083           if (loop_vinfo)
2084             {
2085               if (dump_enabled_p ())
2086                 dump_printf_loc (MSG_NOTE, vect_location,
2087                                  "Data access with gaps requires scalar "
2088                                  "epilogue loop\n");
2089               if (loop->inner)
2090                 {
2091                   if (dump_enabled_p ())
2092                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2093                                      "Peeling for outer loop is not"
2094                                      " supported\n");
2095                   return false;
2096                 }
2097
2098               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2099             }
2100
2101           return true;
2102         }
2103
2104       if (dump_enabled_p ())
2105         {
2106           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2107                            "not consecutive access ");
2108           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2109           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2110         }
2111
2112       if (bb_vinfo)
2113         {
2114           /* Mark the statement as unvectorizable.  */
2115           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2116           return true;
2117         }
2118
2119       return false;
2120     }
2121
2122   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2123     {
2124       /* First stmt in the interleaving chain. Check the chain.  */
2125       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2126       struct data_reference *data_ref = dr;
2127       unsigned int count = 1;
2128       tree prev_init = DR_INIT (data_ref);
2129       gimple prev = stmt;
2130       HOST_WIDE_INT diff, gaps = 0;
2131
2132       while (next)
2133         {
2134           /* Skip same data-refs.  In case that two or more stmts share
2135              data-ref (supported only for loads), we vectorize only the first
2136              stmt, and the rest get their vectorized loads from the first
2137              one.  */
2138           if (!tree_int_cst_compare (DR_INIT (data_ref),
2139                                      DR_INIT (STMT_VINFO_DATA_REF (
2140                                                    vinfo_for_stmt (next)))))
2141             {
2142               if (DR_IS_WRITE (data_ref))
2143                 {
2144                   if (dump_enabled_p ())
2145                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2146                                      "Two store stmts share the same dr.\n");
2147                   return false;
2148                 }
2149
2150               /* For load use the same data-ref load.  */
2151               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2152
2153               prev = next;
2154               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2155               continue;
2156             }
2157
2158           prev = next;
2159           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2160
2161           /* All group members have the same STEP by construction.  */
2162           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2163
2164           /* Check that the distance between two accesses is equal to the type
2165              size. Otherwise, we have gaps.  */
2166           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2167                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2168           if (diff != 1)
2169             {
2170               /* FORNOW: SLP of accesses with gaps is not supported.  */
2171               slp_impossible = true;
2172               if (DR_IS_WRITE (data_ref))
2173                 {
2174                   if (dump_enabled_p ())
2175                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2176                                      "interleaved store with gaps\n");
2177                   return false;
2178                 }
2179
2180               gaps += diff - 1;
2181             }
2182
2183           last_accessed_element += diff;
2184
2185           /* Store the gap from the previous member of the group. If there is no
2186              gap in the access, GROUP_GAP is always 1.  */
2187           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2188
2189           prev_init = DR_INIT (data_ref);
2190           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2191           /* Count the number of data-refs in the chain.  */
2192           count++;
2193         }
2194
2195       if (groupsize == 0)
2196         groupsize = count + gaps;
2197
2198       /* Check that the size of the interleaving is equal to count for stores,
2199          i.e., that there are no gaps.  */
2200       if (groupsize != count
2201           && !DR_IS_READ (dr))
2202         {
2203           if (dump_enabled_p ())
2204             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2205                              "interleaved store with gaps\n");
2206           return false;
2207         }
2208
2209       /* If there is a gap after the last load in the group it is the
2210          difference between the groupsize and the last accessed
2211          element.
2212          When there is no gap, this difference should be 0.  */
2213       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2214
2215       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2216       if (dump_enabled_p ())
2217         {
2218           dump_printf_loc (MSG_NOTE, vect_location,
2219                            "Detected interleaving of size %d starting with ",
2220                            (int)groupsize);
2221           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2222           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2223             dump_printf_loc (MSG_NOTE, vect_location,
2224                              "There is a gap of %d elements after the group\n",
2225                              (int)GROUP_GAP (vinfo_for_stmt (stmt)));
2226         }
2227
2228       /* SLP: create an SLP data structure for every interleaving group of
2229          stores for further analysis in vect_analyse_slp.  */
2230       if (DR_IS_WRITE (dr) && !slp_impossible)
2231         {
2232           if (loop_vinfo)
2233             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2234           if (bb_vinfo)
2235             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2236         }
2237
2238       /* If there is a gap in the end of the group or the group size cannot
2239          be made a multiple of the vector element count then we access excess
2240          elements in the last iteration and thus need to peel that off.  */
2241       if (loop_vinfo
2242           && (groupsize - last_accessed_element > 0
2243               || exact_log2 (groupsize) == -1))
2244
2245         {
2246           if (dump_enabled_p ())
2247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248                              "Data access with gaps requires scalar "
2249                              "epilogue loop\n");
2250           if (loop->inner)
2251             {
2252               if (dump_enabled_p ())
2253                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2254                                  "Peeling for outer loop is not supported\n");
2255               return false;
2256             }
2257
2258           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2259         }
2260     }
2261
2262   return true;
2263 }
2264
2265
2266 /* Analyze the access pattern of the data-reference DR.
2267    In case of non-consecutive accesses call vect_analyze_group_access() to
2268    analyze groups of accesses.  */
2269
2270 static bool
2271 vect_analyze_data_ref_access (struct data_reference *dr)
2272 {
2273   tree step = DR_STEP (dr);
2274   tree scalar_type = TREE_TYPE (DR_REF (dr));
2275   gimple stmt = DR_STMT (dr);
2276   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2278   struct loop *loop = NULL;
2279
2280   if (loop_vinfo)
2281     loop = LOOP_VINFO_LOOP (loop_vinfo);
2282
2283   if (loop_vinfo && !step)
2284     {
2285       if (dump_enabled_p ())
2286         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287                          "bad data-ref access in loop\n");
2288       return false;
2289     }
2290
2291   /* Allow loads with zero step in inner-loop vectorization.  */
2292   if (loop_vinfo && integer_zerop (step))
2293     {
2294       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2295       if (!nested_in_vect_loop_p (loop, stmt))
2296         return DR_IS_READ (dr);
2297       /* Allow references with zero step for outer loops marked
2298          with pragma omp simd only - it guarantees absence of
2299          loop-carried dependencies between inner loop iterations.  */
2300       if (!loop->force_vectorize)
2301         {
2302           if (dump_enabled_p ())
2303             dump_printf_loc (MSG_NOTE, vect_location,
2304                              "zero step in inner loop of nest\n");
2305           return false;
2306         }
2307     }
2308
2309   if (loop && nested_in_vect_loop_p (loop, stmt))
2310     {
2311       /* Interleaved accesses are not yet supported within outer-loop
2312         vectorization for references in the inner-loop.  */
2313       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2314
2315       /* For the rest of the analysis we use the outer-loop step.  */
2316       step = STMT_VINFO_DR_STEP (stmt_info);
2317       if (integer_zerop (step))
2318         {
2319           if (dump_enabled_p ())
2320             dump_printf_loc (MSG_NOTE, vect_location,
2321                              "zero step in outer loop.\n");
2322           if (DR_IS_READ (dr))
2323             return true;
2324           else
2325             return false;
2326         }
2327     }
2328
2329   /* Consecutive?  */
2330   if (TREE_CODE (step) == INTEGER_CST)
2331     {
2332       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2333       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2334           || (dr_step < 0
2335               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2336         {
2337           /* Mark that it is not interleaving.  */
2338           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2339           return true;
2340         }
2341     }
2342
2343   if (loop && nested_in_vect_loop_p (loop, stmt))
2344     {
2345       if (dump_enabled_p ())
2346         dump_printf_loc (MSG_NOTE, vect_location,
2347                          "grouped access in outer loop.\n");
2348       return false;
2349     }
2350
2351
2352   /* Assume this is a DR handled by non-constant strided load case.  */
2353   if (TREE_CODE (step) != INTEGER_CST)
2354     return (STMT_VINFO_STRIDED_P (stmt_info)
2355             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2356                 || vect_analyze_group_access (dr)));
2357
2358   /* Not consecutive access - check if it's a part of interleaving group.  */
2359   return vect_analyze_group_access (dr);
2360 }
2361
2362
2363
2364 /*  A helper function used in the comparator function to sort data
2365     references.  T1 and T2 are two data references to be compared.
2366     The function returns -1, 0, or 1.  */
2367
2368 static int
2369 compare_tree (tree t1, tree t2)
2370 {
2371   int i, cmp;
2372   enum tree_code code;
2373   char tclass;
2374
2375   if (t1 == t2)
2376     return 0;
2377   if (t1 == NULL)
2378     return -1;
2379   if (t2 == NULL)
2380     return 1;
2381
2382
2383   if (TREE_CODE (t1) != TREE_CODE (t2))
2384     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2385
2386   code = TREE_CODE (t1);
2387   switch (code)
2388     {
2389     /* For const values, we can just use hash values for comparisons.  */
2390     case INTEGER_CST:
2391     case REAL_CST:
2392     case FIXED_CST:
2393     case STRING_CST:
2394     case COMPLEX_CST:
2395     case VECTOR_CST:
2396       {
2397         hashval_t h1 = iterative_hash_expr (t1, 0);
2398         hashval_t h2 = iterative_hash_expr (t2, 0);
2399         if (h1 != h2)
2400           return h1 < h2 ? -1 : 1;
2401         break;
2402       }
2403
2404     case SSA_NAME:
2405       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2406       if (cmp != 0)
2407         return cmp;
2408
2409       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2410         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2411       break;
2412
2413     default:
2414       tclass = TREE_CODE_CLASS (code);
2415
2416       /* For var-decl, we could compare their UIDs.  */
2417       if (tclass == tcc_declaration)
2418         {
2419           if (DECL_UID (t1) != DECL_UID (t2))
2420             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2421           break;
2422         }
2423
2424       /* For expressions with operands, compare their operands recursively.  */
2425       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2426         {
2427           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2428           if (cmp != 0)
2429             return cmp;
2430         }
2431     }
2432
2433   return 0;
2434 }
2435
2436
2437 /* Compare two data-references DRA and DRB to group them into chunks
2438    suitable for grouping.  */
2439
2440 static int
2441 dr_group_sort_cmp (const void *dra_, const void *drb_)
2442 {
2443   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2444   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2445   int cmp;
2446
2447   /* Stabilize sort.  */
2448   if (dra == drb)
2449     return 0;
2450
2451   /* Ordering of DRs according to base.  */
2452   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2453     {
2454       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2455       if (cmp != 0)
2456         return cmp;
2457     }
2458
2459   /* And according to DR_OFFSET.  */
2460   if (!dr_equal_offsets_p (dra, drb))
2461     {
2462       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2463       if (cmp != 0)
2464         return cmp;
2465     }
2466
2467   /* Put reads before writes.  */
2468   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2469     return DR_IS_READ (dra) ? -1 : 1;
2470
2471   /* Then sort after access size.  */
2472   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2473                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2474     {
2475       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2476                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2477       if (cmp != 0)
2478         return cmp;
2479     }
2480
2481   /* And after step.  */
2482   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2483     {
2484       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2485       if (cmp != 0)
2486         return cmp;
2487     }
2488
2489   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2490   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2491   if (cmp == 0)
2492     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2493   return cmp;
2494 }
2495
2496 /* Function vect_analyze_data_ref_accesses.
2497
2498    Analyze the access pattern of all the data references in the loop.
2499
2500    FORNOW: the only access pattern that is considered vectorizable is a
2501            simple step 1 (consecutive) access.
2502
2503    FORNOW: handle only arrays and pointer accesses.  */
2504
2505 bool
2506 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2507 {
2508   unsigned int i;
2509   vec<data_reference_p> datarefs;
2510   struct data_reference *dr;
2511
2512   if (dump_enabled_p ())
2513     dump_printf_loc (MSG_NOTE, vect_location,
2514                      "=== vect_analyze_data_ref_accesses ===\n");
2515
2516   if (loop_vinfo)
2517     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2518   else
2519     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2520
2521   if (datarefs.is_empty ())
2522     return true;
2523
2524   /* Sort the array of datarefs to make building the interleaving chains
2525      linear.  Don't modify the original vector's order, it is needed for
2526      determining what dependencies are reversed.  */
2527   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2528   datarefs_copy.qsort (dr_group_sort_cmp);
2529
2530   /* Build the interleaving chains.  */
2531   for (i = 0; i < datarefs_copy.length () - 1;)
2532     {
2533       data_reference_p dra = datarefs_copy[i];
2534       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2535       stmt_vec_info lastinfo = NULL;
2536       for (i = i + 1; i < datarefs_copy.length (); ++i)
2537         {
2538           data_reference_p drb = datarefs_copy[i];
2539           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2540
2541           /* ???  Imperfect sorting (non-compatible types, non-modulo
2542              accesses, same accesses) can lead to a group to be artificially
2543              split here as we don't just skip over those.  If it really
2544              matters we can push those to a worklist and re-iterate
2545              over them.  The we can just skip ahead to the next DR here.  */
2546
2547           /* Check that the data-refs have same first location (except init)
2548              and they are both either store or load (not load and store,
2549              not masked loads or stores).  */
2550           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2551               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2552                                    DR_BASE_ADDRESS (drb), 0)
2553               || !dr_equal_offsets_p (dra, drb)
2554               || !gimple_assign_single_p (DR_STMT (dra))
2555               || !gimple_assign_single_p (DR_STMT (drb)))
2556             break;
2557
2558           /* Check that the data-refs have the same constant size.  */
2559           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2560           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2561           if (!tree_fits_uhwi_p (sza)
2562               || !tree_fits_uhwi_p (szb)
2563               || !tree_int_cst_equal (sza, szb))
2564             break;
2565
2566           /* Check that the data-refs have the same step.  */
2567           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2568             break;
2569
2570           /* Do not place the same access in the interleaving chain twice.  */
2571           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2572             break;
2573
2574           /* Check the types are compatible.
2575              ???  We don't distinguish this during sorting.  */
2576           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2577                                    TREE_TYPE (DR_REF (drb))))
2578             break;
2579
2580           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2581           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2582           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2583           gcc_assert (init_a < init_b);
2584
2585           /* If init_b == init_a + the size of the type * k, we have an
2586              interleaving, and DRA is accessed before DRB.  */
2587           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2588           if ((init_b - init_a) % type_size_a != 0)
2589             break;
2590
2591           /* If we have a store, the accesses are adjacent.  This splits
2592              groups into chunks we support (we don't support vectorization
2593              of stores with gaps).  */
2594           if (!DR_IS_READ (dra)
2595               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2596                                              (DR_INIT (datarefs_copy[i-1]))
2597                   != type_size_a))
2598             break;
2599
2600           /* If the step (if not zero or non-constant) is greater than the
2601              difference between data-refs' inits this splits groups into
2602              suitable sizes.  */
2603           if (tree_fits_shwi_p (DR_STEP (dra)))
2604             {
2605               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2606               if (step != 0 && step <= (init_b - init_a))
2607                 break;
2608             }
2609
2610           if (dump_enabled_p ())
2611             {
2612               dump_printf_loc (MSG_NOTE, vect_location,
2613                                "Detected interleaving ");
2614               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2615               dump_printf (MSG_NOTE,  " and ");
2616               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2617               dump_printf (MSG_NOTE, "\n");
2618             }
2619
2620           /* Link the found element into the group list.  */
2621           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2622             {
2623               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2624               lastinfo = stmtinfo_a;
2625             }
2626           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2627           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2628           lastinfo = stmtinfo_b;
2629         }
2630     }
2631
2632   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2633     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2634         && !vect_analyze_data_ref_access (dr))
2635       {
2636         if (dump_enabled_p ())
2637           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2638                            "not vectorized: complicated access pattern.\n");
2639
2640         if (bb_vinfo)
2641           {
2642             /* Mark the statement as not vectorizable.  */
2643             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2644             continue;
2645           }
2646         else
2647           {
2648             datarefs_copy.release ();
2649             return false;
2650           }
2651       }
2652
2653   datarefs_copy.release ();
2654   return true;
2655 }
2656
2657
2658 /* Operator == between two dr_with_seg_len objects.
2659
2660    This equality operator is used to make sure two data refs
2661    are the same one so that we will consider to combine the
2662    aliasing checks of those two pairs of data dependent data
2663    refs.  */
2664
2665 static bool
2666 operator == (const dr_with_seg_len& d1,
2667              const dr_with_seg_len& d2)
2668 {
2669   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2670                           DR_BASE_ADDRESS (d2.dr), 0)
2671            && compare_tree (d1.offset, d2.offset) == 0
2672            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2673 }
2674
2675 /* Function comp_dr_with_seg_len_pair.
2676
2677    Comparison function for sorting objects of dr_with_seg_len_pair_t
2678    so that we can combine aliasing checks in one scan.  */
2679
2680 static int
2681 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2682 {
2683   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2684   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2685
2686   const dr_with_seg_len &p11 = p1->first,
2687                         &p12 = p1->second,
2688                         &p21 = p2->first,
2689                         &p22 = p2->second;
2690
2691   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2692      if a and c have the same basic address snd step, and b and d have the same
2693      address and step.  Therefore, if any a&c or b&d don't have the same address
2694      and step, we don't care the order of those two pairs after sorting.  */
2695   int comp_res;
2696
2697   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2698                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2699     return comp_res;
2700   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2701                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2702     return comp_res;
2703   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2704     return comp_res;
2705   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2706     return comp_res;
2707   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2708     return comp_res;
2709   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2710     return comp_res;
2711
2712   return 0;
2713 }
2714
2715 /* Function vect_vfa_segment_size.
2716
2717    Create an expression that computes the size of segment
2718    that will be accessed for a data reference.  The functions takes into
2719    account that realignment loads may access one more vector.
2720
2721    Input:
2722      DR: The data reference.
2723      LENGTH_FACTOR: segment length to consider.
2724
2725    Return an expression whose value is the size of segment which will be
2726    accessed by DR.  */
2727
2728 static tree
2729 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2730 {
2731   tree segment_length;
2732
2733   if (integer_zerop (DR_STEP (dr)))
2734     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2735   else
2736     segment_length = size_binop (MULT_EXPR,
2737                                  fold_convert (sizetype, DR_STEP (dr)),
2738                                  fold_convert (sizetype, length_factor));
2739
2740   if (vect_supportable_dr_alignment (dr, false)
2741         == dr_explicit_realign_optimized)
2742     {
2743       tree vector_size = TYPE_SIZE_UNIT
2744                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2745
2746       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2747     }
2748   return segment_length;
2749 }
2750
2751 /* Function vect_prune_runtime_alias_test_list.
2752
2753    Prune a list of ddrs to be tested at run-time by versioning for alias.
2754    Merge several alias checks into one if possible.
2755    Return FALSE if resulting list of ddrs is longer then allowed by
2756    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2757
2758 bool
2759 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2760 {
2761   vec<ddr_p> may_alias_ddrs =
2762     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2763   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2764     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2765   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2766   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2767
2768   ddr_p ddr;
2769   unsigned int i;
2770   tree length_factor;
2771
2772   if (dump_enabled_p ())
2773     dump_printf_loc (MSG_NOTE, vect_location,
2774                      "=== vect_prune_runtime_alias_test_list ===\n");
2775
2776   if (may_alias_ddrs.is_empty ())
2777     return true;
2778
2779   /* Basically, for each pair of dependent data refs store_ptr_0
2780      and load_ptr_0, we create an expression:
2781
2782      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2783      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2784
2785      for aliasing checks.  However, in some cases we can decrease
2786      the number of checks by combining two checks into one.  For
2787      example, suppose we have another pair of data refs store_ptr_0
2788      and load_ptr_1, and if the following condition is satisfied:
2789
2790      load_ptr_0 < load_ptr_1  &&
2791      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2792
2793      (this condition means, in each iteration of vectorized loop,
2794      the accessed memory of store_ptr_0 cannot be between the memory
2795      of load_ptr_0 and load_ptr_1.)
2796
2797      we then can use only the following expression to finish the
2798      alising checks between store_ptr_0 & load_ptr_0 and
2799      store_ptr_0 & load_ptr_1:
2800
2801      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2802      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2803
2804      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2805      same basic address.  */
2806
2807   comp_alias_ddrs.create (may_alias_ddrs.length ());
2808
2809   /* First, we collect all data ref pairs for aliasing checks.  */
2810   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2811     {
2812       struct data_reference *dr_a, *dr_b;
2813       gimple dr_group_first_a, dr_group_first_b;
2814       tree segment_length_a, segment_length_b;
2815       gimple stmt_a, stmt_b;
2816
2817       dr_a = DDR_A (ddr);
2818       stmt_a = DR_STMT (DDR_A (ddr));
2819       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2820       if (dr_group_first_a)
2821         {
2822           stmt_a = dr_group_first_a;
2823           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2824         }
2825
2826       dr_b = DDR_B (ddr);
2827       stmt_b = DR_STMT (DDR_B (ddr));
2828       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2829       if (dr_group_first_b)
2830         {
2831           stmt_b = dr_group_first_b;
2832           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2833         }
2834
2835       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2836         length_factor = scalar_loop_iters;
2837       else
2838         length_factor = size_int (vect_factor);
2839       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2840       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2841
2842       dr_with_seg_len_pair_t dr_with_seg_len_pair
2843           (dr_with_seg_len (dr_a, segment_length_a),
2844            dr_with_seg_len (dr_b, segment_length_b));
2845
2846       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2847         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2848
2849       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2850     }
2851
2852   /* Second, we sort the collected data ref pairs so that we can scan
2853      them once to combine all possible aliasing checks.  */
2854   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2855
2856   /* Third, we scan the sorted dr pairs and check if we can combine
2857      alias checks of two neighbouring dr pairs.  */
2858   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2859     {
2860       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2861       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2862                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2863                       *dr_a2 = &comp_alias_ddrs[i].first,
2864                       *dr_b2 = &comp_alias_ddrs[i].second;
2865
2866       /* Remove duplicate data ref pairs.  */
2867       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2868         {
2869           if (dump_enabled_p ())
2870             {
2871               dump_printf_loc (MSG_NOTE, vect_location,
2872                                "found equal ranges ");
2873               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2874                                  DR_REF (dr_a1->dr));
2875               dump_printf (MSG_NOTE,  ", ");
2876               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2877                                  DR_REF (dr_b1->dr));
2878               dump_printf (MSG_NOTE,  " and ");
2879               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2880                                  DR_REF (dr_a2->dr));
2881               dump_printf (MSG_NOTE,  ", ");
2882               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2883                                  DR_REF (dr_b2->dr));
2884               dump_printf (MSG_NOTE, "\n");
2885             }
2886
2887           comp_alias_ddrs.ordered_remove (i--);
2888           continue;
2889         }
2890
2891       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2892         {
2893           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2894              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2895           if (*dr_a1 == *dr_a2)
2896             {
2897               std::swap (dr_a1, dr_b1);
2898               std::swap (dr_a2, dr_b2);
2899             }
2900
2901           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2902                                 DR_BASE_ADDRESS (dr_a2->dr),
2903                                 0)
2904               || !tree_fits_shwi_p (dr_a1->offset)
2905               || !tree_fits_shwi_p (dr_a2->offset))
2906             continue;
2907
2908           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2909                                 - tree_to_shwi (dr_a1->offset));
2910
2911
2912           /* Now we check if the following condition is satisfied:
2913
2914              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2915
2916              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2917              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2918              have to make a best estimation.  We can get the minimum value
2919              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2920              then either of the following two conditions can guarantee the
2921              one above:
2922
2923              1: DIFF <= MIN_SEG_LEN_B
2924              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2925
2926              */
2927
2928           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2929                                           ? tree_to_shwi (dr_b1->seg_len)
2930                                           : vect_factor);
2931
2932           if (diff <= min_seg_len_b
2933               || (tree_fits_shwi_p (dr_a1->seg_len)
2934                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2935             {
2936               if (dump_enabled_p ())
2937                 {
2938                   dump_printf_loc (MSG_NOTE, vect_location,
2939                                    "merging ranges for ");
2940                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2941                                      DR_REF (dr_a1->dr));
2942                   dump_printf (MSG_NOTE,  ", ");
2943                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2944                                      DR_REF (dr_b1->dr));
2945                   dump_printf (MSG_NOTE,  " and ");
2946                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2947                                      DR_REF (dr_a2->dr));
2948                   dump_printf (MSG_NOTE,  ", ");
2949                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2950                                      DR_REF (dr_b2->dr));
2951                   dump_printf (MSG_NOTE, "\n");
2952                 }
2953
2954               dr_a1->seg_len = size_binop (PLUS_EXPR,
2955                                            dr_a2->seg_len, size_int (diff));
2956               comp_alias_ddrs.ordered_remove (i--);
2957             }
2958         }
2959     }
2960
2961   dump_printf_loc (MSG_NOTE, vect_location,
2962                    "improved number of alias checks from %d to %d\n",
2963                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2964   if ((int) comp_alias_ddrs.length () >
2965       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2966     return false;
2967
2968   return true;
2969 }
2970
2971 /* Check whether a non-affine read in stmt is suitable for gather load
2972    and if so, return a builtin decl for that operation.  */
2973
2974 tree
2975 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2976                    tree *offp, int *scalep)
2977 {
2978   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2979   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2980   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2981   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2982   tree offtype = NULL_TREE;
2983   tree decl, base, off;
2984   machine_mode pmode;
2985   int punsignedp, pvolatilep;
2986
2987   base = DR_REF (dr);
2988   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2989      see if we can use the def stmt of the address.  */
2990   if (is_gimple_call (stmt)
2991       && gimple_call_internal_p (stmt)
2992       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2993           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2994       && TREE_CODE (base) == MEM_REF
2995       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2996       && integer_zerop (TREE_OPERAND (base, 1))
2997       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2998     {
2999       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3000       if (is_gimple_assign (def_stmt)
3001           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3002         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3003     }
3004
3005   /* The gather builtins need address of the form
3006      loop_invariant + vector * {1, 2, 4, 8}
3007      or
3008      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3009      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3010      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3011      multiplications and additions in it.  To get a vector, we need
3012      a single SSA_NAME that will be defined in the loop and will
3013      contain everything that is not loop invariant and that can be
3014      vectorized.  The following code attempts to find such a preexistng
3015      SSA_NAME OFF and put the loop invariants into a tree BASE
3016      that can be gimplified before the loop.  */
3017   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3018                               &pmode, &punsignedp, &pvolatilep, false);
3019   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3020
3021   if (TREE_CODE (base) == MEM_REF)
3022     {
3023       if (!integer_zerop (TREE_OPERAND (base, 1)))
3024         {
3025           if (off == NULL_TREE)
3026             {
3027               offset_int moff = mem_ref_offset (base);
3028               off = wide_int_to_tree (sizetype, moff);
3029             }
3030           else
3031             off = size_binop (PLUS_EXPR, off,
3032                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3033         }
3034       base = TREE_OPERAND (base, 0);
3035     }
3036   else
3037     base = build_fold_addr_expr (base);
3038
3039   if (off == NULL_TREE)
3040     off = size_zero_node;
3041
3042   /* If base is not loop invariant, either off is 0, then we start with just
3043      the constant offset in the loop invariant BASE and continue with base
3044      as OFF, otherwise give up.
3045      We could handle that case by gimplifying the addition of base + off
3046      into some SSA_NAME and use that as off, but for now punt.  */
3047   if (!expr_invariant_in_loop_p (loop, base))
3048     {
3049       if (!integer_zerop (off))
3050         return NULL_TREE;
3051       off = base;
3052       base = size_int (pbitpos / BITS_PER_UNIT);
3053     }
3054   /* Otherwise put base + constant offset into the loop invariant BASE
3055      and continue with OFF.  */
3056   else
3057     {
3058       base = fold_convert (sizetype, base);
3059       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3060     }
3061
3062   /* OFF at this point may be either a SSA_NAME or some tree expression
3063      from get_inner_reference.  Try to peel off loop invariants from it
3064      into BASE as long as possible.  */
3065   STRIP_NOPS (off);
3066   while (offtype == NULL_TREE)
3067     {
3068       enum tree_code code;
3069       tree op0, op1, add = NULL_TREE;
3070
3071       if (TREE_CODE (off) == SSA_NAME)
3072         {
3073           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3074
3075           if (expr_invariant_in_loop_p (loop, off))
3076             return NULL_TREE;
3077
3078           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3079             break;
3080
3081           op0 = gimple_assign_rhs1 (def_stmt);
3082           code = gimple_assign_rhs_code (def_stmt);
3083           op1 = gimple_assign_rhs2 (def_stmt);
3084         }
3085       else
3086         {
3087           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3088             return NULL_TREE;
3089           code = TREE_CODE (off);
3090           extract_ops_from_tree (off, &code, &op0, &op1);
3091         }
3092       switch (code)
3093         {
3094         case POINTER_PLUS_EXPR:
3095         case PLUS_EXPR:
3096           if (expr_invariant_in_loop_p (loop, op0))
3097             {
3098               add = op0;
3099               off = op1;
3100             do_add:
3101               add = fold_convert (sizetype, add);
3102               if (scale != 1)
3103                 add = size_binop (MULT_EXPR, add, size_int (scale));
3104               base = size_binop (PLUS_EXPR, base, add);
3105               continue;
3106             }
3107           if (expr_invariant_in_loop_p (loop, op1))
3108             {
3109               add = op1;
3110               off = op0;
3111               goto do_add;
3112             }
3113           break;
3114         case MINUS_EXPR:
3115           if (expr_invariant_in_loop_p (loop, op1))
3116             {
3117               add = fold_convert (sizetype, op1);
3118               add = size_binop (MINUS_EXPR, size_zero_node, add);
3119               off = op0;
3120               goto do_add;
3121             }
3122           break;
3123         case MULT_EXPR:
3124           if (scale == 1 && tree_fits_shwi_p (op1))
3125             {
3126               scale = tree_to_shwi (op1);
3127               off = op0;
3128               continue;
3129             }
3130           break;
3131         case SSA_NAME:
3132           off = op0;
3133           continue;
3134         CASE_CONVERT:
3135           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3136               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3137             break;
3138           if (TYPE_PRECISION (TREE_TYPE (op0))
3139               == TYPE_PRECISION (TREE_TYPE (off)))
3140             {
3141               off = op0;
3142               continue;
3143             }
3144           if (TYPE_PRECISION (TREE_TYPE (op0))
3145               < TYPE_PRECISION (TREE_TYPE (off)))
3146             {
3147               off = op0;
3148               offtype = TREE_TYPE (off);
3149               STRIP_NOPS (off);
3150               continue;
3151             }
3152           break;
3153         default:
3154           break;
3155         }
3156       break;
3157     }
3158
3159   /* If at the end OFF still isn't a SSA_NAME or isn't
3160      defined in the loop, punt.  */
3161   if (TREE_CODE (off) != SSA_NAME
3162       || expr_invariant_in_loop_p (loop, off))
3163     return NULL_TREE;
3164
3165   if (offtype == NULL_TREE)
3166     offtype = TREE_TYPE (off);
3167
3168   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3169                                            offtype, scale);
3170   if (decl == NULL_TREE)
3171     return NULL_TREE;
3172
3173   if (basep)
3174     *basep = base;
3175   if (offp)
3176     *offp = off;
3177   if (scalep)
3178     *scalep = scale;
3179   return decl;
3180 }
3181
3182 /* Function vect_analyze_data_refs.
3183
3184   Find all the data references in the loop or basic block.
3185
3186    The general structure of the analysis of data refs in the vectorizer is as
3187    follows:
3188    1- vect_analyze_data_refs(loop/bb): call
3189       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3190       in the loop/bb and their dependences.
3191    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3192    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3193    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3194
3195 */
3196
3197 bool
3198 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3199                         bb_vec_info bb_vinfo,
3200                         int *min_vf, unsigned *n_stmts)
3201 {
3202   struct loop *loop = NULL;
3203   basic_block bb = NULL;
3204   unsigned int i;
3205   vec<data_reference_p> datarefs;
3206   struct data_reference *dr;
3207   tree scalar_type;
3208
3209   if (dump_enabled_p ())
3210     dump_printf_loc (MSG_NOTE, vect_location,
3211                      "=== vect_analyze_data_refs ===\n");
3212
3213   if (loop_vinfo)
3214     {
3215       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3216
3217       loop = LOOP_VINFO_LOOP (loop_vinfo);
3218       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3219       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3220         {
3221           if (dump_enabled_p ())
3222             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3223                              "not vectorized: loop contains function calls"
3224                              " or data references that cannot be analyzed\n");
3225           return false;
3226         }
3227
3228       for (i = 0; i < loop->num_nodes; i++)
3229         {
3230           gimple_stmt_iterator gsi;
3231
3232           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3233             {
3234               gimple stmt = gsi_stmt (gsi);
3235               if (is_gimple_debug (stmt))
3236                 continue;
3237               ++*n_stmts;
3238               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3239                 {
3240                   if (is_gimple_call (stmt) && loop->safelen)
3241                     {
3242                       tree fndecl = gimple_call_fndecl (stmt), op;
3243                       if (fndecl != NULL_TREE)
3244                         {
3245                           struct cgraph_node *node = cgraph_node::get (fndecl);
3246                           if (node != NULL && node->simd_clones != NULL)
3247                             {
3248                               unsigned int j, n = gimple_call_num_args (stmt);
3249                               for (j = 0; j < n; j++)
3250                                 {
3251                                   op = gimple_call_arg (stmt, j);
3252                                   if (DECL_P (op)
3253                                       || (REFERENCE_CLASS_P (op)
3254                                           && get_base_address (op)))
3255                                     break;
3256                                 }
3257                               op = gimple_call_lhs (stmt);
3258                               /* Ignore #pragma omp declare simd functions
3259                                  if they don't have data references in the
3260                                  call stmt itself.  */
3261                               if (j == n
3262                                   && !(op
3263                                        && (DECL_P (op)
3264                                            || (REFERENCE_CLASS_P (op)
3265                                                && get_base_address (op)))))
3266                                 continue;
3267                             }
3268                         }
3269                     }
3270                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3271                   if (dump_enabled_p ())
3272                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3273                                      "not vectorized: loop contains function "
3274                                      "calls or data references that cannot "
3275                                      "be analyzed\n");
3276                   return false;
3277                 }
3278             }
3279         }
3280
3281       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3282     }
3283   else
3284     {
3285       gimple_stmt_iterator gsi;
3286
3287       bb = BB_VINFO_BB (bb_vinfo);
3288       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3289         {
3290           gimple stmt = gsi_stmt (gsi);
3291           if (is_gimple_debug (stmt))
3292             continue;
3293           ++*n_stmts;
3294           if (!find_data_references_in_stmt (NULL, stmt,
3295                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3296             {
3297               /* Mark the rest of the basic-block as unvectorizable.  */
3298               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3299                 {
3300                   stmt = gsi_stmt (gsi);
3301                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3302                 }
3303               break;
3304             }
3305         }
3306
3307       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3308     }
3309
3310   /* Go through the data-refs, check that the analysis succeeded.  Update
3311      pointer from stmt_vec_info struct to DR and vectype.  */
3312
3313   FOR_EACH_VEC_ELT (datarefs, i, dr)
3314     {
3315       gimple stmt;
3316       stmt_vec_info stmt_info;
3317       tree base, offset, init;
3318       bool gather = false;
3319       bool simd_lane_access = false;
3320       int vf;
3321
3322 again:
3323       if (!dr || !DR_REF (dr))
3324         {
3325           if (dump_enabled_p ())
3326             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3327                              "not vectorized: unhandled data-ref\n");
3328           return false;
3329         }
3330
3331       stmt = DR_STMT (dr);
3332       stmt_info = vinfo_for_stmt (stmt);
3333
3334       /* Discard clobbers from the dataref vector.  We will remove
3335          clobber stmts during vectorization.  */
3336       if (gimple_clobber_p (stmt))
3337         {
3338           free_data_ref (dr);
3339           if (i == datarefs.length () - 1)
3340             {
3341               datarefs.pop ();
3342               break;
3343             }
3344           datarefs.ordered_remove (i);
3345           dr = datarefs[i];
3346           goto again;
3347         }
3348
3349       /* Check that analysis of the data-ref succeeded.  */
3350       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3351           || !DR_STEP (dr))
3352         {
3353           bool maybe_gather
3354             = DR_IS_READ (dr)
3355               && !TREE_THIS_VOLATILE (DR_REF (dr))
3356               && targetm.vectorize.builtin_gather != NULL;
3357           bool maybe_simd_lane_access
3358             = loop_vinfo && loop->simduid;
3359
3360           /* If target supports vector gather loads, or if this might be
3361              a SIMD lane access, see if they can't be used.  */
3362           if (loop_vinfo
3363               && (maybe_gather || maybe_simd_lane_access)
3364               && !nested_in_vect_loop_p (loop, stmt))
3365             {
3366               struct data_reference *newdr
3367                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3368                                    DR_REF (dr), stmt, true);
3369               gcc_assert (newdr != NULL && DR_REF (newdr));
3370               if (DR_BASE_ADDRESS (newdr)
3371                   && DR_OFFSET (newdr)
3372                   && DR_INIT (newdr)
3373                   && DR_STEP (newdr)
3374                   && integer_zerop (DR_STEP (newdr)))
3375                 {
3376                   if (maybe_simd_lane_access)
3377                     {
3378                       tree off = DR_OFFSET (newdr);
3379                       STRIP_NOPS (off);
3380                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3381                           && TREE_CODE (off) == MULT_EXPR
3382                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3383                         {
3384                           tree step = TREE_OPERAND (off, 1);
3385                           off = TREE_OPERAND (off, 0);
3386                           STRIP_NOPS (off);
3387                           if (CONVERT_EXPR_P (off)
3388                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3389                                                                           0)))
3390                                  < TYPE_PRECISION (TREE_TYPE (off)))
3391                             off = TREE_OPERAND (off, 0);
3392                           if (TREE_CODE (off) == SSA_NAME)
3393                             {
3394                               gimple def = SSA_NAME_DEF_STMT (off);
3395                               tree reft = TREE_TYPE (DR_REF (newdr));
3396                               if (is_gimple_call (def)
3397                                   && gimple_call_internal_p (def)
3398                                   && (gimple_call_internal_fn (def)
3399                                       == IFN_GOMP_SIMD_LANE))
3400                                 {
3401                                   tree arg = gimple_call_arg (def, 0);
3402                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3403                                   arg = SSA_NAME_VAR (arg);
3404                                   if (arg == loop->simduid
3405                                       /* For now.  */
3406                                       && tree_int_cst_equal
3407                                            (TYPE_SIZE_UNIT (reft),
3408                                             step))
3409                                     {
3410                                       DR_OFFSET (newdr) = ssize_int (0);
3411                                       DR_STEP (newdr) = step;
3412                                       DR_ALIGNED_TO (newdr)
3413                                         = size_int (BIGGEST_ALIGNMENT);
3414                                       dr = newdr;
3415                                       simd_lane_access = true;
3416                                     }
3417                                 }
3418                             }
3419                         }
3420                     }
3421                   if (!simd_lane_access && maybe_gather)
3422                     {
3423                       dr = newdr;
3424                       gather = true;
3425                     }
3426                 }
3427               if (!gather && !simd_lane_access)
3428                 free_data_ref (newdr);
3429             }
3430
3431           if (!gather && !simd_lane_access)
3432             {
3433               if (dump_enabled_p ())
3434                 {
3435                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3436                                    "not vectorized: data ref analysis "
3437                                    "failed ");
3438                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3439                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3440                 }
3441
3442               if (bb_vinfo)
3443                 break;
3444
3445               return false;
3446             }
3447         }
3448
3449       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3450         {
3451           if (dump_enabled_p ())
3452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3453                              "not vectorized: base addr of dr is a "
3454                              "constant\n");
3455
3456           if (bb_vinfo)
3457             break;
3458
3459           if (gather || simd_lane_access)
3460             free_data_ref (dr);
3461           return false;
3462         }
3463
3464       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3465         {
3466           if (dump_enabled_p ())
3467             {
3468               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3469                                "not vectorized: volatile type ");
3470               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3471               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3472             }
3473
3474           if (bb_vinfo)
3475             break;
3476
3477           return false;
3478         }
3479
3480       if (stmt_can_throw_internal (stmt))
3481         {
3482           if (dump_enabled_p ())
3483             {
3484               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3485                                "not vectorized: statement can throw an "
3486                                "exception ");
3487               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3488               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3489             }
3490
3491           if (bb_vinfo)
3492             break;
3493
3494           if (gather || simd_lane_access)
3495             free_data_ref (dr);
3496           return false;
3497         }
3498
3499       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3500           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3501         {
3502           if (dump_enabled_p ())
3503             {
3504               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3505                                "not vectorized: statement is bitfield "
3506                                "access ");
3507               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3508               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3509             }
3510
3511           if (bb_vinfo)
3512             break;
3513
3514           if (gather || simd_lane_access)
3515             free_data_ref (dr);
3516           return false;
3517         }
3518
3519       base = unshare_expr (DR_BASE_ADDRESS (dr));
3520       offset = unshare_expr (DR_OFFSET (dr));
3521       init = unshare_expr (DR_INIT (dr));
3522
3523       if (is_gimple_call (stmt)
3524           && (!gimple_call_internal_p (stmt)
3525               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3526                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3527         {
3528           if (dump_enabled_p ())
3529             {
3530               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3531                                "not vectorized: dr in a call ");
3532               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3533               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3534             }
3535
3536           if (bb_vinfo)
3537             break;
3538
3539           if (gather || simd_lane_access)
3540             free_data_ref (dr);
3541           return false;
3542         }
3543
3544       /* Update DR field in stmt_vec_info struct.  */
3545
3546       /* If the dataref is in an inner-loop of the loop that is considered for
3547          for vectorization, we also want to analyze the access relative to
3548          the outer-loop (DR contains information only relative to the
3549          inner-most enclosing loop).  We do that by building a reference to the
3550          first location accessed by the inner-loop, and analyze it relative to
3551          the outer-loop.  */
3552       if (loop && nested_in_vect_loop_p (loop, stmt))
3553         {
3554           tree outer_step, outer_base, outer_init;
3555           HOST_WIDE_INT pbitsize, pbitpos;
3556           tree poffset;
3557           machine_mode pmode;
3558           int punsignedp, pvolatilep;
3559           affine_iv base_iv, offset_iv;
3560           tree dinit;
3561
3562           /* Build a reference to the first location accessed by the
3563              inner-loop: *(BASE+INIT).  (The first location is actually
3564              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3565           tree inner_base = build_fold_indirect_ref
3566                                 (fold_build_pointer_plus (base, init));
3567
3568           if (dump_enabled_p ())
3569             {
3570               dump_printf_loc (MSG_NOTE, vect_location,
3571                                "analyze in outer-loop: ");
3572               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3573               dump_printf (MSG_NOTE, "\n");
3574             }
3575
3576           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3577                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3578           gcc_assert (outer_base != NULL_TREE);
3579
3580           if (pbitpos % BITS_PER_UNIT != 0)
3581             {
3582               if (dump_enabled_p ())
3583                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584                                  "failed: bit offset alignment.\n");
3585               return false;
3586             }
3587
3588           outer_base = build_fold_addr_expr (outer_base);
3589           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3590                           &base_iv, false))
3591             {
3592               if (dump_enabled_p ())
3593                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3594                                  "failed: evolution of base is not affine.\n");
3595               return false;
3596             }
3597
3598           if (offset)
3599             {
3600               if (poffset)
3601                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3602                                        poffset);
3603               else
3604                 poffset = offset;
3605             }
3606
3607           if (!poffset)
3608             {
3609               offset_iv.base = ssize_int (0);
3610               offset_iv.step = ssize_int (0);
3611             }
3612           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3613                                &offset_iv, false))
3614             {
3615               if (dump_enabled_p ())
3616                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3617                                  "evolution of offset is not affine.\n");
3618               return false;
3619             }
3620
3621           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3622           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3623           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3624           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3625           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3626
3627           outer_step = size_binop (PLUS_EXPR,
3628                                 fold_convert (ssizetype, base_iv.step),
3629                                 fold_convert (ssizetype, offset_iv.step));
3630
3631           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3632           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3633           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3634           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3635           STMT_VINFO_DR_OFFSET (stmt_info) =
3636                                 fold_convert (ssizetype, offset_iv.base);
3637           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3638                                 size_int (highest_pow2_factor (offset_iv.base));
3639
3640           if (dump_enabled_p ())
3641             {
3642               dump_printf_loc (MSG_NOTE, vect_location,
3643                                "\touter base_address: ");
3644               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3645                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3646               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3647               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3648                                  STMT_VINFO_DR_OFFSET (stmt_info));
3649               dump_printf (MSG_NOTE,
3650                            "\n\touter constant offset from base address: ");
3651               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3652                                  STMT_VINFO_DR_INIT (stmt_info));
3653               dump_printf (MSG_NOTE, "\n\touter step: ");
3654               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3655                                  STMT_VINFO_DR_STEP (stmt_info));
3656               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3657               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3658                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3659               dump_printf (MSG_NOTE, "\n");
3660             }
3661         }
3662
3663       if (STMT_VINFO_DATA_REF (stmt_info))
3664         {
3665           if (dump_enabled_p ())
3666             {
3667               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3668                                "not vectorized: more than one data ref "
3669                                "in stmt: ");
3670               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3671               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3672             }
3673
3674           if (bb_vinfo)
3675             break;
3676
3677           if (gather || simd_lane_access)
3678             free_data_ref (dr);
3679           return false;
3680         }
3681
3682       STMT_VINFO_DATA_REF (stmt_info) = dr;
3683       if (simd_lane_access)
3684         {
3685           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3686           free_data_ref (datarefs[i]);
3687           datarefs[i] = dr;
3688         }
3689
3690       /* Set vectype for STMT.  */
3691       scalar_type = TREE_TYPE (DR_REF (dr));
3692       STMT_VINFO_VECTYPE (stmt_info)
3693         = get_vectype_for_scalar_type (scalar_type);
3694       if (!STMT_VINFO_VECTYPE (stmt_info))
3695         {
3696           if (dump_enabled_p ())
3697             {
3698               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3699                                "not vectorized: no vectype for stmt: ");
3700               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3701               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3702               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3703                                  scalar_type);
3704               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3705             }
3706
3707           if (bb_vinfo)
3708             break;
3709
3710           if (gather || simd_lane_access)
3711             {
3712               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3713               if (gather)
3714                 free_data_ref (dr);
3715             }
3716           return false;
3717         }
3718       else
3719         {
3720           if (dump_enabled_p ())
3721             {
3722               dump_printf_loc (MSG_NOTE, vect_location,
3723                                "got vectype for stmt: ");
3724               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3725               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3726                                  STMT_VINFO_VECTYPE (stmt_info));
3727               dump_printf (MSG_NOTE, "\n");
3728             }
3729         }
3730
3731       /* Adjust the minimal vectorization factor according to the
3732          vector type.  */
3733       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3734       if (vf > *min_vf)
3735         *min_vf = vf;
3736
3737       if (gather)
3738         {
3739           tree off;
3740
3741           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3742           if (gather
3743               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3744             gather = false;
3745           if (!gather)
3746             {
3747               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3748               free_data_ref (dr);
3749               if (dump_enabled_p ())
3750                 {
3751                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3752                                    "not vectorized: not suitable for gather "
3753                                    "load ");
3754                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3755                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3756                 }
3757               return false;
3758             }
3759
3760           datarefs[i] = dr;
3761           STMT_VINFO_GATHER_P (stmt_info) = true;
3762         }
3763       else if (loop_vinfo
3764                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3765         {
3766           if (nested_in_vect_loop_p (loop, stmt))
3767             {
3768               if (dump_enabled_p ())
3769                 {
3770                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3771                                    "not vectorized: not suitable for strided "
3772                                    "load ");
3773                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3774                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3775                 }
3776               return false;
3777             }
3778           STMT_VINFO_STRIDED_P (stmt_info) = true;
3779         }
3780     }
3781
3782   /* If we stopped analysis at the first dataref we could not analyze
3783      when trying to vectorize a basic-block mark the rest of the datarefs
3784      as not vectorizable and truncate the vector of datarefs.  That
3785      avoids spending useless time in analyzing their dependence.  */
3786   if (i != datarefs.length ())
3787     {
3788       gcc_assert (bb_vinfo != NULL);
3789       for (unsigned j = i; j < datarefs.length (); ++j)
3790         {
3791           data_reference_p dr = datarefs[j];
3792           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3793           free_data_ref (dr);
3794         }
3795       datarefs.truncate (i);
3796     }
3797
3798   return true;
3799 }
3800
3801
3802 /* Function vect_get_new_vect_var.
3803
3804    Returns a name for a new variable.  The current naming scheme appends the
3805    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3806    the name of vectorizer generated variables, and appends that to NAME if
3807    provided.  */
3808
3809 tree
3810 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3811 {
3812   const char *prefix;
3813   tree new_vect_var;
3814
3815   switch (var_kind)
3816   {
3817   case vect_simple_var:
3818     prefix = "vect";
3819     break;
3820   case vect_scalar_var:
3821     prefix = "stmp";
3822     break;
3823   case vect_pointer_var:
3824     prefix = "vectp";
3825     break;
3826   default:
3827     gcc_unreachable ();
3828   }
3829
3830   if (name)
3831     {
3832       char* tmp = concat (prefix, "_", name, NULL);
3833       new_vect_var = create_tmp_reg (type, tmp);
3834       free (tmp);
3835     }
3836   else
3837     new_vect_var = create_tmp_reg (type, prefix);
3838
3839   return new_vect_var;
3840 }
3841
3842 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3843
3844 static void
3845 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3846                                   stmt_vec_info stmt_info)
3847 {
3848   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3849   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3850   int misalign = DR_MISALIGNMENT (dr);
3851   if (misalign == -1)
3852     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3853   else
3854     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3855 }
3856
3857 /* Function vect_create_addr_base_for_vector_ref.
3858
3859    Create an expression that computes the address of the first memory location
3860    that will be accessed for a data reference.
3861
3862    Input:
3863    STMT: The statement containing the data reference.
3864    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3865    OFFSET: Optional. If supplied, it is be added to the initial address.
3866    LOOP:    Specify relative to which loop-nest should the address be computed.
3867             For example, when the dataref is in an inner-loop nested in an
3868             outer-loop that is now being vectorized, LOOP can be either the
3869             outer-loop, or the inner-loop.  The first memory location accessed
3870             by the following dataref ('in' points to short):
3871
3872                 for (i=0; i<N; i++)
3873                    for (j=0; j<M; j++)
3874                      s += in[i+j]
3875
3876             is as follows:
3877             if LOOP=i_loop:     &in             (relative to i_loop)
3878             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3879    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3880             initial address.  Unlike OFFSET, which is number of elements to
3881             be added, BYTE_OFFSET is measured in bytes.
3882
3883    Output:
3884    1. Return an SSA_NAME whose value is the address of the memory location of
3885       the first vector of the data reference.
3886    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3887       these statement(s) which define the returned SSA_NAME.
3888
3889    FORNOW: We are only handling array accesses with step 1.  */
3890
3891 tree
3892 vect_create_addr_base_for_vector_ref (gimple stmt,
3893                                       gimple_seq *new_stmt_list,
3894                                       tree offset,
3895                                       struct loop *loop,
3896                                       tree byte_offset)
3897 {
3898   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3899   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3900   tree data_ref_base;
3901   const char *base_name;
3902   tree addr_base;
3903   tree dest;
3904   gimple_seq seq = NULL;
3905   tree base_offset;
3906   tree init;
3907   tree vect_ptr_type;
3908   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3909   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3910
3911   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3912     {
3913       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3914
3915       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3916
3917       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3918       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3919       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3920     }
3921   else
3922     {
3923       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3924       base_offset = unshare_expr (DR_OFFSET (dr));
3925       init = unshare_expr (DR_INIT (dr));
3926     }
3927
3928   if (loop_vinfo)
3929     base_name = get_name (data_ref_base);
3930   else
3931     {
3932       base_offset = ssize_int (0);
3933       init = ssize_int (0);
3934       base_name = get_name (DR_REF (dr));
3935     }
3936
3937   /* Create base_offset */
3938   base_offset = size_binop (PLUS_EXPR,
3939                             fold_convert (sizetype, base_offset),
3940                             fold_convert (sizetype, init));
3941
3942   if (offset)
3943     {
3944       offset = fold_build2 (MULT_EXPR, sizetype,
3945                             fold_convert (sizetype, offset), step);
3946       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3947                                  base_offset, offset);
3948     }
3949   if (byte_offset)
3950     {
3951       byte_offset = fold_convert (sizetype, byte_offset);
3952       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3953                                  base_offset, byte_offset);
3954     }
3955
3956   /* base + base_offset */
3957   if (loop_vinfo)
3958     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3959   else
3960     {
3961       addr_base = build1 (ADDR_EXPR,
3962                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3963                           unshare_expr (DR_REF (dr)));
3964     }
3965
3966   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3967   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3968   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3969   gimple_seq_add_seq (new_stmt_list, seq);
3970
3971   if (DR_PTR_INFO (dr)
3972       && TREE_CODE (addr_base) == SSA_NAME
3973       && !SSA_NAME_PTR_INFO (addr_base))
3974     {
3975       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3976       if (offset || byte_offset)
3977         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3978     }
3979
3980   if (dump_enabled_p ())
3981     {
3982       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3983       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3984       dump_printf (MSG_NOTE, "\n");
3985     }
3986
3987   return addr_base;
3988 }
3989
3990
3991 /* Function vect_create_data_ref_ptr.
3992
3993    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3994    location accessed in the loop by STMT, along with the def-use update
3995    chain to appropriately advance the pointer through the loop iterations.
3996    Also set aliasing information for the pointer.  This pointer is used by
3997    the callers to this function to create a memory reference expression for
3998    vector load/store access.
3999
4000    Input:
4001    1. STMT: a stmt that references memory. Expected to be of the form
4002          GIMPLE_ASSIGN <name, data-ref> or
4003          GIMPLE_ASSIGN <data-ref, name>.
4004    2. AGGR_TYPE: the type of the reference, which should be either a vector
4005         or an array.
4006    3. AT_LOOP: the loop where the vector memref is to be created.
4007    4. OFFSET (optional): an offset to be added to the initial address accessed
4008         by the data-ref in STMT.
4009    5. BSI: location where the new stmts are to be placed if there is no loop
4010    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4011         pointing to the initial address.
4012    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4013         to the initial address accessed by the data-ref in STMT.  This is
4014         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4015         in bytes.
4016
4017    Output:
4018    1. Declare a new ptr to vector_type, and have it point to the base of the
4019       data reference (initial addressed accessed by the data reference).
4020       For example, for vector of type V8HI, the following code is generated:
4021
4022       v8hi *ap;
4023       ap = (v8hi *)initial_address;
4024
4025       if OFFSET is not supplied:
4026          initial_address = &a[init];
4027       if OFFSET is supplied:
4028          initial_address = &a[init + OFFSET];
4029       if BYTE_OFFSET is supplied:
4030          initial_address = &a[init] + BYTE_OFFSET;
4031
4032       Return the initial_address in INITIAL_ADDRESS.
4033
4034    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4035       update the pointer in each iteration of the loop.
4036
4037       Return the increment stmt that updates the pointer in PTR_INCR.
4038
4039    3. Set INV_P to true if the access pattern of the data reference in the
4040       vectorized loop is invariant.  Set it to false otherwise.
4041
4042    4. Return the pointer.  */
4043
4044 tree
4045 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4046                           tree offset, tree *initial_address,
4047                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
4048                           bool only_init, bool *inv_p, tree byte_offset)
4049 {
4050   const char *base_name;
4051   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4052   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4053   struct loop *loop = NULL;
4054   bool nested_in_vect_loop = false;
4055   struct loop *containing_loop = NULL;
4056   tree aggr_ptr_type;
4057   tree aggr_ptr;
4058   tree new_temp;
4059   gimple_seq new_stmt_list = NULL;
4060   edge pe = NULL;
4061   basic_block new_bb;
4062   tree aggr_ptr_init;
4063   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4064   tree aptr;
4065   gimple_stmt_iterator incr_gsi;
4066   bool insert_after;
4067   tree indx_before_incr, indx_after_incr;
4068   gimple incr;
4069   tree step;
4070   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4071
4072   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4073               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4074
4075   if (loop_vinfo)
4076     {
4077       loop = LOOP_VINFO_LOOP (loop_vinfo);
4078       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4079       containing_loop = (gimple_bb (stmt))->loop_father;
4080       pe = loop_preheader_edge (loop);
4081     }
4082   else
4083     {
4084       gcc_assert (bb_vinfo);
4085       only_init = true;
4086       *ptr_incr = NULL;
4087     }
4088
4089   /* Check the step (evolution) of the load in LOOP, and record
4090      whether it's invariant.  */
4091   if (nested_in_vect_loop)
4092     step = STMT_VINFO_DR_STEP (stmt_info);
4093   else
4094     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4095
4096   if (integer_zerop (step))
4097     *inv_p = true;
4098   else
4099     *inv_p = false;
4100
4101   /* Create an expression for the first address accessed by this load
4102      in LOOP.  */
4103   base_name = get_name (DR_BASE_ADDRESS (dr));
4104
4105   if (dump_enabled_p ())
4106     {
4107       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4108       dump_printf_loc (MSG_NOTE, vect_location,
4109                        "create %s-pointer variable to type: ",
4110                        get_tree_code_name (TREE_CODE (aggr_type)));
4111       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4112       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4113         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4114       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4115         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4116       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4117         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4118       else
4119         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4120       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4121       dump_printf (MSG_NOTE, "\n");
4122     }
4123
4124   /* (1) Create the new aggregate-pointer variable.
4125      Vector and array types inherit the alias set of their component
4126      type by default so we need to use a ref-all pointer if the data
4127      reference does not conflict with the created aggregated data
4128      reference because it is not addressable.  */
4129   bool need_ref_all = false;
4130   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4131                               get_alias_set (DR_REF (dr))))
4132     need_ref_all = true;
4133   /* Likewise for any of the data references in the stmt group.  */
4134   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4135     {
4136       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4137       do
4138         {
4139           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4140           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4141           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4142                                       get_alias_set (DR_REF (sdr))))
4143             {
4144               need_ref_all = true;
4145               break;
4146             }
4147           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4148         }
4149       while (orig_stmt);
4150     }
4151   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4152                                                need_ref_all);
4153   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4154
4155
4156   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4157      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4158      def-use update cycles for the pointer: one relative to the outer-loop
4159      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4160      to the inner-loop (which is the inner-most loop containing the dataref),
4161      and this is done be step (5) below.
4162
4163      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4164      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4165      redundant.  Steps (3),(4) create the following:
4166
4167         vp0 = &base_addr;
4168         LOOP:   vp1 = phi(vp0,vp2)
4169                 ...
4170                 ...
4171                 vp2 = vp1 + step
4172                 goto LOOP
4173
4174      If there is an inner-loop nested in loop, then step (5) will also be
4175      applied, and an additional update in the inner-loop will be created:
4176
4177         vp0 = &base_addr;
4178         LOOP:   vp1 = phi(vp0,vp2)
4179                 ...
4180         inner:     vp3 = phi(vp1,vp4)
4181                    vp4 = vp3 + inner_step
4182                    if () goto inner
4183                 ...
4184                 vp2 = vp1 + step
4185                 if () goto LOOP   */
4186
4187   /* (2) Calculate the initial address of the aggregate-pointer, and set
4188      the aggregate-pointer to point to it before the loop.  */
4189
4190   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4191
4192   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4193                                                    offset, loop, byte_offset);
4194   if (new_stmt_list)
4195     {
4196       if (pe)
4197         {
4198           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4199           gcc_assert (!new_bb);
4200         }
4201       else
4202         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4203     }
4204
4205   *initial_address = new_temp;
4206   aggr_ptr_init = new_temp;
4207
4208   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4209      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4210      inner-loop nested in LOOP (during outer-loop vectorization).  */
4211
4212   /* No update in loop is required.  */
4213   if (only_init && (!loop_vinfo || at_loop == loop))
4214     aptr = aggr_ptr_init;
4215   else
4216     {
4217       /* The step of the aggregate pointer is the type size.  */
4218       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4219       /* One exception to the above is when the scalar step of the load in
4220          LOOP is zero. In this case the step here is also zero.  */
4221       if (*inv_p)
4222         iv_step = size_zero_node;
4223       else if (tree_int_cst_sgn (step) == -1)
4224         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4225
4226       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4227
4228       create_iv (aggr_ptr_init,
4229                  fold_convert (aggr_ptr_type, iv_step),
4230                  aggr_ptr, loop, &incr_gsi, insert_after,
4231                  &indx_before_incr, &indx_after_incr);
4232       incr = gsi_stmt (incr_gsi);
4233       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4234
4235       /* Copy the points-to information if it exists. */
4236       if (DR_PTR_INFO (dr))
4237         {
4238           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4239           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4240         }
4241       if (ptr_incr)
4242         *ptr_incr = incr;
4243
4244       aptr = indx_before_incr;
4245     }
4246
4247   if (!nested_in_vect_loop || only_init)
4248     return aptr;
4249
4250
4251   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4252      nested in LOOP, if exists.  */
4253
4254   gcc_assert (nested_in_vect_loop);
4255   if (!only_init)
4256     {
4257       standard_iv_increment_position (containing_loop, &incr_gsi,
4258                                       &insert_after);
4259       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4260                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4261                  &indx_after_incr);
4262       incr = gsi_stmt (incr_gsi);
4263       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4264
4265       /* Copy the points-to information if it exists. */
4266       if (DR_PTR_INFO (dr))
4267         {
4268           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4269           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4270         }
4271       if (ptr_incr)
4272         *ptr_incr = incr;
4273
4274       return indx_before_incr;
4275     }
4276   else
4277     gcc_unreachable ();
4278 }
4279
4280
4281 /* Function bump_vector_ptr
4282
4283    Increment a pointer (to a vector type) by vector-size. If requested,
4284    i.e. if PTR-INCR is given, then also connect the new increment stmt
4285    to the existing def-use update-chain of the pointer, by modifying
4286    the PTR_INCR as illustrated below:
4287
4288    The pointer def-use update-chain before this function:
4289                         DATAREF_PTR = phi (p_0, p_2)
4290                         ....
4291         PTR_INCR:       p_2 = DATAREF_PTR + step
4292
4293    The pointer def-use update-chain after this function:
4294                         DATAREF_PTR = phi (p_0, p_2)
4295                         ....
4296                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4297                         ....
4298         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4299
4300    Input:
4301    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4302                  in the loop.
4303    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4304               the loop.  The increment amount across iterations is expected
4305               to be vector_size.
4306    BSI - location where the new update stmt is to be placed.
4307    STMT - the original scalar memory-access stmt that is being vectorized.
4308    BUMP - optional. The offset by which to bump the pointer. If not given,
4309           the offset is assumed to be vector_size.
4310
4311    Output: Return NEW_DATAREF_PTR as illustrated above.
4312
4313 */
4314
4315 tree
4316 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4317                  gimple stmt, tree bump)
4318 {
4319   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4320   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4321   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4322   tree update = TYPE_SIZE_UNIT (vectype);
4323   gassign *incr_stmt;
4324   ssa_op_iter iter;
4325   use_operand_p use_p;
4326   tree new_dataref_ptr;
4327
4328   if (bump)
4329     update = bump;
4330
4331   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4332     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4333   else
4334     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4335   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4336                                    dataref_ptr, update);
4337   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4338
4339   /* Copy the points-to information if it exists. */
4340   if (DR_PTR_INFO (dr))
4341     {
4342       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4343       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4344     }
4345
4346   if (!ptr_incr)
4347     return new_dataref_ptr;
4348
4349   /* Update the vector-pointer's cross-iteration increment.  */
4350   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4351     {
4352       tree use = USE_FROM_PTR (use_p);
4353
4354       if (use == dataref_ptr)
4355         SET_USE (use_p, new_dataref_ptr);
4356       else
4357         gcc_assert (tree_int_cst_compare (use, update) == 0);
4358     }
4359
4360   return new_dataref_ptr;
4361 }
4362
4363
4364 /* Function vect_create_destination_var.
4365
4366    Create a new temporary of type VECTYPE.  */
4367
4368 tree
4369 vect_create_destination_var (tree scalar_dest, tree vectype)
4370 {
4371   tree vec_dest;
4372   const char *name;
4373   char *new_name;
4374   tree type;
4375   enum vect_var_kind kind;
4376
4377   kind = vectype ? vect_simple_var : vect_scalar_var;
4378   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4379
4380   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4381
4382   name = get_name (scalar_dest);
4383   if (name)
4384     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4385   else
4386     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4387   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4388   free (new_name);
4389
4390   return vec_dest;
4391 }
4392
4393 /* Function vect_grouped_store_supported.
4394
4395    Returns TRUE if interleave high and interleave low permutations
4396    are supported, and FALSE otherwise.  */
4397
4398 bool
4399 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4400 {
4401   machine_mode mode = TYPE_MODE (vectype);
4402
4403   /* vect_permute_store_chain requires the group size to be equal to 3 or
4404      be a power of two.  */
4405   if (count != 3 && exact_log2 (count) == -1)
4406     {
4407       if (dump_enabled_p ())
4408         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4409                          "the size of the group of accesses"
4410                          " is not a power of 2 or not eqaul to 3\n");
4411       return false;
4412     }
4413
4414   /* Check that the permutation is supported.  */
4415   if (VECTOR_MODE_P (mode))
4416     {
4417       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4418       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4419
4420       if (count == 3)
4421         {
4422           unsigned int j0 = 0, j1 = 0, j2 = 0;
4423           unsigned int i, j;
4424
4425           for (j = 0; j < 3; j++)
4426             {
4427               int nelt0 = ((3 - j) * nelt) % 3;
4428               int nelt1 = ((3 - j) * nelt + 1) % 3;
4429               int nelt2 = ((3 - j) * nelt + 2) % 3;
4430               for (i = 0; i < nelt; i++)
4431                 {
4432                   if (3 * i + nelt0 < nelt)
4433                     sel[3 * i + nelt0] = j0++;
4434                   if (3 * i + nelt1 < nelt)
4435                     sel[3 * i + nelt1] = nelt + j1++;
4436                   if (3 * i + nelt2 < nelt)
4437                     sel[3 * i + nelt2] = 0;
4438                 }
4439               if (!can_vec_perm_p (mode, false, sel))
4440                 {
4441                   if (dump_enabled_p ())
4442                     dump_printf (MSG_MISSED_OPTIMIZATION,
4443                                  "permutaion op not supported by target.\n");
4444                   return false;
4445                 }
4446
4447               for (i = 0; i < nelt; i++)
4448                 {
4449                   if (3 * i + nelt0 < nelt)
4450                     sel[3 * i + nelt0] = 3 * i + nelt0;
4451                   if (3 * i + nelt1 < nelt)
4452                     sel[3 * i + nelt1] = 3 * i + nelt1;
4453                   if (3 * i + nelt2 < nelt)
4454                     sel[3 * i + nelt2] = nelt + j2++;
4455                 }
4456               if (!can_vec_perm_p (mode, false, sel))
4457                 {
4458                   if (dump_enabled_p ())
4459                     dump_printf (MSG_MISSED_OPTIMIZATION,
4460                                  "permutaion op not supported by target.\n");
4461                   return false;
4462                 }
4463             }
4464           return true;
4465         }
4466       else
4467         {
4468           /* If length is not equal to 3 then only power of 2 is supported.  */
4469           gcc_assert (exact_log2 (count) != -1);
4470
4471           for (i = 0; i < nelt / 2; i++)
4472             {
4473               sel[i * 2] = i;
4474               sel[i * 2 + 1] = i + nelt;
4475             }
4476             if (can_vec_perm_p (mode, false, sel))
4477               {
4478                 for (i = 0; i < nelt; i++)
4479                   sel[i] += nelt / 2;
4480                 if (can_vec_perm_p (mode, false, sel))
4481                   return true;
4482               }
4483         }
4484     }
4485
4486   if (dump_enabled_p ())
4487     dump_printf (MSG_MISSED_OPTIMIZATION,
4488                  "permutaion op not supported by target.\n");
4489   return false;
4490 }
4491
4492
4493 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4494    type VECTYPE.  */
4495
4496 bool
4497 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4498 {
4499   return vect_lanes_optab_supported_p ("vec_store_lanes",
4500                                        vec_store_lanes_optab,
4501                                        vectype, count);
4502 }
4503
4504
4505 /* Function vect_permute_store_chain.
4506
4507    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4508    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4509    the data correctly for the stores.  Return the final references for stores
4510    in RESULT_CHAIN.
4511
4512    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4513    The input is 4 vectors each containing 8 elements.  We assign a number to
4514    each element, the input sequence is:
4515
4516    1st vec:   0  1  2  3  4  5  6  7
4517    2nd vec:   8  9 10 11 12 13 14 15
4518    3rd vec:  16 17 18 19 20 21 22 23
4519    4th vec:  24 25 26 27 28 29 30 31
4520
4521    The output sequence should be:
4522
4523    1st vec:  0  8 16 24  1  9 17 25
4524    2nd vec:  2 10 18 26  3 11 19 27
4525    3rd vec:  4 12 20 28  5 13 21 30
4526    4th vec:  6 14 22 30  7 15 23 31
4527
4528    i.e., we interleave the contents of the four vectors in their order.
4529
4530    We use interleave_high/low instructions to create such output.  The input of
4531    each interleave_high/low operation is two vectors:
4532    1st vec    2nd vec
4533    0 1 2 3    4 5 6 7
4534    the even elements of the result vector are obtained left-to-right from the
4535    high/low elements of the first vector.  The odd elements of the result are
4536    obtained left-to-right from the high/low elements of the second vector.
4537    The output of interleave_high will be:   0 4 1 5
4538    and of interleave_low:                   2 6 3 7
4539
4540
4541    The permutation is done in log LENGTH stages.  In each stage interleave_high
4542    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4543    where the first argument is taken from the first half of DR_CHAIN and the
4544    second argument from it's second half.
4545    In our example,
4546
4547    I1: interleave_high (1st vec, 3rd vec)
4548    I2: interleave_low (1st vec, 3rd vec)
4549    I3: interleave_high (2nd vec, 4th vec)
4550    I4: interleave_low (2nd vec, 4th vec)
4551
4552    The output for the first stage is:
4553
4554    I1:  0 16  1 17  2 18  3 19
4555    I2:  4 20  5 21  6 22  7 23
4556    I3:  8 24  9 25 10 26 11 27
4557    I4: 12 28 13 29 14 30 15 31
4558
4559    The output of the second stage, i.e. the final result is:
4560
4561    I1:  0  8 16 24  1  9 17 25
4562    I2:  2 10 18 26  3 11 19 27
4563    I3:  4 12 20 28  5 13 21 30
4564    I4:  6 14 22 30  7 15 23 31.  */
4565
4566 void
4567 vect_permute_store_chain (vec<tree> dr_chain,
4568                           unsigned int length,
4569                           gimple stmt,
4570                           gimple_stmt_iterator *gsi,
4571                           vec<tree> *result_chain)
4572 {
4573   tree vect1, vect2, high, low;
4574   gimple perm_stmt;
4575   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4576   tree perm_mask_low, perm_mask_high;
4577   tree data_ref;
4578   tree perm3_mask_low, perm3_mask_high;
4579   unsigned int i, n, log_length = exact_log2 (length);
4580   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4581   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4582
4583   result_chain->quick_grow (length);
4584   memcpy (result_chain->address (), dr_chain.address (),
4585           length * sizeof (tree));
4586
4587   if (length == 3)
4588     {
4589       unsigned int j0 = 0, j1 = 0, j2 = 0;
4590
4591       for (j = 0; j < 3; j++)
4592         {
4593           int nelt0 = ((3 - j) * nelt) % 3;
4594           int nelt1 = ((3 - j) * nelt + 1) % 3;
4595           int nelt2 = ((3 - j) * nelt + 2) % 3;
4596
4597           for (i = 0; i < nelt; i++)
4598             {
4599               if (3 * i + nelt0 < nelt)
4600                 sel[3 * i + nelt0] = j0++;
4601               if (3 * i + nelt1 < nelt)
4602                 sel[3 * i + nelt1] = nelt + j1++;
4603               if (3 * i + nelt2 < nelt)
4604                 sel[3 * i + nelt2] = 0;
4605             }
4606           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4607
4608           for (i = 0; i < nelt; i++)
4609             {
4610               if (3 * i + nelt0 < nelt)
4611                 sel[3 * i + nelt0] = 3 * i + nelt0;
4612               if (3 * i + nelt1 < nelt)
4613                 sel[3 * i + nelt1] = 3 * i + nelt1;
4614               if (3 * i + nelt2 < nelt)
4615                 sel[3 * i + nelt2] = nelt + j2++;
4616             }
4617           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4618
4619           vect1 = dr_chain[0];
4620           vect2 = dr_chain[1];
4621
4622           /* Create interleaving stmt:
4623              low = VEC_PERM_EXPR <vect1, vect2,
4624                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4625                                    j + 2, nelt + j + 2, *, ...}>  */
4626           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4627           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4628                                            vect2, perm3_mask_low);
4629           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4630
4631           vect1 = data_ref;
4632           vect2 = dr_chain[2];
4633           /* Create interleaving stmt:
4634              low = VEC_PERM_EXPR <vect1, vect2,
4635                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4636                                    6, 7, nelt + j + 2, ...}>  */
4637           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4638           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4639                                            vect2, perm3_mask_high);
4640           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4641           (*result_chain)[j] = data_ref;
4642         }
4643     }
4644   else
4645     {
4646       /* If length is not equal to 3 then only power of 2 is supported.  */
4647       gcc_assert (exact_log2 (length) != -1);
4648
4649       for (i = 0, n = nelt / 2; i < n; i++)
4650         {
4651           sel[i * 2] = i;
4652           sel[i * 2 + 1] = i + nelt;
4653         }
4654         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4655
4656         for (i = 0; i < nelt; i++)
4657           sel[i] += nelt / 2;
4658         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4659
4660         for (i = 0, n = log_length; i < n; i++)
4661           {
4662             for (j = 0; j < length/2; j++)
4663               {
4664                 vect1 = dr_chain[j];
4665                 vect2 = dr_chain[j+length/2];
4666
4667                 /* Create interleaving stmt:
4668                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4669                                                         ...}>  */
4670                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4671                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4672                                                  vect2, perm_mask_high);
4673                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4674                 (*result_chain)[2*j] = high;
4675
4676                 /* Create interleaving stmt:
4677                    low = VEC_PERM_EXPR <vect1, vect2,
4678                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4679                                          ...}>  */
4680                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4681                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4682                                                  vect2, perm_mask_low);
4683                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4684                 (*result_chain)[2*j+1] = low;
4685               }
4686             memcpy (dr_chain.address (), result_chain->address (),
4687                     length * sizeof (tree));
4688           }
4689     }
4690 }
4691
4692 /* Function vect_setup_realignment
4693
4694    This function is called when vectorizing an unaligned load using
4695    the dr_explicit_realign[_optimized] scheme.
4696    This function generates the following code at the loop prolog:
4697
4698       p = initial_addr;
4699    x  msq_init = *(floor(p));   # prolog load
4700       realignment_token = call target_builtin;
4701     loop:
4702    x  msq = phi (msq_init, ---)
4703
4704    The stmts marked with x are generated only for the case of
4705    dr_explicit_realign_optimized.
4706
4707    The code above sets up a new (vector) pointer, pointing to the first
4708    location accessed by STMT, and a "floor-aligned" load using that pointer.
4709    It also generates code to compute the "realignment-token" (if the relevant
4710    target hook was defined), and creates a phi-node at the loop-header bb
4711    whose arguments are the result of the prolog-load (created by this
4712    function) and the result of a load that takes place in the loop (to be
4713    created by the caller to this function).
4714
4715    For the case of dr_explicit_realign_optimized:
4716    The caller to this function uses the phi-result (msq) to create the
4717    realignment code inside the loop, and sets up the missing phi argument,
4718    as follows:
4719     loop:
4720       msq = phi (msq_init, lsq)
4721       lsq = *(floor(p'));        # load in loop
4722       result = realign_load (msq, lsq, realignment_token);
4723
4724    For the case of dr_explicit_realign:
4725     loop:
4726       msq = *(floor(p));        # load in loop
4727       p' = p + (VS-1);
4728       lsq = *(floor(p'));       # load in loop
4729       result = realign_load (msq, lsq, realignment_token);
4730
4731    Input:
4732    STMT - (scalar) load stmt to be vectorized. This load accesses
4733           a memory location that may be unaligned.
4734    BSI - place where new code is to be inserted.
4735    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4736                               is used.
4737
4738    Output:
4739    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4740                        target hook, if defined.
4741    Return value - the result of the loop-header phi node.  */
4742
4743 tree
4744 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4745                         tree *realignment_token,
4746                         enum dr_alignment_support alignment_support_scheme,
4747                         tree init_addr,
4748                         struct loop **at_loop)
4749 {
4750   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4751   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4752   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4753   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4754   struct loop *loop = NULL;
4755   edge pe = NULL;
4756   tree scalar_dest = gimple_assign_lhs (stmt);
4757   tree vec_dest;
4758   gimple inc;
4759   tree ptr;
4760   tree data_ref;
4761   basic_block new_bb;
4762   tree msq_init = NULL_TREE;
4763   tree new_temp;
4764   gphi *phi_stmt;
4765   tree msq = NULL_TREE;
4766   gimple_seq stmts = NULL;
4767   bool inv_p;
4768   bool compute_in_loop = false;
4769   bool nested_in_vect_loop = false;
4770   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4771   struct loop *loop_for_initial_load = NULL;
4772
4773   if (loop_vinfo)
4774     {
4775       loop = LOOP_VINFO_LOOP (loop_vinfo);
4776       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4777     }
4778
4779   gcc_assert (alignment_support_scheme == dr_explicit_realign
4780               || alignment_support_scheme == dr_explicit_realign_optimized);
4781
4782   /* We need to generate three things:
4783      1. the misalignment computation
4784      2. the extra vector load (for the optimized realignment scheme).
4785      3. the phi node for the two vectors from which the realignment is
4786       done (for the optimized realignment scheme).  */
4787
4788   /* 1. Determine where to generate the misalignment computation.
4789
4790      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4791      calculation will be generated by this function, outside the loop (in the
4792      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4793      caller, inside the loop.
4794
4795      Background: If the misalignment remains fixed throughout the iterations of
4796      the loop, then both realignment schemes are applicable, and also the
4797      misalignment computation can be done outside LOOP.  This is because we are
4798      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4799      are a multiple of VS (the Vector Size), and therefore the misalignment in
4800      different vectorized LOOP iterations is always the same.
4801      The problem arises only if the memory access is in an inner-loop nested
4802      inside LOOP, which is now being vectorized using outer-loop vectorization.
4803      This is the only case when the misalignment of the memory access may not
4804      remain fixed throughout the iterations of the inner-loop (as explained in
4805      detail in vect_supportable_dr_alignment).  In this case, not only is the
4806      optimized realignment scheme not applicable, but also the misalignment
4807      computation (and generation of the realignment token that is passed to
4808      REALIGN_LOAD) have to be done inside the loop.
4809
4810      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4811      or not, which in turn determines if the misalignment is computed inside
4812      the inner-loop, or outside LOOP.  */
4813
4814   if (init_addr != NULL_TREE || !loop_vinfo)
4815     {
4816       compute_in_loop = true;
4817       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4818     }
4819
4820
4821   /* 2. Determine where to generate the extra vector load.
4822
4823      For the optimized realignment scheme, instead of generating two vector
4824      loads in each iteration, we generate a single extra vector load in the
4825      preheader of the loop, and in each iteration reuse the result of the
4826      vector load from the previous iteration.  In case the memory access is in
4827      an inner-loop nested inside LOOP, which is now being vectorized using
4828      outer-loop vectorization, we need to determine whether this initial vector
4829      load should be generated at the preheader of the inner-loop, or can be
4830      generated at the preheader of LOOP.  If the memory access has no evolution
4831      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4832      to be generated inside LOOP (in the preheader of the inner-loop).  */
4833
4834   if (nested_in_vect_loop)
4835     {
4836       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4837       bool invariant_in_outerloop =
4838             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4839       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4840     }
4841   else
4842     loop_for_initial_load = loop;
4843   if (at_loop)
4844     *at_loop = loop_for_initial_load;
4845
4846   if (loop_for_initial_load)
4847     pe = loop_preheader_edge (loop_for_initial_load);
4848
4849   /* 3. For the case of the optimized realignment, create the first vector
4850       load at the loop preheader.  */
4851
4852   if (alignment_support_scheme == dr_explicit_realign_optimized)
4853     {
4854       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4855       gassign *new_stmt;
4856
4857       gcc_assert (!compute_in_loop);
4858       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4859       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4860                                       NULL_TREE, &init_addr, NULL, &inc,
4861                                       true, &inv_p);
4862       new_temp = copy_ssa_name (ptr);
4863       new_stmt = gimple_build_assign
4864                    (new_temp, BIT_AND_EXPR, ptr,
4865                     build_int_cst (TREE_TYPE (ptr),
4866                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4867       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4868       gcc_assert (!new_bb);
4869       data_ref
4870         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4871                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4872       new_stmt = gimple_build_assign (vec_dest, data_ref);
4873       new_temp = make_ssa_name (vec_dest, new_stmt);
4874       gimple_assign_set_lhs (new_stmt, new_temp);
4875       if (pe)
4876         {
4877           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4878           gcc_assert (!new_bb);
4879         }
4880       else
4881          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4882
4883       msq_init = gimple_assign_lhs (new_stmt);
4884     }
4885
4886   /* 4. Create realignment token using a target builtin, if available.
4887       It is done either inside the containing loop, or before LOOP (as
4888       determined above).  */
4889
4890   if (targetm.vectorize.builtin_mask_for_load)
4891     {
4892       gcall *new_stmt;
4893       tree builtin_decl;
4894
4895       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4896       if (!init_addr)
4897         {
4898           /* Generate the INIT_ADDR computation outside LOOP.  */
4899           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4900                                                         NULL_TREE, loop);
4901           if (loop)
4902             {
4903               pe = loop_preheader_edge (loop);
4904               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4905               gcc_assert (!new_bb);
4906             }
4907           else
4908              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4909         }
4910
4911       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4912       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4913       vec_dest =
4914         vect_create_destination_var (scalar_dest,
4915                                      gimple_call_return_type (new_stmt));
4916       new_temp = make_ssa_name (vec_dest, new_stmt);
4917       gimple_call_set_lhs (new_stmt, new_temp);
4918
4919       if (compute_in_loop)
4920         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4921       else
4922         {
4923           /* Generate the misalignment computation outside LOOP.  */
4924           pe = loop_preheader_edge (loop);
4925           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4926           gcc_assert (!new_bb);
4927         }
4928
4929       *realignment_token = gimple_call_lhs (new_stmt);
4930
4931       /* The result of the CALL_EXPR to this builtin is determined from
4932          the value of the parameter and no global variables are touched
4933          which makes the builtin a "const" function.  Requiring the
4934          builtin to have the "const" attribute makes it unnecessary
4935          to call mark_call_clobbered.  */
4936       gcc_assert (TREE_READONLY (builtin_decl));
4937     }
4938
4939   if (alignment_support_scheme == dr_explicit_realign)
4940     return msq;
4941
4942   gcc_assert (!compute_in_loop);
4943   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4944
4945
4946   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4947
4948   pe = loop_preheader_edge (containing_loop);
4949   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4950   msq = make_ssa_name (vec_dest);
4951   phi_stmt = create_phi_node (msq, containing_loop->header);
4952   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4953
4954   return msq;
4955 }
4956
4957
4958 /* Function vect_grouped_load_supported.
4959
4960    Returns TRUE if even and odd permutations are supported,
4961    and FALSE otherwise.  */
4962
4963 bool
4964 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4965 {
4966   machine_mode mode = TYPE_MODE (vectype);
4967
4968   /* vect_permute_load_chain requires the group size to be equal to 3 or
4969      be a power of two.  */
4970   if (count != 3 && exact_log2 (count) == -1)
4971     {
4972       if (dump_enabled_p ())
4973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4974                          "the size of the group of accesses"
4975                          " is not a power of 2 or not equal to 3\n");
4976       return false;
4977     }
4978
4979   /* Check that the permutation is supported.  */
4980   if (VECTOR_MODE_P (mode))
4981     {
4982       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4983       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4984
4985       if (count == 3)
4986         {
4987           unsigned int k;
4988           for (k = 0; k < 3; k++)
4989             {
4990               for (i = 0; i < nelt; i++)
4991                 if (3 * i + k < 2 * nelt)
4992                   sel[i] = 3 * i + k;
4993                 else
4994                   sel[i] = 0;
4995               if (!can_vec_perm_p (mode, false, sel))
4996                 {
4997                   if (dump_enabled_p ())
4998                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4999                                      "shuffle of 3 loads is not supported by"
5000                                      " target\n");
5001                   return false;
5002                 }
5003               for (i = 0, j = 0; i < nelt; i++)
5004                 if (3 * i + k < 2 * nelt)
5005                   sel[i] = i;
5006                 else
5007                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5008               if (!can_vec_perm_p (mode, false, sel))
5009                 {
5010                   if (dump_enabled_p ())
5011                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5012                                      "shuffle of 3 loads is not supported by"
5013                                      " target\n");
5014                   return false;
5015                 }
5016             }
5017           return true;
5018         }
5019       else
5020         {
5021           /* If length is not equal to 3 then only power of 2 is supported.  */
5022           gcc_assert (exact_log2 (count) != -1);
5023           for (i = 0; i < nelt; i++)
5024             sel[i] = i * 2;
5025           if (can_vec_perm_p (mode, false, sel))
5026             {
5027               for (i = 0; i < nelt; i++)
5028                 sel[i] = i * 2 + 1;
5029               if (can_vec_perm_p (mode, false, sel))
5030                 return true;
5031             }
5032         }
5033     }
5034
5035   if (dump_enabled_p ())
5036     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5037                      "extract even/odd not supported by target\n");
5038   return false;
5039 }
5040
5041 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5042    type VECTYPE.  */
5043
5044 bool
5045 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5046 {
5047   return vect_lanes_optab_supported_p ("vec_load_lanes",
5048                                        vec_load_lanes_optab,
5049                                        vectype, count);
5050 }
5051
5052 /* Function vect_permute_load_chain.
5053
5054    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5055    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5056    the input data correctly.  Return the final references for loads in
5057    RESULT_CHAIN.
5058
5059    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5060    The input is 4 vectors each containing 8 elements. We assign a number to each
5061    element, the input sequence is:
5062
5063    1st vec:   0  1  2  3  4  5  6  7
5064    2nd vec:   8  9 10 11 12 13 14 15
5065    3rd vec:  16 17 18 19 20 21 22 23
5066    4th vec:  24 25 26 27 28 29 30 31
5067
5068    The output sequence should be:
5069
5070    1st vec:  0 4  8 12 16 20 24 28
5071    2nd vec:  1 5  9 13 17 21 25 29
5072    3rd vec:  2 6 10 14 18 22 26 30
5073    4th vec:  3 7 11 15 19 23 27 31
5074
5075    i.e., the first output vector should contain the first elements of each
5076    interleaving group, etc.
5077
5078    We use extract_even/odd instructions to create such output.  The input of
5079    each extract_even/odd operation is two vectors
5080    1st vec    2nd vec
5081    0 1 2 3    4 5 6 7
5082
5083    and the output is the vector of extracted even/odd elements.  The output of
5084    extract_even will be:   0 2 4 6
5085    and of extract_odd:     1 3 5 7
5086
5087
5088    The permutation is done in log LENGTH stages.  In each stage extract_even
5089    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5090    their order.  In our example,
5091
5092    E1: extract_even (1st vec, 2nd vec)
5093    E2: extract_odd (1st vec, 2nd vec)
5094    E3: extract_even (3rd vec, 4th vec)
5095    E4: extract_odd (3rd vec, 4th vec)
5096
5097    The output for the first stage will be:
5098
5099    E1:  0  2  4  6  8 10 12 14
5100    E2:  1  3  5  7  9 11 13 15
5101    E3: 16 18 20 22 24 26 28 30
5102    E4: 17 19 21 23 25 27 29 31
5103
5104    In order to proceed and create the correct sequence for the next stage (or
5105    for the correct output, if the second stage is the last one, as in our
5106    example), we first put the output of extract_even operation and then the
5107    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5108    The input for the second stage is:
5109
5110    1st vec (E1):  0  2  4  6  8 10 12 14
5111    2nd vec (E3): 16 18 20 22 24 26 28 30
5112    3rd vec (E2):  1  3  5  7  9 11 13 15
5113    4th vec (E4): 17 19 21 23 25 27 29 31
5114
5115    The output of the second stage:
5116
5117    E1: 0 4  8 12 16 20 24 28
5118    E2: 2 6 10 14 18 22 26 30
5119    E3: 1 5  9 13 17 21 25 29
5120    E4: 3 7 11 15 19 23 27 31
5121
5122    And RESULT_CHAIN after reordering:
5123
5124    1st vec (E1):  0 4  8 12 16 20 24 28
5125    2nd vec (E3):  1 5  9 13 17 21 25 29
5126    3rd vec (E2):  2 6 10 14 18 22 26 30
5127    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5128
5129 static void
5130 vect_permute_load_chain (vec<tree> dr_chain,
5131                          unsigned int length,
5132                          gimple stmt,
5133                          gimple_stmt_iterator *gsi,
5134                          vec<tree> *result_chain)
5135 {
5136   tree data_ref, first_vect, second_vect;
5137   tree perm_mask_even, perm_mask_odd;
5138   tree perm3_mask_low, perm3_mask_high;
5139   gimple perm_stmt;
5140   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5141   unsigned int i, j, log_length = exact_log2 (length);
5142   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5143   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5144
5145   result_chain->quick_grow (length);
5146   memcpy (result_chain->address (), dr_chain.address (),
5147           length * sizeof (tree));
5148
5149   if (length == 3)
5150     {
5151       unsigned int k;
5152
5153       for (k = 0; k < 3; k++)
5154         {
5155           for (i = 0; i < nelt; i++)
5156             if (3 * i + k < 2 * nelt)
5157               sel[i] = 3 * i + k;
5158             else
5159               sel[i] = 0;
5160           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5161
5162           for (i = 0, j = 0; i < nelt; i++)
5163             if (3 * i + k < 2 * nelt)
5164               sel[i] = i;
5165             else
5166               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5167
5168           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5169
5170           first_vect = dr_chain[0];
5171           second_vect = dr_chain[1];
5172
5173           /* Create interleaving stmt (low part of):
5174              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5175                                                              ...}>  */
5176           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5177           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5178                                            second_vect, perm3_mask_low);
5179           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5180
5181           /* Create interleaving stmt (high part of):
5182              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5183                                                               ...}>  */
5184           first_vect = data_ref;
5185           second_vect = dr_chain[2];
5186           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5187           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5188                                            second_vect, perm3_mask_high);
5189           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5190           (*result_chain)[k] = data_ref;
5191         }
5192     }
5193   else
5194     {
5195       /* If length is not equal to 3 then only power of 2 is supported.  */
5196       gcc_assert (exact_log2 (length) != -1);
5197
5198       for (i = 0; i < nelt; ++i)
5199         sel[i] = i * 2;
5200       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5201
5202       for (i = 0; i < nelt; ++i)
5203         sel[i] = i * 2 + 1;
5204       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5205
5206       for (i = 0; i < log_length; i++)
5207         {
5208           for (j = 0; j < length; j += 2)
5209             {
5210               first_vect = dr_chain[j];
5211               second_vect = dr_chain[j+1];
5212
5213               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5214               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5215               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5216                                                first_vect, second_vect,
5217                                                perm_mask_even);
5218               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5219               (*result_chain)[j/2] = data_ref;
5220
5221               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5222               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5223               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5224                                                first_vect, second_vect,
5225                                                perm_mask_odd);
5226               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5227               (*result_chain)[j/2+length/2] = data_ref;
5228             }
5229           memcpy (dr_chain.address (), result_chain->address (),
5230                   length * sizeof (tree));
5231         }
5232     }
5233 }
5234
5235 /* Function vect_shift_permute_load_chain.
5236
5237    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5238    sequence of stmts to reorder the input data accordingly.
5239    Return the final references for loads in RESULT_CHAIN.
5240    Return true if successed, false otherwise.
5241
5242    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5243    The input is 3 vectors each containing 8 elements.  We assign a
5244    number to each element, the input sequence is:
5245
5246    1st vec:   0  1  2  3  4  5  6  7
5247    2nd vec:   8  9 10 11 12 13 14 15
5248    3rd vec:  16 17 18 19 20 21 22 23
5249
5250    The output sequence should be:
5251
5252    1st vec:  0 3 6  9 12 15 18 21
5253    2nd vec:  1 4 7 10 13 16 19 22
5254    3rd vec:  2 5 8 11 14 17 20 23
5255
5256    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5257
5258    First we shuffle all 3 vectors to get correct elements order:
5259
5260    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5261    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5262    3rd vec:  (16 19 22) (17 20 23) (18 21)
5263
5264    Next we unite and shift vector 3 times:
5265
5266    1st step:
5267      shift right by 6 the concatenation of:
5268      "1st vec" and  "2nd vec"
5269        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5270      "2nd vec" and  "3rd vec"
5271        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5272      "3rd vec" and  "1st vec"
5273        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5274                              | New vectors                   |
5275
5276      So that now new vectors are:
5277
5278      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5279      2nd vec:  (10 13) (16 19 22) (17 20 23)
5280      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5281
5282    2nd step:
5283      shift right by 5 the concatenation of:
5284      "1st vec" and  "3rd vec"
5285        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5286      "2nd vec" and  "1st vec"
5287        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5288      "3rd vec" and  "2nd vec"
5289        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5290                           | New vectors                   |
5291
5292      So that now new vectors are:
5293
5294      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5295      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5296      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5297
5298    3rd step:
5299      shift right by 5 the concatenation of:
5300      "1st vec" and  "1st vec"
5301        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5302      shift right by 3 the concatenation of:
5303      "2nd vec" and  "2nd vec"
5304                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5305                           | New vectors                   |
5306
5307      So that now all vectors are READY:
5308      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5309      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5310      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5311
5312    This algorithm is faster than one in vect_permute_load_chain if:
5313      1.  "shift of a concatination" is faster than general permutation.
5314          This is usually so.
5315      2.  The TARGET machine can't execute vector instructions in parallel.
5316          This is because each step of the algorithm depends on previous.
5317          The algorithm in vect_permute_load_chain is much more parallel.
5318
5319    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5320 */
5321
5322 static bool
5323 vect_shift_permute_load_chain (vec<tree> dr_chain,
5324                                unsigned int length,
5325                                gimple stmt,
5326                                gimple_stmt_iterator *gsi,
5327                                vec<tree> *result_chain)
5328 {
5329   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5330   tree perm2_mask1, perm2_mask2, perm3_mask;
5331   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5332   gimple perm_stmt;
5333
5334   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5335   unsigned int i;
5336   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5337   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5338   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5339   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5340
5341   result_chain->quick_grow (length);
5342   memcpy (result_chain->address (), dr_chain.address (),
5343           length * sizeof (tree));
5344
5345   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5346     {
5347       unsigned int j, log_length = exact_log2 (length);
5348       for (i = 0; i < nelt / 2; ++i)
5349         sel[i] = i * 2;
5350       for (i = 0; i < nelt / 2; ++i)
5351         sel[nelt / 2 + i] = i * 2 + 1;
5352       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5353         {
5354           if (dump_enabled_p ())
5355             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5356                              "shuffle of 2 fields structure is not \
5357                               supported by target\n");
5358           return false;
5359         }
5360       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5361
5362       for (i = 0; i < nelt / 2; ++i)
5363         sel[i] = i * 2 + 1;
5364       for (i = 0; i < nelt / 2; ++i)
5365         sel[nelt / 2 + i] = i * 2;
5366       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5367         {
5368           if (dump_enabled_p ())
5369             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5370                              "shuffle of 2 fields structure is not \
5371                               supported by target\n");
5372           return false;
5373         }
5374       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5375
5376       /* Generating permutation constant to shift all elements.
5377          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5378       for (i = 0; i < nelt; i++)
5379         sel[i] = nelt / 2 + i;
5380       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5381         {
5382           if (dump_enabled_p ())
5383             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5384                              "shift permutation is not supported by target\n");
5385           return false;
5386         }
5387       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5388
5389       /* Generating permutation constant to select vector from 2.
5390          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5391       for (i = 0; i < nelt / 2; i++)
5392         sel[i] = i;
5393       for (i = nelt / 2; i < nelt; i++)
5394         sel[i] = nelt + i;
5395       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5396         {
5397           if (dump_enabled_p ())
5398             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5399                              "select is not supported by target\n");
5400           return false;
5401         }
5402       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5403
5404       for (i = 0; i < log_length; i++)
5405         {
5406           for (j = 0; j < length; j += 2)
5407             {
5408               first_vect = dr_chain[j];
5409               second_vect = dr_chain[j + 1];
5410
5411               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5412               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5413                                                first_vect, first_vect,
5414                                                perm2_mask1);
5415               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5416               vect[0] = data_ref;
5417
5418               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5419               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5420                                                second_vect, second_vect,
5421                                                perm2_mask2);
5422               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5423               vect[1] = data_ref;
5424
5425               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5426               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5427                                                vect[0], vect[1], shift1_mask);
5428               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5429               (*result_chain)[j/2 + length/2] = data_ref;
5430
5431               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5432               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5433                                                vect[0], vect[1], select_mask);
5434               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5435               (*result_chain)[j/2] = data_ref;
5436             }
5437           memcpy (dr_chain.address (), result_chain->address (),
5438                   length * sizeof (tree));
5439         }
5440       return true;
5441     }
5442   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5443     {
5444       unsigned int k = 0, l = 0;
5445
5446       /* Generating permutation constant to get all elements in rigth order.
5447          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5448       for (i = 0; i < nelt; i++)
5449         {
5450           if (3 * k + (l % 3) >= nelt)
5451             {
5452               k = 0;
5453               l += (3 - (nelt % 3));
5454             }
5455           sel[i] = 3 * k + (l % 3);
5456           k++;
5457         }
5458       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5459         {
5460           if (dump_enabled_p ())
5461             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5462                              "shuffle of 3 fields structure is not \
5463                               supported by target\n");
5464           return false;
5465         }
5466       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5467
5468       /* Generating permutation constant to shift all elements.
5469          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5470       for (i = 0; i < nelt; i++)
5471         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5472       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5473         {
5474           if (dump_enabled_p ())
5475             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5476                              "shift permutation is not supported by target\n");
5477           return false;
5478         }
5479       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5480
5481       /* Generating permutation constant to shift all elements.
5482          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5483       for (i = 0; i < nelt; i++)
5484         sel[i] = 2 * (nelt / 3) + 1 + i;
5485       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5486         {
5487           if (dump_enabled_p ())
5488             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5489                              "shift permutation is not supported by target\n");
5490           return false;
5491         }
5492       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5493
5494       /* Generating permutation constant to shift all elements.
5495          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5496       for (i = 0; i < nelt; i++)
5497         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5498       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5499         {
5500           if (dump_enabled_p ())
5501             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5502                              "shift permutation is not supported by target\n");
5503           return false;
5504         }
5505       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5506
5507       /* Generating permutation constant to shift all elements.
5508          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5509       for (i = 0; i < nelt; i++)
5510         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5511       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5512         {
5513           if (dump_enabled_p ())
5514             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5515                              "shift permutation is not supported by target\n");
5516           return false;
5517         }
5518       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5519
5520       for (k = 0; k < 3; k++)
5521         {
5522           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5523           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5524                                            dr_chain[k], dr_chain[k],
5525                                            perm3_mask);
5526           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5527           vect[k] = data_ref;
5528         }
5529
5530       for (k = 0; k < 3; k++)
5531         {
5532           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5533           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5534                                            vect[k % 3], vect[(k + 1) % 3],
5535                                            shift1_mask);
5536           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5537           vect_shift[k] = data_ref;
5538         }
5539
5540       for (k = 0; k < 3; k++)
5541         {
5542           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5543           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5544                                            vect_shift[(4 - k) % 3],
5545                                            vect_shift[(3 - k) % 3],
5546                                            shift2_mask);
5547           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5548           vect[k] = data_ref;
5549         }
5550
5551       (*result_chain)[3 - (nelt % 3)] = vect[2];
5552
5553       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5554       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5555                                        vect[0], shift3_mask);
5556       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5557       (*result_chain)[nelt % 3] = data_ref;
5558
5559       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5560       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5561                                        vect[1], shift4_mask);
5562       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5563       (*result_chain)[0] = data_ref;
5564       return true;
5565     }
5566   return false;
5567 }
5568
5569 /* Function vect_transform_grouped_load.
5570
5571    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5572    to perform their permutation and ascribe the result vectorized statements to
5573    the scalar statements.
5574 */
5575
5576 void
5577 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5578                              gimple_stmt_iterator *gsi)
5579 {
5580   machine_mode mode;
5581   vec<tree> result_chain = vNULL;
5582
5583   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5584      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5585      vectors, that are ready for vector computation.  */
5586   result_chain.create (size);
5587
5588   /* If reassociation width for vector type is 2 or greater target machine can
5589      execute 2 or more vector instructions in parallel.  Otherwise try to
5590      get chain for loads group using vect_shift_permute_load_chain.  */
5591   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5592   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5593       || exact_log2 (size) != -1
5594       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5595                                          gsi, &result_chain))
5596     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5597   vect_record_grouped_load_vectors (stmt, result_chain);
5598   result_chain.release ();
5599 }
5600
5601 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5602    generated as part of the vectorization of STMT.  Assign the statement
5603    for each vector to the associated scalar statement.  */
5604
5605 void
5606 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5607 {
5608   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5609   gimple next_stmt, new_stmt;
5610   unsigned int i, gap_count;
5611   tree tmp_data_ref;
5612
5613   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5614      Since we scan the chain starting from it's first node, their order
5615      corresponds the order of data-refs in RESULT_CHAIN.  */
5616   next_stmt = first_stmt;
5617   gap_count = 1;
5618   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5619     {
5620       if (!next_stmt)
5621         break;
5622
5623       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5624        code elimination pass later.  No need to check for the first stmt in
5625        the group, since it always exists.
5626        GROUP_GAP is the number of steps in elements from the previous
5627        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5628        correspond to the gaps.  */
5629       if (next_stmt != first_stmt
5630           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5631       {
5632         gap_count++;
5633         continue;
5634       }
5635
5636       while (next_stmt)
5637         {
5638           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5639           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5640              copies, and we put the new vector statement in the first available
5641              RELATED_STMT.  */
5642           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5643             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5644           else
5645             {
5646               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5647                 {
5648                   gimple prev_stmt =
5649                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5650                   gimple rel_stmt =
5651                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5652                   while (rel_stmt)
5653                     {
5654                       prev_stmt = rel_stmt;
5655                       rel_stmt =
5656                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5657                     }
5658
5659                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5660                     new_stmt;
5661                 }
5662             }
5663
5664           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5665           gap_count = 1;
5666           /* If NEXT_STMT accesses the same DR as the previous statement,
5667              put the same TMP_DATA_REF as its vectorized statement; otherwise
5668              get the next data-ref from RESULT_CHAIN.  */
5669           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5670             break;
5671         }
5672     }
5673 }
5674
5675 /* Function vect_force_dr_alignment_p.
5676
5677    Returns whether the alignment of a DECL can be forced to be aligned
5678    on ALIGNMENT bit boundary.  */
5679
5680 bool
5681 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5682 {
5683   if (TREE_CODE (decl) != VAR_DECL)
5684     return false;
5685
5686   if (decl_in_symtab_p (decl)
5687       && !symtab_node::get (decl)->can_increase_alignment_p ())
5688     return false;
5689
5690   if (TREE_STATIC (decl))
5691     return (alignment <= MAX_OFILE_ALIGNMENT);
5692   else
5693     return (alignment <= MAX_STACK_ALIGNMENT);
5694 }
5695
5696
5697 /* Return whether the data reference DR is supported with respect to its
5698    alignment.
5699    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5700    it is aligned, i.e., check if it is possible to vectorize it with different
5701    alignment.  */
5702
5703 enum dr_alignment_support
5704 vect_supportable_dr_alignment (struct data_reference *dr,
5705                                bool check_aligned_accesses)
5706 {
5707   gimple stmt = DR_STMT (dr);
5708   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5709   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5710   machine_mode mode = TYPE_MODE (vectype);
5711   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5712   struct loop *vect_loop = NULL;
5713   bool nested_in_vect_loop = false;
5714
5715   if (aligned_access_p (dr) && !check_aligned_accesses)
5716     return dr_aligned;
5717
5718   /* For now assume all conditional loads/stores support unaligned
5719      access without any special code.  */
5720   if (is_gimple_call (stmt)
5721       && gimple_call_internal_p (stmt)
5722       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5723           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5724     return dr_unaligned_supported;
5725
5726   if (loop_vinfo)
5727     {
5728       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5729       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5730     }
5731
5732   /* Possibly unaligned access.  */
5733
5734   /* We can choose between using the implicit realignment scheme (generating
5735      a misaligned_move stmt) and the explicit realignment scheme (generating
5736      aligned loads with a REALIGN_LOAD).  There are two variants to the
5737      explicit realignment scheme: optimized, and unoptimized.
5738      We can optimize the realignment only if the step between consecutive
5739      vector loads is equal to the vector size.  Since the vector memory
5740      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5741      is guaranteed that the misalignment amount remains the same throughout the
5742      execution of the vectorized loop.  Therefore, we can create the
5743      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5744      at the loop preheader.
5745
5746      However, in the case of outer-loop vectorization, when vectorizing a
5747      memory access in the inner-loop nested within the LOOP that is now being
5748      vectorized, while it is guaranteed that the misalignment of the
5749      vectorized memory access will remain the same in different outer-loop
5750      iterations, it is *not* guaranteed that is will remain the same throughout
5751      the execution of the inner-loop.  This is because the inner-loop advances
5752      with the original scalar step (and not in steps of VS).  If the inner-loop
5753      step happens to be a multiple of VS, then the misalignment remains fixed
5754      and we can use the optimized realignment scheme.  For example:
5755
5756       for (i=0; i<N; i++)
5757         for (j=0; j<M; j++)
5758           s += a[i+j];
5759
5760      When vectorizing the i-loop in the above example, the step between
5761      consecutive vector loads is 1, and so the misalignment does not remain
5762      fixed across the execution of the inner-loop, and the realignment cannot
5763      be optimized (as illustrated in the following pseudo vectorized loop):
5764
5765       for (i=0; i<N; i+=4)
5766         for (j=0; j<M; j++){
5767           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5768                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5769                          // (assuming that we start from an aligned address).
5770           }
5771
5772      We therefore have to use the unoptimized realignment scheme:
5773
5774       for (i=0; i<N; i+=4)
5775           for (j=k; j<M; j+=4)
5776           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5777                            // that the misalignment of the initial address is
5778                            // 0).
5779
5780      The loop can then be vectorized as follows:
5781
5782       for (k=0; k<4; k++){
5783         rt = get_realignment_token (&vp[k]);
5784         for (i=0; i<N; i+=4){
5785           v1 = vp[i+k];
5786           for (j=k; j<M; j+=4){
5787             v2 = vp[i+j+VS-1];
5788             va = REALIGN_LOAD <v1,v2,rt>;
5789             vs += va;
5790             v1 = v2;
5791           }
5792         }
5793     } */
5794
5795   if (DR_IS_READ (dr))
5796     {
5797       bool is_packed = false;
5798       tree type = (TREE_TYPE (DR_REF (dr)));
5799
5800       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5801           && (!targetm.vectorize.builtin_mask_for_load
5802               || targetm.vectorize.builtin_mask_for_load ()))
5803         {
5804           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5805           if ((nested_in_vect_loop
5806                && (TREE_INT_CST_LOW (DR_STEP (dr))
5807                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5808               || !loop_vinfo)
5809             return dr_explicit_realign;
5810           else
5811             return dr_explicit_realign_optimized;
5812         }
5813       if (!known_alignment_for_access_p (dr))
5814         is_packed = not_size_aligned (DR_REF (dr));
5815
5816       if ((TYPE_USER_ALIGN (type) && !is_packed)
5817           || targetm.vectorize.
5818                support_vector_misalignment (mode, type,
5819                                             DR_MISALIGNMENT (dr), is_packed))
5820         /* Can't software pipeline the loads, but can at least do them.  */
5821         return dr_unaligned_supported;
5822     }
5823   else
5824     {
5825       bool is_packed = false;
5826       tree type = (TREE_TYPE (DR_REF (dr)));
5827
5828       if (!known_alignment_for_access_p (dr))
5829         is_packed = not_size_aligned (DR_REF (dr));
5830
5831      if ((TYPE_USER_ALIGN (type) && !is_packed)
5832          || targetm.vectorize.
5833               support_vector_misalignment (mode, type,
5834                                            DR_MISALIGNMENT (dr), is_packed))
5835        return dr_unaligned_supported;
5836     }
5837
5838   /* Unsupported.  */
5839   return dr_unaligned_unsupported;
5840 }