gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "tm_p.h"
  30 #include "target.h"
  31 #include "basic-block.h"
  32 #include "gimple-pretty-print.h"
  33 #include "tree-ssa-alias.h"
  34 #include "internal-fn.h"
  35 #include "tree-eh.h"
  36 #include "gimple-expr.h"
  37 #include "is-a.h"
  38 #include "gimple.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "gimple-ssa.h"
  43 #include "tree-phinodes.h"
  44 #include "ssa-iterators.h"
  45 #include "stringpool.h"
  46 #include "tree-ssanames.h"
  47 #include "tree-ssa-loop-ivopts.h"
  48 #include "tree-ssa-loop-manip.h"
  49 #include "tree-ssa-loop.h"
  50 #include "dumpfile.h"
  51 #include "cfgloop.h"
  52 #include "tree-chrec.h"
  53 #include "tree-scalar-evolution.h"
  54 #include "tree-vectorizer.h"
  55 #include "diagnostic-core.h"
  56 #include "cgraph.h"
  57 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  58 #include "expr.h"
  59 #include "optabs.h"
  60 #include "builtins.h"
  61 #include "varasm.h"
  62
  63 /* Return true if load- or store-lanes optab OPTAB is implemented for
  64    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  65
  66 static bool
  67 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  68                               tree vectype, unsigned HOST_WIDE_INT count)
  69 {
  70   enum machine_mode mode, array_mode;
  71   bool limit_p;
  72
  73   mode = TYPE_MODE (vectype);
  74   limit_p = !targetm.array_mode_supported_p (mode, count);
  75   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  76                               MODE_INT, limit_p);
  77
  78   if (array_mode == BLKmode)
  79     {
  80       if (dump_enabled_p ())
  81         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  82                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  83                          GET_MODE_NAME (mode), count);
  84       return false;
  85     }
  86
  87   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  88     {
  89       if (dump_enabled_p ())
  90         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  91                          "cannot use %s<%s><%s>\n", name,
  92                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  93       return false;
  94     }
  95
  96   if (dump_enabled_p ())
  97     dump_printf_loc (MSG_NOTE, vect_location,
  98                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  99                      GET_MODE_NAME (mode));
 100
 101   return true;
 102 }
 103
 104
 105 /* Return the smallest scalar part of STMT.
 106    This is used to determine the vectype of the stmt.  We generally set the
 107    vectype according to the type of the result (lhs).  For stmts whose
 108    result-type is different than the type of the arguments (e.g., demotion,
 109    promotion), vectype will be reset appropriately (later).  Note that we have
 110    to visit the smallest datatype in this function, because that determines the
 111    VF.  If the smallest datatype in the loop is present only as the rhs of a
 112    promotion operation - we'd miss it.
 113    Such a case, where a variable of this datatype does not appear in the lhs
 114    anywhere in the loop, can only occur if it's an invariant: e.g.:
 115    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 116    invariant motion.  However, we cannot rely on invariant motion to always
 117    take invariants out of the loop, and so in the case of promotion we also
 118    have to check the rhs.
 119    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 120    types.  */
 121
 122 tree
 123 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 124                                HOST_WIDE_INT *rhs_size_unit)
 125 {
 126   tree scalar_type = gimple_expr_type (stmt);
 127   HOST_WIDE_INT lhs, rhs;
 128
 129   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 130
 131   if (is_gimple_assign (stmt)
 132       && (gimple_assign_cast_p (stmt)
 133           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 134           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 135           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 136     {
 137       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 138
 139       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 140       if (rhs < lhs)
 141         scalar_type = rhs_type;
 142     }
 143
 144   *lhs_size_unit = lhs;
 145   *rhs_size_unit = rhs;
 146   return scalar_type;
 147 }
 148
 149
 150 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 151    tested at run-time.  Return TRUE if DDR was successfully inserted.
 152    Return false if versioning is not supported.  */
 153
 154 static bool
 155 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 156 {
 157   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 158
 159   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 160     return false;
 161
 162   if (dump_enabled_p ())
 163     {
 164       dump_printf_loc (MSG_NOTE, vect_location,
 165                        "mark for run-time aliasing test between ");
 166       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 167       dump_printf (MSG_NOTE,  " and ");
 168       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 169       dump_printf (MSG_NOTE, "\n");
 170     }
 171
 172   if (optimize_loop_nest_for_size_p (loop))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 176                          "versioning not supported when optimizing"
 177                          " for size.\n");
 178       return false;
 179     }
 180
 181   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 182   if (loop->inner)
 183     {
 184       if (dump_enabled_p ())
 185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 186                          "versioning not yet supported for outer-loops.\n");
 187       return false;
 188     }
 189
 190   /* FORNOW: We don't support creating runtime alias tests for non-constant
 191      step.  */
 192   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 193       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 194     {
 195       if (dump_enabled_p ())
 196         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 197                          "versioning not yet supported for non-constant "
 198                          "step\n");
 199       return false;
 200     }
 201
 202   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 203   return true;
 204 }
 205
 206
 207 /* Function vect_analyze_data_ref_dependence.
 208
 209    Return TRUE if there (might) exist a dependence between a memory-reference
 210    DRA and a memory-reference DRB.  When versioning for alias may check a
 211    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 212    the data dependence.  */
 213
 214 static bool
 215 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 216                                   loop_vec_info loop_vinfo, int *max_vf)
 217 {
 218   unsigned int i;
 219   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 220   struct data_reference *dra = DDR_A (ddr);
 221   struct data_reference *drb = DDR_B (ddr);
 222   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 223   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 224   lambda_vector dist_v;
 225   unsigned int loop_depth;
 226
 227   /* In loop analysis all data references should be vectorizable.  */
 228   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 229       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 230     gcc_unreachable ();
 231
 232   /* Independent data accesses.  */
 233   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 234     return false;
 235
 236   if (dra == drb
 237       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 238     return false;
 239
 240   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 241      least two scalar iterations, there is always also a true dependence.
 242      As the vectorizer does not re-order loads and stores we can ignore
 243      the anti-dependence if TBAA can disambiguate both DRs similar to the
 244      case with known negative distance anti-dependences (positive
 245      distance anti-dependences would violate TBAA constraints).  */
 246   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 247        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 248       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 249                                  get_alias_set (DR_REF (drb))))
 250     return false;
 251
 252   /* Unknown data dependence.  */
 253   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 254     {
 255       /* If user asserted safelen consecutive iterations can be
 256          executed concurrently, assume independence.  */
 257       if (loop->safelen >= 2)
 258         {
 259           if (loop->safelen < *max_vf)
 260             *max_vf = loop->safelen;
 261           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 262           return false;
 263         }
 264
 265       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 266           || STMT_VINFO_GATHER_P (stmtinfo_b))
 267         {
 268           if (dump_enabled_p ())
 269             {
 270               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 271                                "versioning for alias not supported for: "
 272                                "can't determine dependence between ");
 273               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 274                                  DR_REF (dra));
 275               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 276               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 277                                  DR_REF (drb));
 278               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 279             }
 280           return true;
 281         }
 282
 283       if (dump_enabled_p ())
 284         {
 285           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 286                            "versioning for alias required: "
 287                            "can't determine dependence between ");
 288           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 289                              DR_REF (dra));
 290           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 291           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 292                              DR_REF (drb));
 293           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 294         }
 295
 296       /* Add to list of ddrs that need to be tested at run-time.  */
 297       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 298     }
 299
 300   /* Known data dependence.  */
 301   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 302     {
 303       /* If user asserted safelen consecutive iterations can be
 304          executed concurrently, assume independence.  */
 305       if (loop->safelen >= 2)
 306         {
 307           if (loop->safelen < *max_vf)
 308             *max_vf = loop->safelen;
 309           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 310           return false;
 311         }
 312
 313       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 314           || STMT_VINFO_GATHER_P (stmtinfo_b))
 315         {
 316           if (dump_enabled_p ())
 317             {
 318               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 319                                "versioning for alias not supported for: "
 320                                "bad dist vector for ");
 321               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 322                                  DR_REF (dra));
 323               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 324               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 325                                  DR_REF (drb));
 326               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 327             }
 328           return true;
 329         }
 330
 331       if (dump_enabled_p ())
 332         {
 333           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 334                            "versioning for alias required: "
 335                            "bad dist vector for ");
 336           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 337           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 338           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 339           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 340         }
 341       /* Add to list of ddrs that need to be tested at run-time.  */
 342       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 343     }
 344
 345   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 346   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 347     {
 348       int dist = dist_v[loop_depth];
 349
 350       if (dump_enabled_p ())
 351         dump_printf_loc (MSG_NOTE, vect_location,
 352                          "dependence distance  = %d.\n", dist);
 353
 354       if (dist == 0)
 355         {
 356           if (dump_enabled_p ())
 357             {
 358               dump_printf_loc (MSG_NOTE, vect_location,
 359                                "dependence distance == 0 between ");
 360               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 361               dump_printf (MSG_NOTE, " and ");
 362               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 363               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 364             }
 365
 366           /* When we perform grouped accesses and perform implicit CSE
 367              by detecting equal accesses and doing disambiguation with
 368              runtime alias tests like for
 369                 .. = a[i];
 370                 .. = a[i+1];
 371                 a[i] = ..;
 372                 a[i+1] = ..;
 373                 *p = ..;
 374                 .. = a[i];
 375                 .. = a[i+1];
 376              where we will end up loading { a[i], a[i+1] } once, make
 377              sure that inserting group loads before the first load and
 378              stores after the last store will do the right thing.  */
 379           if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 380                && GROUP_SAME_DR_STMT (stmtinfo_a))
 381               || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
 382                   && GROUP_SAME_DR_STMT (stmtinfo_b)))
 383             {
 384               gimple earlier_stmt;
 385               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 386               if (DR_IS_WRITE
 387                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 388                 {
 389                   if (dump_enabled_p ())
 390                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 391                                      "READ_WRITE dependence in interleaving."
 392                                      "\n");
 393                   return true;
 394                 }
 395             }
 396
 397           continue;
 398         }
 399
 400       if (dist > 0 && DDR_REVERSED_P (ddr))
 401         {
 402           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 403              reversed (to make distance vector positive), and the actual
 404              distance is negative.  */
 405           if (dump_enabled_p ())
 406             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                              "dependence distance negative.\n");
 408           /* Record a negative dependence distance to later limit the
 409              amount of stmt copying / unrolling we can perform.
 410              Only need to handle read-after-write dependence.  */
 411           if (DR_IS_READ (drb)
 412               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 413                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 414             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 415           continue;
 416         }
 417
 418       if (abs (dist) >= 2
 419           && abs (dist) < *max_vf)
 420         {
 421           /* The dependence distance requires reduction of the maximal
 422              vectorization factor.  */
 423           *max_vf = abs (dist);
 424           if (dump_enabled_p ())
 425             dump_printf_loc (MSG_NOTE, vect_location,
 426                              "adjusting maximal vectorization factor to %i\n",
 427                              *max_vf);
 428         }
 429
 430       if (abs (dist) >= *max_vf)
 431         {
 432           /* Dependence distance does not create dependence, as far as
 433              vectorization is concerned, in this case.  */
 434           if (dump_enabled_p ())
 435             dump_printf_loc (MSG_NOTE, vect_location,
 436                              "dependence distance >= VF.\n");
 437           continue;
 438         }
 439
 440       if (dump_enabled_p ())
 441         {
 442           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 443                        "not vectorized, possible dependence "
 444                        "between data-refs ");
 445           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 446           dump_printf (MSG_NOTE,  " and ");
 447           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 448           dump_printf (MSG_NOTE,  "\n");
 449         }
 450
 451       return true;
 452     }
 453
 454   return false;
 455 }
 456
 457 /* Function vect_analyze_data_ref_dependences.
 458
 459    Examine all the data references in the loop, and make sure there do not
 460    exist any data dependences between them.  Set *MAX_VF according to
 461    the maximum vectorization factor the data dependences allow.  */
 462
 463 bool
 464 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 465 {
 466   unsigned int i;
 467   struct data_dependence_relation *ddr;
 468
 469   if (dump_enabled_p ())
 470     dump_printf_loc (MSG_NOTE, vect_location,
 471                      "=== vect_analyze_data_ref_dependences ===\n");
 472
 473   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 474   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 475                                 &LOOP_VINFO_DDRS (loop_vinfo),
 476                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 477     return false;
 478
 479   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 480     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 481       return false;
 482
 483   return true;
 484 }
 485
 486
 487 /* Function vect_slp_analyze_data_ref_dependence.
 488
 489    Return TRUE if there (might) exist a dependence between a memory-reference
 490    DRA and a memory-reference DRB.  When versioning for alias may check a
 491    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 492    the data dependence.  */
 493
 494 static bool
 495 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 496 {
 497   struct data_reference *dra = DDR_A (ddr);
 498   struct data_reference *drb = DDR_B (ddr);
 499
 500   /* We need to check dependences of statements marked as unvectorizable
 501      as well, they still can prohibit vectorization.  */
 502
 503   /* Independent data accesses.  */
 504   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 505     return false;
 506
 507   if (dra == drb)
 508     return false;
 509
 510   /* Read-read is OK.  */
 511   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 512     return false;
 513
 514   /* If dra and drb are part of the same interleaving chain consider
 515      them independent.  */
 516   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 517       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 518           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 519     return false;
 520
 521   /* Unknown data dependence.  */
 522   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 523     {
 524       if  (dump_enabled_p ())
 525         {
 526           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 527                            "can't determine dependence between ");
 528           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 529           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 530           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 531           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 532         }
 533     }
 534   else if (dump_enabled_p ())
 535     {
 536       dump_printf_loc (MSG_NOTE, vect_location,
 537                        "determined dependence between ");
 538       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 539       dump_printf (MSG_NOTE, " and ");
 540       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 541       dump_printf (MSG_NOTE,  "\n");
 542     }
 543
 544   /* We do not vectorize basic blocks with write-write dependencies.  */
 545   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 546     return true;
 547
 548   /* If we have a read-write dependence check that the load is before the store.
 549      When we vectorize basic blocks, vector load can be only before
 550      corresponding scalar load, and vector store can be only after its
 551      corresponding scalar store.  So the order of the acceses is preserved in
 552      case the load is before the store.  */
 553   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 554   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 555     {
 556       /* That only holds for load-store pairs taking part in vectorization.  */
 557       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 558           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 559         return false;
 560     }
 561
 562   return true;
 563 }
 564
 565
 566 /* Function vect_analyze_data_ref_dependences.
 567
 568    Examine all the data references in the basic-block, and make sure there
 569    do not exist any data dependences between them.  Set *MAX_VF according to
 570    the maximum vectorization factor the data dependences allow.  */
 571
 572 bool
 573 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 574 {
 575   struct data_dependence_relation *ddr;
 576   unsigned int i;
 577
 578   if (dump_enabled_p ())
 579     dump_printf_loc (MSG_NOTE, vect_location,
 580                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 581
 582   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 583                                 &BB_VINFO_DDRS (bb_vinfo),
 584                                 vNULL, true))
 585     return false;
 586
 587   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 588     if (vect_slp_analyze_data_ref_dependence (ddr))
 589       return false;
 590
 591   return true;
 592 }
 593
 594
 595 /* Function vect_compute_data_ref_alignment
 596
 597    Compute the misalignment of the data reference DR.
 598
 599    Output:
 600    1. If during the misalignment computation it is found that the data reference
 601       cannot be vectorized then false is returned.
 602    2. DR_MISALIGNMENT (DR) is defined.
 603
 604    FOR NOW: No analysis is actually performed. Misalignment is calculated
 605    only for trivial cases. TODO.  */
 606
 607 static bool
 608 vect_compute_data_ref_alignment (struct data_reference *dr)
 609 {
 610   gimple stmt = DR_STMT (dr);
 611   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 612   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 613   struct loop *loop = NULL;
 614   tree ref = DR_REF (dr);
 615   tree vectype;
 616   tree base, base_addr;
 617   bool base_aligned;
 618   tree misalign;
 619   tree aligned_to, alignment;
 620
 621   if (dump_enabled_p ())
 622     dump_printf_loc (MSG_NOTE, vect_location,
 623                      "vect_compute_data_ref_alignment:\n");
 624
 625   if (loop_vinfo)
 626     loop = LOOP_VINFO_LOOP (loop_vinfo);
 627
 628   /* Initialize misalignment to unknown.  */
 629   SET_DR_MISALIGNMENT (dr, -1);
 630
 631   /* Strided loads perform only component accesses, misalignment information
 632      is irrelevant for them.  */
 633   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 634     return true;
 635
 636   misalign = DR_INIT (dr);
 637   aligned_to = DR_ALIGNED_TO (dr);
 638   base_addr = DR_BASE_ADDRESS (dr);
 639   vectype = STMT_VINFO_VECTYPE (stmt_info);
 640
 641   /* In case the dataref is in an inner-loop of the loop that is being
 642      vectorized (LOOP), we use the base and misalignment information
 643      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 644      stays the same throughout the execution of the inner-loop, which is why
 645      we have to check that the stride of the dataref in the inner-loop evenly
 646      divides by the vector size.  */
 647   if (loop && nested_in_vect_loop_p (loop, stmt))
 648     {
 649       tree step = DR_STEP (dr);
 650       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 651
 652       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 653         {
 654           if (dump_enabled_p ())
 655             dump_printf_loc (MSG_NOTE, vect_location,
 656                              "inner step divides the vector-size.\n");
 657           misalign = STMT_VINFO_DR_INIT (stmt_info);
 658           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 659           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 660         }
 661       else
 662         {
 663           if (dump_enabled_p ())
 664             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 665                              "inner step doesn't divide the vector-size.\n");
 666           misalign = NULL_TREE;
 667         }
 668     }
 669
 670   /* Similarly, if we're doing basic-block vectorization, we can only use
 671      base and misalignment information relative to an innermost loop if the
 672      misalignment stays the same throughout the execution of the loop.
 673      As above, this is the case if the stride of the dataref evenly divides
 674      by the vector size.  */
 675   if (!loop)
 676     {
 677       tree step = DR_STEP (dr);
 678       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 679
 680       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 681         {
 682           if (dump_enabled_p ())
 683             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 684                              "SLP: step doesn't divide the vector-size.\n");
 685           misalign = NULL_TREE;
 686         }
 687     }
 688
 689   base = build_fold_indirect_ref (base_addr);
 690   alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
 691
 692   if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
 693       || !misalign)
 694     {
 695       if (dump_enabled_p ())
 696         {
 697           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 698                            "Unknown alignment for access: ");
 699           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
 700           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 701         }
 702       return true;
 703     }
 704
 705   if ((DECL_P (base)
 706        && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
 707                                 alignment) >= 0)
 708       || (TREE_CODE (base_addr) == SSA_NAME
 709           && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
 710                                                       TREE_TYPE (base_addr)))),
 711                                    alignment) >= 0)
 712       || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
 713     base_aligned = true;
 714   else
 715     base_aligned = false;
 716
 717   if (!base_aligned)
 718     {
 719       /* Do not change the alignment of global variables here if
 720          flag_section_anchors is enabled as we already generated
 721          RTL for other functions.  Most global variables should
 722          have been aligned during the IPA increase_alignment pass.  */
 723       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
 724           || (TREE_STATIC (base) && flag_section_anchors))
 725         {
 726           if (dump_enabled_p ())
 727             {
 728               dump_printf_loc (MSG_NOTE, vect_location,
 729                                "can't force alignment of ref: ");
 730               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 731               dump_printf (MSG_NOTE, "\n");
 732             }
 733           return true;
 734         }
 735
 736       /* Force the alignment of the decl.
 737          NOTE: This is the only change to the code we make during
 738          the analysis phase, before deciding to vectorize the loop.  */
 739       if (dump_enabled_p ())
 740         {
 741           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 742           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 743           dump_printf (MSG_NOTE, "\n");
 744         }
 745
 746       ((dataref_aux *)dr->aux)->base_decl = base;
 747       ((dataref_aux *)dr->aux)->base_misaligned = true;
 748     }
 749
 750   /* If this is a backward running DR then first access in the larger
 751      vectype actually is N-1 elements before the address in the DR.
 752      Adjust misalign accordingly.  */
 753   if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
 754     {
 755       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 756       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 757          otherwise we wouldn't be here.  */
 758       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 759       /* PLUS because DR_STEP was negative.  */
 760       misalign = size_binop (PLUS_EXPR, misalign, offset);
 761     }
 762
 763   /* Modulo alignment.  */
 764   misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
 765
 766   if (!tree_fits_uhwi_p (misalign))
 767     {
 768       /* Negative or overflowed misalignment value.  */
 769       if (dump_enabled_p ())
 770         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 771                          "unexpected misalign value\n");
 772       return false;
 773     }
 774
 775   SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
 776
 777   if (dump_enabled_p ())
 778     {
 779       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 780                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 781       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 782       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 783     }
 784
 785   return true;
 786 }
 787
 788
 789 /* Function vect_compute_data_refs_alignment
 790
 791    Compute the misalignment of data references in the loop.
 792    Return FALSE if a data reference is found that cannot be vectorized.  */
 793
 794 static bool
 795 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 796                                   bb_vec_info bb_vinfo)
 797 {
 798   vec<data_reference_p> datarefs;
 799   struct data_reference *dr;
 800   unsigned int i;
 801
 802   if (loop_vinfo)
 803     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 804   else
 805     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 806
 807   FOR_EACH_VEC_ELT (datarefs, i, dr)
 808     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 809         && !vect_compute_data_ref_alignment (dr))
 810       {
 811         if (bb_vinfo)
 812           {
 813             /* Mark unsupported statement as unvectorizable.  */
 814             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 815             continue;
 816           }
 817         else
 818           return false;
 819       }
 820
 821   return true;
 822 }
 823
 824
 825 /* Function vect_update_misalignment_for_peel
 826
 827    DR - the data reference whose misalignment is to be adjusted.
 828    DR_PEEL - the data reference whose misalignment is being made
 829              zero in the vector loop by the peel.
 830    NPEEL - the number of iterations in the peel loop if the misalignment
 831            of DR_PEEL is known at compile time.  */
 832
 833 static void
 834 vect_update_misalignment_for_peel (struct data_reference *dr,
 835                                    struct data_reference *dr_peel, int npeel)
 836 {
 837   unsigned int i;
 838   vec<dr_p> same_align_drs;
 839   struct data_reference *current_dr;
 840   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 841   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 842   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 843   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 844
 845  /* For interleaved data accesses the step in the loop must be multiplied by
 846      the size of the interleaving group.  */
 847   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 848     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 849   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 850     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 851
 852   /* It can be assumed that the data refs with the same alignment as dr_peel
 853      are aligned in the vector loop.  */
 854   same_align_drs
 855     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 856   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 857     {
 858       if (current_dr != dr)
 859         continue;
 860       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 861                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 862       SET_DR_MISALIGNMENT (dr, 0);
 863       return;
 864     }
 865
 866   if (known_alignment_for_access_p (dr)
 867       && known_alignment_for_access_p (dr_peel))
 868     {
 869       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 870       int misal = DR_MISALIGNMENT (dr);
 871       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 872       misal += negative ? -npeel * dr_size : npeel * dr_size;
 873       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 874       SET_DR_MISALIGNMENT (dr, misal);
 875       return;
 876     }
 877
 878   if (dump_enabled_p ())
 879     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 880   SET_DR_MISALIGNMENT (dr, -1);
 881 }
 882
 883
 884 /* Function vect_verify_datarefs_alignment
 885
 886    Return TRUE if all data references in the loop can be
 887    handled with respect to alignment.  */
 888
 889 bool
 890 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 891 {
 892   vec<data_reference_p> datarefs;
 893   struct data_reference *dr;
 894   enum dr_alignment_support supportable_dr_alignment;
 895   unsigned int i;
 896
 897   if (loop_vinfo)
 898     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 899   else
 900     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 901
 902   FOR_EACH_VEC_ELT (datarefs, i, dr)
 903     {
 904       gimple stmt = DR_STMT (dr);
 905       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 906
 907       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 908         continue;
 909
 910       /* For interleaving, only the alignment of the first access matters.
 911          Skip statements marked as not vectorizable.  */
 912       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 913            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 914           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 915         continue;
 916
 917       /* Strided loads perform only component accesses, alignment is
 918          irrelevant for them.  */
 919       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 920         continue;
 921
 922       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 923       if (!supportable_dr_alignment)
 924         {
 925           if (dump_enabled_p ())
 926             {
 927               if (DR_IS_READ (dr))
 928                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 929                                  "not vectorized: unsupported unaligned load.");
 930               else
 931                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 932                                  "not vectorized: unsupported unaligned "
 933                                  "store.");
 934
 935               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 936                                  DR_REF (dr));
 937               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 938             }
 939           return false;
 940         }
 941       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 942         dump_printf_loc (MSG_NOTE, vect_location,
 943                          "Vectorizing an unaligned access.\n");
 944     }
 945   return true;
 946 }
 947
 948 /* Given an memory reference EXP return whether its alignment is less
 949    than its size.  */
 950
 951 static bool
 952 not_size_aligned (tree exp)
 953 {
 954   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 955     return true;
 956
 957   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 958           > get_object_alignment (exp));
 959 }
 960
 961 /* Function vector_alignment_reachable_p
 962
 963    Return true if vector alignment for DR is reachable by peeling
 964    a few loop iterations.  Return false otherwise.  */
 965
 966 static bool
 967 vector_alignment_reachable_p (struct data_reference *dr)
 968 {
 969   gimple stmt = DR_STMT (dr);
 970   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 971   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 972
 973   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 974     {
 975       /* For interleaved access we peel only if number of iterations in
 976          the prolog loop ({VF - misalignment}), is a multiple of the
 977          number of the interleaved accesses.  */
 978       int elem_size, mis_in_elements;
 979       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 980
 981       /* FORNOW: handle only known alignment.  */
 982       if (!known_alignment_for_access_p (dr))
 983         return false;
 984
 985       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 986       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 987
 988       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
 989         return false;
 990     }
 991
 992   /* If misalignment is known at the compile time then allow peeling
 993      only if natural alignment is reachable through peeling.  */
 994   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
 995     {
 996       HOST_WIDE_INT elmsize =
 997                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
 998       if (dump_enabled_p ())
 999         {
1000           dump_printf_loc (MSG_NOTE, vect_location,
1001                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1002           dump_printf (MSG_NOTE,
1003                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1004         }
1005       if (DR_MISALIGNMENT (dr) % elmsize)
1006         {
1007           if (dump_enabled_p ())
1008             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1009                              "data size does not divide the misalignment.\n");
1010           return false;
1011         }
1012     }
1013
1014   if (!known_alignment_for_access_p (dr))
1015     {
1016       tree type = TREE_TYPE (DR_REF (dr));
1017       bool is_packed = not_size_aligned (DR_REF (dr));
1018       if (dump_enabled_p ())
1019         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1020                          "Unknown misalignment, is_packed = %d\n",is_packed);
1021       if ((TYPE_USER_ALIGN (type) && !is_packed)
1022           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1023         return true;
1024       else
1025         return false;
1026     }
1027
1028   return true;
1029 }
1030
1031
1032 /* Calculate the cost of the memory access represented by DR.  */
1033
1034 static void
1035 vect_get_data_access_cost (struct data_reference *dr,
1036                            unsigned int *inside_cost,
1037                            unsigned int *outside_cost,
1038                            stmt_vector_for_cost *body_cost_vec)
1039 {
1040   gimple stmt = DR_STMT (dr);
1041   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1042   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1043   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1044   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1045   int ncopies = vf / nunits;
1046
1047   if (DR_IS_READ (dr))
1048     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1049                         NULL, body_cost_vec, false);
1050   else
1051     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1052
1053   if (dump_enabled_p ())
1054     dump_printf_loc (MSG_NOTE, vect_location,
1055                      "vect_get_data_access_cost: inside_cost = %d, "
1056                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1057 }
1058
1059
1060 /* Insert DR into peeling hash table with NPEEL as key.  */
1061
1062 static void
1063 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1064                           int npeel)
1065 {
1066   struct _vect_peel_info elem, *slot;
1067   _vect_peel_info **new_slot;
1068   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1069
1070   elem.npeel = npeel;
1071   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1072   if (slot)
1073     slot->count++;
1074   else
1075     {
1076       slot = XNEW (struct _vect_peel_info);
1077       slot->npeel = npeel;
1078       slot->dr = dr;
1079       slot->count = 1;
1080       new_slot
1081         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1082       *new_slot = slot;
1083     }
1084
1085   if (!supportable_dr_alignment
1086       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1087     slot->count += VECT_MAX_COST;
1088 }
1089
1090
1091 /* Traverse peeling hash table to find peeling option that aligns maximum
1092    number of data accesses.  */
1093
1094 int
1095 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1096                                      _vect_peel_extended_info *max)
1097 {
1098   vect_peel_info elem = *slot;
1099
1100   if (elem->count > max->peel_info.count
1101       || (elem->count == max->peel_info.count
1102           && max->peel_info.npeel > elem->npeel))
1103     {
1104       max->peel_info.npeel = elem->npeel;
1105       max->peel_info.count = elem->count;
1106       max->peel_info.dr = elem->dr;
1107     }
1108
1109   return 1;
1110 }
1111
1112
1113 /* Traverse peeling hash table and calculate cost for each peeling option.
1114    Find the one with the lowest cost.  */
1115
1116 int
1117 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1118                                    _vect_peel_extended_info *min)
1119 {
1120   vect_peel_info elem = *slot;
1121   int save_misalignment, dummy;
1122   unsigned int inside_cost = 0, outside_cost = 0, i;
1123   gimple stmt = DR_STMT (elem->dr);
1124   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1125   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1126   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1127   struct data_reference *dr;
1128   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1129   int single_iter_cost;
1130
1131   prologue_cost_vec.create (2);
1132   body_cost_vec.create (2);
1133   epilogue_cost_vec.create (2);
1134
1135   FOR_EACH_VEC_ELT (datarefs, i, dr)
1136     {
1137       stmt = DR_STMT (dr);
1138       stmt_info = vinfo_for_stmt (stmt);
1139       /* For interleaving, only the alignment of the first access
1140          matters.  */
1141       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1142           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1143         continue;
1144
1145       save_misalignment = DR_MISALIGNMENT (dr);
1146       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1147       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1148                                  &body_cost_vec);
1149       SET_DR_MISALIGNMENT (dr, save_misalignment);
1150     }
1151
1152   single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1153   outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1154                                                &dummy, single_iter_cost,
1155                                                &prologue_cost_vec,
1156                                                &epilogue_cost_vec);
1157
1158   /* Prologue and epilogue costs are added to the target model later.
1159      These costs depend only on the scalar iteration cost, the
1160      number of peeling iterations finally chosen, and the number of
1161      misaligned statements.  So discard the information found here.  */
1162   prologue_cost_vec.release ();
1163   epilogue_cost_vec.release ();
1164
1165   if (inside_cost < min->inside_cost
1166       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1167     {
1168       min->inside_cost = inside_cost;
1169       min->outside_cost = outside_cost;
1170       min->body_cost_vec.release ();
1171       min->body_cost_vec = body_cost_vec;
1172       min->peel_info.dr = elem->dr;
1173       min->peel_info.npeel = elem->npeel;
1174     }
1175   else
1176     body_cost_vec.release ();
1177
1178   return 1;
1179 }
1180
1181
1182 /* Choose best peeling option by traversing peeling hash table and either
1183    choosing an option with the lowest cost (if cost model is enabled) or the
1184    option that aligns as many accesses as possible.  */
1185
1186 static struct data_reference *
1187 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1188                                        unsigned int *npeel,
1189                                        stmt_vector_for_cost *body_cost_vec)
1190 {
1191    struct _vect_peel_extended_info res;
1192
1193    res.peel_info.dr = NULL;
1194    res.body_cost_vec = stmt_vector_for_cost ();
1195
1196    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1197      {
1198        res.inside_cost = INT_MAX;
1199        res.outside_cost = INT_MAX;
1200        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1201            ->traverse <_vect_peel_extended_info *,
1202                        vect_peeling_hash_get_lowest_cost> (&res);
1203      }
1204    else
1205      {
1206        res.peel_info.count = 0;
1207        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1208            ->traverse <_vect_peel_extended_info *,
1209                        vect_peeling_hash_get_most_frequent> (&res);
1210      }
1211
1212    *npeel = res.peel_info.npeel;
1213    *body_cost_vec = res.body_cost_vec;
1214    return res.peel_info.dr;
1215 }
1216
1217
1218 /* Function vect_enhance_data_refs_alignment
1219
1220    This pass will use loop versioning and loop peeling in order to enhance
1221    the alignment of data references in the loop.
1222
1223    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1224    original loop is to be vectorized.  Any other loops that are created by
1225    the transformations performed in this pass - are not supposed to be
1226    vectorized.  This restriction will be relaxed.
1227
1228    This pass will require a cost model to guide it whether to apply peeling
1229    or versioning or a combination of the two.  For example, the scheme that
1230    intel uses when given a loop with several memory accesses, is as follows:
1231    choose one memory access ('p') which alignment you want to force by doing
1232    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1233    other accesses are not necessarily aligned, or (2) use loop versioning to
1234    generate one loop in which all accesses are aligned, and another loop in
1235    which only 'p' is necessarily aligned.
1236
1237    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1238    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1239    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1240
1241    Devising a cost model is the most critical aspect of this work.  It will
1242    guide us on which access to peel for, whether to use loop versioning, how
1243    many versions to create, etc.  The cost model will probably consist of
1244    generic considerations as well as target specific considerations (on
1245    powerpc for example, misaligned stores are more painful than misaligned
1246    loads).
1247
1248    Here are the general steps involved in alignment enhancements:
1249
1250      -- original loop, before alignment analysis:
1251         for (i=0; i<N; i++){
1252           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1253           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1254         }
1255
1256      -- After vect_compute_data_refs_alignment:
1257         for (i=0; i<N; i++){
1258           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1259           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1260         }
1261
1262      -- Possibility 1: we do loop versioning:
1263      if (p is aligned) {
1264         for (i=0; i<N; i++){    # loop 1A
1265           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1266           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1267         }
1268      }
1269      else {
1270         for (i=0; i<N; i++){    # loop 1B
1271           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1272           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1273         }
1274      }
1275
1276      -- Possibility 2: we do loop peeling:
1277      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1278         x = q[i];
1279         p[i] = y;
1280      }
1281      for (i = 3; i < N; i++){   # loop 2A
1282         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1283         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1284      }
1285
1286      -- Possibility 3: combination of loop peeling and versioning:
1287      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1288         x = q[i];
1289         p[i] = y;
1290      }
1291      if (p is aligned) {
1292         for (i = 3; i<N; i++){  # loop 3A
1293           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1294           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1295         }
1296      }
1297      else {
1298         for (i = 3; i<N; i++){  # loop 3B
1299           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1300           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1301         }
1302      }
1303
1304      These loops are later passed to loop_transform to be vectorized.  The
1305      vectorizer will use the alignment information to guide the transformation
1306      (whether to generate regular loads/stores, or with special handling for
1307      misalignment).  */
1308
1309 bool
1310 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1311 {
1312   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1313   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1314   enum dr_alignment_support supportable_dr_alignment;
1315   struct data_reference *dr0 = NULL, *first_store = NULL;
1316   struct data_reference *dr;
1317   unsigned int i, j;
1318   bool do_peeling = false;
1319   bool do_versioning = false;
1320   bool stat;
1321   gimple stmt;
1322   stmt_vec_info stmt_info;
1323   unsigned int npeel = 0;
1324   bool all_misalignments_unknown = true;
1325   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1326   unsigned possible_npeel_number = 1;
1327   tree vectype;
1328   unsigned int nelements, mis, same_align_drs_max = 0;
1329   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1330
1331   if (dump_enabled_p ())
1332     dump_printf_loc (MSG_NOTE, vect_location,
1333                      "=== vect_enhance_data_refs_alignment ===\n");
1334
1335   /* While cost model enhancements are expected in the future, the high level
1336      view of the code at this time is as follows:
1337
1338      A) If there is a misaligned access then see if peeling to align
1339         this access can make all data references satisfy
1340         vect_supportable_dr_alignment.  If so, update data structures
1341         as needed and return true.
1342
1343      B) If peeling wasn't possible and there is a data reference with an
1344         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1345         then see if loop versioning checks can be used to make all data
1346         references satisfy vect_supportable_dr_alignment.  If so, update
1347         data structures as needed and return true.
1348
1349      C) If neither peeling nor versioning were successful then return false if
1350         any data reference does not satisfy vect_supportable_dr_alignment.
1351
1352      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1353
1354      Note, Possibility 3 above (which is peeling and versioning together) is not
1355      being done at this time.  */
1356
1357   /* (1) Peeling to force alignment.  */
1358
1359   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1360      Considerations:
1361      + How many accesses will become aligned due to the peeling
1362      - How many accesses will become unaligned due to the peeling,
1363        and the cost of misaligned accesses.
1364      - The cost of peeling (the extra runtime checks, the increase
1365        in code size).  */
1366
1367   FOR_EACH_VEC_ELT (datarefs, i, dr)
1368     {
1369       stmt = DR_STMT (dr);
1370       stmt_info = vinfo_for_stmt (stmt);
1371
1372       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1373         continue;
1374
1375       /* For interleaving, only the alignment of the first access
1376          matters.  */
1377       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1378           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1379         continue;
1380
1381       /* For invariant accesses there is nothing to enhance.  */
1382       if (integer_zerop (DR_STEP (dr)))
1383         continue;
1384
1385       /* Strided loads perform only component accesses, alignment is
1386          irrelevant for them.  */
1387       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1388         continue;
1389
1390       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1391       do_peeling = vector_alignment_reachable_p (dr);
1392       if (do_peeling)
1393         {
1394           if (known_alignment_for_access_p (dr))
1395             {
1396               unsigned int npeel_tmp;
1397               bool negative = tree_int_cst_compare (DR_STEP (dr),
1398                                                     size_zero_node) < 0;
1399
1400               /* Save info about DR in the hash table.  */
1401               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1402                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1403                   = new hash_table<peel_info_hasher> (1);
1404
1405               vectype = STMT_VINFO_VECTYPE (stmt_info);
1406               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1407               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1408                                                 TREE_TYPE (DR_REF (dr))));
1409               npeel_tmp = (negative
1410                            ? (mis - nelements) : (nelements - mis))
1411                   & (nelements - 1);
1412
1413               /* For multiple types, it is possible that the bigger type access
1414                  will have more than one peeling option.  E.g., a loop with two
1415                  types: one of size (vector size / 4), and the other one of
1416                  size (vector size / 8).  Vectorization factor will 8.  If both
1417                  access are misaligned by 3, the first one needs one scalar
1418                  iteration to be aligned, and the second one needs 5.  But the
1419                  the first one will be aligned also by peeling 5 scalar
1420                  iterations, and in that case both accesses will be aligned.
1421                  Hence, except for the immediate peeling amount, we also want
1422                  to try to add full vector size, while we don't exceed
1423                  vectorization factor.
1424                  We do this automtically for cost model, since we calculate cost
1425                  for every peeling option.  */
1426               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1427                 possible_npeel_number = vf /nelements;
1428
1429               /* Handle the aligned case. We may decide to align some other
1430                  access, making DR unaligned.  */
1431               if (DR_MISALIGNMENT (dr) == 0)
1432                 {
1433                   npeel_tmp = 0;
1434                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1435                     possible_npeel_number++;
1436                 }
1437
1438               for (j = 0; j < possible_npeel_number; j++)
1439                 {
1440                   gcc_assert (npeel_tmp <= vf);
1441                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1442                   npeel_tmp += nelements;
1443                 }
1444
1445               all_misalignments_unknown = false;
1446               /* Data-ref that was chosen for the case that all the
1447                  misalignments are unknown is not relevant anymore, since we
1448                  have a data-ref with known alignment.  */
1449               dr0 = NULL;
1450             }
1451           else
1452             {
1453               /* If we don't know any misalignment values, we prefer
1454                  peeling for data-ref that has the maximum number of data-refs
1455                  with the same alignment, unless the target prefers to align
1456                  stores over load.  */
1457               if (all_misalignments_unknown)
1458                 {
1459                   unsigned same_align_drs
1460                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1461                   if (!dr0
1462                       || same_align_drs_max < same_align_drs)
1463                     {
1464                       same_align_drs_max = same_align_drs;
1465                       dr0 = dr;
1466                     }
1467                   /* For data-refs with the same number of related
1468                      accesses prefer the one where the misalign
1469                      computation will be invariant in the outermost loop.  */
1470                   else if (same_align_drs_max == same_align_drs)
1471                     {
1472                       struct loop *ivloop0, *ivloop;
1473                       ivloop0 = outermost_invariant_loop_for_expr
1474                           (loop, DR_BASE_ADDRESS (dr0));
1475                       ivloop = outermost_invariant_loop_for_expr
1476                           (loop, DR_BASE_ADDRESS (dr));
1477                       if ((ivloop && !ivloop0)
1478                           || (ivloop && ivloop0
1479                               && flow_loop_nested_p (ivloop, ivloop0)))
1480                         dr0 = dr;
1481                     }
1482
1483                   if (!first_store && DR_IS_WRITE (dr))
1484                     first_store = dr;
1485                 }
1486
1487               /* If there are both known and unknown misaligned accesses in the
1488                  loop, we choose peeling amount according to the known
1489                  accesses.  */
1490               if (!supportable_dr_alignment)
1491                 {
1492                   dr0 = dr;
1493                   if (!first_store && DR_IS_WRITE (dr))
1494                     first_store = dr;
1495                 }
1496             }
1497         }
1498       else
1499         {
1500           if (!aligned_access_p (dr))
1501             {
1502               if (dump_enabled_p ())
1503                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1504                                  "vector alignment may not be reachable\n");
1505               break;
1506             }
1507         }
1508     }
1509
1510   /* Check if we can possibly peel the loop.  */
1511   if (!vect_can_advance_ivs_p (loop_vinfo)
1512       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1513     do_peeling = false;
1514
1515   if (do_peeling && all_misalignments_unknown
1516       && vect_supportable_dr_alignment (dr0, false))
1517     {
1518
1519       /* Check if the target requires to prefer stores over loads, i.e., if
1520          misaligned stores are more expensive than misaligned loads (taking
1521          drs with same alignment into account).  */
1522       if (first_store && DR_IS_READ (dr0))
1523         {
1524           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1525           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1526           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1527           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1528           stmt_vector_for_cost dummy;
1529           dummy.create (2);
1530
1531           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1532                                      &dummy);
1533           vect_get_data_access_cost (first_store, &store_inside_cost,
1534                                      &store_outside_cost, &dummy);
1535
1536           dummy.release ();
1537
1538           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1539              aligning the load DR0).  */
1540           load_inside_penalty = store_inside_cost;
1541           load_outside_penalty = store_outside_cost;
1542           for (i = 0;
1543                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1544                           DR_STMT (first_store))).iterate (i, &dr);
1545                i++)
1546             if (DR_IS_READ (dr))
1547               {
1548                 load_inside_penalty += load_inside_cost;
1549                 load_outside_penalty += load_outside_cost;
1550               }
1551             else
1552               {
1553                 load_inside_penalty += store_inside_cost;
1554                 load_outside_penalty += store_outside_cost;
1555               }
1556
1557           /* Calculate the penalty for leaving DR0 unaligned (by
1558              aligning the FIRST_STORE).  */
1559           store_inside_penalty = load_inside_cost;
1560           store_outside_penalty = load_outside_cost;
1561           for (i = 0;
1562                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1563                       DR_STMT (dr0))).iterate (i, &dr);
1564                i++)
1565             if (DR_IS_READ (dr))
1566               {
1567                 store_inside_penalty += load_inside_cost;
1568                 store_outside_penalty += load_outside_cost;
1569               }
1570             else
1571               {
1572                 store_inside_penalty += store_inside_cost;
1573                 store_outside_penalty += store_outside_cost;
1574               }
1575
1576           if (load_inside_penalty > store_inside_penalty
1577               || (load_inside_penalty == store_inside_penalty
1578                   && load_outside_penalty > store_outside_penalty))
1579             dr0 = first_store;
1580         }
1581
1582       /* In case there are only loads with different unknown misalignments, use
1583          peeling only if it may help to align other accesses in the loop.  */
1584       if (!first_store
1585           && !STMT_VINFO_SAME_ALIGN_REFS (
1586                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1587           && vect_supportable_dr_alignment (dr0, false)
1588               != dr_unaligned_supported)
1589         do_peeling = false;
1590     }
1591
1592   if (do_peeling && !dr0)
1593     {
1594       /* Peeling is possible, but there is no data access that is not supported
1595          unless aligned. So we try to choose the best possible peeling.  */
1596
1597       /* We should get here only if there are drs with known misalignment.  */
1598       gcc_assert (!all_misalignments_unknown);
1599
1600       /* Choose the best peeling from the hash table.  */
1601       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1602                                                    &body_cost_vec);
1603       if (!dr0 || !npeel)
1604         do_peeling = false;
1605     }
1606
1607   if (do_peeling)
1608     {
1609       stmt = DR_STMT (dr0);
1610       stmt_info = vinfo_for_stmt (stmt);
1611       vectype = STMT_VINFO_VECTYPE (stmt_info);
1612       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1613
1614       if (known_alignment_for_access_p (dr0))
1615         {
1616           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1617                                                 size_zero_node) < 0;
1618           if (!npeel)
1619             {
1620               /* Since it's known at compile time, compute the number of
1621                  iterations in the peeled loop (the peeling factor) for use in
1622                  updating DR_MISALIGNMENT values.  The peeling factor is the
1623                  vectorization factor minus the misalignment as an element
1624                  count.  */
1625               mis = DR_MISALIGNMENT (dr0);
1626               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1627               npeel = ((negative ? mis - nelements : nelements - mis)
1628                        & (nelements - 1));
1629             }
1630
1631           /* For interleaved data access every iteration accesses all the
1632              members of the group, therefore we divide the number of iterations
1633              by the group size.  */
1634           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1635           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1636             npeel /= GROUP_SIZE (stmt_info);
1637
1638           if (dump_enabled_p ())
1639             dump_printf_loc (MSG_NOTE, vect_location,
1640                              "Try peeling by %d\n", npeel);
1641         }
1642
1643       /* Ensure that all data refs can be vectorized after the peel.  */
1644       FOR_EACH_VEC_ELT (datarefs, i, dr)
1645         {
1646           int save_misalignment;
1647
1648           if (dr == dr0)
1649             continue;
1650
1651           stmt = DR_STMT (dr);
1652           stmt_info = vinfo_for_stmt (stmt);
1653           /* For interleaving, only the alignment of the first access
1654             matters.  */
1655           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1656               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1657             continue;
1658
1659           /* Strided loads perform only component accesses, alignment is
1660              irrelevant for them.  */
1661           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1662             continue;
1663
1664           save_misalignment = DR_MISALIGNMENT (dr);
1665           vect_update_misalignment_for_peel (dr, dr0, npeel);
1666           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1667           SET_DR_MISALIGNMENT (dr, save_misalignment);
1668
1669           if (!supportable_dr_alignment)
1670             {
1671               do_peeling = false;
1672               break;
1673             }
1674         }
1675
1676       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1677         {
1678           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1679           if (!stat)
1680             do_peeling = false;
1681           else
1682             {
1683               body_cost_vec.release ();
1684               return stat;
1685             }
1686         }
1687
1688       if (do_peeling)
1689         {
1690           unsigned max_allowed_peel
1691             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1692           if (max_allowed_peel != (unsigned)-1)
1693             {
1694               unsigned max_peel = npeel;
1695               if (max_peel == 0)
1696                 {
1697                   gimple dr_stmt = DR_STMT (dr0);
1698                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1699                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1700                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1701                 }
1702               if (max_peel > max_allowed_peel)
1703                 {
1704                   do_peeling = false;
1705                   if (dump_enabled_p ())
1706                     dump_printf_loc (MSG_NOTE, vect_location,
1707                         "Disable peeling, max peels reached: %d\n", max_peel);
1708                 }
1709             }
1710         }
1711
1712       if (do_peeling)
1713         {
1714           stmt_info_for_cost *si;
1715           void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1716
1717           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1718              If the misalignment of DR_i is identical to that of dr0 then set
1719              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1720              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1721              by the peeling factor times the element size of DR_i (MOD the
1722              vectorization factor times the size).  Otherwise, the
1723              misalignment of DR_i must be set to unknown.  */
1724           FOR_EACH_VEC_ELT (datarefs, i, dr)
1725             if (dr != dr0)
1726               vect_update_misalignment_for_peel (dr, dr0, npeel);
1727
1728           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1729           if (npeel)
1730             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1731           else
1732             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1733               = DR_MISALIGNMENT (dr0);
1734           SET_DR_MISALIGNMENT (dr0, 0);
1735           if (dump_enabled_p ())
1736             {
1737               dump_printf_loc (MSG_NOTE, vect_location,
1738                                "Alignment of access forced using peeling.\n");
1739               dump_printf_loc (MSG_NOTE, vect_location,
1740                                "Peeling for alignment will be applied.\n");
1741             }
1742           /* We've delayed passing the inside-loop peeling costs to the
1743              target cost model until we were sure peeling would happen.
1744              Do so now.  */
1745           if (body_cost_vec.exists ())
1746             {
1747               FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1748                 {
1749                   struct _stmt_vec_info *stmt_info
1750                     = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1751                   (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1752                                         si->misalign, vect_body);
1753                 }
1754               body_cost_vec.release ();
1755             }
1756
1757           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1758           gcc_assert (stat);
1759           return stat;
1760         }
1761     }
1762
1763   body_cost_vec.release ();
1764
1765   /* (2) Versioning to force alignment.  */
1766
1767   /* Try versioning if:
1768      1) optimize loop for speed
1769      2) there is at least one unsupported misaligned data ref with an unknown
1770         misalignment, and
1771      3) all misaligned data refs with a known misalignment are supported, and
1772      4) the number of runtime alignment checks is within reason.  */
1773
1774   do_versioning =
1775         optimize_loop_nest_for_speed_p (loop)
1776         && (!loop->inner); /* FORNOW */
1777
1778   if (do_versioning)
1779     {
1780       FOR_EACH_VEC_ELT (datarefs, i, dr)
1781         {
1782           stmt = DR_STMT (dr);
1783           stmt_info = vinfo_for_stmt (stmt);
1784
1785           /* For interleaving, only the alignment of the first access
1786              matters.  */
1787           if (aligned_access_p (dr)
1788               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1789                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1790             continue;
1791
1792           /* Strided loads perform only component accesses, alignment is
1793              irrelevant for them.  */
1794           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1795             continue;
1796
1797           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1798
1799           if (!supportable_dr_alignment)
1800             {
1801               gimple stmt;
1802               int mask;
1803               tree vectype;
1804
1805               if (known_alignment_for_access_p (dr)
1806                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1807                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1808                 {
1809                   do_versioning = false;
1810                   break;
1811                 }
1812
1813               stmt = DR_STMT (dr);
1814               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1815               gcc_assert (vectype);
1816
1817               /* The rightmost bits of an aligned address must be zeros.
1818                  Construct the mask needed for this test.  For example,
1819                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1820                  mask must be 15 = 0xf. */
1821               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1822
1823               /* FORNOW: use the same mask to test all potentially unaligned
1824                  references in the loop.  The vectorizer currently supports
1825                  a single vector size, see the reference to
1826                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1827                  vectorization factor is computed.  */
1828               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1829                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1830               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1831               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1832                       DR_STMT (dr));
1833             }
1834         }
1835
1836       /* Versioning requires at least one misaligned data reference.  */
1837       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1838         do_versioning = false;
1839       else if (!do_versioning)
1840         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1841     }
1842
1843   if (do_versioning)
1844     {
1845       vec<gimple> may_misalign_stmts
1846         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1847       gimple stmt;
1848
1849       /* It can now be assumed that the data references in the statements
1850          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1851          of the loop being vectorized.  */
1852       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1853         {
1854           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1855           dr = STMT_VINFO_DATA_REF (stmt_info);
1856           SET_DR_MISALIGNMENT (dr, 0);
1857           if (dump_enabled_p ())
1858             dump_printf_loc (MSG_NOTE, vect_location,
1859                              "Alignment of access forced using versioning.\n");
1860         }
1861
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_NOTE, vect_location,
1864                          "Versioning for alignment will be applied.\n");
1865
1866       /* Peeling and versioning can't be done together at this time.  */
1867       gcc_assert (! (do_peeling && do_versioning));
1868
1869       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1870       gcc_assert (stat);
1871       return stat;
1872     }
1873
1874   /* This point is reached if neither peeling nor versioning is being done.  */
1875   gcc_assert (! (do_peeling || do_versioning));
1876
1877   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1878   return stat;
1879 }
1880
1881
1882 /* Function vect_find_same_alignment_drs.
1883
1884    Update group and alignment relations according to the chosen
1885    vectorization factor.  */
1886
1887 static void
1888 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1889                               loop_vec_info loop_vinfo)
1890 {
1891   unsigned int i;
1892   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1893   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1894   struct data_reference *dra = DDR_A (ddr);
1895   struct data_reference *drb = DDR_B (ddr);
1896   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1897   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1898   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1899   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1900   lambda_vector dist_v;
1901   unsigned int loop_depth;
1902
1903   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1904     return;
1905
1906   if (dra == drb)
1907     return;
1908
1909   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1910     return;
1911
1912   /* Loop-based vectorization and known data dependence.  */
1913   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1914     return;
1915
1916   /* Data-dependence analysis reports a distance vector of zero
1917      for data-references that overlap only in the first iteration
1918      but have different sign step (see PR45764).
1919      So as a sanity check require equal DR_STEP.  */
1920   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1921     return;
1922
1923   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1924   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1925     {
1926       int dist = dist_v[loop_depth];
1927
1928       if (dump_enabled_p ())
1929         dump_printf_loc (MSG_NOTE, vect_location,
1930                          "dependence distance  = %d.\n", dist);
1931
1932       /* Same loop iteration.  */
1933       if (dist == 0
1934           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1935         {
1936           /* Two references with distance zero have the same alignment.  */
1937           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1938           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1939           if (dump_enabled_p ())
1940             {
1941               dump_printf_loc (MSG_NOTE, vect_location,
1942                                "accesses have the same alignment.\n");
1943               dump_printf (MSG_NOTE,
1944                            "dependence distance modulo vf == 0 between ");
1945               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1946               dump_printf (MSG_NOTE,  " and ");
1947               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1948               dump_printf (MSG_NOTE, "\n");
1949             }
1950         }
1951     }
1952 }
1953
1954
1955 /* Function vect_analyze_data_refs_alignment
1956
1957    Analyze the alignment of the data-references in the loop.
1958    Return FALSE if a data reference is found that cannot be vectorized.  */
1959
1960 bool
1961 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1962                                   bb_vec_info bb_vinfo)
1963 {
1964   if (dump_enabled_p ())
1965     dump_printf_loc (MSG_NOTE, vect_location,
1966                      "=== vect_analyze_data_refs_alignment ===\n");
1967
1968   /* Mark groups of data references with same alignment using
1969      data dependence information.  */
1970   if (loop_vinfo)
1971     {
1972       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1973       struct data_dependence_relation *ddr;
1974       unsigned int i;
1975
1976       FOR_EACH_VEC_ELT (ddrs, i, ddr)
1977         vect_find_same_alignment_drs (ddr, loop_vinfo);
1978     }
1979
1980   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1981     {
1982       if (dump_enabled_p ())
1983         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1984                          "not vectorized: can't calculate alignment "
1985                          "for data ref.\n");
1986       return false;
1987     }
1988
1989   return true;
1990 }
1991
1992
1993 /* Analyze groups of accesses: check that DR belongs to a group of
1994    accesses of legal size, step, etc.  Detect gaps, single element
1995    interleaving, and other special cases. Set grouped access info.
1996    Collect groups of strided stores for further use in SLP analysis.  */
1997
1998 static bool
1999 vect_analyze_group_access (struct data_reference *dr)
2000 {
2001   tree step = DR_STEP (dr);
2002   tree scalar_type = TREE_TYPE (DR_REF (dr));
2003   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2004   gimple stmt = DR_STMT (dr);
2005   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2006   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2007   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2008   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2009   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2010   bool slp_impossible = false;
2011   struct loop *loop = NULL;
2012
2013   if (loop_vinfo)
2014     loop = LOOP_VINFO_LOOP (loop_vinfo);
2015
2016   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2017      size of the interleaving group (including gaps).  */
2018   groupsize = absu_hwi (dr_step) / type_size;
2019
2020   /* Not consecutive access is possible only if it is a part of interleaving.  */
2021   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2022     {
2023       /* Check if it this DR is a part of interleaving, and is a single
2024          element of the group that is accessed in the loop.  */
2025
2026       /* Gaps are supported only for loads. STEP must be a multiple of the type
2027          size.  The size of the group must be a power of 2.  */
2028       if (DR_IS_READ (dr)
2029           && (dr_step % type_size) == 0
2030           && groupsize > 0
2031           && exact_log2 (groupsize) != -1)
2032         {
2033           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2034           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2035           if (dump_enabled_p ())
2036             {
2037               dump_printf_loc (MSG_NOTE, vect_location,
2038                                "Detected single element interleaving ");
2039               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2040               dump_printf (MSG_NOTE, " step ");
2041               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2042               dump_printf (MSG_NOTE, "\n");
2043             }
2044
2045           if (loop_vinfo)
2046             {
2047               if (dump_enabled_p ())
2048                 dump_printf_loc (MSG_NOTE, vect_location,
2049                                  "Data access with gaps requires scalar "
2050                                  "epilogue loop\n");
2051               if (loop->inner)
2052                 {
2053                   if (dump_enabled_p ())
2054                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2055                                      "Peeling for outer loop is not"
2056                                      " supported\n");
2057                   return false;
2058                 }
2059
2060               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2061             }
2062
2063           return true;
2064         }
2065
2066       if (dump_enabled_p ())
2067         {
2068           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2069                            "not consecutive access ");
2070           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2071           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2072         }
2073
2074       if (bb_vinfo)
2075         {
2076           /* Mark the statement as unvectorizable.  */
2077           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2078           return true;
2079         }
2080
2081       return false;
2082     }
2083
2084   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2085     {
2086       /* First stmt in the interleaving chain. Check the chain.  */
2087       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2088       struct data_reference *data_ref = dr;
2089       unsigned int count = 1;
2090       tree prev_init = DR_INIT (data_ref);
2091       gimple prev = stmt;
2092       HOST_WIDE_INT diff, gaps = 0;
2093       unsigned HOST_WIDE_INT count_in_bytes;
2094
2095       while (next)
2096         {
2097           /* Skip same data-refs.  In case that two or more stmts share
2098              data-ref (supported only for loads), we vectorize only the first
2099              stmt, and the rest get their vectorized loads from the first
2100              one.  */
2101           if (!tree_int_cst_compare (DR_INIT (data_ref),
2102                                      DR_INIT (STMT_VINFO_DATA_REF (
2103                                                    vinfo_for_stmt (next)))))
2104             {
2105               if (DR_IS_WRITE (data_ref))
2106                 {
2107                   if (dump_enabled_p ())
2108                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2109                                      "Two store stmts share the same dr.\n");
2110                   return false;
2111                 }
2112
2113               /* For load use the same data-ref load.  */
2114               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2115
2116               prev = next;
2117               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2118               continue;
2119             }
2120
2121           prev = next;
2122           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2123
2124           /* All group members have the same STEP by construction.  */
2125           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2126
2127           /* Check that the distance between two accesses is equal to the type
2128              size. Otherwise, we have gaps.  */
2129           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2130                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2131           if (diff != 1)
2132             {
2133               /* FORNOW: SLP of accesses with gaps is not supported.  */
2134               slp_impossible = true;
2135               if (DR_IS_WRITE (data_ref))
2136                 {
2137                   if (dump_enabled_p ())
2138                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139                                      "interleaved store with gaps\n");
2140                   return false;
2141                 }
2142
2143               gaps += diff - 1;
2144             }
2145
2146           last_accessed_element += diff;
2147
2148           /* Store the gap from the previous member of the group. If there is no
2149              gap in the access, GROUP_GAP is always 1.  */
2150           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2151
2152           prev_init = DR_INIT (data_ref);
2153           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2154           /* Count the number of data-refs in the chain.  */
2155           count++;
2156         }
2157
2158       /* COUNT is the number of accesses found, we multiply it by the size of
2159          the type to get COUNT_IN_BYTES.  */
2160       count_in_bytes = type_size * count;
2161
2162       /* Check that the size of the interleaving (including gaps) is not
2163          greater than STEP.  */
2164       if (dr_step != 0
2165           && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2166         {
2167           if (dump_enabled_p ())
2168             {
2169               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170                                "interleaving size is greater than step for ");
2171               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2172                                  DR_REF (dr));
2173               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2174             }
2175           return false;
2176         }
2177
2178       /* Check that the size of the interleaving is equal to STEP for stores,
2179          i.e., that there are no gaps.  */
2180       if (dr_step != 0
2181           && absu_hwi (dr_step) != count_in_bytes)
2182         {
2183           if (DR_IS_READ (dr))
2184             {
2185               slp_impossible = true;
2186               /* There is a gap after the last load in the group. This gap is a
2187                  difference between the groupsize and the number of elements.
2188                  When there is no gap, this difference should be 0.  */
2189               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2190             }
2191           else
2192             {
2193               if (dump_enabled_p ())
2194                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2195                                  "interleaved store with gaps\n");
2196               return false;
2197             }
2198         }
2199
2200       /* Check that STEP is a multiple of type size.  */
2201       if (dr_step != 0
2202           && (dr_step % type_size) != 0)
2203         {
2204           if (dump_enabled_p ())
2205             {
2206               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2207                                "step is not a multiple of type size: step ");
2208               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2209               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2210               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2211                                  TYPE_SIZE_UNIT (scalar_type));
2212               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2213             }
2214           return false;
2215         }
2216
2217       if (groupsize == 0)
2218         groupsize = count;
2219
2220       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2221       if (dump_enabled_p ())
2222         dump_printf_loc (MSG_NOTE, vect_location,
2223                          "Detected interleaving of size %d\n", (int)groupsize);
2224
2225       /* SLP: create an SLP data structure for every interleaving group of
2226          stores for further analysis in vect_analyse_slp.  */
2227       if (DR_IS_WRITE (dr) && !slp_impossible)
2228         {
2229           if (loop_vinfo)
2230             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2231           if (bb_vinfo)
2232             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2233         }
2234
2235       /* There is a gap in the end of the group.  */
2236       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2237         {
2238           if (dump_enabled_p ())
2239             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240                              "Data access with gaps requires scalar "
2241                              "epilogue loop\n");
2242           if (loop->inner)
2243             {
2244               if (dump_enabled_p ())
2245                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246                                  "Peeling for outer loop is not supported\n");
2247               return false;
2248             }
2249
2250           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2251         }
2252     }
2253
2254   return true;
2255 }
2256
2257
2258 /* Analyze the access pattern of the data-reference DR.
2259    In case of non-consecutive accesses call vect_analyze_group_access() to
2260    analyze groups of accesses.  */
2261
2262 static bool
2263 vect_analyze_data_ref_access (struct data_reference *dr)
2264 {
2265   tree step = DR_STEP (dr);
2266   tree scalar_type = TREE_TYPE (DR_REF (dr));
2267   gimple stmt = DR_STMT (dr);
2268   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2269   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2270   struct loop *loop = NULL;
2271
2272   if (loop_vinfo)
2273     loop = LOOP_VINFO_LOOP (loop_vinfo);
2274
2275   if (loop_vinfo && !step)
2276     {
2277       if (dump_enabled_p ())
2278         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279                          "bad data-ref access in loop\n");
2280       return false;
2281     }
2282
2283   /* Allow invariant loads in not nested loops.  */
2284   if (loop_vinfo && integer_zerop (step))
2285     {
2286       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2287       if (nested_in_vect_loop_p (loop, stmt))
2288         {
2289           if (dump_enabled_p ())
2290             dump_printf_loc (MSG_NOTE, vect_location,
2291                              "zero step in inner loop of nest\n");
2292           return false;
2293         }
2294       return DR_IS_READ (dr);
2295     }
2296
2297   if (loop && nested_in_vect_loop_p (loop, stmt))
2298     {
2299       /* Interleaved accesses are not yet supported within outer-loop
2300         vectorization for references in the inner-loop.  */
2301       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2302
2303       /* For the rest of the analysis we use the outer-loop step.  */
2304       step = STMT_VINFO_DR_STEP (stmt_info);
2305       if (integer_zerop (step))
2306         {
2307           if (dump_enabled_p ())
2308             dump_printf_loc (MSG_NOTE, vect_location,
2309                              "zero step in outer loop.\n");
2310           if (DR_IS_READ (dr))
2311             return true;
2312           else
2313             return false;
2314         }
2315     }
2316
2317   /* Consecutive?  */
2318   if (TREE_CODE (step) == INTEGER_CST)
2319     {
2320       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2321       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2322           || (dr_step < 0
2323               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2324         {
2325           /* Mark that it is not interleaving.  */
2326           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2327           return true;
2328         }
2329     }
2330
2331   if (loop && nested_in_vect_loop_p (loop, stmt))
2332     {
2333       if (dump_enabled_p ())
2334         dump_printf_loc (MSG_NOTE, vect_location,
2335                          "grouped access in outer loop.\n");
2336       return false;
2337     }
2338
2339   /* Assume this is a DR handled by non-constant strided load case.  */
2340   if (TREE_CODE (step) != INTEGER_CST)
2341     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2342
2343   /* Not consecutive access - check if it's a part of interleaving group.  */
2344   return vect_analyze_group_access (dr);
2345 }
2346
2347
2348
2349 /*  A helper function used in the comparator function to sort data
2350     references.  T1 and T2 are two data references to be compared.
2351     The function returns -1, 0, or 1.  */
2352
2353 static int
2354 compare_tree (tree t1, tree t2)
2355 {
2356   int i, cmp;
2357   enum tree_code code;
2358   char tclass;
2359
2360   if (t1 == t2)
2361     return 0;
2362   if (t1 == NULL)
2363     return -1;
2364   if (t2 == NULL)
2365     return 1;
2366
2367
2368   if (TREE_CODE (t1) != TREE_CODE (t2))
2369     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2370
2371   code = TREE_CODE (t1);
2372   switch (code)
2373     {
2374     /* For const values, we can just use hash values for comparisons.  */
2375     case INTEGER_CST:
2376     case REAL_CST:
2377     case FIXED_CST:
2378     case STRING_CST:
2379     case COMPLEX_CST:
2380     case VECTOR_CST:
2381       {
2382         hashval_t h1 = iterative_hash_expr (t1, 0);
2383         hashval_t h2 = iterative_hash_expr (t2, 0);
2384         if (h1 != h2)
2385           return h1 < h2 ? -1 : 1;
2386         break;
2387       }
2388
2389     case SSA_NAME:
2390       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2391       if (cmp != 0)
2392         return cmp;
2393
2394       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2395         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2396       break;
2397
2398     default:
2399       tclass = TREE_CODE_CLASS (code);
2400
2401       /* For var-decl, we could compare their UIDs.  */
2402       if (tclass == tcc_declaration)
2403         {
2404           if (DECL_UID (t1) != DECL_UID (t2))
2405             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2406           break;
2407         }
2408
2409       /* For expressions with operands, compare their operands recursively.  */
2410       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2411         {
2412           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2413           if (cmp != 0)
2414             return cmp;
2415         }
2416     }
2417
2418   return 0;
2419 }
2420
2421
2422 /* Compare two data-references DRA and DRB to group them into chunks
2423    suitable for grouping.  */
2424
2425 static int
2426 dr_group_sort_cmp (const void *dra_, const void *drb_)
2427 {
2428   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2429   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2430   int cmp;
2431
2432   /* Stabilize sort.  */
2433   if (dra == drb)
2434     return 0;
2435
2436   /* Ordering of DRs according to base.  */
2437   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2438     {
2439       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2440       if (cmp != 0)
2441         return cmp;
2442     }
2443
2444   /* And according to DR_OFFSET.  */
2445   if (!dr_equal_offsets_p (dra, drb))
2446     {
2447       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2448       if (cmp != 0)
2449         return cmp;
2450     }
2451
2452   /* Put reads before writes.  */
2453   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2454     return DR_IS_READ (dra) ? -1 : 1;
2455
2456   /* Then sort after access size.  */
2457   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2458                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2459     {
2460       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2461                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2462       if (cmp != 0)
2463         return cmp;
2464     }
2465
2466   /* And after step.  */
2467   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2468     {
2469       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2470       if (cmp != 0)
2471         return cmp;
2472     }
2473
2474   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2475   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2476   if (cmp == 0)
2477     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2478   return cmp;
2479 }
2480
2481 /* Function vect_analyze_data_ref_accesses.
2482
2483    Analyze the access pattern of all the data references in the loop.
2484
2485    FORNOW: the only access pattern that is considered vectorizable is a
2486            simple step 1 (consecutive) access.
2487
2488    FORNOW: handle only arrays and pointer accesses.  */
2489
2490 bool
2491 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2492 {
2493   unsigned int i;
2494   vec<data_reference_p> datarefs;
2495   struct data_reference *dr;
2496
2497   if (dump_enabled_p ())
2498     dump_printf_loc (MSG_NOTE, vect_location,
2499                      "=== vect_analyze_data_ref_accesses ===\n");
2500
2501   if (loop_vinfo)
2502     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2503   else
2504     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2505
2506   if (datarefs.is_empty ())
2507     return true;
2508
2509   /* Sort the array of datarefs to make building the interleaving chains
2510      linear.  Don't modify the original vector's order, it is needed for
2511      determining what dependencies are reversed.  */
2512   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2513   datarefs_copy.qsort (dr_group_sort_cmp);
2514
2515   /* Build the interleaving chains.  */
2516   for (i = 0; i < datarefs_copy.length () - 1;)
2517     {
2518       data_reference_p dra = datarefs_copy[i];
2519       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2520       stmt_vec_info lastinfo = NULL;
2521       for (i = i + 1; i < datarefs_copy.length (); ++i)
2522         {
2523           data_reference_p drb = datarefs_copy[i];
2524           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2525
2526           /* ???  Imperfect sorting (non-compatible types, non-modulo
2527              accesses, same accesses) can lead to a group to be artificially
2528              split here as we don't just skip over those.  If it really
2529              matters we can push those to a worklist and re-iterate
2530              over them.  The we can just skip ahead to the next DR here.  */
2531
2532           /* Check that the data-refs have same first location (except init)
2533              and they are both either store or load (not load and store).  */
2534           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2535               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2536                                    DR_BASE_ADDRESS (drb), 0)
2537               || !dr_equal_offsets_p (dra, drb))
2538             break;
2539
2540           /* Check that the data-refs have the same constant size and step.  */
2541           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2542           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2543           if (!tree_fits_uhwi_p (sza)
2544               || !tree_fits_uhwi_p (szb)
2545               || !tree_int_cst_equal (sza, szb)
2546               || !tree_fits_shwi_p (DR_STEP (dra))
2547               || !tree_fits_shwi_p (DR_STEP (drb))
2548               || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2549             break;
2550
2551           /* Do not place the same access in the interleaving chain twice.  */
2552           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2553             break;
2554
2555           /* Check the types are compatible.
2556              ???  We don't distinguish this during sorting.  */
2557           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2558                                    TREE_TYPE (DR_REF (drb))))
2559             break;
2560
2561           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2562           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2563           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2564           gcc_assert (init_a < init_b);
2565
2566           /* If init_b == init_a + the size of the type * k, we have an
2567              interleaving, and DRA is accessed before DRB.  */
2568           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2569           if ((init_b - init_a) % type_size_a != 0)
2570             break;
2571
2572           /* The step (if not zero) is greater than the difference between
2573              data-refs' inits.  This splits groups into suitable sizes.  */
2574           HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2575           if (step != 0 && step <= (init_b - init_a))
2576             break;
2577
2578           if (dump_enabled_p ())
2579             {
2580               dump_printf_loc (MSG_NOTE, vect_location,
2581                                "Detected interleaving ");
2582               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2583               dump_printf (MSG_NOTE,  " and ");
2584               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2585               dump_printf (MSG_NOTE, "\n");
2586             }
2587
2588           /* Link the found element into the group list.  */
2589           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2590             {
2591               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2592               lastinfo = stmtinfo_a;
2593             }
2594           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2595           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2596           lastinfo = stmtinfo_b;
2597         }
2598     }
2599
2600   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2601     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2602         && !vect_analyze_data_ref_access (dr))
2603       {
2604         if (dump_enabled_p ())
2605           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2606                            "not vectorized: complicated access pattern.\n");
2607
2608         if (bb_vinfo)
2609           {
2610             /* Mark the statement as not vectorizable.  */
2611             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2612             continue;
2613           }
2614         else
2615           {
2616             datarefs_copy.release ();
2617             return false;
2618           }
2619       }
2620
2621   datarefs_copy.release ();
2622   return true;
2623 }
2624
2625
2626 /* Operator == between two dr_with_seg_len objects.
2627
2628    This equality operator is used to make sure two data refs
2629    are the same one so that we will consider to combine the
2630    aliasing checks of those two pairs of data dependent data
2631    refs.  */
2632
2633 static bool
2634 operator == (const dr_with_seg_len& d1,
2635              const dr_with_seg_len& d2)
2636 {
2637   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2638                           DR_BASE_ADDRESS (d2.dr), 0)
2639            && compare_tree (d1.offset, d2.offset) == 0
2640            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2641 }
2642
2643 /* Function comp_dr_with_seg_len_pair.
2644
2645    Comparison function for sorting objects of dr_with_seg_len_pair_t
2646    so that we can combine aliasing checks in one scan.  */
2647
2648 static int
2649 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2650 {
2651   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2652   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2653
2654   const dr_with_seg_len &p11 = p1->first,
2655                         &p12 = p1->second,
2656                         &p21 = p2->first,
2657                         &p22 = p2->second;
2658
2659   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2660      if a and c have the same basic address snd step, and b and d have the same
2661      address and step.  Therefore, if any a&c or b&d don't have the same address
2662      and step, we don't care the order of those two pairs after sorting.  */
2663   int comp_res;
2664
2665   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2666                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2667     return comp_res;
2668   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2669                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2670     return comp_res;
2671   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2672     return comp_res;
2673   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2674     return comp_res;
2675   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2676     return comp_res;
2677   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2678     return comp_res;
2679
2680   return 0;
2681 }
2682
2683 template <class T> static void
2684 swap (T& a, T& b)
2685 {
2686   T c (a);
2687   a = b;
2688   b = c;
2689 }
2690
2691 /* Function vect_vfa_segment_size.
2692
2693    Create an expression that computes the size of segment
2694    that will be accessed for a data reference.  The functions takes into
2695    account that realignment loads may access one more vector.
2696
2697    Input:
2698      DR: The data reference.
2699      LENGTH_FACTOR: segment length to consider.
2700
2701    Return an expression whose value is the size of segment which will be
2702    accessed by DR.  */
2703
2704 static tree
2705 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2706 {
2707   tree segment_length;
2708
2709   if (integer_zerop (DR_STEP (dr)))
2710     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2711   else
2712     segment_length = size_binop (MULT_EXPR,
2713                                  fold_convert (sizetype, DR_STEP (dr)),
2714                                  fold_convert (sizetype, length_factor));
2715
2716   if (vect_supportable_dr_alignment (dr, false)
2717         == dr_explicit_realign_optimized)
2718     {
2719       tree vector_size = TYPE_SIZE_UNIT
2720                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2721
2722       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2723     }
2724   return segment_length;
2725 }
2726
2727 /* Function vect_prune_runtime_alias_test_list.
2728
2729    Prune a list of ddrs to be tested at run-time by versioning for alias.
2730    Merge several alias checks into one if possible.
2731    Return FALSE if resulting list of ddrs is longer then allowed by
2732    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2733
2734 bool
2735 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2736 {
2737   vec<ddr_p> may_alias_ddrs =
2738     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2739   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2740     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2741   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2742   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2743
2744   ddr_p ddr;
2745   unsigned int i;
2746   tree length_factor;
2747
2748   if (dump_enabled_p ())
2749     dump_printf_loc (MSG_NOTE, vect_location,
2750                      "=== vect_prune_runtime_alias_test_list ===\n");
2751
2752   if (may_alias_ddrs.is_empty ())
2753     return true;
2754
2755   /* Basically, for each pair of dependent data refs store_ptr_0
2756      and load_ptr_0, we create an expression:
2757
2758      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2759      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2760
2761      for aliasing checks.  However, in some cases we can decrease
2762      the number of checks by combining two checks into one.  For
2763      example, suppose we have another pair of data refs store_ptr_0
2764      and load_ptr_1, and if the following condition is satisfied:
2765
2766      load_ptr_0 < load_ptr_1  &&
2767      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2768
2769      (this condition means, in each iteration of vectorized loop,
2770      the accessed memory of store_ptr_0 cannot be between the memory
2771      of load_ptr_0 and load_ptr_1.)
2772
2773      we then can use only the following expression to finish the
2774      alising checks between store_ptr_0 & load_ptr_0 and
2775      store_ptr_0 & load_ptr_1:
2776
2777      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2778      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2779
2780      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2781      same basic address.  */
2782
2783   comp_alias_ddrs.create (may_alias_ddrs.length ());
2784
2785   /* First, we collect all data ref pairs for aliasing checks.  */
2786   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2787     {
2788       struct data_reference *dr_a, *dr_b;
2789       gimple dr_group_first_a, dr_group_first_b;
2790       tree segment_length_a, segment_length_b;
2791       gimple stmt_a, stmt_b;
2792
2793       dr_a = DDR_A (ddr);
2794       stmt_a = DR_STMT (DDR_A (ddr));
2795       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2796       if (dr_group_first_a)
2797         {
2798           stmt_a = dr_group_first_a;
2799           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2800         }
2801
2802       dr_b = DDR_B (ddr);
2803       stmt_b = DR_STMT (DDR_B (ddr));
2804       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2805       if (dr_group_first_b)
2806         {
2807           stmt_b = dr_group_first_b;
2808           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2809         }
2810
2811       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2812         length_factor = scalar_loop_iters;
2813       else
2814         length_factor = size_int (vect_factor);
2815       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2816       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2817
2818       dr_with_seg_len_pair_t dr_with_seg_len_pair
2819           (dr_with_seg_len (dr_a, segment_length_a),
2820            dr_with_seg_len (dr_b, segment_length_b));
2821
2822       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2823         swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2824
2825       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2826     }
2827
2828   /* Second, we sort the collected data ref pairs so that we can scan
2829      them once to combine all possible aliasing checks.  */
2830   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2831
2832   /* Third, we scan the sorted dr pairs and check if we can combine
2833      alias checks of two neighbouring dr pairs.  */
2834   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2835     {
2836       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2837       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2838                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2839                       *dr_a2 = &comp_alias_ddrs[i].first,
2840                       *dr_b2 = &comp_alias_ddrs[i].second;
2841
2842       /* Remove duplicate data ref pairs.  */
2843       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2844         {
2845           if (dump_enabled_p ())
2846             {
2847               dump_printf_loc (MSG_NOTE, vect_location,
2848                                "found equal ranges ");
2849               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2850                                  DR_REF (dr_a1->dr));
2851               dump_printf (MSG_NOTE,  ", ");
2852               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2853                                  DR_REF (dr_b1->dr));
2854               dump_printf (MSG_NOTE,  " and ");
2855               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2856                                  DR_REF (dr_a2->dr));
2857               dump_printf (MSG_NOTE,  ", ");
2858               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2859                                  DR_REF (dr_b2->dr));
2860               dump_printf (MSG_NOTE, "\n");
2861             }
2862
2863           comp_alias_ddrs.ordered_remove (i--);
2864           continue;
2865         }
2866
2867       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2868         {
2869           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2870              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2871           if (*dr_a1 == *dr_a2)
2872             {
2873               swap (dr_a1, dr_b1);
2874               swap (dr_a2, dr_b2);
2875             }
2876
2877           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2878                                 DR_BASE_ADDRESS (dr_a2->dr),
2879                                 0)
2880               || !tree_fits_shwi_p (dr_a1->offset)
2881               || !tree_fits_shwi_p (dr_a2->offset))
2882             continue;
2883
2884           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2885                                 - tree_to_shwi (dr_a1->offset));
2886
2887
2888           /* Now we check if the following condition is satisfied:
2889
2890              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2891
2892              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2893              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2894              have to make a best estimation.  We can get the minimum value
2895              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2896              then either of the following two conditions can guarantee the
2897              one above:
2898
2899              1: DIFF <= MIN_SEG_LEN_B
2900              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2901
2902              */
2903
2904           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2905                                           ? tree_to_shwi (dr_b1->seg_len)
2906                                           : vect_factor);
2907
2908           if (diff <= min_seg_len_b
2909               || (tree_fits_shwi_p (dr_a1->seg_len)
2910                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2911             {
2912               if (dump_enabled_p ())
2913                 {
2914                   dump_printf_loc (MSG_NOTE, vect_location,
2915                                    "merging ranges for ");
2916                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2917                                      DR_REF (dr_a1->dr));
2918                   dump_printf (MSG_NOTE,  ", ");
2919                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2920                                      DR_REF (dr_b1->dr));
2921                   dump_printf (MSG_NOTE,  " and ");
2922                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923                                      DR_REF (dr_a2->dr));
2924                   dump_printf (MSG_NOTE,  ", ");
2925                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926                                      DR_REF (dr_b2->dr));
2927                   dump_printf (MSG_NOTE, "\n");
2928                 }
2929
2930               dr_a1->seg_len = size_binop (PLUS_EXPR,
2931                                            dr_a2->seg_len, size_int (diff));
2932               comp_alias_ddrs.ordered_remove (i--);
2933             }
2934         }
2935     }
2936
2937   dump_printf_loc (MSG_NOTE, vect_location,
2938                    "improved number of alias checks from %d to %d\n",
2939                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2940   if ((int) comp_alias_ddrs.length () >
2941       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2942     return false;
2943
2944   return true;
2945 }
2946
2947 /* Check whether a non-affine read in stmt is suitable for gather load
2948    and if so, return a builtin decl for that operation.  */
2949
2950 tree
2951 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2952                    tree *offp, int *scalep)
2953 {
2954   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2955   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2956   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2957   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2958   tree offtype = NULL_TREE;
2959   tree decl, base, off;
2960   enum machine_mode pmode;
2961   int punsignedp, pvolatilep;
2962
2963   base = DR_REF (dr);
2964   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2965      see if we can use the def stmt of the address.  */
2966   if (is_gimple_call (stmt)
2967       && gimple_call_internal_p (stmt)
2968       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2969           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2970       && TREE_CODE (base) == MEM_REF
2971       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2972       && integer_zerop (TREE_OPERAND (base, 1))
2973       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2974     {
2975       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2976       if (is_gimple_assign (def_stmt)
2977           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2978         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2979     }
2980
2981   /* The gather builtins need address of the form
2982      loop_invariant + vector * {1, 2, 4, 8}
2983      or
2984      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2985      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2986      of loop invariants/SSA_NAMEs defined in the loop, with casts,
2987      multiplications and additions in it.  To get a vector, we need
2988      a single SSA_NAME that will be defined in the loop and will
2989      contain everything that is not loop invariant and that can be
2990      vectorized.  The following code attempts to find such a preexistng
2991      SSA_NAME OFF and put the loop invariants into a tree BASE
2992      that can be gimplified before the loop.  */
2993   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
2994                               &pmode, &punsignedp, &pvolatilep, false);
2995   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2996
2997   if (TREE_CODE (base) == MEM_REF)
2998     {
2999       if (!integer_zerop (TREE_OPERAND (base, 1)))
3000         {
3001           if (off == NULL_TREE)
3002             {
3003               offset_int moff = mem_ref_offset (base);
3004               off = wide_int_to_tree (sizetype, moff);
3005             }
3006           else
3007             off = size_binop (PLUS_EXPR, off,
3008                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3009         }
3010       base = TREE_OPERAND (base, 0);
3011     }
3012   else
3013     base = build_fold_addr_expr (base);
3014
3015   if (off == NULL_TREE)
3016     off = size_zero_node;
3017
3018   /* If base is not loop invariant, either off is 0, then we start with just
3019      the constant offset in the loop invariant BASE and continue with base
3020      as OFF, otherwise give up.
3021      We could handle that case by gimplifying the addition of base + off
3022      into some SSA_NAME and use that as off, but for now punt.  */
3023   if (!expr_invariant_in_loop_p (loop, base))
3024     {
3025       if (!integer_zerop (off))
3026         return NULL_TREE;
3027       off = base;
3028       base = size_int (pbitpos / BITS_PER_UNIT);
3029     }
3030   /* Otherwise put base + constant offset into the loop invariant BASE
3031      and continue with OFF.  */
3032   else
3033     {
3034       base = fold_convert (sizetype, base);
3035       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3036     }
3037
3038   /* OFF at this point may be either a SSA_NAME or some tree expression
3039      from get_inner_reference.  Try to peel off loop invariants from it
3040      into BASE as long as possible.  */
3041   STRIP_NOPS (off);
3042   while (offtype == NULL_TREE)
3043     {
3044       enum tree_code code;
3045       tree op0, op1, add = NULL_TREE;
3046
3047       if (TREE_CODE (off) == SSA_NAME)
3048         {
3049           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3050
3051           if (expr_invariant_in_loop_p (loop, off))
3052             return NULL_TREE;
3053
3054           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3055             break;
3056
3057           op0 = gimple_assign_rhs1 (def_stmt);
3058           code = gimple_assign_rhs_code (def_stmt);
3059           op1 = gimple_assign_rhs2 (def_stmt);
3060         }
3061       else
3062         {
3063           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3064             return NULL_TREE;
3065           code = TREE_CODE (off);
3066           extract_ops_from_tree (off, &code, &op0, &op1);
3067         }
3068       switch (code)
3069         {
3070         case POINTER_PLUS_EXPR:
3071         case PLUS_EXPR:
3072           if (expr_invariant_in_loop_p (loop, op0))
3073             {
3074               add = op0;
3075               off = op1;
3076             do_add:
3077               add = fold_convert (sizetype, add);
3078               if (scale != 1)
3079                 add = size_binop (MULT_EXPR, add, size_int (scale));
3080               base = size_binop (PLUS_EXPR, base, add);
3081               continue;
3082             }
3083           if (expr_invariant_in_loop_p (loop, op1))
3084             {
3085               add = op1;
3086               off = op0;
3087               goto do_add;
3088             }
3089           break;
3090         case MINUS_EXPR:
3091           if (expr_invariant_in_loop_p (loop, op1))
3092             {
3093               add = fold_convert (sizetype, op1);
3094               add = size_binop (MINUS_EXPR, size_zero_node, add);
3095               off = op0;
3096               goto do_add;
3097             }
3098           break;
3099         case MULT_EXPR:
3100           if (scale == 1 && tree_fits_shwi_p (op1))
3101             {
3102               scale = tree_to_shwi (op1);
3103               off = op0;
3104               continue;
3105             }
3106           break;
3107         case SSA_NAME:
3108           off = op0;
3109           continue;
3110         CASE_CONVERT:
3111           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3112               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3113             break;
3114           if (TYPE_PRECISION (TREE_TYPE (op0))
3115               == TYPE_PRECISION (TREE_TYPE (off)))
3116             {
3117               off = op0;
3118               continue;
3119             }
3120           if (TYPE_PRECISION (TREE_TYPE (op0))
3121               < TYPE_PRECISION (TREE_TYPE (off)))
3122             {
3123               off = op0;
3124               offtype = TREE_TYPE (off);
3125               STRIP_NOPS (off);
3126               continue;
3127             }
3128           break;
3129         default:
3130           break;
3131         }
3132       break;
3133     }
3134
3135   /* If at the end OFF still isn't a SSA_NAME or isn't
3136      defined in the loop, punt.  */
3137   if (TREE_CODE (off) != SSA_NAME
3138       || expr_invariant_in_loop_p (loop, off))
3139     return NULL_TREE;
3140
3141   if (offtype == NULL_TREE)
3142     offtype = TREE_TYPE (off);
3143
3144   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3145                                            offtype, scale);
3146   if (decl == NULL_TREE)
3147     return NULL_TREE;
3148
3149   if (basep)
3150     *basep = base;
3151   if (offp)
3152     *offp = off;
3153   if (scalep)
3154     *scalep = scale;
3155   return decl;
3156 }
3157
3158 /* Function vect_analyze_data_refs.
3159
3160   Find all the data references in the loop or basic block.
3161
3162    The general structure of the analysis of data refs in the vectorizer is as
3163    follows:
3164    1- vect_analyze_data_refs(loop/bb): call
3165       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3166       in the loop/bb and their dependences.
3167    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3168    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3169    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3170
3171 */
3172
3173 bool
3174 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3175                         bb_vec_info bb_vinfo,
3176                         int *min_vf, unsigned *n_stmts)
3177 {
3178   struct loop *loop = NULL;
3179   basic_block bb = NULL;
3180   unsigned int i;
3181   vec<data_reference_p> datarefs;
3182   struct data_reference *dr;
3183   tree scalar_type;
3184
3185   if (dump_enabled_p ())
3186     dump_printf_loc (MSG_NOTE, vect_location,
3187                      "=== vect_analyze_data_refs ===\n");
3188
3189   if (loop_vinfo)
3190     {
3191       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3192
3193       loop = LOOP_VINFO_LOOP (loop_vinfo);
3194       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3195       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3196         {
3197           if (dump_enabled_p ())
3198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3199                              "not vectorized: loop contains function calls"
3200                              " or data references that cannot be analyzed\n");
3201           return false;
3202         }
3203
3204       for (i = 0; i < loop->num_nodes; i++)
3205         {
3206           gimple_stmt_iterator gsi;
3207
3208           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3209             {
3210               gimple stmt = gsi_stmt (gsi);
3211               if (is_gimple_debug (stmt))
3212                 continue;
3213               ++*n_stmts;
3214               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3215                 {
3216                   if (is_gimple_call (stmt) && loop->safelen)
3217                     {
3218                       tree fndecl = gimple_call_fndecl (stmt), op;
3219                       if (fndecl != NULL_TREE)
3220                         {
3221                           struct cgraph_node *node = cgraph_get_node (fndecl);
3222                           if (node != NULL && node->simd_clones != NULL)
3223                             {
3224                               unsigned int j, n = gimple_call_num_args (stmt);
3225                               for (j = 0; j < n; j++)
3226                                 {
3227                                   op = gimple_call_arg (stmt, j);
3228                                   if (DECL_P (op)
3229                                       || (REFERENCE_CLASS_P (op)
3230                                           && get_base_address (op)))
3231                                     break;
3232                                 }
3233                               op = gimple_call_lhs (stmt);
3234                               /* Ignore #pragma omp declare simd functions
3235                                  if they don't have data references in the
3236                                  call stmt itself.  */
3237                               if (j == n
3238                                   && !(op
3239                                        && (DECL_P (op)
3240                                            || (REFERENCE_CLASS_P (op)
3241                                                && get_base_address (op)))))
3242                                 continue;
3243                             }
3244                         }
3245                     }
3246                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3247                   if (dump_enabled_p ())
3248                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3249                                      "not vectorized: loop contains function "
3250                                      "calls or data references that cannot "
3251                                      "be analyzed\n");
3252                   return false;
3253                 }
3254             }
3255         }
3256
3257       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3258     }
3259   else
3260     {
3261       gimple_stmt_iterator gsi;
3262
3263       bb = BB_VINFO_BB (bb_vinfo);
3264       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3265         {
3266           gimple stmt = gsi_stmt (gsi);
3267           if (is_gimple_debug (stmt))
3268             continue;
3269           ++*n_stmts;
3270           if (!find_data_references_in_stmt (NULL, stmt,
3271                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3272             {
3273               /* Mark the rest of the basic-block as unvectorizable.  */
3274               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3275                 {
3276                   stmt = gsi_stmt (gsi);
3277                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3278                 }
3279               break;
3280             }
3281         }
3282
3283       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3284     }
3285
3286   /* Go through the data-refs, check that the analysis succeeded.  Update
3287      pointer from stmt_vec_info struct to DR and vectype.  */
3288
3289   FOR_EACH_VEC_ELT (datarefs, i, dr)
3290     {
3291       gimple stmt;
3292       stmt_vec_info stmt_info;
3293       tree base, offset, init;
3294       bool gather = false;
3295       bool simd_lane_access = false;
3296       int vf;
3297
3298 again:
3299       if (!dr || !DR_REF (dr))
3300         {
3301           if (dump_enabled_p ())
3302             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3303                              "not vectorized: unhandled data-ref\n");
3304           return false;
3305         }
3306
3307       stmt = DR_STMT (dr);
3308       stmt_info = vinfo_for_stmt (stmt);
3309
3310       /* Discard clobbers from the dataref vector.  We will remove
3311          clobber stmts during vectorization.  */
3312       if (gimple_clobber_p (stmt))
3313         {
3314           free_data_ref (dr);
3315           if (i == datarefs.length () - 1)
3316             {
3317               datarefs.pop ();
3318               break;
3319             }
3320           datarefs.ordered_remove (i);
3321           dr = datarefs[i];
3322           goto again;
3323         }
3324
3325       /* Check that analysis of the data-ref succeeded.  */
3326       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3327           || !DR_STEP (dr))
3328         {
3329           bool maybe_gather
3330             = DR_IS_READ (dr)
3331               && !TREE_THIS_VOLATILE (DR_REF (dr))
3332               && targetm.vectorize.builtin_gather != NULL;
3333           bool maybe_simd_lane_access
3334             = loop_vinfo && loop->simduid;
3335
3336           /* If target supports vector gather loads, or if this might be
3337              a SIMD lane access, see if they can't be used.  */
3338           if (loop_vinfo
3339               && (maybe_gather || maybe_simd_lane_access)
3340               && !nested_in_vect_loop_p (loop, stmt))
3341             {
3342               struct data_reference *newdr
3343                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3344                                    DR_REF (dr), stmt, true);
3345               gcc_assert (newdr != NULL && DR_REF (newdr));
3346               if (DR_BASE_ADDRESS (newdr)
3347                   && DR_OFFSET (newdr)
3348                   && DR_INIT (newdr)
3349                   && DR_STEP (newdr)
3350                   && integer_zerop (DR_STEP (newdr)))
3351                 {
3352                   if (maybe_simd_lane_access)
3353                     {
3354                       tree off = DR_OFFSET (newdr);
3355                       STRIP_NOPS (off);
3356                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3357                           && TREE_CODE (off) == MULT_EXPR
3358                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3359                         {
3360                           tree step = TREE_OPERAND (off, 1);
3361                           off = TREE_OPERAND (off, 0);
3362                           STRIP_NOPS (off);
3363                           if (CONVERT_EXPR_P (off)
3364                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3365                                                                           0)))
3366                                  < TYPE_PRECISION (TREE_TYPE (off)))
3367                             off = TREE_OPERAND (off, 0);
3368                           if (TREE_CODE (off) == SSA_NAME)
3369                             {
3370                               gimple def = SSA_NAME_DEF_STMT (off);
3371                               tree reft = TREE_TYPE (DR_REF (newdr));
3372                               if (is_gimple_call (def)
3373                                   && gimple_call_internal_p (def)
3374                                   && (gimple_call_internal_fn (def)
3375                                       == IFN_GOMP_SIMD_LANE))
3376                                 {
3377                                   tree arg = gimple_call_arg (def, 0);
3378                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3379                                   arg = SSA_NAME_VAR (arg);
3380                                   if (arg == loop->simduid
3381                                       /* For now.  */
3382                                       && tree_int_cst_equal
3383                                            (TYPE_SIZE_UNIT (reft),
3384                                             step))
3385                                     {
3386                                       DR_OFFSET (newdr) = ssize_int (0);
3387                                       DR_STEP (newdr) = step;
3388                                       DR_ALIGNED_TO (newdr)
3389                                         = size_int (BIGGEST_ALIGNMENT);
3390                                       dr = newdr;
3391                                       simd_lane_access = true;
3392                                     }
3393                                 }
3394                             }
3395                         }
3396                     }
3397                   if (!simd_lane_access && maybe_gather)
3398                     {
3399                       dr = newdr;
3400                       gather = true;
3401                     }
3402                 }
3403               if (!gather && !simd_lane_access)
3404                 free_data_ref (newdr);
3405             }
3406
3407           if (!gather && !simd_lane_access)
3408             {
3409               if (dump_enabled_p ())
3410                 {
3411                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3412                                    "not vectorized: data ref analysis "
3413                                    "failed ");
3414                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3415                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3416                 }
3417
3418               if (bb_vinfo)
3419                 break;
3420
3421               return false;
3422             }
3423         }
3424
3425       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3426         {
3427           if (dump_enabled_p ())
3428             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3429                              "not vectorized: base addr of dr is a "
3430                              "constant\n");
3431
3432           if (bb_vinfo)
3433             break;
3434
3435           if (gather || simd_lane_access)
3436             free_data_ref (dr);
3437           return false;
3438         }
3439
3440       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3441         {
3442           if (dump_enabled_p ())
3443             {
3444               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3445                                "not vectorized: volatile type ");
3446               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3447               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3448             }
3449
3450           if (bb_vinfo)
3451             break;
3452
3453           return false;
3454         }
3455
3456       if (stmt_can_throw_internal (stmt))
3457         {
3458           if (dump_enabled_p ())
3459             {
3460               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3461                                "not vectorized: statement can throw an "
3462                                "exception ");
3463               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3464               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3465             }
3466
3467           if (bb_vinfo)
3468             break;
3469
3470           if (gather || simd_lane_access)
3471             free_data_ref (dr);
3472           return false;
3473         }
3474
3475       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3476           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3477         {
3478           if (dump_enabled_p ())
3479             {
3480               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3481                                "not vectorized: statement is bitfield "
3482                                "access ");
3483               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3484               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3485             }
3486
3487           if (bb_vinfo)
3488             break;
3489
3490           if (gather || simd_lane_access)
3491             free_data_ref (dr);
3492           return false;
3493         }
3494
3495       base = unshare_expr (DR_BASE_ADDRESS (dr));
3496       offset = unshare_expr (DR_OFFSET (dr));
3497       init = unshare_expr (DR_INIT (dr));
3498
3499       if (is_gimple_call (stmt)
3500           && (!gimple_call_internal_p (stmt)
3501               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3502                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3503         {
3504           if (dump_enabled_p ())
3505             {
3506               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3507                                "not vectorized: dr in a call ");
3508               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3509               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3510             }
3511
3512           if (bb_vinfo)
3513             break;
3514
3515           if (gather || simd_lane_access)
3516             free_data_ref (dr);
3517           return false;
3518         }
3519
3520       /* Update DR field in stmt_vec_info struct.  */
3521
3522       /* If the dataref is in an inner-loop of the loop that is considered for
3523          for vectorization, we also want to analyze the access relative to
3524          the outer-loop (DR contains information only relative to the
3525          inner-most enclosing loop).  We do that by building a reference to the
3526          first location accessed by the inner-loop, and analyze it relative to
3527          the outer-loop.  */
3528       if (loop && nested_in_vect_loop_p (loop, stmt))
3529         {
3530           tree outer_step, outer_base, outer_init;
3531           HOST_WIDE_INT pbitsize, pbitpos;
3532           tree poffset;
3533           enum machine_mode pmode;
3534           int punsignedp, pvolatilep;
3535           affine_iv base_iv, offset_iv;
3536           tree dinit;
3537
3538           /* Build a reference to the first location accessed by the
3539              inner-loop: *(BASE+INIT).  (The first location is actually
3540              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3541           tree inner_base = build_fold_indirect_ref
3542                                 (fold_build_pointer_plus (base, init));
3543
3544           if (dump_enabled_p ())
3545             {
3546               dump_printf_loc (MSG_NOTE, vect_location,
3547                                "analyze in outer-loop: ");
3548               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3549               dump_printf (MSG_NOTE, "\n");
3550             }
3551
3552           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3553                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3554           gcc_assert (outer_base != NULL_TREE);
3555
3556           if (pbitpos % BITS_PER_UNIT != 0)
3557             {
3558               if (dump_enabled_p ())
3559                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3560                                  "failed: bit offset alignment.\n");
3561               return false;
3562             }
3563
3564           outer_base = build_fold_addr_expr (outer_base);
3565           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3566                           &base_iv, false))
3567             {
3568               if (dump_enabled_p ())
3569                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3570                                  "failed: evolution of base is not affine.\n");
3571               return false;
3572             }
3573
3574           if (offset)
3575             {
3576               if (poffset)
3577                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3578                                        poffset);
3579               else
3580                 poffset = offset;
3581             }
3582
3583           if (!poffset)
3584             {
3585               offset_iv.base = ssize_int (0);
3586               offset_iv.step = ssize_int (0);
3587             }
3588           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3589                                &offset_iv, false))
3590             {
3591               if (dump_enabled_p ())
3592                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3593                                  "evolution of offset is not affine.\n");
3594               return false;
3595             }
3596
3597           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3598           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3599           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3600           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3601           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3602
3603           outer_step = size_binop (PLUS_EXPR,
3604                                 fold_convert (ssizetype, base_iv.step),
3605                                 fold_convert (ssizetype, offset_iv.step));
3606
3607           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3608           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3609           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3610           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3611           STMT_VINFO_DR_OFFSET (stmt_info) =
3612                                 fold_convert (ssizetype, offset_iv.base);
3613           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3614                                 size_int (highest_pow2_factor (offset_iv.base));
3615
3616           if (dump_enabled_p ())
3617             {
3618               dump_printf_loc (MSG_NOTE, vect_location,
3619                                "\touter base_address: ");
3620               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3621                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3622               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3623               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3624                                  STMT_VINFO_DR_OFFSET (stmt_info));
3625               dump_printf (MSG_NOTE,
3626                            "\n\touter constant offset from base address: ");
3627               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3628                                  STMT_VINFO_DR_INIT (stmt_info));
3629               dump_printf (MSG_NOTE, "\n\touter step: ");
3630               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3631                                  STMT_VINFO_DR_STEP (stmt_info));
3632               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3633               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3634                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3635               dump_printf (MSG_NOTE, "\n");
3636             }
3637         }
3638
3639       if (STMT_VINFO_DATA_REF (stmt_info))
3640         {
3641           if (dump_enabled_p ())
3642             {
3643               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3644                                "not vectorized: more than one data ref "
3645                                "in stmt: ");
3646               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3647               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3648             }
3649
3650           if (bb_vinfo)
3651             break;
3652
3653           if (gather || simd_lane_access)
3654             free_data_ref (dr);
3655           return false;
3656         }
3657
3658       STMT_VINFO_DATA_REF (stmt_info) = dr;
3659       if (simd_lane_access)
3660         {
3661           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3662           free_data_ref (datarefs[i]);
3663           datarefs[i] = dr;
3664         }
3665
3666       /* Set vectype for STMT.  */
3667       scalar_type = TREE_TYPE (DR_REF (dr));
3668       STMT_VINFO_VECTYPE (stmt_info)
3669         = get_vectype_for_scalar_type (scalar_type);
3670       if (!STMT_VINFO_VECTYPE (stmt_info))
3671         {
3672           if (dump_enabled_p ())
3673             {
3674               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3675                                "not vectorized: no vectype for stmt: ");
3676               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3677               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3678               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3679                                  scalar_type);
3680               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3681             }
3682
3683           if (bb_vinfo)
3684             break;
3685
3686           if (gather || simd_lane_access)
3687             {
3688               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3689               if (gather)
3690                 free_data_ref (dr);
3691             }
3692           return false;
3693         }
3694       else
3695         {
3696           if (dump_enabled_p ())
3697             {
3698               dump_printf_loc (MSG_NOTE, vect_location,
3699                                "got vectype for stmt: ");
3700               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3701               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3702                                  STMT_VINFO_VECTYPE (stmt_info));
3703               dump_printf (MSG_NOTE, "\n");
3704             }
3705         }
3706
3707       /* Adjust the minimal vectorization factor according to the
3708          vector type.  */
3709       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3710       if (vf > *min_vf)
3711         *min_vf = vf;
3712
3713       if (gather)
3714         {
3715           tree off;
3716
3717           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3718           if (gather
3719               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3720             gather = false;
3721           if (!gather)
3722             {
3723               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3724               free_data_ref (dr);
3725               if (dump_enabled_p ())
3726                 {
3727                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3728                                    "not vectorized: not suitable for gather "
3729                                    "load ");
3730                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3731                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3732                 }
3733               return false;
3734             }
3735
3736           datarefs[i] = dr;
3737           STMT_VINFO_GATHER_P (stmt_info) = true;
3738         }
3739       else if (loop_vinfo
3740                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3741         {
3742           if (nested_in_vect_loop_p (loop, stmt)
3743               || !DR_IS_READ (dr))
3744             {
3745               if (dump_enabled_p ())
3746                 {
3747                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3748                                    "not vectorized: not suitable for strided "
3749                                    "load ");
3750                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3751                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3752                 }
3753               return false;
3754             }
3755           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3756         }
3757     }
3758
3759   /* If we stopped analysis at the first dataref we could not analyze
3760      when trying to vectorize a basic-block mark the rest of the datarefs
3761      as not vectorizable and truncate the vector of datarefs.  That
3762      avoids spending useless time in analyzing their dependence.  */
3763   if (i != datarefs.length ())
3764     {
3765       gcc_assert (bb_vinfo != NULL);
3766       for (unsigned j = i; j < datarefs.length (); ++j)
3767         {
3768           data_reference_p dr = datarefs[j];
3769           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3770           free_data_ref (dr);
3771         }
3772       datarefs.truncate (i);
3773     }
3774
3775   return true;
3776 }
3777
3778
3779 /* Function vect_get_new_vect_var.
3780
3781    Returns a name for a new variable.  The current naming scheme appends the
3782    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3783    the name of vectorizer generated variables, and appends that to NAME if
3784    provided.  */
3785
3786 tree
3787 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3788 {
3789   const char *prefix;
3790   tree new_vect_var;
3791
3792   switch (var_kind)
3793   {
3794   case vect_simple_var:
3795     prefix = "vect";
3796     break;
3797   case vect_scalar_var:
3798     prefix = "stmp";
3799     break;
3800   case vect_pointer_var:
3801     prefix = "vectp";
3802     break;
3803   default:
3804     gcc_unreachable ();
3805   }
3806
3807   if (name)
3808     {
3809       char* tmp = concat (prefix, "_", name, NULL);
3810       new_vect_var = create_tmp_reg (type, tmp);
3811       free (tmp);
3812     }
3813   else
3814     new_vect_var = create_tmp_reg (type, prefix);
3815
3816   return new_vect_var;
3817 }
3818
3819
3820 /* Function vect_create_addr_base_for_vector_ref.
3821
3822    Create an expression that computes the address of the first memory location
3823    that will be accessed for a data reference.
3824
3825    Input:
3826    STMT: The statement containing the data reference.
3827    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3828    OFFSET: Optional. If supplied, it is be added to the initial address.
3829    LOOP:    Specify relative to which loop-nest should the address be computed.
3830             For example, when the dataref is in an inner-loop nested in an
3831             outer-loop that is now being vectorized, LOOP can be either the
3832             outer-loop, or the inner-loop.  The first memory location accessed
3833             by the following dataref ('in' points to short):
3834
3835                 for (i=0; i<N; i++)
3836                    for (j=0; j<M; j++)
3837                      s += in[i+j]
3838
3839             is as follows:
3840             if LOOP=i_loop:     &in             (relative to i_loop)
3841             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3842
3843    Output:
3844    1. Return an SSA_NAME whose value is the address of the memory location of
3845       the first vector of the data reference.
3846    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3847       these statement(s) which define the returned SSA_NAME.
3848
3849    FORNOW: We are only handling array accesses with step 1.  */
3850
3851 tree
3852 vect_create_addr_base_for_vector_ref (gimple stmt,
3853                                       gimple_seq *new_stmt_list,
3854                                       tree offset,
3855                                       struct loop *loop)
3856 {
3857   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3858   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3859   tree data_ref_base;
3860   const char *base_name;
3861   tree addr_base;
3862   tree dest;
3863   gimple_seq seq = NULL;
3864   tree base_offset;
3865   tree init;
3866   tree vect_ptr_type;
3867   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3868   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3869
3870   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3871     {
3872       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3873
3874       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3875
3876       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3877       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3878       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3879     }
3880   else
3881     {
3882       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3883       base_offset = unshare_expr (DR_OFFSET (dr));
3884       init = unshare_expr (DR_INIT (dr));
3885     }
3886
3887   if (loop_vinfo)
3888     base_name = get_name (data_ref_base);
3889   else
3890     {
3891       base_offset = ssize_int (0);
3892       init = ssize_int (0);
3893       base_name = get_name (DR_REF (dr));
3894     }
3895
3896   /* Create base_offset */
3897   base_offset = size_binop (PLUS_EXPR,
3898                             fold_convert (sizetype, base_offset),
3899                             fold_convert (sizetype, init));
3900
3901   if (offset)
3902     {
3903       offset = fold_build2 (MULT_EXPR, sizetype,
3904                             fold_convert (sizetype, offset), step);
3905       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3906                                  base_offset, offset);
3907     }
3908
3909   /* base + base_offset */
3910   if (loop_vinfo)
3911     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3912   else
3913     {
3914       addr_base = build1 (ADDR_EXPR,
3915                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3916                           unshare_expr (DR_REF (dr)));
3917     }
3918
3919   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3920   addr_base = fold_convert (vect_ptr_type, addr_base);
3921   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3922   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3923   gimple_seq_add_seq (new_stmt_list, seq);
3924
3925   if (DR_PTR_INFO (dr)
3926       && TREE_CODE (addr_base) == SSA_NAME)
3927     {
3928       duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3929       if (offset)
3930         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3931     }
3932
3933   if (dump_enabled_p ())
3934     {
3935       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3936       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3937       dump_printf (MSG_NOTE, "\n");
3938     }
3939
3940   return addr_base;
3941 }
3942
3943
3944 /* Function vect_create_data_ref_ptr.
3945
3946    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3947    location accessed in the loop by STMT, along with the def-use update
3948    chain to appropriately advance the pointer through the loop iterations.
3949    Also set aliasing information for the pointer.  This pointer is used by
3950    the callers to this function to create a memory reference expression for
3951    vector load/store access.
3952
3953    Input:
3954    1. STMT: a stmt that references memory. Expected to be of the form
3955          GIMPLE_ASSIGN <name, data-ref> or
3956          GIMPLE_ASSIGN <data-ref, name>.
3957    2. AGGR_TYPE: the type of the reference, which should be either a vector
3958         or an array.
3959    3. AT_LOOP: the loop where the vector memref is to be created.
3960    4. OFFSET (optional): an offset to be added to the initial address accessed
3961         by the data-ref in STMT.
3962    5. BSI: location where the new stmts are to be placed if there is no loop
3963    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3964         pointing to the initial address.
3965
3966    Output:
3967    1. Declare a new ptr to vector_type, and have it point to the base of the
3968       data reference (initial addressed accessed by the data reference).
3969       For example, for vector of type V8HI, the following code is generated:
3970
3971       v8hi *ap;
3972       ap = (v8hi *)initial_address;
3973
3974       if OFFSET is not supplied:
3975          initial_address = &a[init];
3976       if OFFSET is supplied:
3977          initial_address = &a[init + OFFSET];
3978
3979       Return the initial_address in INITIAL_ADDRESS.
3980
3981    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
3982       update the pointer in each iteration of the loop.
3983
3984       Return the increment stmt that updates the pointer in PTR_INCR.
3985
3986    3. Set INV_P to true if the access pattern of the data reference in the
3987       vectorized loop is invariant.  Set it to false otherwise.
3988
3989    4. Return the pointer.  */
3990
3991 tree
3992 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3993                           tree offset, tree *initial_address,
3994                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
3995                           bool only_init, bool *inv_p)
3996 {
3997   const char *base_name;
3998   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3999   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4000   struct loop *loop = NULL;
4001   bool nested_in_vect_loop = false;
4002   struct loop *containing_loop = NULL;
4003   tree aggr_ptr_type;
4004   tree aggr_ptr;
4005   tree new_temp;
4006   gimple vec_stmt;
4007   gimple_seq new_stmt_list = NULL;
4008   edge pe = NULL;
4009   basic_block new_bb;
4010   tree aggr_ptr_init;
4011   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4012   tree aptr;
4013   gimple_stmt_iterator incr_gsi;
4014   bool insert_after;
4015   tree indx_before_incr, indx_after_incr;
4016   gimple incr;
4017   tree step;
4018   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4019
4020   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4021               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4022
4023   if (loop_vinfo)
4024     {
4025       loop = LOOP_VINFO_LOOP (loop_vinfo);
4026       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4027       containing_loop = (gimple_bb (stmt))->loop_father;
4028       pe = loop_preheader_edge (loop);
4029     }
4030   else
4031     {
4032       gcc_assert (bb_vinfo);
4033       only_init = true;
4034       *ptr_incr = NULL;
4035     }
4036
4037   /* Check the step (evolution) of the load in LOOP, and record
4038      whether it's invariant.  */
4039   if (nested_in_vect_loop)
4040     step = STMT_VINFO_DR_STEP (stmt_info);
4041   else
4042     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4043
4044   if (integer_zerop (step))
4045     *inv_p = true;
4046   else
4047     *inv_p = false;
4048
4049   /* Create an expression for the first address accessed by this load
4050      in LOOP.  */
4051   base_name = get_name (DR_BASE_ADDRESS (dr));
4052
4053   if (dump_enabled_p ())
4054     {
4055       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4056       dump_printf_loc (MSG_NOTE, vect_location,
4057                        "create %s-pointer variable to type: ",
4058                        get_tree_code_name (TREE_CODE (aggr_type)));
4059       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4060       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4061         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4062       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4063         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4064       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4065         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4066       else
4067         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4068       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4069       dump_printf (MSG_NOTE, "\n");
4070     }
4071
4072   /* (1) Create the new aggregate-pointer variable.
4073      Vector and array types inherit the alias set of their component
4074      type by default so we need to use a ref-all pointer if the data
4075      reference does not conflict with the created aggregated data
4076      reference because it is not addressable.  */
4077   bool need_ref_all = false;
4078   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4079                               get_alias_set (DR_REF (dr))))
4080     need_ref_all = true;
4081   /* Likewise for any of the data references in the stmt group.  */
4082   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4083     {
4084       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4085       do
4086         {
4087           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4088           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4089           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4090                                       get_alias_set (DR_REF (sdr))))
4091             {
4092               need_ref_all = true;
4093               break;
4094             }
4095           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4096         }
4097       while (orig_stmt);
4098     }
4099   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4100                                                need_ref_all);
4101   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4102
4103
4104   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4105      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4106      def-use update cycles for the pointer: one relative to the outer-loop
4107      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4108      to the inner-loop (which is the inner-most loop containing the dataref),
4109      and this is done be step (5) below.
4110
4111      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4112      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4113      redundant.  Steps (3),(4) create the following:
4114
4115         vp0 = &base_addr;
4116         LOOP:   vp1 = phi(vp0,vp2)
4117                 ...
4118                 ...
4119                 vp2 = vp1 + step
4120                 goto LOOP
4121
4122      If there is an inner-loop nested in loop, then step (5) will also be
4123      applied, and an additional update in the inner-loop will be created:
4124
4125         vp0 = &base_addr;
4126         LOOP:   vp1 = phi(vp0,vp2)
4127                 ...
4128         inner:     vp3 = phi(vp1,vp4)
4129                    vp4 = vp3 + inner_step
4130                    if () goto inner
4131                 ...
4132                 vp2 = vp1 + step
4133                 if () goto LOOP   */
4134
4135   /* (2) Calculate the initial address of the aggregate-pointer, and set
4136      the aggregate-pointer to point to it before the loop.  */
4137
4138   /* Create: (&(base[init_val+offset]) in the loop preheader.  */
4139
4140   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4141                                                    offset, loop);
4142   if (new_stmt_list)
4143     {
4144       if (pe)
4145         {
4146           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4147           gcc_assert (!new_bb);
4148         }
4149       else
4150         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4151     }
4152
4153   *initial_address = new_temp;
4154
4155   /* Create: p = (aggr_type *) initial_base  */
4156   if (TREE_CODE (new_temp) != SSA_NAME
4157       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4158     {
4159       vec_stmt = gimple_build_assign (aggr_ptr,
4160                                       fold_convert (aggr_ptr_type, new_temp));
4161       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4162       /* Copy the points-to information if it exists. */
4163       if (DR_PTR_INFO (dr))
4164         duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4165       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4166       if (pe)
4167         {
4168           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4169           gcc_assert (!new_bb);
4170         }
4171       else
4172         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4173     }
4174   else
4175     aggr_ptr_init = new_temp;
4176
4177   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4178      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4179      inner-loop nested in LOOP (during outer-loop vectorization).  */
4180
4181   /* No update in loop is required.  */
4182   if (only_init && (!loop_vinfo || at_loop == loop))
4183     aptr = aggr_ptr_init;
4184   else
4185     {
4186       /* The step of the aggregate pointer is the type size.  */
4187       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4188       /* One exception to the above is when the scalar step of the load in
4189          LOOP is zero. In this case the step here is also zero.  */
4190       if (*inv_p)
4191         iv_step = size_zero_node;
4192       else if (tree_int_cst_sgn (step) == -1)
4193         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4194
4195       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4196
4197       create_iv (aggr_ptr_init,
4198                  fold_convert (aggr_ptr_type, iv_step),
4199                  aggr_ptr, loop, &incr_gsi, insert_after,
4200                  &indx_before_incr, &indx_after_incr);
4201       incr = gsi_stmt (incr_gsi);
4202       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4203
4204       /* Copy the points-to information if it exists. */
4205       if (DR_PTR_INFO (dr))
4206         {
4207           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4208           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4209         }
4210       if (ptr_incr)
4211         *ptr_incr = incr;
4212
4213       aptr = indx_before_incr;
4214     }
4215
4216   if (!nested_in_vect_loop || only_init)
4217     return aptr;
4218
4219
4220   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4221      nested in LOOP, if exists.  */
4222
4223   gcc_assert (nested_in_vect_loop);
4224   if (!only_init)
4225     {
4226       standard_iv_increment_position (containing_loop, &incr_gsi,
4227                                       &insert_after);
4228       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4229                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4230                  &indx_after_incr);
4231       incr = gsi_stmt (incr_gsi);
4232       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4233
4234       /* Copy the points-to information if it exists. */
4235       if (DR_PTR_INFO (dr))
4236         {
4237           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4238           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4239         }
4240       if (ptr_incr)
4241         *ptr_incr = incr;
4242
4243       return indx_before_incr;
4244     }
4245   else
4246     gcc_unreachable ();
4247 }
4248
4249
4250 /* Function bump_vector_ptr
4251
4252    Increment a pointer (to a vector type) by vector-size. If requested,
4253    i.e. if PTR-INCR is given, then also connect the new increment stmt
4254    to the existing def-use update-chain of the pointer, by modifying
4255    the PTR_INCR as illustrated below:
4256
4257    The pointer def-use update-chain before this function:
4258                         DATAREF_PTR = phi (p_0, p_2)
4259                         ....
4260         PTR_INCR:       p_2 = DATAREF_PTR + step
4261
4262    The pointer def-use update-chain after this function:
4263                         DATAREF_PTR = phi (p_0, p_2)
4264                         ....
4265                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4266                         ....
4267         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4268
4269    Input:
4270    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4271                  in the loop.
4272    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4273               the loop.  The increment amount across iterations is expected
4274               to be vector_size.
4275    BSI - location where the new update stmt is to be placed.
4276    STMT - the original scalar memory-access stmt that is being vectorized.
4277    BUMP - optional. The offset by which to bump the pointer. If not given,
4278           the offset is assumed to be vector_size.
4279
4280    Output: Return NEW_DATAREF_PTR as illustrated above.
4281
4282 */
4283
4284 tree
4285 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4286                  gimple stmt, tree bump)
4287 {
4288   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4289   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4290   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4291   tree update = TYPE_SIZE_UNIT (vectype);
4292   gimple incr_stmt;
4293   ssa_op_iter iter;
4294   use_operand_p use_p;
4295   tree new_dataref_ptr;
4296
4297   if (bump)
4298     update = bump;
4299
4300   new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4301   incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4302                                             dataref_ptr, update);
4303   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4304
4305   /* Copy the points-to information if it exists. */
4306   if (DR_PTR_INFO (dr))
4307     {
4308       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4309       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4310     }
4311
4312   if (!ptr_incr)
4313     return new_dataref_ptr;
4314
4315   /* Update the vector-pointer's cross-iteration increment.  */
4316   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4317     {
4318       tree use = USE_FROM_PTR (use_p);
4319
4320       if (use == dataref_ptr)
4321         SET_USE (use_p, new_dataref_ptr);
4322       else
4323         gcc_assert (tree_int_cst_compare (use, update) == 0);
4324     }
4325
4326   return new_dataref_ptr;
4327 }
4328
4329
4330 /* Function vect_create_destination_var.
4331
4332    Create a new temporary of type VECTYPE.  */
4333
4334 tree
4335 vect_create_destination_var (tree scalar_dest, tree vectype)
4336 {
4337   tree vec_dest;
4338   const char *name;
4339   char *new_name;
4340   tree type;
4341   enum vect_var_kind kind;
4342
4343   kind = vectype ? vect_simple_var : vect_scalar_var;
4344   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4345
4346   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4347
4348   name = get_name (scalar_dest);
4349   if (name)
4350     asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4351   else
4352     asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4353   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4354   free (new_name);
4355
4356   return vec_dest;
4357 }
4358
4359 /* Function vect_grouped_store_supported.
4360
4361    Returns TRUE if interleave high and interleave low permutations
4362    are supported, and FALSE otherwise.  */
4363
4364 bool
4365 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4366 {
4367   enum machine_mode mode = TYPE_MODE (vectype);
4368
4369   /* vect_permute_store_chain requires the group size to be equal to 3 or
4370      be a power of two.  */
4371   if (count != 3 && exact_log2 (count) == -1)
4372     {
4373       if (dump_enabled_p ())
4374         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4375                          "the size of the group of accesses"
4376                          " is not a power of 2 or not eqaul to 3\n");
4377       return false;
4378     }
4379
4380   /* Check that the permutation is supported.  */
4381   if (VECTOR_MODE_P (mode))
4382     {
4383       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4384       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4385
4386       if (count == 3)
4387         {
4388           unsigned int j0 = 0, j1 = 0, j2 = 0;
4389           unsigned int i, j;
4390
4391           for (j = 0; j < 3; j++)
4392             {
4393               int nelt0 = ((3 - j) * nelt) % 3;
4394               int nelt1 = ((3 - j) * nelt + 1) % 3;
4395               int nelt2 = ((3 - j) * nelt + 2) % 3;
4396               for (i = 0; i < nelt; i++)
4397                 {
4398                   if (3 * i + nelt0 < nelt)
4399                     sel[3 * i + nelt0] = j0++;
4400                   if (3 * i + nelt1 < nelt)
4401                     sel[3 * i + nelt1] = nelt + j1++;
4402                   if (3 * i + nelt2 < nelt)
4403                     sel[3 * i + nelt2] = 0;
4404                 }
4405               if (!can_vec_perm_p (mode, false, sel))
4406                 {
4407                   if (dump_enabled_p ())
4408                     dump_printf (MSG_MISSED_OPTIMIZATION,
4409                                  "permutaion op not supported by target.\n");
4410                   return false;
4411                 }
4412
4413               for (i = 0; i < nelt; i++)
4414                 {
4415                   if (3 * i + nelt0 < nelt)
4416                     sel[3 * i + nelt0] = 3 * i + nelt0;
4417                   if (3 * i + nelt1 < nelt)
4418                     sel[3 * i + nelt1] = 3 * i + nelt1;
4419                   if (3 * i + nelt2 < nelt)
4420                     sel[3 * i + nelt2] = nelt + j2++;
4421                 }
4422               if (!can_vec_perm_p (mode, false, sel))
4423                 {
4424                   if (dump_enabled_p ())
4425                     dump_printf (MSG_MISSED_OPTIMIZATION,
4426                                  "permutaion op not supported by target.\n");
4427                   return false;
4428                 }
4429             }
4430           return true;
4431         }
4432       else
4433         {
4434           /* If length is not equal to 3 then only power of 2 is supported.  */
4435           gcc_assert (exact_log2 (count) != -1);
4436
4437           for (i = 0; i < nelt / 2; i++)
4438             {
4439               sel[i * 2] = i;
4440               sel[i * 2 + 1] = i + nelt;
4441             }
4442             if (can_vec_perm_p (mode, false, sel))
4443               {
4444                 for (i = 0; i < nelt; i++)
4445                   sel[i] += nelt / 2;
4446                 if (can_vec_perm_p (mode, false, sel))
4447                   return true;
4448               }
4449         }
4450     }
4451
4452   if (dump_enabled_p ())
4453     dump_printf (MSG_MISSED_OPTIMIZATION,
4454                  "permutaion op not supported by target.\n");
4455   return false;
4456 }
4457
4458
4459 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4460    type VECTYPE.  */
4461
4462 bool
4463 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4464 {
4465   return vect_lanes_optab_supported_p ("vec_store_lanes",
4466                                        vec_store_lanes_optab,
4467                                        vectype, count);
4468 }
4469
4470
4471 /* Function vect_permute_store_chain.
4472
4473    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4474    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4475    the data correctly for the stores.  Return the final references for stores
4476    in RESULT_CHAIN.
4477
4478    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4479    The input is 4 vectors each containing 8 elements.  We assign a number to
4480    each element, the input sequence is:
4481
4482    1st vec:   0  1  2  3  4  5  6  7
4483    2nd vec:   8  9 10 11 12 13 14 15
4484    3rd vec:  16 17 18 19 20 21 22 23
4485    4th vec:  24 25 26 27 28 29 30 31
4486
4487    The output sequence should be:
4488
4489    1st vec:  0  8 16 24  1  9 17 25
4490    2nd vec:  2 10 18 26  3 11 19 27
4491    3rd vec:  4 12 20 28  5 13 21 30
4492    4th vec:  6 14 22 30  7 15 23 31
4493
4494    i.e., we interleave the contents of the four vectors in their order.
4495
4496    We use interleave_high/low instructions to create such output.  The input of
4497    each interleave_high/low operation is two vectors:
4498    1st vec    2nd vec
4499    0 1 2 3    4 5 6 7
4500    the even elements of the result vector are obtained left-to-right from the
4501    high/low elements of the first vector.  The odd elements of the result are
4502    obtained left-to-right from the high/low elements of the second vector.
4503    The output of interleave_high will be:   0 4 1 5
4504    and of interleave_low:                   2 6 3 7
4505
4506
4507    The permutation is done in log LENGTH stages.  In each stage interleave_high
4508    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4509    where the first argument is taken from the first half of DR_CHAIN and the
4510    second argument from it's second half.
4511    In our example,
4512
4513    I1: interleave_high (1st vec, 3rd vec)
4514    I2: interleave_low (1st vec, 3rd vec)
4515    I3: interleave_high (2nd vec, 4th vec)
4516    I4: interleave_low (2nd vec, 4th vec)
4517
4518    The output for the first stage is:
4519
4520    I1:  0 16  1 17  2 18  3 19
4521    I2:  4 20  5 21  6 22  7 23
4522    I3:  8 24  9 25 10 26 11 27
4523    I4: 12 28 13 29 14 30 15 31
4524
4525    The output of the second stage, i.e. the final result is:
4526
4527    I1:  0  8 16 24  1  9 17 25
4528    I2:  2 10 18 26  3 11 19 27
4529    I3:  4 12 20 28  5 13 21 30
4530    I4:  6 14 22 30  7 15 23 31.  */
4531
4532 void
4533 vect_permute_store_chain (vec<tree> dr_chain,
4534                           unsigned int length,
4535                           gimple stmt,
4536                           gimple_stmt_iterator *gsi,
4537                           vec<tree> *result_chain)
4538 {
4539   tree vect1, vect2, high, low;
4540   gimple perm_stmt;
4541   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4542   tree perm_mask_low, perm_mask_high;
4543   tree data_ref;
4544   tree perm3_mask_low, perm3_mask_high;
4545   unsigned int i, n, log_length = exact_log2 (length);
4546   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4547   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4548
4549   result_chain->quick_grow (length);
4550   memcpy (result_chain->address (), dr_chain.address (),
4551           length * sizeof (tree));
4552
4553   if (length == 3)
4554     {
4555       unsigned int j0 = 0, j1 = 0, j2 = 0;
4556
4557       for (j = 0; j < 3; j++)
4558         {
4559           int nelt0 = ((3 - j) * nelt) % 3;
4560           int nelt1 = ((3 - j) * nelt + 1) % 3;
4561           int nelt2 = ((3 - j) * nelt + 2) % 3;
4562
4563           for (i = 0; i < nelt; i++)
4564             {
4565               if (3 * i + nelt0 < nelt)
4566                 sel[3 * i + nelt0] = j0++;
4567               if (3 * i + nelt1 < nelt)
4568                 sel[3 * i + nelt1] = nelt + j1++;
4569               if (3 * i + nelt2 < nelt)
4570                 sel[3 * i + nelt2] = 0;
4571             }
4572           perm3_mask_low = vect_gen_perm_mask (vectype, sel);
4573           gcc_assert (perm3_mask_low != NULL);
4574
4575           for (i = 0; i < nelt; i++)
4576             {
4577               if (3 * i + nelt0 < nelt)
4578                 sel[3 * i + nelt0] = 3 * i + nelt0;
4579               if (3 * i + nelt1 < nelt)
4580                 sel[3 * i + nelt1] = 3 * i + nelt1;
4581               if (3 * i + nelt2 < nelt)
4582                 sel[3 * i + nelt2] = nelt + j2++;
4583             }
4584           perm3_mask_high = vect_gen_perm_mask (vectype, sel);
4585           gcc_assert (perm3_mask_high != NULL);
4586
4587           vect1 = dr_chain[0];
4588           vect2 = dr_chain[1];
4589
4590           /* Create interleaving stmt:
4591              low = VEC_PERM_EXPR <vect1, vect2,
4592                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4593                                    j + 2, nelt + j + 2, *, ...}>  */
4594           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4595           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4596                                                     vect1, vect2,
4597                                                     perm3_mask_low);
4598           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4599
4600           vect1 = data_ref;
4601           vect2 = dr_chain[2];
4602           /* Create interleaving stmt:
4603              low = VEC_PERM_EXPR <vect1, vect2,
4604                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4605                                    6, 7, nelt + j + 2, ...}>  */
4606           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4607           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4608                                                     vect1, vect2,
4609                                                     perm3_mask_high);
4610           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4611           (*result_chain)[j] = data_ref;
4612         }
4613     }
4614   else
4615     {
4616       /* If length is not equal to 3 then only power of 2 is supported.  */
4617       gcc_assert (exact_log2 (length) != -1);
4618
4619       for (i = 0, n = nelt / 2; i < n; i++)
4620         {
4621           sel[i * 2] = i;
4622           sel[i * 2 + 1] = i + nelt;
4623         }
4624         perm_mask_high = vect_gen_perm_mask (vectype, sel);
4625         gcc_assert (perm_mask_high != NULL);
4626
4627         for (i = 0; i < nelt; i++)
4628           sel[i] += nelt / 2;
4629         perm_mask_low = vect_gen_perm_mask (vectype, sel);
4630         gcc_assert (perm_mask_low != NULL);
4631
4632         for (i = 0, n = log_length; i < n; i++)
4633           {
4634             for (j = 0; j < length/2; j++)
4635               {
4636                 vect1 = dr_chain[j];
4637                 vect2 = dr_chain[j+length/2];
4638
4639                 /* Create interleaving stmt:
4640                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4641                                                         ...}>  */
4642                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4643                 perm_stmt
4644                   = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4645                                                   vect1, vect2, perm_mask_high);
4646                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4647                 (*result_chain)[2*j] = high;
4648
4649                 /* Create interleaving stmt:
4650                    low = VEC_PERM_EXPR <vect1, vect2,
4651                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4652                                          ...}>  */
4653                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4654                 perm_stmt
4655                   = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4656                                                   vect1, vect2, perm_mask_low);
4657                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4658                 (*result_chain)[2*j+1] = low;
4659               }
4660             memcpy (dr_chain.address (), result_chain->address (),
4661                     length * sizeof (tree));
4662           }
4663     }
4664 }
4665
4666 /* Function vect_setup_realignment
4667
4668    This function is called when vectorizing an unaligned load using
4669    the dr_explicit_realign[_optimized] scheme.
4670    This function generates the following code at the loop prolog:
4671
4672       p = initial_addr;
4673    x  msq_init = *(floor(p));   # prolog load
4674       realignment_token = call target_builtin;
4675     loop:
4676    x  msq = phi (msq_init, ---)
4677
4678    The stmts marked with x are generated only for the case of
4679    dr_explicit_realign_optimized.
4680
4681    The code above sets up a new (vector) pointer, pointing to the first
4682    location accessed by STMT, and a "floor-aligned" load using that pointer.
4683    It also generates code to compute the "realignment-token" (if the relevant
4684    target hook was defined), and creates a phi-node at the loop-header bb
4685    whose arguments are the result of the prolog-load (created by this
4686    function) and the result of a load that takes place in the loop (to be
4687    created by the caller to this function).
4688
4689    For the case of dr_explicit_realign_optimized:
4690    The caller to this function uses the phi-result (msq) to create the
4691    realignment code inside the loop, and sets up the missing phi argument,
4692    as follows:
4693     loop:
4694       msq = phi (msq_init, lsq)
4695       lsq = *(floor(p'));        # load in loop
4696       result = realign_load (msq, lsq, realignment_token);
4697
4698    For the case of dr_explicit_realign:
4699     loop:
4700       msq = *(floor(p));        # load in loop
4701       p' = p + (VS-1);
4702       lsq = *(floor(p'));       # load in loop
4703       result = realign_load (msq, lsq, realignment_token);
4704
4705    Input:
4706    STMT - (scalar) load stmt to be vectorized. This load accesses
4707           a memory location that may be unaligned.
4708    BSI - place where new code is to be inserted.
4709    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4710                               is used.
4711
4712    Output:
4713    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4714                        target hook, if defined.
4715    Return value - the result of the loop-header phi node.  */
4716
4717 tree
4718 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4719                         tree *realignment_token,
4720                         enum dr_alignment_support alignment_support_scheme,
4721                         tree init_addr,
4722                         struct loop **at_loop)
4723 {
4724   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4725   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4726   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4727   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4728   struct loop *loop = NULL;
4729   edge pe = NULL;
4730   tree scalar_dest = gimple_assign_lhs (stmt);
4731   tree vec_dest;
4732   gimple inc;
4733   tree ptr;
4734   tree data_ref;
4735   gimple new_stmt;
4736   basic_block new_bb;
4737   tree msq_init = NULL_TREE;
4738   tree new_temp;
4739   gimple phi_stmt;
4740   tree msq = NULL_TREE;
4741   gimple_seq stmts = NULL;
4742   bool inv_p;
4743   bool compute_in_loop = false;
4744   bool nested_in_vect_loop = false;
4745   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4746   struct loop *loop_for_initial_load = NULL;
4747
4748   if (loop_vinfo)
4749     {
4750       loop = LOOP_VINFO_LOOP (loop_vinfo);
4751       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4752     }
4753
4754   gcc_assert (alignment_support_scheme == dr_explicit_realign
4755               || alignment_support_scheme == dr_explicit_realign_optimized);
4756
4757   /* We need to generate three things:
4758      1. the misalignment computation
4759      2. the extra vector load (for the optimized realignment scheme).
4760      3. the phi node for the two vectors from which the realignment is
4761       done (for the optimized realignment scheme).  */
4762
4763   /* 1. Determine where to generate the misalignment computation.
4764
4765      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4766      calculation will be generated by this function, outside the loop (in the
4767      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4768      caller, inside the loop.
4769
4770      Background: If the misalignment remains fixed throughout the iterations of
4771      the loop, then both realignment schemes are applicable, and also the
4772      misalignment computation can be done outside LOOP.  This is because we are
4773      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4774      are a multiple of VS (the Vector Size), and therefore the misalignment in
4775      different vectorized LOOP iterations is always the same.
4776      The problem arises only if the memory access is in an inner-loop nested
4777      inside LOOP, which is now being vectorized using outer-loop vectorization.
4778      This is the only case when the misalignment of the memory access may not
4779      remain fixed throughout the iterations of the inner-loop (as explained in
4780      detail in vect_supportable_dr_alignment).  In this case, not only is the
4781      optimized realignment scheme not applicable, but also the misalignment
4782      computation (and generation of the realignment token that is passed to
4783      REALIGN_LOAD) have to be done inside the loop.
4784
4785      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4786      or not, which in turn determines if the misalignment is computed inside
4787      the inner-loop, or outside LOOP.  */
4788
4789   if (init_addr != NULL_TREE || !loop_vinfo)
4790     {
4791       compute_in_loop = true;
4792       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4793     }
4794
4795
4796   /* 2. Determine where to generate the extra vector load.
4797
4798      For the optimized realignment scheme, instead of generating two vector
4799      loads in each iteration, we generate a single extra vector load in the
4800      preheader of the loop, and in each iteration reuse the result of the
4801      vector load from the previous iteration.  In case the memory access is in
4802      an inner-loop nested inside LOOP, which is now being vectorized using
4803      outer-loop vectorization, we need to determine whether this initial vector
4804      load should be generated at the preheader of the inner-loop, or can be
4805      generated at the preheader of LOOP.  If the memory access has no evolution
4806      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4807      to be generated inside LOOP (in the preheader of the inner-loop).  */
4808
4809   if (nested_in_vect_loop)
4810     {
4811       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4812       bool invariant_in_outerloop =
4813             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4814       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4815     }
4816   else
4817     loop_for_initial_load = loop;
4818   if (at_loop)
4819     *at_loop = loop_for_initial_load;
4820
4821   if (loop_for_initial_load)
4822     pe = loop_preheader_edge (loop_for_initial_load);
4823
4824   /* 3. For the case of the optimized realignment, create the first vector
4825       load at the loop preheader.  */
4826
4827   if (alignment_support_scheme == dr_explicit_realign_optimized)
4828     {
4829       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4830
4831       gcc_assert (!compute_in_loop);
4832       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4833       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4834                                       NULL_TREE, &init_addr, NULL, &inc,
4835                                       true, &inv_p);
4836       new_temp = copy_ssa_name (ptr, NULL);
4837       new_stmt = gimple_build_assign_with_ops
4838                    (BIT_AND_EXPR, new_temp, ptr,
4839                     build_int_cst (TREE_TYPE (ptr),
4840                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4841       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4842       gcc_assert (!new_bb);
4843       data_ref
4844         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4845                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4846       new_stmt = gimple_build_assign (vec_dest, data_ref);
4847       new_temp = make_ssa_name (vec_dest, new_stmt);
4848       gimple_assign_set_lhs (new_stmt, new_temp);
4849       if (pe)
4850         {
4851           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4852           gcc_assert (!new_bb);
4853         }
4854       else
4855          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4856
4857       msq_init = gimple_assign_lhs (new_stmt);
4858     }
4859
4860   /* 4. Create realignment token using a target builtin, if available.
4861       It is done either inside the containing loop, or before LOOP (as
4862       determined above).  */
4863
4864   if (targetm.vectorize.builtin_mask_for_load)
4865     {
4866       tree builtin_decl;
4867
4868       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4869       if (!init_addr)
4870         {
4871           /* Generate the INIT_ADDR computation outside LOOP.  */
4872           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4873                                                         NULL_TREE, loop);
4874           if (loop)
4875             {
4876               pe = loop_preheader_edge (loop);
4877               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4878               gcc_assert (!new_bb);
4879             }
4880           else
4881              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4882         }
4883
4884       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4885       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4886       vec_dest =
4887         vect_create_destination_var (scalar_dest,
4888                                      gimple_call_return_type (new_stmt));
4889       new_temp = make_ssa_name (vec_dest, new_stmt);
4890       gimple_call_set_lhs (new_stmt, new_temp);
4891
4892       if (compute_in_loop)
4893         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4894       else
4895         {
4896           /* Generate the misalignment computation outside LOOP.  */
4897           pe = loop_preheader_edge (loop);
4898           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4899           gcc_assert (!new_bb);
4900         }
4901
4902       *realignment_token = gimple_call_lhs (new_stmt);
4903
4904       /* The result of the CALL_EXPR to this builtin is determined from
4905          the value of the parameter and no global variables are touched
4906          which makes the builtin a "const" function.  Requiring the
4907          builtin to have the "const" attribute makes it unnecessary
4908          to call mark_call_clobbered.  */
4909       gcc_assert (TREE_READONLY (builtin_decl));
4910     }
4911
4912   if (alignment_support_scheme == dr_explicit_realign)
4913     return msq;
4914
4915   gcc_assert (!compute_in_loop);
4916   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4917
4918
4919   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4920
4921   pe = loop_preheader_edge (containing_loop);
4922   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4923   msq = make_ssa_name (vec_dest, NULL);
4924   phi_stmt = create_phi_node (msq, containing_loop->header);
4925   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4926
4927   return msq;
4928 }
4929
4930
4931 /* Function vect_grouped_load_supported.
4932
4933    Returns TRUE if even and odd permutations are supported,
4934    and FALSE otherwise.  */
4935
4936 bool
4937 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4938 {
4939   enum machine_mode mode = TYPE_MODE (vectype);
4940
4941   /* vect_permute_load_chain requires the group size to be equal to 3 or
4942      be a power of two.  */
4943   if (count != 3 && exact_log2 (count) == -1)
4944     {
4945       if (dump_enabled_p ())
4946         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4947                          "the size of the group of accesses"
4948                          " is not a power of 2 or not equal to 3\n");
4949       return false;
4950     }
4951
4952   /* Check that the permutation is supported.  */
4953   if (VECTOR_MODE_P (mode))
4954     {
4955       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4956       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4957
4958       if (count == 3)
4959         {
4960           unsigned int k;
4961           for (k = 0; k < 3; k++)
4962             {
4963               for (i = 0; i < nelt; i++)
4964                 if (3 * i + k < 2 * nelt)
4965                   sel[i] = 3 * i + k;
4966                 else
4967                   sel[i] = 0;
4968               if (!can_vec_perm_p (mode, false, sel))
4969                 {
4970                   if (dump_enabled_p ())
4971                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4972                                      "shuffle of 3 loads is not supported by"
4973                                      " target\n");
4974                     return false;
4975                 }
4976               for (i = 0, j = 0; i < nelt; i++)
4977                 if (3 * i + k < 2 * nelt)
4978                   sel[i] = i;
4979                 else
4980                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4981               if (!can_vec_perm_p (mode, false, sel))
4982                 {
4983                   if (dump_enabled_p ())
4984                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4985                                      "shuffle of 3 loads is not supported by"
4986                                      " target\n");
4987                   return false;
4988                 }
4989             }
4990           return true;
4991         }
4992       else
4993         {
4994           /* If length is not equal to 3 then only power of 2 is supported.  */
4995           gcc_assert (exact_log2 (count) != -1);
4996           for (i = 0; i < nelt; i++)
4997             sel[i] = i * 2;
4998           if (can_vec_perm_p (mode, false, sel))
4999             {
5000               for (i = 0; i < nelt; i++)
5001                 sel[i] = i * 2 + 1;
5002               if (can_vec_perm_p (mode, false, sel))
5003                 return true;
5004             }
5005         }
5006     }
5007
5008   if (dump_enabled_p ())
5009     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5010                      "extract even/odd not supported by target\n");
5011   return false;
5012 }
5013
5014 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5015    type VECTYPE.  */
5016
5017 bool
5018 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5019 {
5020   return vect_lanes_optab_supported_p ("vec_load_lanes",
5021                                        vec_load_lanes_optab,
5022                                        vectype, count);
5023 }
5024
5025 /* Function vect_permute_load_chain.
5026
5027    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5028    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5029    the input data correctly.  Return the final references for loads in
5030    RESULT_CHAIN.
5031
5032    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5033    The input is 4 vectors each containing 8 elements. We assign a number to each
5034    element, the input sequence is:
5035
5036    1st vec:   0  1  2  3  4  5  6  7
5037    2nd vec:   8  9 10 11 12 13 14 15
5038    3rd vec:  16 17 18 19 20 21 22 23
5039    4th vec:  24 25 26 27 28 29 30 31
5040
5041    The output sequence should be:
5042
5043    1st vec:  0 4  8 12 16 20 24 28
5044    2nd vec:  1 5  9 13 17 21 25 29
5045    3rd vec:  2 6 10 14 18 22 26 30
5046    4th vec:  3 7 11 15 19 23 27 31
5047
5048    i.e., the first output vector should contain the first elements of each
5049    interleaving group, etc.
5050
5051    We use extract_even/odd instructions to create such output.  The input of
5052    each extract_even/odd operation is two vectors
5053    1st vec    2nd vec
5054    0 1 2 3    4 5 6 7
5055
5056    and the output is the vector of extracted even/odd elements.  The output of
5057    extract_even will be:   0 2 4 6
5058    and of extract_odd:     1 3 5 7
5059
5060
5061    The permutation is done in log LENGTH stages.  In each stage extract_even
5062    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5063    their order.  In our example,
5064
5065    E1: extract_even (1st vec, 2nd vec)
5066    E2: extract_odd (1st vec, 2nd vec)
5067    E3: extract_even (3rd vec, 4th vec)
5068    E4: extract_odd (3rd vec, 4th vec)
5069
5070    The output for the first stage will be:
5071
5072    E1:  0  2  4  6  8 10 12 14
5073    E2:  1  3  5  7  9 11 13 15
5074    E3: 16 18 20 22 24 26 28 30
5075    E4: 17 19 21 23 25 27 29 31
5076
5077    In order to proceed and create the correct sequence for the next stage (or
5078    for the correct output, if the second stage is the last one, as in our
5079    example), we first put the output of extract_even operation and then the
5080    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5081    The input for the second stage is:
5082
5083    1st vec (E1):  0  2  4  6  8 10 12 14
5084    2nd vec (E3): 16 18 20 22 24 26 28 30
5085    3rd vec (E2):  1  3  5  7  9 11 13 15
5086    4th vec (E4): 17 19 21 23 25 27 29 31
5087
5088    The output of the second stage:
5089
5090    E1: 0 4  8 12 16 20 24 28
5091    E2: 2 6 10 14 18 22 26 30
5092    E3: 1 5  9 13 17 21 25 29
5093    E4: 3 7 11 15 19 23 27 31
5094
5095    And RESULT_CHAIN after reordering:
5096
5097    1st vec (E1):  0 4  8 12 16 20 24 28
5098    2nd vec (E3):  1 5  9 13 17 21 25 29
5099    3rd vec (E2):  2 6 10 14 18 22 26 30
5100    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5101
5102 static void
5103 vect_permute_load_chain (vec<tree> dr_chain,
5104                          unsigned int length,
5105                          gimple stmt,
5106                          gimple_stmt_iterator *gsi,
5107                          vec<tree> *result_chain)
5108 {
5109   tree data_ref, first_vect, second_vect;
5110   tree perm_mask_even, perm_mask_odd;
5111   tree perm3_mask_low, perm3_mask_high;
5112   gimple perm_stmt;
5113   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5114   unsigned int i, j, log_length = exact_log2 (length);
5115   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5116   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5117
5118   result_chain->quick_grow (length);
5119   memcpy (result_chain->address (), dr_chain.address (),
5120           length * sizeof (tree));
5121
5122   if (length == 3)
5123     {
5124       unsigned int k;
5125
5126       for (k = 0; k < 3; k++)
5127         {
5128           for (i = 0; i < nelt; i++)
5129             if (3 * i + k < 2 * nelt)
5130               sel[i] = 3 * i + k;
5131             else
5132               sel[i] = 0;
5133           perm3_mask_low = vect_gen_perm_mask (vectype, sel);
5134           gcc_assert (perm3_mask_low != NULL);
5135
5136           for (i = 0, j = 0; i < nelt; i++)
5137             if (3 * i + k < 2 * nelt)
5138               sel[i] = i;
5139             else
5140               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5141
5142           perm3_mask_high = vect_gen_perm_mask (vectype, sel);
5143           gcc_assert (perm3_mask_high != NULL);
5144
5145           first_vect = dr_chain[0];
5146           second_vect = dr_chain[1];
5147
5148           /* Create interleaving stmt (low part of):
5149              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5150                                                              ...}>  */
5151           data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low");
5152           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5153                                                     first_vect, second_vect,
5154                                                     perm3_mask_low);
5155           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5156
5157           /* Create interleaving stmt (high part of):
5158              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5159                                                               ...}>  */
5160           first_vect = data_ref;
5161           second_vect = dr_chain[2];
5162           data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high");
5163           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5164                                                     first_vect, second_vect,
5165                                                     perm3_mask_high);
5166           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5167           (*result_chain)[k] = data_ref;
5168         }
5169     }
5170   else
5171     {
5172       /* If length is not equal to 3 then only power of 2 is supported.  */
5173       gcc_assert (exact_log2 (length) != -1);
5174
5175       for (i = 0; i < nelt; ++i)
5176         sel[i] = i * 2;
5177       perm_mask_even = vect_gen_perm_mask (vectype, sel);
5178       gcc_assert (perm_mask_even != NULL);
5179
5180       for (i = 0; i < nelt; ++i)
5181         sel[i] = i * 2 + 1;
5182       perm_mask_odd = vect_gen_perm_mask (vectype, sel);
5183       gcc_assert (perm_mask_odd != NULL);
5184
5185       for (i = 0; i < log_length; i++)
5186         {
5187           for (j = 0; j < length; j += 2)
5188             {
5189               first_vect = dr_chain[j];
5190               second_vect = dr_chain[j+1];
5191
5192               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5193               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5194               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5195                                                         first_vect, second_vect,
5196                                                         perm_mask_even);
5197               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5198               (*result_chain)[j/2] = data_ref;
5199
5200               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5201               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5202               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5203                                                         first_vect, second_vect,
5204                                                         perm_mask_odd);
5205               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5206               (*result_chain)[j/2+length/2] = data_ref;
5207             }
5208           memcpy (dr_chain.address (), result_chain->address (),
5209                   length * sizeof (tree));
5210         }
5211     }
5212 }
5213
5214 /* Function vect_shift_permute_load_chain.
5215
5216    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5217    sequence of stmts to reorder the input data accordingly.
5218    Return the final references for loads in RESULT_CHAIN.
5219    Return true if successed, false otherwise.
5220
5221    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5222    The input is 3 vectors each containing 8 elements.  We assign a
5223    number to each element, the input sequence is:
5224
5225    1st vec:   0  1  2  3  4  5  6  7
5226    2nd vec:   8  9 10 11 12 13 14 15
5227    3rd vec:  16 17 18 19 20 21 22 23
5228
5229    The output sequence should be:
5230
5231    1st vec:  0 3 6  9 12 15 18 21
5232    2nd vec:  1 4 7 10 13 16 19 22
5233    3rd vec:  2 5 8 11 14 17 20 23
5234
5235    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5236
5237    First we shuffle all 3 vectors to get correct elements order:
5238
5239    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5240    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5241    3rd vec:  (16 19 22) (17 20 23) (18 21)
5242
5243    Next we unite and shift vector 3 times:
5244
5245    1st step:
5246      shift right by 6 the concatenation of:
5247      "1st vec" and  "2nd vec"
5248        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5249      "2nd vec" and  "3rd vec"
5250        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5251      "3rd vec" and  "1st vec"
5252        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5253                              | New vectors                   |
5254
5255      So that now new vectors are:
5256
5257      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5258      2nd vec:  (10 13) (16 19 22) (17 20 23)
5259      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5260
5261    2nd step:
5262      shift right by 5 the concatenation of:
5263      "1st vec" and  "3rd vec"
5264        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5265      "2nd vec" and  "1st vec"
5266        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5267      "3rd vec" and  "2nd vec"
5268        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5269                           | New vectors                   |
5270
5271      So that now new vectors are:
5272
5273      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5274      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5275      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5276
5277    3rd step:
5278      shift right by 5 the concatenation of:
5279      "1st vec" and  "1st vec"
5280        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5281      shift right by 3 the concatenation of:
5282      "2nd vec" and  "2nd vec"
5283                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5284                           | New vectors                   |
5285
5286      So that now all vectors are READY:
5287      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5288      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5289      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5290
5291    This algorithm is faster than one in vect_permute_load_chain if:
5292      1.  "shift of a concatination" is faster than general permutation.
5293          This is usually so.
5294      2.  The TARGET machine can't execute vector instructions in parallel.
5295          This is because each step of the algorithm depends on previous.
5296          The algorithm in vect_permute_load_chain is much more parallel.
5297
5298    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5299 */
5300
5301 static bool
5302 vect_shift_permute_load_chain (vec<tree> dr_chain,
5303                                unsigned int length,
5304                                gimple stmt,
5305                                gimple_stmt_iterator *gsi,
5306                                vec<tree> *result_chain)
5307 {
5308   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5309   tree perm2_mask1, perm2_mask2, perm3_mask;
5310   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5311   gimple perm_stmt;
5312
5313   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5314   unsigned int i;
5315   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5316   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5317   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5318   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5319
5320   result_chain->quick_grow (length);
5321   memcpy (result_chain->address (), dr_chain.address (),
5322           length * sizeof (tree));
5323
5324   if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5325     {
5326       for (i = 0; i < nelt / 2; ++i)
5327         sel[i] = i * 2;
5328       for (i = 0; i < nelt / 2; ++i)
5329         sel[nelt / 2 + i] = i * 2 + 1;
5330       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5331         {
5332           if (dump_enabled_p ())
5333             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5334                              "shuffle of 2 fields structure is not \
5335                               supported by target\n");
5336           return false;
5337         }
5338       perm2_mask1 = vect_gen_perm_mask (vectype, sel);
5339       gcc_assert (perm2_mask1 != NULL);
5340
5341       for (i = 0; i < nelt / 2; ++i)
5342         sel[i] = i * 2 + 1;
5343       for (i = 0; i < nelt / 2; ++i)
5344         sel[nelt / 2 + i] = i * 2;
5345       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5346         {
5347           if (dump_enabled_p ())
5348             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5349                              "shuffle of 2 fields structure is not \
5350                               supported by target\n");
5351           return false;
5352         }
5353       perm2_mask2 = vect_gen_perm_mask (vectype, sel);
5354       gcc_assert (perm2_mask2 != NULL);
5355
5356       /* Generating permutation constant to shift all elements.
5357          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5358       for (i = 0; i < nelt; i++)
5359         sel[i] = nelt / 2 + i;
5360       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5361         {
5362           if (dump_enabled_p ())
5363             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5364                              "shift permutation is not supported by target\n");
5365           return false;
5366         }
5367       shift1_mask = vect_gen_perm_mask (vectype, sel);
5368       gcc_assert (shift1_mask != NULL);
5369
5370       /* Generating permutation constant to select vector from 2.
5371          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5372       for (i = 0; i < nelt / 2; i++)
5373         sel[i] = i;
5374       for (i = nelt / 2; i < nelt; i++)
5375         sel[i] = nelt + i;
5376       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5377         {
5378           if (dump_enabled_p ())
5379             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5380                              "select is not supported by target\n");
5381           return false;
5382         }
5383       select_mask = vect_gen_perm_mask (vectype, sel);
5384       gcc_assert (select_mask != NULL);
5385
5386       first_vect = dr_chain[0];
5387       second_vect = dr_chain[1];
5388
5389       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5390       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5391                                                 first_vect, first_vect,
5392                                                 perm2_mask1);
5393       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5394       vect[0] = data_ref;
5395
5396       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5397       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5398                                                 second_vect, second_vect,
5399                                                 perm2_mask2);
5400       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5401       vect[1] = data_ref;
5402
5403       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5404       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5405                                                 vect[0], vect[1],
5406                                                 shift1_mask);
5407       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5408       (*result_chain)[1] = data_ref;
5409
5410       data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5411       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5412                                                 vect[0], vect[1],
5413                                                 select_mask);
5414       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5415       (*result_chain)[0] = data_ref;
5416
5417       return true;
5418     }
5419   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5420     {
5421       unsigned int k = 0, l = 0;
5422
5423       /* Generating permutation constant to get all elements in rigth order.
5424          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5425       for (i = 0; i < nelt; i++)
5426         {
5427           if (3 * k + (l % 3) >= nelt)
5428             {
5429               k = 0;
5430               l += (3 - (nelt % 3));
5431             }
5432           sel[i] = 3 * k + (l % 3);
5433           k++;
5434         }
5435       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5436         {
5437           if (dump_enabled_p ())
5438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5439                              "shuffle of 3 fields structure is not \
5440                               supported by target\n");
5441           return false;
5442         }
5443       perm3_mask = vect_gen_perm_mask (vectype, sel);
5444       gcc_assert (perm3_mask != NULL);
5445
5446       /* Generating permutation constant to shift all elements.
5447          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5448       for (i = 0; i < nelt; i++)
5449         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5450       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5451         {
5452           if (dump_enabled_p ())
5453             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5454                              "shift permutation is not supported by target\n");
5455           return false;
5456         }
5457       shift1_mask = vect_gen_perm_mask (vectype, sel);
5458       gcc_assert (shift1_mask != NULL);
5459
5460       /* Generating permutation constant to shift all elements.
5461          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5462       for (i = 0; i < nelt; i++)
5463         sel[i] = 2 * (nelt / 3) + 1 + i;
5464       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5465         {
5466           if (dump_enabled_p ())
5467             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5468                              "shift permutation is not supported by target\n");
5469           return false;
5470         }
5471       shift2_mask = vect_gen_perm_mask (vectype, sel);
5472       gcc_assert (shift2_mask != NULL);
5473
5474       /* Generating permutation constant to shift all elements.
5475          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5476       for (i = 0; i < nelt; i++)
5477         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5478       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5479         {
5480           if (dump_enabled_p ())
5481             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5482                              "shift permutation is not supported by target\n");
5483           return false;
5484         }
5485       shift3_mask = vect_gen_perm_mask (vectype, sel);
5486       gcc_assert (shift3_mask != NULL);
5487
5488       /* Generating permutation constant to shift all elements.
5489          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5490       for (i = 0; i < nelt; i++)
5491         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5492       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5493         {
5494           if (dump_enabled_p ())
5495             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5496                              "shift permutation is not supported by target\n");
5497           return false;
5498         }
5499       shift4_mask = vect_gen_perm_mask (vectype, sel);
5500       gcc_assert (shift4_mask != NULL);
5501
5502       for (k = 0; k < 3; k++)
5503         {
5504           data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3");
5505           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5506                                                     dr_chain[k], dr_chain[k],
5507                                                     perm3_mask);
5508           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5509           vect[k] = data_ref;
5510         }
5511
5512       for (k = 0; k < 3; k++)
5513         {
5514           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5515           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5516                                                     vect[k % 3],
5517                                                     vect[(k + 1) % 3],
5518                                                     shift1_mask);
5519           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5520           vect_shift[k] = data_ref;
5521         }
5522
5523       for (k = 0; k < 3; k++)
5524         {
5525           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5526           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5527                                                     vect_shift[(4 - k) % 3],
5528                                                     vect_shift[(3 - k) % 3],
5529                                                     shift2_mask);
5530           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5531           vect[k] = data_ref;
5532         }
5533
5534       (*result_chain)[3 - (nelt % 3)] = vect[2];
5535
5536       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5537       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5538                                                 vect[0], vect[0],
5539                                                 shift3_mask);
5540       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5541       (*result_chain)[nelt % 3] = data_ref;
5542
5543       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5544       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5545                                                 vect[1], vect[1],
5546                                                 shift4_mask);
5547       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5548       (*result_chain)[0] = data_ref;
5549       return true;
5550     }
5551   return false;
5552 }
5553
5554 /* Function vect_transform_grouped_load.
5555
5556    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5557    to perform their permutation and ascribe the result vectorized statements to
5558    the scalar statements.
5559 */
5560
5561 void
5562 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5563                              gimple_stmt_iterator *gsi)
5564 {
5565   enum machine_mode mode;
5566   vec<tree> result_chain = vNULL;
5567
5568   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5569      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5570      vectors, that are ready for vector computation.  */
5571   result_chain.create (size);
5572
5573   /* If reassociation width for vector type is 2 or greater target machine can
5574      execute 2 or more vector instructions in parallel.  Otherwise try to
5575      get chain for loads group using vect_shift_permute_load_chain.  */
5576   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5577   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5578       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5579                                          gsi, &result_chain))
5580     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5581   vect_record_grouped_load_vectors (stmt, result_chain);
5582   result_chain.release ();
5583 }
5584
5585 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5586    generated as part of the vectorization of STMT.  Assign the statement
5587    for each vector to the associated scalar statement.  */
5588
5589 void
5590 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5591 {
5592   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5593   gimple next_stmt, new_stmt;
5594   unsigned int i, gap_count;
5595   tree tmp_data_ref;
5596
5597   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5598      Since we scan the chain starting from it's first node, their order
5599      corresponds the order of data-refs in RESULT_CHAIN.  */
5600   next_stmt = first_stmt;
5601   gap_count = 1;
5602   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5603     {
5604       if (!next_stmt)
5605         break;
5606
5607       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5608        code elimination pass later.  No need to check for the first stmt in
5609        the group, since it always exists.
5610        GROUP_GAP is the number of steps in elements from the previous
5611        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5612        correspond to the gaps.  */
5613       if (next_stmt != first_stmt
5614           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5615       {
5616         gap_count++;
5617         continue;
5618       }
5619
5620       while (next_stmt)
5621         {
5622           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5623           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5624              copies, and we put the new vector statement in the first available
5625              RELATED_STMT.  */
5626           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5627             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5628           else
5629             {
5630               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5631                 {
5632                   gimple prev_stmt =
5633                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5634                   gimple rel_stmt =
5635                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5636                   while (rel_stmt)
5637                     {
5638                       prev_stmt = rel_stmt;
5639                       rel_stmt =
5640                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5641                     }
5642
5643                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5644                     new_stmt;
5645                 }
5646             }
5647
5648           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5649           gap_count = 1;
5650           /* If NEXT_STMT accesses the same DR as the previous statement,
5651              put the same TMP_DATA_REF as its vectorized statement; otherwise
5652              get the next data-ref from RESULT_CHAIN.  */
5653           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5654             break;
5655         }
5656     }
5657 }
5658
5659 /* Function vect_force_dr_alignment_p.
5660
5661    Returns whether the alignment of a DECL can be forced to be aligned
5662    on ALIGNMENT bit boundary.  */
5663
5664 bool
5665 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5666 {
5667   if (TREE_CODE (decl) != VAR_DECL)
5668     return false;
5669
5670   /* With -fno-toplevel-reorder we may have already output the constant.  */
5671   if (TREE_ASM_WRITTEN (decl))
5672     return false;
5673
5674   /* Constant pool entries may be shared and not properly merged by LTO.  */
5675   if (DECL_IN_CONSTANT_POOL (decl))
5676     return false;
5677
5678   if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
5679     {
5680       symtab_node *snode;
5681
5682       /* We cannot change alignment of symbols that may bind to symbols
5683          in other translation unit that may contain a definition with lower
5684          alignment.  */
5685       if (!decl_binds_to_current_def_p (decl))
5686         return false;
5687
5688       /* When compiling partition, be sure the symbol is not output by other
5689          partition.  */
5690       snode = symtab_get_node (decl);
5691       if (flag_ltrans
5692           && (snode->in_other_partition
5693               || symtab_get_symbol_partitioning_class (snode) == SYMBOL_DUPLICATE))
5694         return false;
5695     }
5696
5697   /* Do not override the alignment as specified by the ABI when the used
5698      attribute is set.  */
5699   if (DECL_PRESERVE_P (decl))
5700     return false;
5701
5702   /* Do not override explicit alignment set by the user when an explicit
5703      section name is also used.  This is a common idiom used by many
5704      software projects.  */
5705   if (TREE_STATIC (decl)
5706       && DECL_SECTION_NAME (decl) != NULL
5707       && !symtab_get_node (decl)->implicit_section)
5708     return false;
5709
5710   /* If symbol is an alias, we need to check that target is OK.  */
5711   if (TREE_STATIC (decl))
5712     {
5713       tree target = symtab_alias_ultimate_target (symtab_get_node (decl))->decl;
5714       if (target != decl)
5715         {
5716           if (DECL_PRESERVE_P (target))
5717             return false;
5718           decl = target;
5719         }
5720     }
5721
5722   if (TREE_STATIC (decl))
5723     return (alignment <= MAX_OFILE_ALIGNMENT);
5724   else
5725     return (alignment <= MAX_STACK_ALIGNMENT);
5726 }
5727
5728
5729 /* Return whether the data reference DR is supported with respect to its
5730    alignment.
5731    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5732    it is aligned, i.e., check if it is possible to vectorize it with different
5733    alignment.  */
5734
5735 enum dr_alignment_support
5736 vect_supportable_dr_alignment (struct data_reference *dr,
5737                                bool check_aligned_accesses)
5738 {
5739   gimple stmt = DR_STMT (dr);
5740   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5741   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5742   enum machine_mode mode = TYPE_MODE (vectype);
5743   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5744   struct loop *vect_loop = NULL;
5745   bool nested_in_vect_loop = false;
5746
5747   if (aligned_access_p (dr) && !check_aligned_accesses)
5748     return dr_aligned;
5749
5750   /* For now assume all conditional loads/stores support unaligned
5751      access without any special code.  */
5752   if (is_gimple_call (stmt)
5753       && gimple_call_internal_p (stmt)
5754       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5755           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5756     return dr_unaligned_supported;
5757
5758   if (loop_vinfo)
5759     {
5760       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5761       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5762     }
5763
5764   /* Possibly unaligned access.  */
5765
5766   /* We can choose between using the implicit realignment scheme (generating
5767      a misaligned_move stmt) and the explicit realignment scheme (generating
5768      aligned loads with a REALIGN_LOAD).  There are two variants to the
5769      explicit realignment scheme: optimized, and unoptimized.
5770      We can optimize the realignment only if the step between consecutive
5771      vector loads is equal to the vector size.  Since the vector memory
5772      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5773      is guaranteed that the misalignment amount remains the same throughout the
5774      execution of the vectorized loop.  Therefore, we can create the
5775      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5776      at the loop preheader.
5777
5778      However, in the case of outer-loop vectorization, when vectorizing a
5779      memory access in the inner-loop nested within the LOOP that is now being
5780      vectorized, while it is guaranteed that the misalignment of the
5781      vectorized memory access will remain the same in different outer-loop
5782      iterations, it is *not* guaranteed that is will remain the same throughout
5783      the execution of the inner-loop.  This is because the inner-loop advances
5784      with the original scalar step (and not in steps of VS).  If the inner-loop
5785      step happens to be a multiple of VS, then the misalignment remains fixed
5786      and we can use the optimized realignment scheme.  For example:
5787
5788       for (i=0; i<N; i++)
5789         for (j=0; j<M; j++)
5790           s += a[i+j];
5791
5792      When vectorizing the i-loop in the above example, the step between
5793      consecutive vector loads is 1, and so the misalignment does not remain
5794      fixed across the execution of the inner-loop, and the realignment cannot
5795      be optimized (as illustrated in the following pseudo vectorized loop):
5796
5797       for (i=0; i<N; i+=4)
5798         for (j=0; j<M; j++){
5799           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5800                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5801                          // (assuming that we start from an aligned address).
5802           }
5803
5804      We therefore have to use the unoptimized realignment scheme:
5805
5806       for (i=0; i<N; i+=4)
5807           for (j=k; j<M; j+=4)
5808           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5809                            // that the misalignment of the initial address is
5810                            // 0).
5811
5812      The loop can then be vectorized as follows:
5813
5814       for (k=0; k<4; k++){
5815         rt = get_realignment_token (&vp[k]);
5816         for (i=0; i<N; i+=4){
5817           v1 = vp[i+k];
5818           for (j=k; j<M; j+=4){
5819             v2 = vp[i+j+VS-1];
5820             va = REALIGN_LOAD <v1,v2,rt>;
5821             vs += va;
5822             v1 = v2;
5823           }
5824         }
5825     } */
5826
5827   if (DR_IS_READ (dr))
5828     {
5829       bool is_packed = false;
5830       tree type = (TREE_TYPE (DR_REF (dr)));
5831
5832       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5833           && (!targetm.vectorize.builtin_mask_for_load
5834               || targetm.vectorize.builtin_mask_for_load ()))
5835         {
5836           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5837           if ((nested_in_vect_loop
5838                && (TREE_INT_CST_LOW (DR_STEP (dr))
5839                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5840               || !loop_vinfo)
5841             return dr_explicit_realign;
5842           else
5843             return dr_explicit_realign_optimized;
5844         }
5845       if (!known_alignment_for_access_p (dr))
5846         is_packed = not_size_aligned (DR_REF (dr));
5847
5848       if ((TYPE_USER_ALIGN (type) && !is_packed)
5849           || targetm.vectorize.
5850                support_vector_misalignment (mode, type,
5851                                             DR_MISALIGNMENT (dr), is_packed))
5852         /* Can't software pipeline the loads, but can at least do them.  */
5853         return dr_unaligned_supported;
5854     }
5855   else
5856     {
5857       bool is_packed = false;
5858       tree type = (TREE_TYPE (DR_REF (dr)));
5859
5860       if (!known_alignment_for_access_p (dr))
5861         is_packed = not_size_aligned (DR_REF (dr));
5862
5863      if ((TYPE_USER_ALIGN (type) && !is_packed)
5864          || targetm.vectorize.
5865               support_vector_misalignment (mode, type,
5866                                            DR_MISALIGNMENT (dr), is_packed))
5867        return dr_unaligned_supported;
5868     }
5869
5870   /* Unsupported.  */
5871   return dr_unaligned_unsupported;
5872 }