gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "alias.h"
  28 #include "symtab.h"
  29 #include "tree.h"
  30 #include "fold-const.h"
  31 #include "stor-layout.h"
  32 #include "tm_p.h"
  33 #include "target.h"
  34 #include "predict.h"
  35 #include "hard-reg-set.h"
  36 #include "function.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "basic-block.h"
  40 #include "gimple-pretty-print.h"
  41 #include "tree-ssa-alias.h"
  42 #include "internal-fn.h"
  43 #include "tree-eh.h"
  44 #include "gimple-expr.h"
  45 #include "gimple.h"
  46 #include "gimplify.h"
  47 #include "gimple-iterator.h"
  48 #include "gimplify-me.h"
  49 #include "gimple-ssa.h"
  50 #include "tree-phinodes.h"
  51 #include "ssa-iterators.h"
  52 #include "stringpool.h"
  53 #include "tree-ssanames.h"
  54 #include "tree-ssa-loop-ivopts.h"
  55 #include "tree-ssa-loop-manip.h"
  56 #include "tree-ssa-loop.h"
  57 #include "cfgloop.h"
  58 #include "tree-chrec.h"
  59 #include "tree-scalar-evolution.h"
  60 #include "tree-vectorizer.h"
  61 #include "diagnostic-core.h"
  62 #include "cgraph.h"
  63 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  64 #include "rtl.h"
  65 #include "flags.h"
  66 #include "insn-config.h"
  67 #include "expmed.h"
  68 #include "dojump.h"
  69 #include "explow.h"
  70 #include "calls.h"
  71 #include "emit-rtl.h"
  72 #include "varasm.h"
  73 #include "stmt.h"
  74 #include "expr.h"
  75 #include "insn-codes.h"
  76 #include "optabs.h"
  77 #include "builtins.h"
  78
  79 /* Return true if load- or store-lanes optab OPTAB is implemented for
  80    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  81
  82 static bool
  83 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  84                               tree vectype, unsigned HOST_WIDE_INT count)
  85 {
  86   machine_mode mode, array_mode;
  87   bool limit_p;
  88
  89   mode = TYPE_MODE (vectype);
  90   limit_p = !targetm.array_mode_supported_p (mode, count);
  91   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  92                               MODE_INT, limit_p);
  93
  94   if (array_mode == BLKmode)
  95     {
  96       if (dump_enabled_p ())
  97         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  98                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  99                          GET_MODE_NAME (mode), count);
 100       return false;
 101     }
 102
 103   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
 104     {
 105       if (dump_enabled_p ())
 106         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 107                          "cannot use %s<%s><%s>\n", name,
 108                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
 109       return false;
 110     }
 111
 112   if (dump_enabled_p ())
 113     dump_printf_loc (MSG_NOTE, vect_location,
 114                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
 115                      GET_MODE_NAME (mode));
 116
 117   return true;
 118 }
 119
 120
 121 /* Return the smallest scalar part of STMT.
 122    This is used to determine the vectype of the stmt.  We generally set the
 123    vectype according to the type of the result (lhs).  For stmts whose
 124    result-type is different than the type of the arguments (e.g., demotion,
 125    promotion), vectype will be reset appropriately (later).  Note that we have
 126    to visit the smallest datatype in this function, because that determines the
 127    VF.  If the smallest datatype in the loop is present only as the rhs of a
 128    promotion operation - we'd miss it.
 129    Such a case, where a variable of this datatype does not appear in the lhs
 130    anywhere in the loop, can only occur if it's an invariant: e.g.:
 131    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 132    invariant motion.  However, we cannot rely on invariant motion to always
 133    take invariants out of the loop, and so in the case of promotion we also
 134    have to check the rhs.
 135    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 136    types.  */
 137
 138 tree
 139 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 140                                HOST_WIDE_INT *rhs_size_unit)
 141 {
 142   tree scalar_type = gimple_expr_type (stmt);
 143   HOST_WIDE_INT lhs, rhs;
 144
 145   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 146
 147   if (is_gimple_assign (stmt)
 148       && (gimple_assign_cast_p (stmt)
 149           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 150           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 151           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 152     {
 153       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 154
 155       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 156       if (rhs < lhs)
 157         scalar_type = rhs_type;
 158     }
 159
 160   *lhs_size_unit = lhs;
 161   *rhs_size_unit = rhs;
 162   return scalar_type;
 163 }
 164
 165
 166 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 167    tested at run-time.  Return TRUE if DDR was successfully inserted.
 168    Return false if versioning is not supported.  */
 169
 170 static bool
 171 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 172 {
 173   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 174
 175   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 176     return false;
 177
 178   if (dump_enabled_p ())
 179     {
 180       dump_printf_loc (MSG_NOTE, vect_location,
 181                        "mark for run-time aliasing test between ");
 182       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 183       dump_printf (MSG_NOTE,  " and ");
 184       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 185       dump_printf (MSG_NOTE, "\n");
 186     }
 187
 188   if (optimize_loop_nest_for_size_p (loop))
 189     {
 190       if (dump_enabled_p ())
 191         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 192                          "versioning not supported when optimizing"
 193                          " for size.\n");
 194       return false;
 195     }
 196
 197   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 198   if (loop->inner)
 199     {
 200       if (dump_enabled_p ())
 201         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 202                          "versioning not yet supported for outer-loops.\n");
 203       return false;
 204     }
 205
 206   /* FORNOW: We don't support creating runtime alias tests for non-constant
 207      step.  */
 208   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 209       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 210     {
 211       if (dump_enabled_p ())
 212         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 213                          "versioning not yet supported for non-constant "
 214                          "step\n");
 215       return false;
 216     }
 217
 218   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 219   return true;
 220 }
 221
 222
 223 /* Function vect_analyze_data_ref_dependence.
 224
 225    Return TRUE if there (might) exist a dependence between a memory-reference
 226    DRA and a memory-reference DRB.  When versioning for alias may check a
 227    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 228    the data dependence.  */
 229
 230 static bool
 231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 232                                   loop_vec_info loop_vinfo, int *max_vf)
 233 {
 234   unsigned int i;
 235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 236   struct data_reference *dra = DDR_A (ddr);
 237   struct data_reference *drb = DDR_B (ddr);
 238   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 239   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 240   lambda_vector dist_v;
 241   unsigned int loop_depth;
 242
 243   /* In loop analysis all data references should be vectorizable.  */
 244   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 245       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 246     gcc_unreachable ();
 247
 248   /* Independent data accesses.  */
 249   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 250     return false;
 251
 252   if (dra == drb
 253       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 254     return false;
 255
 256   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 257      least two scalar iterations, there is always also a true dependence.
 258      As the vectorizer does not re-order loads and stores we can ignore
 259      the anti-dependence if TBAA can disambiguate both DRs similar to the
 260      case with known negative distance anti-dependences (positive
 261      distance anti-dependences would violate TBAA constraints).  */
 262   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 263        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 264       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 265                                  get_alias_set (DR_REF (drb))))
 266     return false;
 267
 268   /* Unknown data dependence.  */
 269   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 270     {
 271       /* If user asserted safelen consecutive iterations can be
 272          executed concurrently, assume independence.  */
 273       if (loop->safelen >= 2)
 274         {
 275           if (loop->safelen < *max_vf)
 276             *max_vf = loop->safelen;
 277           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 278           return false;
 279         }
 280
 281       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 282           || STMT_VINFO_GATHER_P (stmtinfo_b))
 283         {
 284           if (dump_enabled_p ())
 285             {
 286               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 287                                "versioning for alias not supported for: "
 288                                "can't determine dependence between ");
 289               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 290                                  DR_REF (dra));
 291               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 292               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 293                                  DR_REF (drb));
 294               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 295             }
 296           return true;
 297         }
 298
 299       if (dump_enabled_p ())
 300         {
 301           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 302                            "versioning for alias required: "
 303                            "can't determine dependence between ");
 304           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 305                              DR_REF (dra));
 306           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 307           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 308                              DR_REF (drb));
 309           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 310         }
 311
 312       /* Add to list of ddrs that need to be tested at run-time.  */
 313       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 314     }
 315
 316   /* Known data dependence.  */
 317   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 318     {
 319       /* If user asserted safelen consecutive iterations can be
 320          executed concurrently, assume independence.  */
 321       if (loop->safelen >= 2)
 322         {
 323           if (loop->safelen < *max_vf)
 324             *max_vf = loop->safelen;
 325           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 326           return false;
 327         }
 328
 329       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 330           || STMT_VINFO_GATHER_P (stmtinfo_b))
 331         {
 332           if (dump_enabled_p ())
 333             {
 334               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 335                                "versioning for alias not supported for: "
 336                                "bad dist vector for ");
 337               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 338                                  DR_REF (dra));
 339               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 340               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 341                                  DR_REF (drb));
 342               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 343             }
 344           return true;
 345         }
 346
 347       if (dump_enabled_p ())
 348         {
 349           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 350                            "versioning for alias required: "
 351                            "bad dist vector for ");
 352           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 353           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 354           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 355           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 356         }
 357       /* Add to list of ddrs that need to be tested at run-time.  */
 358       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 359     }
 360
 361   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 362   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 363     {
 364       int dist = dist_v[loop_depth];
 365
 366       if (dump_enabled_p ())
 367         dump_printf_loc (MSG_NOTE, vect_location,
 368                          "dependence distance  = %d.\n", dist);
 369
 370       if (dist == 0)
 371         {
 372           if (dump_enabled_p ())
 373             {
 374               dump_printf_loc (MSG_NOTE, vect_location,
 375                                "dependence distance == 0 between ");
 376               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 377               dump_printf (MSG_NOTE, " and ");
 378               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 379               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 380             }
 381
 382           /* When we perform grouped accesses and perform implicit CSE
 383              by detecting equal accesses and doing disambiguation with
 384              runtime alias tests like for
 385                 .. = a[i];
 386                 .. = a[i+1];
 387                 a[i] = ..;
 388                 a[i+1] = ..;
 389                 *p = ..;
 390                 .. = a[i];
 391                 .. = a[i+1];
 392              where we will end up loading { a[i], a[i+1] } once, make
 393              sure that inserting group loads before the first load and
 394              stores after the last store will do the right thing.
 395              Similar for groups like
 396                 a[i] = ...;
 397                 ... = a[i];
 398                 a[i+1] = ...;
 399              where loads from the group interleave with the store.  */
 400           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 401               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 402             {
 403               gimple earlier_stmt;
 404               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 405               if (DR_IS_WRITE
 406                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 407                 {
 408                   if (dump_enabled_p ())
 409                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 410                                      "READ_WRITE dependence in interleaving."
 411                                      "\n");
 412                   return true;
 413                 }
 414             }
 415
 416           continue;
 417         }
 418
 419       if (dist > 0 && DDR_REVERSED_P (ddr))
 420         {
 421           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 422              reversed (to make distance vector positive), and the actual
 423              distance is negative.  */
 424           if (dump_enabled_p ())
 425             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 426                              "dependence distance negative.\n");
 427           /* Record a negative dependence distance to later limit the
 428              amount of stmt copying / unrolling we can perform.
 429              Only need to handle read-after-write dependence.  */
 430           if (DR_IS_READ (drb)
 431               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 432                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 433             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 434           continue;
 435         }
 436
 437       if (abs (dist) >= 2
 438           && abs (dist) < *max_vf)
 439         {
 440           /* The dependence distance requires reduction of the maximal
 441              vectorization factor.  */
 442           *max_vf = abs (dist);
 443           if (dump_enabled_p ())
 444             dump_printf_loc (MSG_NOTE, vect_location,
 445                              "adjusting maximal vectorization factor to %i\n",
 446                              *max_vf);
 447         }
 448
 449       if (abs (dist) >= *max_vf)
 450         {
 451           /* Dependence distance does not create dependence, as far as
 452              vectorization is concerned, in this case.  */
 453           if (dump_enabled_p ())
 454             dump_printf_loc (MSG_NOTE, vect_location,
 455                              "dependence distance >= VF.\n");
 456           continue;
 457         }
 458
 459       if (dump_enabled_p ())
 460         {
 461           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 462                        "not vectorized, possible dependence "
 463                        "between data-refs ");
 464           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 465           dump_printf (MSG_NOTE,  " and ");
 466           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 467           dump_printf (MSG_NOTE,  "\n");
 468         }
 469
 470       return true;
 471     }
 472
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_data_ref_dependences.
 477
 478    Examine all the data references in the loop, and make sure there do not
 479    exist any data dependences between them.  Set *MAX_VF according to
 480    the maximum vectorization factor the data dependences allow.  */
 481
 482 bool
 483 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 484 {
 485   unsigned int i;
 486   struct data_dependence_relation *ddr;
 487
 488   if (dump_enabled_p ())
 489     dump_printf_loc (MSG_NOTE, vect_location,
 490                      "=== vect_analyze_data_ref_dependences ===\n");
 491
 492   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 493   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 494                                 &LOOP_VINFO_DDRS (loop_vinfo),
 495                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 496     return false;
 497
 498   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 499     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 500       return false;
 501
 502   return true;
 503 }
 504
 505
 506 /* Function vect_slp_analyze_data_ref_dependence.
 507
 508    Return TRUE if there (might) exist a dependence between a memory-reference
 509    DRA and a memory-reference DRB.  When versioning for alias may check a
 510    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 511    the data dependence.  */
 512
 513 static bool
 514 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 515 {
 516   struct data_reference *dra = DDR_A (ddr);
 517   struct data_reference *drb = DDR_B (ddr);
 518
 519   /* We need to check dependences of statements marked as unvectorizable
 520      as well, they still can prohibit vectorization.  */
 521
 522   /* Independent data accesses.  */
 523   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 524     return false;
 525
 526   if (dra == drb)
 527     return false;
 528
 529   /* Read-read is OK.  */
 530   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 531     return false;
 532
 533   /* If dra and drb are part of the same interleaving chain consider
 534      them independent.  */
 535   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 536       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 537           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 538     return false;
 539
 540   /* Unknown data dependence.  */
 541   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 542     {
 543       if  (dump_enabled_p ())
 544         {
 545           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 546                            "can't determine dependence between ");
 547           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 548           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 549           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 550           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 551         }
 552     }
 553   else if (dump_enabled_p ())
 554     {
 555       dump_printf_loc (MSG_NOTE, vect_location,
 556                        "determined dependence between ");
 557       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 558       dump_printf (MSG_NOTE, " and ");
 559       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 560       dump_printf (MSG_NOTE,  "\n");
 561     }
 562
 563   /* We do not vectorize basic blocks with write-write dependencies.  */
 564   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 565     return true;
 566
 567   /* If we have a read-write dependence check that the load is before the store.
 568      When we vectorize basic blocks, vector load can be only before
 569      corresponding scalar load, and vector store can be only after its
 570      corresponding scalar store.  So the order of the acceses is preserved in
 571      case the load is before the store.  */
 572   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 573   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 574     {
 575       /* That only holds for load-store pairs taking part in vectorization.  */
 576       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 577           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 578         return false;
 579     }
 580
 581   return true;
 582 }
 583
 584
 585 /* Function vect_analyze_data_ref_dependences.
 586
 587    Examine all the data references in the basic-block, and make sure there
 588    do not exist any data dependences between them.  Set *MAX_VF according to
 589    the maximum vectorization factor the data dependences allow.  */
 590
 591 bool
 592 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 593 {
 594   struct data_dependence_relation *ddr;
 595   unsigned int i;
 596
 597   if (dump_enabled_p ())
 598     dump_printf_loc (MSG_NOTE, vect_location,
 599                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 600
 601   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 602                                 &BB_VINFO_DDRS (bb_vinfo),
 603                                 vNULL, true))
 604     return false;
 605
 606   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 607     if (vect_slp_analyze_data_ref_dependence (ddr))
 608       return false;
 609
 610   return true;
 611 }
 612
 613
 614 /* Function vect_compute_data_ref_alignment
 615
 616    Compute the misalignment of the data reference DR.
 617
 618    Output:
 619    1. If during the misalignment computation it is found that the data reference
 620       cannot be vectorized then false is returned.
 621    2. DR_MISALIGNMENT (DR) is defined.
 622
 623    FOR NOW: No analysis is actually performed. Misalignment is calculated
 624    only for trivial cases. TODO.  */
 625
 626 static bool
 627 vect_compute_data_ref_alignment (struct data_reference *dr)
 628 {
 629   gimple stmt = DR_STMT (dr);
 630   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 631   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 632   struct loop *loop = NULL;
 633   tree ref = DR_REF (dr);
 634   tree vectype;
 635   tree base, base_addr;
 636   bool base_aligned;
 637   tree misalign = NULL_TREE;
 638   tree aligned_to;
 639   unsigned HOST_WIDE_INT alignment;
 640
 641   if (dump_enabled_p ())
 642     dump_printf_loc (MSG_NOTE, vect_location,
 643                      "vect_compute_data_ref_alignment:\n");
 644
 645   if (loop_vinfo)
 646     loop = LOOP_VINFO_LOOP (loop_vinfo);
 647
 648   /* Initialize misalignment to unknown.  */
 649   SET_DR_MISALIGNMENT (dr, -1);
 650
 651   /* Strided accesses perform only component accesses, misalignment information
 652      is irrelevant for them.  */
 653   if (STMT_VINFO_STRIDED_P (stmt_info)
 654       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 655     return true;
 656
 657   if (tree_fits_shwi_p (DR_STEP (dr)))
 658     misalign = DR_INIT (dr);
 659   aligned_to = DR_ALIGNED_TO (dr);
 660   base_addr = DR_BASE_ADDRESS (dr);
 661   vectype = STMT_VINFO_VECTYPE (stmt_info);
 662
 663   /* In case the dataref is in an inner-loop of the loop that is being
 664      vectorized (LOOP), we use the base and misalignment information
 665      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 666      stays the same throughout the execution of the inner-loop, which is why
 667      we have to check that the stride of the dataref in the inner-loop evenly
 668      divides by the vector size.  */
 669   if (loop && nested_in_vect_loop_p (loop, stmt))
 670     {
 671       tree step = DR_STEP (dr);
 672
 673       if (tree_fits_shwi_p (step)
 674           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 675         {
 676           if (dump_enabled_p ())
 677             dump_printf_loc (MSG_NOTE, vect_location,
 678                              "inner step divides the vector-size.\n");
 679           misalign = STMT_VINFO_DR_INIT (stmt_info);
 680           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 681           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 682         }
 683       else
 684         {
 685           if (dump_enabled_p ())
 686             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 687                              "inner step doesn't divide the vector-size.\n");
 688           misalign = NULL_TREE;
 689         }
 690     }
 691
 692   /* Similarly we can only use base and misalignment information relative to
 693      an innermost loop if the misalignment stays the same throughout the
 694      execution of the loop.  As above, this is the case if the stride of
 695      the dataref evenly divides by the vector size.  */
 696   else
 697     {
 698       tree step = DR_STEP (dr);
 699       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 700
 701       if (tree_fits_shwi_p (step)
 702           && ((tree_to_shwi (step) * vf)
 703               % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 704         {
 705           if (dump_enabled_p ())
 706             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 707                              "step doesn't divide the vector-size.\n");
 708           misalign = NULL_TREE;
 709         }
 710     }
 711
 712   alignment = TYPE_ALIGN_UNIT (vectype);
 713
 714   if ((compare_tree_int (aligned_to, alignment) < 0)
 715       || !misalign)
 716     {
 717       if (dump_enabled_p ())
 718         {
 719           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 720                            "Unknown alignment for access: ");
 721           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 722           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 723         }
 724       return true;
 725     }
 726
 727   /* To look at alignment of the base we have to preserve an inner MEM_REF
 728      as that carries alignment information of the actual access.  */
 729   base = ref;
 730   while (handled_component_p (base))
 731     base = TREE_OPERAND (base, 0);
 732   if (TREE_CODE (base) == MEM_REF)
 733     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 734                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 735
 736   if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
 737     base_aligned = true;
 738   else
 739     base_aligned = false;
 740
 741   if (!base_aligned)
 742     {
 743       /* Strip an inner MEM_REF to a bare decl if possible.  */
 744       if (TREE_CODE (base) == MEM_REF
 745           && integer_zerop (TREE_OPERAND (base, 1))
 746           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 747         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 748
 749       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 750         {
 751           if (dump_enabled_p ())
 752             {
 753               dump_printf_loc (MSG_NOTE, vect_location,
 754                                "can't force alignment of ref: ");
 755               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 756               dump_printf (MSG_NOTE, "\n");
 757             }
 758           return true;
 759         }
 760
 761       /* Force the alignment of the decl.
 762          NOTE: This is the only change to the code we make during
 763          the analysis phase, before deciding to vectorize the loop.  */
 764       if (dump_enabled_p ())
 765         {
 766           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 767           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 768           dump_printf (MSG_NOTE, "\n");
 769         }
 770
 771       ((dataref_aux *)dr->aux)->base_decl = base;
 772       ((dataref_aux *)dr->aux)->base_misaligned = true;
 773     }
 774
 775   /* If this is a backward running DR then first access in the larger
 776      vectype actually is N-1 elements before the address in the DR.
 777      Adjust misalign accordingly.  */
 778   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 779     {
 780       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 781       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 782          otherwise we wouldn't be here.  */
 783       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 784       /* PLUS because DR_STEP was negative.  */
 785       misalign = size_binop (PLUS_EXPR, misalign, offset);
 786     }
 787
 788   SET_DR_MISALIGNMENT (dr,
 789                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 790
 791   if (dump_enabled_p ())
 792     {
 793       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 794                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 795       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 796       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 797     }
 798
 799   return true;
 800 }
 801
 802
 803 /* Function vect_compute_data_refs_alignment
 804
 805    Compute the misalignment of data references in the loop.
 806    Return FALSE if a data reference is found that cannot be vectorized.  */
 807
 808 static bool
 809 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 810                                   bb_vec_info bb_vinfo)
 811 {
 812   vec<data_reference_p> datarefs;
 813   struct data_reference *dr;
 814   unsigned int i;
 815
 816   if (loop_vinfo)
 817     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 818   else
 819     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 820
 821   FOR_EACH_VEC_ELT (datarefs, i, dr)
 822     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 823         && !vect_compute_data_ref_alignment (dr))
 824       {
 825         if (bb_vinfo)
 826           {
 827             /* Mark unsupported statement as unvectorizable.  */
 828             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 829             continue;
 830           }
 831         else
 832           return false;
 833       }
 834
 835   return true;
 836 }
 837
 838
 839 /* Function vect_update_misalignment_for_peel
 840
 841    DR - the data reference whose misalignment is to be adjusted.
 842    DR_PEEL - the data reference whose misalignment is being made
 843              zero in the vector loop by the peel.
 844    NPEEL - the number of iterations in the peel loop if the misalignment
 845            of DR_PEEL is known at compile time.  */
 846
 847 static void
 848 vect_update_misalignment_for_peel (struct data_reference *dr,
 849                                    struct data_reference *dr_peel, int npeel)
 850 {
 851   unsigned int i;
 852   vec<dr_p> same_align_drs;
 853   struct data_reference *current_dr;
 854   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 855   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 856   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 857   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 858
 859  /* For interleaved data accesses the step in the loop must be multiplied by
 860      the size of the interleaving group.  */
 861   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 862     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 863   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 864     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 865
 866   /* It can be assumed that the data refs with the same alignment as dr_peel
 867      are aligned in the vector loop.  */
 868   same_align_drs
 869     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 870   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 871     {
 872       if (current_dr != dr)
 873         continue;
 874       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 875                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 876       SET_DR_MISALIGNMENT (dr, 0);
 877       return;
 878     }
 879
 880   if (known_alignment_for_access_p (dr)
 881       && known_alignment_for_access_p (dr_peel))
 882     {
 883       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 884       int misal = DR_MISALIGNMENT (dr);
 885       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 886       misal += negative ? -npeel * dr_size : npeel * dr_size;
 887       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 888       SET_DR_MISALIGNMENT (dr, misal);
 889       return;
 890     }
 891
 892   if (dump_enabled_p ())
 893     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 894   SET_DR_MISALIGNMENT (dr, -1);
 895 }
 896
 897
 898 /* Function vect_verify_datarefs_alignment
 899
 900    Return TRUE if all data references in the loop can be
 901    handled with respect to alignment.  */
 902
 903 bool
 904 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 905 {
 906   vec<data_reference_p> datarefs;
 907   struct data_reference *dr;
 908   enum dr_alignment_support supportable_dr_alignment;
 909   unsigned int i;
 910
 911   if (loop_vinfo)
 912     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 913   else
 914     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 915
 916   FOR_EACH_VEC_ELT (datarefs, i, dr)
 917     {
 918       gimple stmt = DR_STMT (dr);
 919       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 920
 921       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 922         continue;
 923
 924       /* For interleaving, only the alignment of the first access matters.
 925          Skip statements marked as not vectorizable.  */
 926       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 927            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 928           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 929         continue;
 930
 931       /* Strided accesses perform only component accesses, alignment is
 932          irrelevant for them.  */
 933       if (STMT_VINFO_STRIDED_P (stmt_info)
 934           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 935         continue;
 936
 937       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 938       if (!supportable_dr_alignment)
 939         {
 940           if (dump_enabled_p ())
 941             {
 942               if (DR_IS_READ (dr))
 943                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 944                                  "not vectorized: unsupported unaligned load.");
 945               else
 946                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 947                                  "not vectorized: unsupported unaligned "
 948                                  "store.");
 949
 950               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 951                                  DR_REF (dr));
 952               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 953             }
 954           return false;
 955         }
 956       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 957         dump_printf_loc (MSG_NOTE, vect_location,
 958                          "Vectorizing an unaligned access.\n");
 959     }
 960   return true;
 961 }
 962
 963 /* Given an memory reference EXP return whether its alignment is less
 964    than its size.  */
 965
 966 static bool
 967 not_size_aligned (tree exp)
 968 {
 969   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 970     return true;
 971
 972   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 973           > get_object_alignment (exp));
 974 }
 975
 976 /* Function vector_alignment_reachable_p
 977
 978    Return true if vector alignment for DR is reachable by peeling
 979    a few loop iterations.  Return false otherwise.  */
 980
 981 static bool
 982 vector_alignment_reachable_p (struct data_reference *dr)
 983 {
 984   gimple stmt = DR_STMT (dr);
 985   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 986   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 987
 988   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 989     {
 990       /* For interleaved access we peel only if number of iterations in
 991          the prolog loop ({VF - misalignment}), is a multiple of the
 992          number of the interleaved accesses.  */
 993       int elem_size, mis_in_elements;
 994       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 995
 996       /* FORNOW: handle only known alignment.  */
 997       if (!known_alignment_for_access_p (dr))
 998         return false;
 999
1000       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1001       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1002
1003       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1004         return false;
1005     }
1006
1007   /* If misalignment is known at the compile time then allow peeling
1008      only if natural alignment is reachable through peeling.  */
1009   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1010     {
1011       HOST_WIDE_INT elmsize =
1012                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1013       if (dump_enabled_p ())
1014         {
1015           dump_printf_loc (MSG_NOTE, vect_location,
1016                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1017           dump_printf (MSG_NOTE,
1018                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1019         }
1020       if (DR_MISALIGNMENT (dr) % elmsize)
1021         {
1022           if (dump_enabled_p ())
1023             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024                              "data size does not divide the misalignment.\n");
1025           return false;
1026         }
1027     }
1028
1029   if (!known_alignment_for_access_p (dr))
1030     {
1031       tree type = TREE_TYPE (DR_REF (dr));
1032       bool is_packed = not_size_aligned (DR_REF (dr));
1033       if (dump_enabled_p ())
1034         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035                          "Unknown misalignment, is_packed = %d\n",is_packed);
1036       if ((TYPE_USER_ALIGN (type) && !is_packed)
1037           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1038         return true;
1039       else
1040         return false;
1041     }
1042
1043   return true;
1044 }
1045
1046
1047 /* Calculate the cost of the memory access represented by DR.  */
1048
1049 static void
1050 vect_get_data_access_cost (struct data_reference *dr,
1051                            unsigned int *inside_cost,
1052                            unsigned int *outside_cost,
1053                            stmt_vector_for_cost *body_cost_vec)
1054 {
1055   gimple stmt = DR_STMT (dr);
1056   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1057   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1058   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1059   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1060   int ncopies = vf / nunits;
1061
1062   if (DR_IS_READ (dr))
1063     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1064                         NULL, body_cost_vec, false);
1065   else
1066     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1067
1068   if (dump_enabled_p ())
1069     dump_printf_loc (MSG_NOTE, vect_location,
1070                      "vect_get_data_access_cost: inside_cost = %d, "
1071                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1072 }
1073
1074
1075 /* Insert DR into peeling hash table with NPEEL as key.  */
1076
1077 static void
1078 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1079                           int npeel)
1080 {
1081   struct _vect_peel_info elem, *slot;
1082   _vect_peel_info **new_slot;
1083   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1084
1085   elem.npeel = npeel;
1086   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1087   if (slot)
1088     slot->count++;
1089   else
1090     {
1091       slot = XNEW (struct _vect_peel_info);
1092       slot->npeel = npeel;
1093       slot->dr = dr;
1094       slot->count = 1;
1095       new_slot
1096         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1097       *new_slot = slot;
1098     }
1099
1100   if (!supportable_dr_alignment
1101       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1102     slot->count += VECT_MAX_COST;
1103 }
1104
1105
1106 /* Traverse peeling hash table to find peeling option that aligns maximum
1107    number of data accesses.  */
1108
1109 int
1110 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1111                                      _vect_peel_extended_info *max)
1112 {
1113   vect_peel_info elem = *slot;
1114
1115   if (elem->count > max->peel_info.count
1116       || (elem->count == max->peel_info.count
1117           && max->peel_info.npeel > elem->npeel))
1118     {
1119       max->peel_info.npeel = elem->npeel;
1120       max->peel_info.count = elem->count;
1121       max->peel_info.dr = elem->dr;
1122     }
1123
1124   return 1;
1125 }
1126
1127
1128 /* Traverse peeling hash table and calculate cost for each peeling option.
1129    Find the one with the lowest cost.  */
1130
1131 int
1132 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1133                                    _vect_peel_extended_info *min)
1134 {
1135   vect_peel_info elem = *slot;
1136   int save_misalignment, dummy;
1137   unsigned int inside_cost = 0, outside_cost = 0, i;
1138   gimple stmt = DR_STMT (elem->dr);
1139   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1140   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1141   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1142   struct data_reference *dr;
1143   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1144
1145   prologue_cost_vec.create (2);
1146   body_cost_vec.create (2);
1147   epilogue_cost_vec.create (2);
1148
1149   FOR_EACH_VEC_ELT (datarefs, i, dr)
1150     {
1151       stmt = DR_STMT (dr);
1152       stmt_info = vinfo_for_stmt (stmt);
1153       /* For interleaving, only the alignment of the first access
1154          matters.  */
1155       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1156           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1157         continue;
1158
1159       save_misalignment = DR_MISALIGNMENT (dr);
1160       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1161       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1162                                  &body_cost_vec);
1163       SET_DR_MISALIGNMENT (dr, save_misalignment);
1164     }
1165
1166   outside_cost += vect_get_known_peeling_cost
1167     (loop_vinfo, elem->npeel, &dummy,
1168      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1169      &prologue_cost_vec, &epilogue_cost_vec);
1170
1171   /* Prologue and epilogue costs are added to the target model later.
1172      These costs depend only on the scalar iteration cost, the
1173      number of peeling iterations finally chosen, and the number of
1174      misaligned statements.  So discard the information found here.  */
1175   prologue_cost_vec.release ();
1176   epilogue_cost_vec.release ();
1177
1178   if (inside_cost < min->inside_cost
1179       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1180     {
1181       min->inside_cost = inside_cost;
1182       min->outside_cost = outside_cost;
1183       min->body_cost_vec.release ();
1184       min->body_cost_vec = body_cost_vec;
1185       min->peel_info.dr = elem->dr;
1186       min->peel_info.npeel = elem->npeel;
1187     }
1188   else
1189     body_cost_vec.release ();
1190
1191   return 1;
1192 }
1193
1194
1195 /* Choose best peeling option by traversing peeling hash table and either
1196    choosing an option with the lowest cost (if cost model is enabled) or the
1197    option that aligns as many accesses as possible.  */
1198
1199 static struct data_reference *
1200 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1201                                        unsigned int *npeel,
1202                                        stmt_vector_for_cost *body_cost_vec)
1203 {
1204    struct _vect_peel_extended_info res;
1205
1206    res.peel_info.dr = NULL;
1207    res.body_cost_vec = stmt_vector_for_cost ();
1208
1209    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1210      {
1211        res.inside_cost = INT_MAX;
1212        res.outside_cost = INT_MAX;
1213        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1214            ->traverse <_vect_peel_extended_info *,
1215                        vect_peeling_hash_get_lowest_cost> (&res);
1216      }
1217    else
1218      {
1219        res.peel_info.count = 0;
1220        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1221            ->traverse <_vect_peel_extended_info *,
1222                        vect_peeling_hash_get_most_frequent> (&res);
1223      }
1224
1225    *npeel = res.peel_info.npeel;
1226    *body_cost_vec = res.body_cost_vec;
1227    return res.peel_info.dr;
1228 }
1229
1230
1231 /* Function vect_enhance_data_refs_alignment
1232
1233    This pass will use loop versioning and loop peeling in order to enhance
1234    the alignment of data references in the loop.
1235
1236    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1237    original loop is to be vectorized.  Any other loops that are created by
1238    the transformations performed in this pass - are not supposed to be
1239    vectorized.  This restriction will be relaxed.
1240
1241    This pass will require a cost model to guide it whether to apply peeling
1242    or versioning or a combination of the two.  For example, the scheme that
1243    intel uses when given a loop with several memory accesses, is as follows:
1244    choose one memory access ('p') which alignment you want to force by doing
1245    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1246    other accesses are not necessarily aligned, or (2) use loop versioning to
1247    generate one loop in which all accesses are aligned, and another loop in
1248    which only 'p' is necessarily aligned.
1249
1250    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1251    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1252    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1253
1254    Devising a cost model is the most critical aspect of this work.  It will
1255    guide us on which access to peel for, whether to use loop versioning, how
1256    many versions to create, etc.  The cost model will probably consist of
1257    generic considerations as well as target specific considerations (on
1258    powerpc for example, misaligned stores are more painful than misaligned
1259    loads).
1260
1261    Here are the general steps involved in alignment enhancements:
1262
1263      -- original loop, before alignment analysis:
1264         for (i=0; i<N; i++){
1265           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1266           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1267         }
1268
1269      -- After vect_compute_data_refs_alignment:
1270         for (i=0; i<N; i++){
1271           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1272           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1273         }
1274
1275      -- Possibility 1: we do loop versioning:
1276      if (p is aligned) {
1277         for (i=0; i<N; i++){    # loop 1A
1278           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1279           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1280         }
1281      }
1282      else {
1283         for (i=0; i<N; i++){    # loop 1B
1284           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1285           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1286         }
1287      }
1288
1289      -- Possibility 2: we do loop peeling:
1290      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1291         x = q[i];
1292         p[i] = y;
1293      }
1294      for (i = 3; i < N; i++){   # loop 2A
1295         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1296         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1297      }
1298
1299      -- Possibility 3: combination of loop peeling and versioning:
1300      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1301         x = q[i];
1302         p[i] = y;
1303      }
1304      if (p is aligned) {
1305         for (i = 3; i<N; i++){  # loop 3A
1306           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1307           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1308         }
1309      }
1310      else {
1311         for (i = 3; i<N; i++){  # loop 3B
1312           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1313           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1314         }
1315      }
1316
1317      These loops are later passed to loop_transform to be vectorized.  The
1318      vectorizer will use the alignment information to guide the transformation
1319      (whether to generate regular loads/stores, or with special handling for
1320      misalignment).  */
1321
1322 bool
1323 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1324 {
1325   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1326   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1327   enum dr_alignment_support supportable_dr_alignment;
1328   struct data_reference *dr0 = NULL, *first_store = NULL;
1329   struct data_reference *dr;
1330   unsigned int i, j;
1331   bool do_peeling = false;
1332   bool do_versioning = false;
1333   bool stat;
1334   gimple stmt;
1335   stmt_vec_info stmt_info;
1336   unsigned int npeel = 0;
1337   bool all_misalignments_unknown = true;
1338   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1339   unsigned possible_npeel_number = 1;
1340   tree vectype;
1341   unsigned int nelements, mis, same_align_drs_max = 0;
1342   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1343
1344   if (dump_enabled_p ())
1345     dump_printf_loc (MSG_NOTE, vect_location,
1346                      "=== vect_enhance_data_refs_alignment ===\n");
1347
1348   /* While cost model enhancements are expected in the future, the high level
1349      view of the code at this time is as follows:
1350
1351      A) If there is a misaligned access then see if peeling to align
1352         this access can make all data references satisfy
1353         vect_supportable_dr_alignment.  If so, update data structures
1354         as needed and return true.
1355
1356      B) If peeling wasn't possible and there is a data reference with an
1357         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1358         then see if loop versioning checks can be used to make all data
1359         references satisfy vect_supportable_dr_alignment.  If so, update
1360         data structures as needed and return true.
1361
1362      C) If neither peeling nor versioning were successful then return false if
1363         any data reference does not satisfy vect_supportable_dr_alignment.
1364
1365      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1366
1367      Note, Possibility 3 above (which is peeling and versioning together) is not
1368      being done at this time.  */
1369
1370   /* (1) Peeling to force alignment.  */
1371
1372   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1373      Considerations:
1374      + How many accesses will become aligned due to the peeling
1375      - How many accesses will become unaligned due to the peeling,
1376        and the cost of misaligned accesses.
1377      - The cost of peeling (the extra runtime checks, the increase
1378        in code size).  */
1379
1380   FOR_EACH_VEC_ELT (datarefs, i, dr)
1381     {
1382       stmt = DR_STMT (dr);
1383       stmt_info = vinfo_for_stmt (stmt);
1384
1385       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1386         continue;
1387
1388       /* For interleaving, only the alignment of the first access
1389          matters.  */
1390       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1391           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1392         continue;
1393
1394       /* For invariant accesses there is nothing to enhance.  */
1395       if (integer_zerop (DR_STEP (dr)))
1396         continue;
1397
1398       /* Strided accesses perform only component accesses, alignment is
1399          irrelevant for them.  */
1400       if (STMT_VINFO_STRIDED_P (stmt_info)
1401           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1402         continue;
1403
1404       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1405       do_peeling = vector_alignment_reachable_p (dr);
1406       if (do_peeling)
1407         {
1408           if (known_alignment_for_access_p (dr))
1409             {
1410               unsigned int npeel_tmp;
1411               bool negative = tree_int_cst_compare (DR_STEP (dr),
1412                                                     size_zero_node) < 0;
1413
1414               /* Save info about DR in the hash table.  */
1415               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1416                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1417                   = new hash_table<peel_info_hasher> (1);
1418
1419               vectype = STMT_VINFO_VECTYPE (stmt_info);
1420               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1421               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1422                                                 TREE_TYPE (DR_REF (dr))));
1423               npeel_tmp = (negative
1424                            ? (mis - nelements) : (nelements - mis))
1425                   & (nelements - 1);
1426
1427               /* For multiple types, it is possible that the bigger type access
1428                  will have more than one peeling option.  E.g., a loop with two
1429                  types: one of size (vector size / 4), and the other one of
1430                  size (vector size / 8).  Vectorization factor will 8.  If both
1431                  access are misaligned by 3, the first one needs one scalar
1432                  iteration to be aligned, and the second one needs 5.  But the
1433                  the first one will be aligned also by peeling 5 scalar
1434                  iterations, and in that case both accesses will be aligned.
1435                  Hence, except for the immediate peeling amount, we also want
1436                  to try to add full vector size, while we don't exceed
1437                  vectorization factor.
1438                  We do this automtically for cost model, since we calculate cost
1439                  for every peeling option.  */
1440               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1441                 {
1442                   if (STMT_SLP_TYPE (stmt_info))
1443                     possible_npeel_number
1444                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1445                   else
1446                     possible_npeel_number = vf / nelements;
1447                 }
1448
1449               /* Handle the aligned case. We may decide to align some other
1450                  access, making DR unaligned.  */
1451               if (DR_MISALIGNMENT (dr) == 0)
1452                 {
1453                   npeel_tmp = 0;
1454                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1455                     possible_npeel_number++;
1456                 }
1457
1458               for (j = 0; j < possible_npeel_number; j++)
1459                 {
1460                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1461                   npeel_tmp += nelements;
1462                 }
1463
1464               all_misalignments_unknown = false;
1465               /* Data-ref that was chosen for the case that all the
1466                  misalignments are unknown is not relevant anymore, since we
1467                  have a data-ref with known alignment.  */
1468               dr0 = NULL;
1469             }
1470           else
1471             {
1472               /* If we don't know any misalignment values, we prefer
1473                  peeling for data-ref that has the maximum number of data-refs
1474                  with the same alignment, unless the target prefers to align
1475                  stores over load.  */
1476               if (all_misalignments_unknown)
1477                 {
1478                   unsigned same_align_drs
1479                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1480                   if (!dr0
1481                       || same_align_drs_max < same_align_drs)
1482                     {
1483                       same_align_drs_max = same_align_drs;
1484                       dr0 = dr;
1485                     }
1486                   /* For data-refs with the same number of related
1487                      accesses prefer the one where the misalign
1488                      computation will be invariant in the outermost loop.  */
1489                   else if (same_align_drs_max == same_align_drs)
1490                     {
1491                       struct loop *ivloop0, *ivloop;
1492                       ivloop0 = outermost_invariant_loop_for_expr
1493                           (loop, DR_BASE_ADDRESS (dr0));
1494                       ivloop = outermost_invariant_loop_for_expr
1495                           (loop, DR_BASE_ADDRESS (dr));
1496                       if ((ivloop && !ivloop0)
1497                           || (ivloop && ivloop0
1498                               && flow_loop_nested_p (ivloop, ivloop0)))
1499                         dr0 = dr;
1500                     }
1501
1502                   if (!first_store && DR_IS_WRITE (dr))
1503                     first_store = dr;
1504                 }
1505
1506               /* If there are both known and unknown misaligned accesses in the
1507                  loop, we choose peeling amount according to the known
1508                  accesses.  */
1509               if (!supportable_dr_alignment)
1510                 {
1511                   dr0 = dr;
1512                   if (!first_store && DR_IS_WRITE (dr))
1513                     first_store = dr;
1514                 }
1515             }
1516         }
1517       else
1518         {
1519           if (!aligned_access_p (dr))
1520             {
1521               if (dump_enabled_p ())
1522                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1523                                  "vector alignment may not be reachable\n");
1524               break;
1525             }
1526         }
1527     }
1528
1529   /* Check if we can possibly peel the loop.  */
1530   if (!vect_can_advance_ivs_p (loop_vinfo)
1531       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1532     do_peeling = false;
1533
1534   if (do_peeling
1535       && all_misalignments_unknown
1536       && vect_supportable_dr_alignment (dr0, false))
1537     {
1538       /* Check if the target requires to prefer stores over loads, i.e., if
1539          misaligned stores are more expensive than misaligned loads (taking
1540          drs with same alignment into account).  */
1541       if (first_store && DR_IS_READ (dr0))
1542         {
1543           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1544           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1545           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1546           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1547           stmt_vector_for_cost dummy;
1548           dummy.create (2);
1549
1550           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1551                                      &dummy);
1552           vect_get_data_access_cost (first_store, &store_inside_cost,
1553                                      &store_outside_cost, &dummy);
1554
1555           dummy.release ();
1556
1557           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1558              aligning the load DR0).  */
1559           load_inside_penalty = store_inside_cost;
1560           load_outside_penalty = store_outside_cost;
1561           for (i = 0;
1562                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1563                           DR_STMT (first_store))).iterate (i, &dr);
1564                i++)
1565             if (DR_IS_READ (dr))
1566               {
1567                 load_inside_penalty += load_inside_cost;
1568                 load_outside_penalty += load_outside_cost;
1569               }
1570             else
1571               {
1572                 load_inside_penalty += store_inside_cost;
1573                 load_outside_penalty += store_outside_cost;
1574               }
1575
1576           /* Calculate the penalty for leaving DR0 unaligned (by
1577              aligning the FIRST_STORE).  */
1578           store_inside_penalty = load_inside_cost;
1579           store_outside_penalty = load_outside_cost;
1580           for (i = 0;
1581                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1582                       DR_STMT (dr0))).iterate (i, &dr);
1583                i++)
1584             if (DR_IS_READ (dr))
1585               {
1586                 store_inside_penalty += load_inside_cost;
1587                 store_outside_penalty += load_outside_cost;
1588               }
1589             else
1590               {
1591                 store_inside_penalty += store_inside_cost;
1592                 store_outside_penalty += store_outside_cost;
1593               }
1594
1595           if (load_inside_penalty > store_inside_penalty
1596               || (load_inside_penalty == store_inside_penalty
1597                   && load_outside_penalty > store_outside_penalty))
1598             dr0 = first_store;
1599         }
1600
1601       /* In case there are only loads with different unknown misalignments, use
1602          peeling only if it may help to align other accesses in the loop or
1603          if it may help improving load bandwith when we'd end up using
1604          unaligned loads.  */
1605       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1606       if (!first_store
1607           && !STMT_VINFO_SAME_ALIGN_REFS (
1608                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1609           && (vect_supportable_dr_alignment (dr0, false)
1610               != dr_unaligned_supported
1611               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1612                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1613         do_peeling = false;
1614     }
1615
1616   if (do_peeling && !dr0)
1617     {
1618       /* Peeling is possible, but there is no data access that is not supported
1619          unless aligned. So we try to choose the best possible peeling.  */
1620
1621       /* We should get here only if there are drs with known misalignment.  */
1622       gcc_assert (!all_misalignments_unknown);
1623
1624       /* Choose the best peeling from the hash table.  */
1625       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1626                                                    &body_cost_vec);
1627       if (!dr0 || !npeel)
1628         do_peeling = false;
1629     }
1630
1631   if (do_peeling)
1632     {
1633       stmt = DR_STMT (dr0);
1634       stmt_info = vinfo_for_stmt (stmt);
1635       vectype = STMT_VINFO_VECTYPE (stmt_info);
1636       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1637
1638       if (known_alignment_for_access_p (dr0))
1639         {
1640           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1641                                                 size_zero_node) < 0;
1642           if (!npeel)
1643             {
1644               /* Since it's known at compile time, compute the number of
1645                  iterations in the peeled loop (the peeling factor) for use in
1646                  updating DR_MISALIGNMENT values.  The peeling factor is the
1647                  vectorization factor minus the misalignment as an element
1648                  count.  */
1649               mis = DR_MISALIGNMENT (dr0);
1650               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1651               npeel = ((negative ? mis - nelements : nelements - mis)
1652                        & (nelements - 1));
1653             }
1654
1655           /* For interleaved data access every iteration accesses all the
1656              members of the group, therefore we divide the number of iterations
1657              by the group size.  */
1658           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1659           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1660             npeel /= GROUP_SIZE (stmt_info);
1661
1662           if (dump_enabled_p ())
1663             dump_printf_loc (MSG_NOTE, vect_location,
1664                              "Try peeling by %d\n", npeel);
1665         }
1666
1667       /* Ensure that all data refs can be vectorized after the peel.  */
1668       FOR_EACH_VEC_ELT (datarefs, i, dr)
1669         {
1670           int save_misalignment;
1671
1672           if (dr == dr0)
1673             continue;
1674
1675           stmt = DR_STMT (dr);
1676           stmt_info = vinfo_for_stmt (stmt);
1677           /* For interleaving, only the alignment of the first access
1678             matters.  */
1679           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1680               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1681             continue;
1682
1683           /* Strided accesses perform only component accesses, alignment is
1684              irrelevant for them.  */
1685           if (STMT_VINFO_STRIDED_P (stmt_info)
1686               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1687             continue;
1688
1689           save_misalignment = DR_MISALIGNMENT (dr);
1690           vect_update_misalignment_for_peel (dr, dr0, npeel);
1691           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1692           SET_DR_MISALIGNMENT (dr, save_misalignment);
1693
1694           if (!supportable_dr_alignment)
1695             {
1696               do_peeling = false;
1697               break;
1698             }
1699         }
1700
1701       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1702         {
1703           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1704           if (!stat)
1705             do_peeling = false;
1706           else
1707             {
1708               body_cost_vec.release ();
1709               return stat;
1710             }
1711         }
1712
1713       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1714       if (do_peeling)
1715         {
1716           unsigned max_allowed_peel
1717             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1718           if (max_allowed_peel != (unsigned)-1)
1719             {
1720               unsigned max_peel = npeel;
1721               if (max_peel == 0)
1722                 {
1723                   gimple dr_stmt = DR_STMT (dr0);
1724                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1725                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1726                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1727                 }
1728               if (max_peel > max_allowed_peel)
1729                 {
1730                   do_peeling = false;
1731                   if (dump_enabled_p ())
1732                     dump_printf_loc (MSG_NOTE, vect_location,
1733                         "Disable peeling, max peels reached: %d\n", max_peel);
1734                 }
1735             }
1736         }
1737
1738       /* Cost model #2 - if peeling may result in a remaining loop not
1739          iterating enough to be vectorized then do not peel.  */
1740       if (do_peeling
1741           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1742         {
1743           unsigned max_peel
1744             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1745           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1746               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1747             do_peeling = false;
1748         }
1749
1750       if (do_peeling)
1751         {
1752           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1753              If the misalignment of DR_i is identical to that of dr0 then set
1754              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1755              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1756              by the peeling factor times the element size of DR_i (MOD the
1757              vectorization factor times the size).  Otherwise, the
1758              misalignment of DR_i must be set to unknown.  */
1759           FOR_EACH_VEC_ELT (datarefs, i, dr)
1760             if (dr != dr0)
1761               vect_update_misalignment_for_peel (dr, dr0, npeel);
1762
1763           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1764           if (npeel)
1765             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1766           else
1767             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1768               = DR_MISALIGNMENT (dr0);
1769           SET_DR_MISALIGNMENT (dr0, 0);
1770           if (dump_enabled_p ())
1771             {
1772               dump_printf_loc (MSG_NOTE, vect_location,
1773                                "Alignment of access forced using peeling.\n");
1774               dump_printf_loc (MSG_NOTE, vect_location,
1775                                "Peeling for alignment will be applied.\n");
1776             }
1777           /* The inside-loop cost will be accounted for in vectorizable_load
1778              and vectorizable_store correctly with adjusted alignments.
1779              Drop the body_cst_vec on the floor here.  */
1780           body_cost_vec.release ();
1781
1782           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1783           gcc_assert (stat);
1784           return stat;
1785         }
1786     }
1787
1788   body_cost_vec.release ();
1789
1790   /* (2) Versioning to force alignment.  */
1791
1792   /* Try versioning if:
1793      1) optimize loop for speed
1794      2) there is at least one unsupported misaligned data ref with an unknown
1795         misalignment, and
1796      3) all misaligned data refs with a known misalignment are supported, and
1797      4) the number of runtime alignment checks is within reason.  */
1798
1799   do_versioning =
1800         optimize_loop_nest_for_speed_p (loop)
1801         && (!loop->inner); /* FORNOW */
1802
1803   if (do_versioning)
1804     {
1805       FOR_EACH_VEC_ELT (datarefs, i, dr)
1806         {
1807           stmt = DR_STMT (dr);
1808           stmt_info = vinfo_for_stmt (stmt);
1809
1810           /* For interleaving, only the alignment of the first access
1811              matters.  */
1812           if (aligned_access_p (dr)
1813               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1814                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1815             continue;
1816
1817           if (STMT_VINFO_STRIDED_P (stmt_info))
1818             {
1819               /* Strided loads perform only component accesses, alignment is
1820                  irrelevant for them.  */
1821               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1822                 continue;
1823               do_versioning = false;
1824               break;
1825             }
1826
1827           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1828
1829           if (!supportable_dr_alignment)
1830             {
1831               gimple stmt;
1832               int mask;
1833               tree vectype;
1834
1835               if (known_alignment_for_access_p (dr)
1836                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1837                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1838                 {
1839                   do_versioning = false;
1840                   break;
1841                 }
1842
1843               stmt = DR_STMT (dr);
1844               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1845               gcc_assert (vectype);
1846
1847               /* The rightmost bits of an aligned address must be zeros.
1848                  Construct the mask needed for this test.  For example,
1849                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1850                  mask must be 15 = 0xf. */
1851               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1852
1853               /* FORNOW: use the same mask to test all potentially unaligned
1854                  references in the loop.  The vectorizer currently supports
1855                  a single vector size, see the reference to
1856                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1857                  vectorization factor is computed.  */
1858               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1859                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1860               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1861               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1862                       DR_STMT (dr));
1863             }
1864         }
1865
1866       /* Versioning requires at least one misaligned data reference.  */
1867       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1868         do_versioning = false;
1869       else if (!do_versioning)
1870         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1871     }
1872
1873   if (do_versioning)
1874     {
1875       vec<gimple> may_misalign_stmts
1876         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1877       gimple stmt;
1878
1879       /* It can now be assumed that the data references in the statements
1880          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1881          of the loop being vectorized.  */
1882       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1883         {
1884           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1885           dr = STMT_VINFO_DATA_REF (stmt_info);
1886           SET_DR_MISALIGNMENT (dr, 0);
1887           if (dump_enabled_p ())
1888             dump_printf_loc (MSG_NOTE, vect_location,
1889                              "Alignment of access forced using versioning.\n");
1890         }
1891
1892       if (dump_enabled_p ())
1893         dump_printf_loc (MSG_NOTE, vect_location,
1894                          "Versioning for alignment will be applied.\n");
1895
1896       /* Peeling and versioning can't be done together at this time.  */
1897       gcc_assert (! (do_peeling && do_versioning));
1898
1899       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1900       gcc_assert (stat);
1901       return stat;
1902     }
1903
1904   /* This point is reached if neither peeling nor versioning is being done.  */
1905   gcc_assert (! (do_peeling || do_versioning));
1906
1907   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1908   return stat;
1909 }
1910
1911
1912 /* Function vect_find_same_alignment_drs.
1913
1914    Update group and alignment relations according to the chosen
1915    vectorization factor.  */
1916
1917 static void
1918 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1919                               loop_vec_info loop_vinfo)
1920 {
1921   unsigned int i;
1922   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1923   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1924   struct data_reference *dra = DDR_A (ddr);
1925   struct data_reference *drb = DDR_B (ddr);
1926   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1927   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1928   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1929   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1930   lambda_vector dist_v;
1931   unsigned int loop_depth;
1932
1933   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1934     return;
1935
1936   if (dra == drb)
1937     return;
1938
1939   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1940     return;
1941
1942   /* Loop-based vectorization and known data dependence.  */
1943   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1944     return;
1945
1946   /* Data-dependence analysis reports a distance vector of zero
1947      for data-references that overlap only in the first iteration
1948      but have different sign step (see PR45764).
1949      So as a sanity check require equal DR_STEP.  */
1950   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1951     return;
1952
1953   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1954   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1955     {
1956       int dist = dist_v[loop_depth];
1957
1958       if (dump_enabled_p ())
1959         dump_printf_loc (MSG_NOTE, vect_location,
1960                          "dependence distance  = %d.\n", dist);
1961
1962       /* Same loop iteration.  */
1963       if (dist == 0
1964           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1965         {
1966           /* Two references with distance zero have the same alignment.  */
1967           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1968           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1969           if (dump_enabled_p ())
1970             {
1971               dump_printf_loc (MSG_NOTE, vect_location,
1972                                "accesses have the same alignment.\n");
1973               dump_printf (MSG_NOTE,
1974                            "dependence distance modulo vf == 0 between ");
1975               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1976               dump_printf (MSG_NOTE,  " and ");
1977               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1978               dump_printf (MSG_NOTE, "\n");
1979             }
1980         }
1981     }
1982 }
1983
1984
1985 /* Function vect_analyze_data_refs_alignment
1986
1987    Analyze the alignment of the data-references in the loop.
1988    Return FALSE if a data reference is found that cannot be vectorized.  */
1989
1990 bool
1991 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1992                                   bb_vec_info bb_vinfo)
1993 {
1994   if (dump_enabled_p ())
1995     dump_printf_loc (MSG_NOTE, vect_location,
1996                      "=== vect_analyze_data_refs_alignment ===\n");
1997
1998   /* Mark groups of data references with same alignment using
1999      data dependence information.  */
2000   if (loop_vinfo)
2001     {
2002       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2003       struct data_dependence_relation *ddr;
2004       unsigned int i;
2005
2006       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2007         vect_find_same_alignment_drs (ddr, loop_vinfo);
2008     }
2009
2010   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2011     {
2012       if (dump_enabled_p ())
2013         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2014                          "not vectorized: can't calculate alignment "
2015                          "for data ref.\n");
2016       return false;
2017     }
2018
2019   return true;
2020 }
2021
2022
2023 /* Analyze groups of accesses: check that DR belongs to a group of
2024    accesses of legal size, step, etc.  Detect gaps, single element
2025    interleaving, and other special cases. Set grouped access info.
2026    Collect groups of strided stores for further use in SLP analysis.  */
2027
2028 static bool
2029 vect_analyze_group_access (struct data_reference *dr)
2030 {
2031   tree step = DR_STEP (dr);
2032   tree scalar_type = TREE_TYPE (DR_REF (dr));
2033   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2034   gimple stmt = DR_STMT (dr);
2035   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2036   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2037   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2038   HOST_WIDE_INT dr_step = -1;
2039   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2040   bool slp_impossible = false;
2041   struct loop *loop = NULL;
2042
2043   if (loop_vinfo)
2044     loop = LOOP_VINFO_LOOP (loop_vinfo);
2045
2046   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2047      size of the interleaving group (including gaps).  */
2048   if (tree_fits_shwi_p (step))
2049     {
2050       dr_step = tree_to_shwi (step);
2051       groupsize = absu_hwi (dr_step) / type_size;
2052     }
2053   else
2054     groupsize = 0;
2055
2056   /* Not consecutive access is possible only if it is a part of interleaving.  */
2057   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2058     {
2059       /* Check if it this DR is a part of interleaving, and is a single
2060          element of the group that is accessed in the loop.  */
2061
2062       /* Gaps are supported only for loads. STEP must be a multiple of the type
2063          size.  The size of the group must be a power of 2.  */
2064       if (DR_IS_READ (dr)
2065           && (dr_step % type_size) == 0
2066           && groupsize > 0
2067           && exact_log2 (groupsize) != -1)
2068         {
2069           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2070           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2071           if (dump_enabled_p ())
2072             {
2073               dump_printf_loc (MSG_NOTE, vect_location,
2074                                "Detected single element interleaving ");
2075               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2076               dump_printf (MSG_NOTE, " step ");
2077               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2078               dump_printf (MSG_NOTE, "\n");
2079             }
2080
2081           if (loop_vinfo)
2082             {
2083               if (dump_enabled_p ())
2084                 dump_printf_loc (MSG_NOTE, vect_location,
2085                                  "Data access with gaps requires scalar "
2086                                  "epilogue loop\n");
2087               if (loop->inner)
2088                 {
2089                   if (dump_enabled_p ())
2090                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2091                                      "Peeling for outer loop is not"
2092                                      " supported\n");
2093                   return false;
2094                 }
2095
2096               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2097             }
2098
2099           return true;
2100         }
2101
2102       if (dump_enabled_p ())
2103         {
2104           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2105                            "not consecutive access ");
2106           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2107           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2108         }
2109
2110       if (bb_vinfo)
2111         {
2112           /* Mark the statement as unvectorizable.  */
2113           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2114           return true;
2115         }
2116
2117       return false;
2118     }
2119
2120   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2121     {
2122       /* First stmt in the interleaving chain. Check the chain.  */
2123       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2124       struct data_reference *data_ref = dr;
2125       unsigned int count = 1;
2126       tree prev_init = DR_INIT (data_ref);
2127       gimple prev = stmt;
2128       HOST_WIDE_INT diff, gaps = 0;
2129
2130       while (next)
2131         {
2132           /* Skip same data-refs.  In case that two or more stmts share
2133              data-ref (supported only for loads), we vectorize only the first
2134              stmt, and the rest get their vectorized loads from the first
2135              one.  */
2136           if (!tree_int_cst_compare (DR_INIT (data_ref),
2137                                      DR_INIT (STMT_VINFO_DATA_REF (
2138                                                    vinfo_for_stmt (next)))))
2139             {
2140               if (DR_IS_WRITE (data_ref))
2141                 {
2142                   if (dump_enabled_p ())
2143                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144                                      "Two store stmts share the same dr.\n");
2145                   return false;
2146                 }
2147
2148               /* For load use the same data-ref load.  */
2149               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2150
2151               prev = next;
2152               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2153               continue;
2154             }
2155
2156           prev = next;
2157           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2158
2159           /* All group members have the same STEP by construction.  */
2160           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2161
2162           /* Check that the distance between two accesses is equal to the type
2163              size. Otherwise, we have gaps.  */
2164           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2165                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2166           if (diff != 1)
2167             {
2168               /* FORNOW: SLP of accesses with gaps is not supported.  */
2169               slp_impossible = true;
2170               if (DR_IS_WRITE (data_ref))
2171                 {
2172                   if (dump_enabled_p ())
2173                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2174                                      "interleaved store with gaps\n");
2175                   return false;
2176                 }
2177
2178               gaps += diff - 1;
2179             }
2180
2181           last_accessed_element += diff;
2182
2183           /* Store the gap from the previous member of the group. If there is no
2184              gap in the access, GROUP_GAP is always 1.  */
2185           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2186
2187           prev_init = DR_INIT (data_ref);
2188           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2189           /* Count the number of data-refs in the chain.  */
2190           count++;
2191         }
2192
2193       if (groupsize == 0)
2194         groupsize = count + gaps;
2195
2196       /* Check that the size of the interleaving is equal to count for stores,
2197          i.e., that there are no gaps.  */
2198       if (groupsize != count
2199           && !DR_IS_READ (dr))
2200         {
2201           if (dump_enabled_p ())
2202             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203                              "interleaved store with gaps\n");
2204           return false;
2205         }
2206
2207       /* If there is a gap after the last load in the group it is the
2208          difference between the groupsize and the last accessed
2209          element.
2210          When there is no gap, this difference should be 0.  */
2211       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2212
2213       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2214       if (dump_enabled_p ())
2215         {
2216           dump_printf_loc (MSG_NOTE, vect_location,
2217                            "Detected interleaving of size %d starting with ",
2218                            (int)groupsize);
2219           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2220           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2221             dump_printf_loc (MSG_NOTE, vect_location,
2222                              "There is a gap of %d elements after the group\n",
2223                              (int)GROUP_GAP (vinfo_for_stmt (stmt)));
2224         }
2225
2226       /* SLP: create an SLP data structure for every interleaving group of
2227          stores for further analysis in vect_analyse_slp.  */
2228       if (DR_IS_WRITE (dr) && !slp_impossible)
2229         {
2230           if (loop_vinfo)
2231             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2232           if (bb_vinfo)
2233             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2234         }
2235
2236       /* If there is a gap in the end of the group or the group size cannot
2237          be made a multiple of the vector element count then we access excess
2238          elements in the last iteration and thus need to peel that off.  */
2239       if (loop_vinfo
2240           && (groupsize - last_accessed_element > 0
2241               || exact_log2 (groupsize) == -1))
2242
2243         {
2244           if (dump_enabled_p ())
2245             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246                              "Data access with gaps requires scalar "
2247                              "epilogue loop\n");
2248           if (loop->inner)
2249             {
2250               if (dump_enabled_p ())
2251                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252                                  "Peeling for outer loop is not supported\n");
2253               return false;
2254             }
2255
2256           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2257         }
2258     }
2259
2260   return true;
2261 }
2262
2263
2264 /* Analyze the access pattern of the data-reference DR.
2265    In case of non-consecutive accesses call vect_analyze_group_access() to
2266    analyze groups of accesses.  */
2267
2268 static bool
2269 vect_analyze_data_ref_access (struct data_reference *dr)
2270 {
2271   tree step = DR_STEP (dr);
2272   tree scalar_type = TREE_TYPE (DR_REF (dr));
2273   gimple stmt = DR_STMT (dr);
2274   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2275   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2276   struct loop *loop = NULL;
2277
2278   if (loop_vinfo)
2279     loop = LOOP_VINFO_LOOP (loop_vinfo);
2280
2281   if (loop_vinfo && !step)
2282     {
2283       if (dump_enabled_p ())
2284         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285                          "bad data-ref access in loop\n");
2286       return false;
2287     }
2288
2289   /* Allow loads with zero step in inner-loop vectorization.  */
2290   if (loop_vinfo && integer_zerop (step))
2291     {
2292       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2293       if (!nested_in_vect_loop_p (loop, stmt))
2294         return DR_IS_READ (dr);
2295       /* Allow references with zero step for outer loops marked
2296          with pragma omp simd only - it guarantees absence of
2297          loop-carried dependencies between inner loop iterations.  */
2298       if (!loop->force_vectorize)
2299         {
2300           if (dump_enabled_p ())
2301             dump_printf_loc (MSG_NOTE, vect_location,
2302                              "zero step in inner loop of nest\n");
2303           return false;
2304         }
2305     }
2306
2307   if (loop && nested_in_vect_loop_p (loop, stmt))
2308     {
2309       /* Interleaved accesses are not yet supported within outer-loop
2310         vectorization for references in the inner-loop.  */
2311       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2312
2313       /* For the rest of the analysis we use the outer-loop step.  */
2314       step = STMT_VINFO_DR_STEP (stmt_info);
2315       if (integer_zerop (step))
2316         {
2317           if (dump_enabled_p ())
2318             dump_printf_loc (MSG_NOTE, vect_location,
2319                              "zero step in outer loop.\n");
2320           if (DR_IS_READ (dr))
2321             return true;
2322           else
2323             return false;
2324         }
2325     }
2326
2327   /* Consecutive?  */
2328   if (TREE_CODE (step) == INTEGER_CST)
2329     {
2330       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2331       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2332           || (dr_step < 0
2333               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2334         {
2335           /* Mark that it is not interleaving.  */
2336           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2337           return true;
2338         }
2339     }
2340
2341   if (loop && nested_in_vect_loop_p (loop, stmt))
2342     {
2343       if (dump_enabled_p ())
2344         dump_printf_loc (MSG_NOTE, vect_location,
2345                          "grouped access in outer loop.\n");
2346       return false;
2347     }
2348
2349
2350   /* Assume this is a DR handled by non-constant strided load case.  */
2351   if (TREE_CODE (step) != INTEGER_CST)
2352     return (STMT_VINFO_STRIDED_P (stmt_info)
2353             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2354                 || vect_analyze_group_access (dr)));
2355
2356   /* Not consecutive access - check if it's a part of interleaving group.  */
2357   return vect_analyze_group_access (dr);
2358 }
2359
2360
2361
2362 /*  A helper function used in the comparator function to sort data
2363     references.  T1 and T2 are two data references to be compared.
2364     The function returns -1, 0, or 1.  */
2365
2366 static int
2367 compare_tree (tree t1, tree t2)
2368 {
2369   int i, cmp;
2370   enum tree_code code;
2371   char tclass;
2372
2373   if (t1 == t2)
2374     return 0;
2375   if (t1 == NULL)
2376     return -1;
2377   if (t2 == NULL)
2378     return 1;
2379
2380
2381   if (TREE_CODE (t1) != TREE_CODE (t2))
2382     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2383
2384   code = TREE_CODE (t1);
2385   switch (code)
2386     {
2387     /* For const values, we can just use hash values for comparisons.  */
2388     case INTEGER_CST:
2389     case REAL_CST:
2390     case FIXED_CST:
2391     case STRING_CST:
2392     case COMPLEX_CST:
2393     case VECTOR_CST:
2394       {
2395         hashval_t h1 = iterative_hash_expr (t1, 0);
2396         hashval_t h2 = iterative_hash_expr (t2, 0);
2397         if (h1 != h2)
2398           return h1 < h2 ? -1 : 1;
2399         break;
2400       }
2401
2402     case SSA_NAME:
2403       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2404       if (cmp != 0)
2405         return cmp;
2406
2407       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2408         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2409       break;
2410
2411     default:
2412       tclass = TREE_CODE_CLASS (code);
2413
2414       /* For var-decl, we could compare their UIDs.  */
2415       if (tclass == tcc_declaration)
2416         {
2417           if (DECL_UID (t1) != DECL_UID (t2))
2418             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2419           break;
2420         }
2421
2422       /* For expressions with operands, compare their operands recursively.  */
2423       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2424         {
2425           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2426           if (cmp != 0)
2427             return cmp;
2428         }
2429     }
2430
2431   return 0;
2432 }
2433
2434
2435 /* Compare two data-references DRA and DRB to group them into chunks
2436    suitable for grouping.  */
2437
2438 static int
2439 dr_group_sort_cmp (const void *dra_, const void *drb_)
2440 {
2441   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2442   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2443   int cmp;
2444
2445   /* Stabilize sort.  */
2446   if (dra == drb)
2447     return 0;
2448
2449   /* Ordering of DRs according to base.  */
2450   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2451     {
2452       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2453       if (cmp != 0)
2454         return cmp;
2455     }
2456
2457   /* And according to DR_OFFSET.  */
2458   if (!dr_equal_offsets_p (dra, drb))
2459     {
2460       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2461       if (cmp != 0)
2462         return cmp;
2463     }
2464
2465   /* Put reads before writes.  */
2466   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2467     return DR_IS_READ (dra) ? -1 : 1;
2468
2469   /* Then sort after access size.  */
2470   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2471                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2472     {
2473       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2474                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2475       if (cmp != 0)
2476         return cmp;
2477     }
2478
2479   /* And after step.  */
2480   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2481     {
2482       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2483       if (cmp != 0)
2484         return cmp;
2485     }
2486
2487   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2488   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2489   if (cmp == 0)
2490     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2491   return cmp;
2492 }
2493
2494 /* Function vect_analyze_data_ref_accesses.
2495
2496    Analyze the access pattern of all the data references in the loop.
2497
2498    FORNOW: the only access pattern that is considered vectorizable is a
2499            simple step 1 (consecutive) access.
2500
2501    FORNOW: handle only arrays and pointer accesses.  */
2502
2503 bool
2504 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2505 {
2506   unsigned int i;
2507   vec<data_reference_p> datarefs;
2508   struct data_reference *dr;
2509
2510   if (dump_enabled_p ())
2511     dump_printf_loc (MSG_NOTE, vect_location,
2512                      "=== vect_analyze_data_ref_accesses ===\n");
2513
2514   if (loop_vinfo)
2515     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2516   else
2517     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2518
2519   if (datarefs.is_empty ())
2520     return true;
2521
2522   /* Sort the array of datarefs to make building the interleaving chains
2523      linear.  Don't modify the original vector's order, it is needed for
2524      determining what dependencies are reversed.  */
2525   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2526   datarefs_copy.qsort (dr_group_sort_cmp);
2527
2528   /* Build the interleaving chains.  */
2529   for (i = 0; i < datarefs_copy.length () - 1;)
2530     {
2531       data_reference_p dra = datarefs_copy[i];
2532       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2533       stmt_vec_info lastinfo = NULL;
2534       for (i = i + 1; i < datarefs_copy.length (); ++i)
2535         {
2536           data_reference_p drb = datarefs_copy[i];
2537           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2538
2539           /* ???  Imperfect sorting (non-compatible types, non-modulo
2540              accesses, same accesses) can lead to a group to be artificially
2541              split here as we don't just skip over those.  If it really
2542              matters we can push those to a worklist and re-iterate
2543              over them.  The we can just skip ahead to the next DR here.  */
2544
2545           /* Check that the data-refs have same first location (except init)
2546              and they are both either store or load (not load and store,
2547              not masked loads or stores).  */
2548           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2549               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2550                                    DR_BASE_ADDRESS (drb), 0)
2551               || !dr_equal_offsets_p (dra, drb)
2552               || !gimple_assign_single_p (DR_STMT (dra))
2553               || !gimple_assign_single_p (DR_STMT (drb)))
2554             break;
2555
2556           /* Check that the data-refs have the same constant size.  */
2557           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2558           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2559           if (!tree_fits_uhwi_p (sza)
2560               || !tree_fits_uhwi_p (szb)
2561               || !tree_int_cst_equal (sza, szb))
2562             break;
2563
2564           /* Check that the data-refs have the same step.  */
2565           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2566             break;
2567
2568           /* Do not place the same access in the interleaving chain twice.  */
2569           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2570             break;
2571
2572           /* Check the types are compatible.
2573              ???  We don't distinguish this during sorting.  */
2574           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2575                                    TREE_TYPE (DR_REF (drb))))
2576             break;
2577
2578           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2579           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2580           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2581           gcc_assert (init_a < init_b);
2582
2583           /* If init_b == init_a + the size of the type * k, we have an
2584              interleaving, and DRA is accessed before DRB.  */
2585           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2586           if ((init_b - init_a) % type_size_a != 0)
2587             break;
2588
2589           /* If we have a store, the accesses are adjacent.  This splits
2590              groups into chunks we support (we don't support vectorization
2591              of stores with gaps).  */
2592           if (!DR_IS_READ (dra)
2593               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2594                                              (DR_INIT (datarefs_copy[i-1]))
2595                   != type_size_a))
2596             break;
2597
2598           /* If the step (if not zero or non-constant) is greater than the
2599              difference between data-refs' inits this splits groups into
2600              suitable sizes.  */
2601           if (tree_fits_shwi_p (DR_STEP (dra)))
2602             {
2603               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2604               if (step != 0 && step <= (init_b - init_a))
2605                 break;
2606             }
2607
2608           if (dump_enabled_p ())
2609             {
2610               dump_printf_loc (MSG_NOTE, vect_location,
2611                                "Detected interleaving ");
2612               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2613               dump_printf (MSG_NOTE,  " and ");
2614               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2615               dump_printf (MSG_NOTE, "\n");
2616             }
2617
2618           /* Link the found element into the group list.  */
2619           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2620             {
2621               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2622               lastinfo = stmtinfo_a;
2623             }
2624           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2625           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2626           lastinfo = stmtinfo_b;
2627         }
2628     }
2629
2630   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2631     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2632         && !vect_analyze_data_ref_access (dr))
2633       {
2634         if (dump_enabled_p ())
2635           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2636                            "not vectorized: complicated access pattern.\n");
2637
2638         if (bb_vinfo)
2639           {
2640             /* Mark the statement as not vectorizable.  */
2641             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2642             continue;
2643           }
2644         else
2645           {
2646             datarefs_copy.release ();
2647             return false;
2648           }
2649       }
2650
2651   datarefs_copy.release ();
2652   return true;
2653 }
2654
2655
2656 /* Operator == between two dr_with_seg_len objects.
2657
2658    This equality operator is used to make sure two data refs
2659    are the same one so that we will consider to combine the
2660    aliasing checks of those two pairs of data dependent data
2661    refs.  */
2662
2663 static bool
2664 operator == (const dr_with_seg_len& d1,
2665              const dr_with_seg_len& d2)
2666 {
2667   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2668                           DR_BASE_ADDRESS (d2.dr), 0)
2669            && compare_tree (d1.offset, d2.offset) == 0
2670            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2671 }
2672
2673 /* Function comp_dr_with_seg_len_pair.
2674
2675    Comparison function for sorting objects of dr_with_seg_len_pair_t
2676    so that we can combine aliasing checks in one scan.  */
2677
2678 static int
2679 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2680 {
2681   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2682   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2683
2684   const dr_with_seg_len &p11 = p1->first,
2685                         &p12 = p1->second,
2686                         &p21 = p2->first,
2687                         &p22 = p2->second;
2688
2689   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2690      if a and c have the same basic address snd step, and b and d have the same
2691      address and step.  Therefore, if any a&c or b&d don't have the same address
2692      and step, we don't care the order of those two pairs after sorting.  */
2693   int comp_res;
2694
2695   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2696                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2697     return comp_res;
2698   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2699                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2700     return comp_res;
2701   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2702     return comp_res;
2703   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2704     return comp_res;
2705   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2706     return comp_res;
2707   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2708     return comp_res;
2709
2710   return 0;
2711 }
2712
2713 /* Function vect_vfa_segment_size.
2714
2715    Create an expression that computes the size of segment
2716    that will be accessed for a data reference.  The functions takes into
2717    account that realignment loads may access one more vector.
2718
2719    Input:
2720      DR: The data reference.
2721      LENGTH_FACTOR: segment length to consider.
2722
2723    Return an expression whose value is the size of segment which will be
2724    accessed by DR.  */
2725
2726 static tree
2727 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2728 {
2729   tree segment_length;
2730
2731   if (integer_zerop (DR_STEP (dr)))
2732     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2733   else
2734     segment_length = size_binop (MULT_EXPR,
2735                                  fold_convert (sizetype, DR_STEP (dr)),
2736                                  fold_convert (sizetype, length_factor));
2737
2738   if (vect_supportable_dr_alignment (dr, false)
2739         == dr_explicit_realign_optimized)
2740     {
2741       tree vector_size = TYPE_SIZE_UNIT
2742                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2743
2744       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2745     }
2746   return segment_length;
2747 }
2748
2749 /* Function vect_prune_runtime_alias_test_list.
2750
2751    Prune a list of ddrs to be tested at run-time by versioning for alias.
2752    Merge several alias checks into one if possible.
2753    Return FALSE if resulting list of ddrs is longer then allowed by
2754    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2755
2756 bool
2757 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2758 {
2759   vec<ddr_p> may_alias_ddrs =
2760     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2761   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2762     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2763   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2764   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2765
2766   ddr_p ddr;
2767   unsigned int i;
2768   tree length_factor;
2769
2770   if (dump_enabled_p ())
2771     dump_printf_loc (MSG_NOTE, vect_location,
2772                      "=== vect_prune_runtime_alias_test_list ===\n");
2773
2774   if (may_alias_ddrs.is_empty ())
2775     return true;
2776
2777   /* Basically, for each pair of dependent data refs store_ptr_0
2778      and load_ptr_0, we create an expression:
2779
2780      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2781      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2782
2783      for aliasing checks.  However, in some cases we can decrease
2784      the number of checks by combining two checks into one.  For
2785      example, suppose we have another pair of data refs store_ptr_0
2786      and load_ptr_1, and if the following condition is satisfied:
2787
2788      load_ptr_0 < load_ptr_1  &&
2789      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2790
2791      (this condition means, in each iteration of vectorized loop,
2792      the accessed memory of store_ptr_0 cannot be between the memory
2793      of load_ptr_0 and load_ptr_1.)
2794
2795      we then can use only the following expression to finish the
2796      alising checks between store_ptr_0 & load_ptr_0 and
2797      store_ptr_0 & load_ptr_1:
2798
2799      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2800      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2801
2802      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2803      same basic address.  */
2804
2805   comp_alias_ddrs.create (may_alias_ddrs.length ());
2806
2807   /* First, we collect all data ref pairs for aliasing checks.  */
2808   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2809     {
2810       struct data_reference *dr_a, *dr_b;
2811       gimple dr_group_first_a, dr_group_first_b;
2812       tree segment_length_a, segment_length_b;
2813       gimple stmt_a, stmt_b;
2814
2815       dr_a = DDR_A (ddr);
2816       stmt_a = DR_STMT (DDR_A (ddr));
2817       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2818       if (dr_group_first_a)
2819         {
2820           stmt_a = dr_group_first_a;
2821           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2822         }
2823
2824       dr_b = DDR_B (ddr);
2825       stmt_b = DR_STMT (DDR_B (ddr));
2826       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2827       if (dr_group_first_b)
2828         {
2829           stmt_b = dr_group_first_b;
2830           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2831         }
2832
2833       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2834         length_factor = scalar_loop_iters;
2835       else
2836         length_factor = size_int (vect_factor);
2837       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2838       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2839
2840       dr_with_seg_len_pair_t dr_with_seg_len_pair
2841           (dr_with_seg_len (dr_a, segment_length_a),
2842            dr_with_seg_len (dr_b, segment_length_b));
2843
2844       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2845         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2846
2847       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2848     }
2849
2850   /* Second, we sort the collected data ref pairs so that we can scan
2851      them once to combine all possible aliasing checks.  */
2852   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2853
2854   /* Third, we scan the sorted dr pairs and check if we can combine
2855      alias checks of two neighbouring dr pairs.  */
2856   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2857     {
2858       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2859       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2860                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2861                       *dr_a2 = &comp_alias_ddrs[i].first,
2862                       *dr_b2 = &comp_alias_ddrs[i].second;
2863
2864       /* Remove duplicate data ref pairs.  */
2865       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2866         {
2867           if (dump_enabled_p ())
2868             {
2869               dump_printf_loc (MSG_NOTE, vect_location,
2870                                "found equal ranges ");
2871               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2872                                  DR_REF (dr_a1->dr));
2873               dump_printf (MSG_NOTE,  ", ");
2874               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2875                                  DR_REF (dr_b1->dr));
2876               dump_printf (MSG_NOTE,  " and ");
2877               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2878                                  DR_REF (dr_a2->dr));
2879               dump_printf (MSG_NOTE,  ", ");
2880               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2881                                  DR_REF (dr_b2->dr));
2882               dump_printf (MSG_NOTE, "\n");
2883             }
2884
2885           comp_alias_ddrs.ordered_remove (i--);
2886           continue;
2887         }
2888
2889       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2890         {
2891           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2892              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2893           if (*dr_a1 == *dr_a2)
2894             {
2895               std::swap (dr_a1, dr_b1);
2896               std::swap (dr_a2, dr_b2);
2897             }
2898
2899           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2900                                 DR_BASE_ADDRESS (dr_a2->dr),
2901                                 0)
2902               || !tree_fits_shwi_p (dr_a1->offset)
2903               || !tree_fits_shwi_p (dr_a2->offset))
2904             continue;
2905
2906           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2907                                 - tree_to_shwi (dr_a1->offset));
2908
2909
2910           /* Now we check if the following condition is satisfied:
2911
2912              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2913
2914              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2915              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2916              have to make a best estimation.  We can get the minimum value
2917              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2918              then either of the following two conditions can guarantee the
2919              one above:
2920
2921              1: DIFF <= MIN_SEG_LEN_B
2922              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2923
2924              */
2925
2926           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2927                                           ? tree_to_shwi (dr_b1->seg_len)
2928                                           : vect_factor);
2929
2930           if (diff <= min_seg_len_b
2931               || (tree_fits_shwi_p (dr_a1->seg_len)
2932                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2933             {
2934               if (dump_enabled_p ())
2935                 {
2936                   dump_printf_loc (MSG_NOTE, vect_location,
2937                                    "merging ranges for ");
2938                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2939                                      DR_REF (dr_a1->dr));
2940                   dump_printf (MSG_NOTE,  ", ");
2941                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2942                                      DR_REF (dr_b1->dr));
2943                   dump_printf (MSG_NOTE,  " and ");
2944                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2945                                      DR_REF (dr_a2->dr));
2946                   dump_printf (MSG_NOTE,  ", ");
2947                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2948                                      DR_REF (dr_b2->dr));
2949                   dump_printf (MSG_NOTE, "\n");
2950                 }
2951
2952               dr_a1->seg_len = size_binop (PLUS_EXPR,
2953                                            dr_a2->seg_len, size_int (diff));
2954               comp_alias_ddrs.ordered_remove (i--);
2955             }
2956         }
2957     }
2958
2959   dump_printf_loc (MSG_NOTE, vect_location,
2960                    "improved number of alias checks from %d to %d\n",
2961                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2962   if ((int) comp_alias_ddrs.length () >
2963       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2964     return false;
2965
2966   return true;
2967 }
2968
2969 /* Check whether a non-affine read in stmt is suitable for gather load
2970    and if so, return a builtin decl for that operation.  */
2971
2972 tree
2973 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2974                    tree *offp, int *scalep)
2975 {
2976   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2977   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2978   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2979   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2980   tree offtype = NULL_TREE;
2981   tree decl, base, off;
2982   machine_mode pmode;
2983   int punsignedp, pvolatilep;
2984
2985   base = DR_REF (dr);
2986   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2987      see if we can use the def stmt of the address.  */
2988   if (is_gimple_call (stmt)
2989       && gimple_call_internal_p (stmt)
2990       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2991           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2992       && TREE_CODE (base) == MEM_REF
2993       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2994       && integer_zerop (TREE_OPERAND (base, 1))
2995       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2996     {
2997       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2998       if (is_gimple_assign (def_stmt)
2999           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3000         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3001     }
3002
3003   /* The gather builtins need address of the form
3004      loop_invariant + vector * {1, 2, 4, 8}
3005      or
3006      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3007      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3008      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3009      multiplications and additions in it.  To get a vector, we need
3010      a single SSA_NAME that will be defined in the loop and will
3011      contain everything that is not loop invariant and that can be
3012      vectorized.  The following code attempts to find such a preexistng
3013      SSA_NAME OFF and put the loop invariants into a tree BASE
3014      that can be gimplified before the loop.  */
3015   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3016                               &pmode, &punsignedp, &pvolatilep, false);
3017   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3018
3019   if (TREE_CODE (base) == MEM_REF)
3020     {
3021       if (!integer_zerop (TREE_OPERAND (base, 1)))
3022         {
3023           if (off == NULL_TREE)
3024             {
3025               offset_int moff = mem_ref_offset (base);
3026               off = wide_int_to_tree (sizetype, moff);
3027             }
3028           else
3029             off = size_binop (PLUS_EXPR, off,
3030                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3031         }
3032       base = TREE_OPERAND (base, 0);
3033     }
3034   else
3035     base = build_fold_addr_expr (base);
3036
3037   if (off == NULL_TREE)
3038     off = size_zero_node;
3039
3040   /* If base is not loop invariant, either off is 0, then we start with just
3041      the constant offset in the loop invariant BASE and continue with base
3042      as OFF, otherwise give up.
3043      We could handle that case by gimplifying the addition of base + off
3044      into some SSA_NAME and use that as off, but for now punt.  */
3045   if (!expr_invariant_in_loop_p (loop, base))
3046     {
3047       if (!integer_zerop (off))
3048         return NULL_TREE;
3049       off = base;
3050       base = size_int (pbitpos / BITS_PER_UNIT);
3051     }
3052   /* Otherwise put base + constant offset into the loop invariant BASE
3053      and continue with OFF.  */
3054   else
3055     {
3056       base = fold_convert (sizetype, base);
3057       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3058     }
3059
3060   /* OFF at this point may be either a SSA_NAME or some tree expression
3061      from get_inner_reference.  Try to peel off loop invariants from it
3062      into BASE as long as possible.  */
3063   STRIP_NOPS (off);
3064   while (offtype == NULL_TREE)
3065     {
3066       enum tree_code code;
3067       tree op0, op1, add = NULL_TREE;
3068
3069       if (TREE_CODE (off) == SSA_NAME)
3070         {
3071           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3072
3073           if (expr_invariant_in_loop_p (loop, off))
3074             return NULL_TREE;
3075
3076           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3077             break;
3078
3079           op0 = gimple_assign_rhs1 (def_stmt);
3080           code = gimple_assign_rhs_code (def_stmt);
3081           op1 = gimple_assign_rhs2 (def_stmt);
3082         }
3083       else
3084         {
3085           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3086             return NULL_TREE;
3087           code = TREE_CODE (off);
3088           extract_ops_from_tree (off, &code, &op0, &op1);
3089         }
3090       switch (code)
3091         {
3092         case POINTER_PLUS_EXPR:
3093         case PLUS_EXPR:
3094           if (expr_invariant_in_loop_p (loop, op0))
3095             {
3096               add = op0;
3097               off = op1;
3098             do_add:
3099               add = fold_convert (sizetype, add);
3100               if (scale != 1)
3101                 add = size_binop (MULT_EXPR, add, size_int (scale));
3102               base = size_binop (PLUS_EXPR, base, add);
3103               continue;
3104             }
3105           if (expr_invariant_in_loop_p (loop, op1))
3106             {
3107               add = op1;
3108               off = op0;
3109               goto do_add;
3110             }
3111           break;
3112         case MINUS_EXPR:
3113           if (expr_invariant_in_loop_p (loop, op1))
3114             {
3115               add = fold_convert (sizetype, op1);
3116               add = size_binop (MINUS_EXPR, size_zero_node, add);
3117               off = op0;
3118               goto do_add;
3119             }
3120           break;
3121         case MULT_EXPR:
3122           if (scale == 1 && tree_fits_shwi_p (op1))
3123             {
3124               scale = tree_to_shwi (op1);
3125               off = op0;
3126               continue;
3127             }
3128           break;
3129         case SSA_NAME:
3130           off = op0;
3131           continue;
3132         CASE_CONVERT:
3133           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3134               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3135             break;
3136           if (TYPE_PRECISION (TREE_TYPE (op0))
3137               == TYPE_PRECISION (TREE_TYPE (off)))
3138             {
3139               off = op0;
3140               continue;
3141             }
3142           if (TYPE_PRECISION (TREE_TYPE (op0))
3143               < TYPE_PRECISION (TREE_TYPE (off)))
3144             {
3145               off = op0;
3146               offtype = TREE_TYPE (off);
3147               STRIP_NOPS (off);
3148               continue;
3149             }
3150           break;
3151         default:
3152           break;
3153         }
3154       break;
3155     }
3156
3157   /* If at the end OFF still isn't a SSA_NAME or isn't
3158      defined in the loop, punt.  */
3159   if (TREE_CODE (off) != SSA_NAME
3160       || expr_invariant_in_loop_p (loop, off))
3161     return NULL_TREE;
3162
3163   if (offtype == NULL_TREE)
3164     offtype = TREE_TYPE (off);
3165
3166   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3167                                            offtype, scale);
3168   if (decl == NULL_TREE)
3169     return NULL_TREE;
3170
3171   if (basep)
3172     *basep = base;
3173   if (offp)
3174     *offp = off;
3175   if (scalep)
3176     *scalep = scale;
3177   return decl;
3178 }
3179
3180 /* Function vect_analyze_data_refs.
3181
3182   Find all the data references in the loop or basic block.
3183
3184    The general structure of the analysis of data refs in the vectorizer is as
3185    follows:
3186    1- vect_analyze_data_refs(loop/bb): call
3187       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3188       in the loop/bb and their dependences.
3189    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3190    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3191    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3192
3193 */
3194
3195 bool
3196 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3197                         bb_vec_info bb_vinfo,
3198                         int *min_vf, unsigned *n_stmts)
3199 {
3200   struct loop *loop = NULL;
3201   basic_block bb = NULL;
3202   unsigned int i;
3203   vec<data_reference_p> datarefs;
3204   struct data_reference *dr;
3205   tree scalar_type;
3206
3207   if (dump_enabled_p ())
3208     dump_printf_loc (MSG_NOTE, vect_location,
3209                      "=== vect_analyze_data_refs ===\n");
3210
3211   if (loop_vinfo)
3212     {
3213       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3214
3215       loop = LOOP_VINFO_LOOP (loop_vinfo);
3216       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3217       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3218         {
3219           if (dump_enabled_p ())
3220             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3221                              "not vectorized: loop contains function calls"
3222                              " or data references that cannot be analyzed\n");
3223           return false;
3224         }
3225
3226       for (i = 0; i < loop->num_nodes; i++)
3227         {
3228           gimple_stmt_iterator gsi;
3229
3230           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3231             {
3232               gimple stmt = gsi_stmt (gsi);
3233               if (is_gimple_debug (stmt))
3234                 continue;
3235               ++*n_stmts;
3236               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3237                 {
3238                   if (is_gimple_call (stmt) && loop->safelen)
3239                     {
3240                       tree fndecl = gimple_call_fndecl (stmt), op;
3241                       if (fndecl != NULL_TREE)
3242                         {
3243                           struct cgraph_node *node = cgraph_node::get (fndecl);
3244                           if (node != NULL && node->simd_clones != NULL)
3245                             {
3246                               unsigned int j, n = gimple_call_num_args (stmt);
3247                               for (j = 0; j < n; j++)
3248                                 {
3249                                   op = gimple_call_arg (stmt, j);
3250                                   if (DECL_P (op)
3251                                       || (REFERENCE_CLASS_P (op)
3252                                           && get_base_address (op)))
3253                                     break;
3254                                 }
3255                               op = gimple_call_lhs (stmt);
3256                               /* Ignore #pragma omp declare simd functions
3257                                  if they don't have data references in the
3258                                  call stmt itself.  */
3259                               if (j == n
3260                                   && !(op
3261                                        && (DECL_P (op)
3262                                            || (REFERENCE_CLASS_P (op)
3263                                                && get_base_address (op)))))
3264                                 continue;
3265                             }
3266                         }
3267                     }
3268                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3269                   if (dump_enabled_p ())
3270                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3271                                      "not vectorized: loop contains function "
3272                                      "calls or data references that cannot "
3273                                      "be analyzed\n");
3274                   return false;
3275                 }
3276             }
3277         }
3278
3279       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3280     }
3281   else
3282     {
3283       gimple_stmt_iterator gsi;
3284
3285       bb = BB_VINFO_BB (bb_vinfo);
3286       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3287         {
3288           gimple stmt = gsi_stmt (gsi);
3289           if (is_gimple_debug (stmt))
3290             continue;
3291           ++*n_stmts;
3292           if (!find_data_references_in_stmt (NULL, stmt,
3293                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3294             {
3295               /* Mark the rest of the basic-block as unvectorizable.  */
3296               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3297                 {
3298                   stmt = gsi_stmt (gsi);
3299                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3300                 }
3301               break;
3302             }
3303         }
3304
3305       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3306     }
3307
3308   /* Go through the data-refs, check that the analysis succeeded.  Update
3309      pointer from stmt_vec_info struct to DR and vectype.  */
3310
3311   FOR_EACH_VEC_ELT (datarefs, i, dr)
3312     {
3313       gimple stmt;
3314       stmt_vec_info stmt_info;
3315       tree base, offset, init;
3316       bool gather = false;
3317       bool simd_lane_access = false;
3318       int vf;
3319
3320 again:
3321       if (!dr || !DR_REF (dr))
3322         {
3323           if (dump_enabled_p ())
3324             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3325                              "not vectorized: unhandled data-ref\n");
3326           return false;
3327         }
3328
3329       stmt = DR_STMT (dr);
3330       stmt_info = vinfo_for_stmt (stmt);
3331
3332       /* Discard clobbers from the dataref vector.  We will remove
3333          clobber stmts during vectorization.  */
3334       if (gimple_clobber_p (stmt))
3335         {
3336           free_data_ref (dr);
3337           if (i == datarefs.length () - 1)
3338             {
3339               datarefs.pop ();
3340               break;
3341             }
3342           datarefs.ordered_remove (i);
3343           dr = datarefs[i];
3344           goto again;
3345         }
3346
3347       /* Check that analysis of the data-ref succeeded.  */
3348       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3349           || !DR_STEP (dr))
3350         {
3351           bool maybe_gather
3352             = DR_IS_READ (dr)
3353               && !TREE_THIS_VOLATILE (DR_REF (dr))
3354               && targetm.vectorize.builtin_gather != NULL;
3355           bool maybe_simd_lane_access
3356             = loop_vinfo && loop->simduid;
3357
3358           /* If target supports vector gather loads, or if this might be
3359              a SIMD lane access, see if they can't be used.  */
3360           if (loop_vinfo
3361               && (maybe_gather || maybe_simd_lane_access)
3362               && !nested_in_vect_loop_p (loop, stmt))
3363             {
3364               struct data_reference *newdr
3365                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3366                                    DR_REF (dr), stmt, true);
3367               gcc_assert (newdr != NULL && DR_REF (newdr));
3368               if (DR_BASE_ADDRESS (newdr)
3369                   && DR_OFFSET (newdr)
3370                   && DR_INIT (newdr)
3371                   && DR_STEP (newdr)
3372                   && integer_zerop (DR_STEP (newdr)))
3373                 {
3374                   if (maybe_simd_lane_access)
3375                     {
3376                       tree off = DR_OFFSET (newdr);
3377                       STRIP_NOPS (off);
3378                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3379                           && TREE_CODE (off) == MULT_EXPR
3380                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3381                         {
3382                           tree step = TREE_OPERAND (off, 1);
3383                           off = TREE_OPERAND (off, 0);
3384                           STRIP_NOPS (off);
3385                           if (CONVERT_EXPR_P (off)
3386                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3387                                                                           0)))
3388                                  < TYPE_PRECISION (TREE_TYPE (off)))
3389                             off = TREE_OPERAND (off, 0);
3390                           if (TREE_CODE (off) == SSA_NAME)
3391                             {
3392                               gimple def = SSA_NAME_DEF_STMT (off);
3393                               tree reft = TREE_TYPE (DR_REF (newdr));
3394                               if (is_gimple_call (def)
3395                                   && gimple_call_internal_p (def)
3396                                   && (gimple_call_internal_fn (def)
3397                                       == IFN_GOMP_SIMD_LANE))
3398                                 {
3399                                   tree arg = gimple_call_arg (def, 0);
3400                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3401                                   arg = SSA_NAME_VAR (arg);
3402                                   if (arg == loop->simduid
3403                                       /* For now.  */
3404                                       && tree_int_cst_equal
3405                                            (TYPE_SIZE_UNIT (reft),
3406                                             step))
3407                                     {
3408                                       DR_OFFSET (newdr) = ssize_int (0);
3409                                       DR_STEP (newdr) = step;
3410                                       DR_ALIGNED_TO (newdr)
3411                                         = size_int (BIGGEST_ALIGNMENT);
3412                                       dr = newdr;
3413                                       simd_lane_access = true;
3414                                     }
3415                                 }
3416                             }
3417                         }
3418                     }
3419                   if (!simd_lane_access && maybe_gather)
3420                     {
3421                       dr = newdr;
3422                       gather = true;
3423                     }
3424                 }
3425               if (!gather && !simd_lane_access)
3426                 free_data_ref (newdr);
3427             }
3428
3429           if (!gather && !simd_lane_access)
3430             {
3431               if (dump_enabled_p ())
3432                 {
3433                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3434                                    "not vectorized: data ref analysis "
3435                                    "failed ");
3436                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3437                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3438                 }
3439
3440               if (bb_vinfo)
3441                 break;
3442
3443               return false;
3444             }
3445         }
3446
3447       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3448         {
3449           if (dump_enabled_p ())
3450             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3451                              "not vectorized: base addr of dr is a "
3452                              "constant\n");
3453
3454           if (bb_vinfo)
3455             break;
3456
3457           if (gather || simd_lane_access)
3458             free_data_ref (dr);
3459           return false;
3460         }
3461
3462       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3463         {
3464           if (dump_enabled_p ())
3465             {
3466               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3467                                "not vectorized: volatile type ");
3468               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3469               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3470             }
3471
3472           if (bb_vinfo)
3473             break;
3474
3475           return false;
3476         }
3477
3478       if (stmt_can_throw_internal (stmt))
3479         {
3480           if (dump_enabled_p ())
3481             {
3482               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3483                                "not vectorized: statement can throw an "
3484                                "exception ");
3485               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3486               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3487             }
3488
3489           if (bb_vinfo)
3490             break;
3491
3492           if (gather || simd_lane_access)
3493             free_data_ref (dr);
3494           return false;
3495         }
3496
3497       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3498           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3499         {
3500           if (dump_enabled_p ())
3501             {
3502               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3503                                "not vectorized: statement is bitfield "
3504                                "access ");
3505               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3506               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3507             }
3508
3509           if (bb_vinfo)
3510             break;
3511
3512           if (gather || simd_lane_access)
3513             free_data_ref (dr);
3514           return false;
3515         }
3516
3517       base = unshare_expr (DR_BASE_ADDRESS (dr));
3518       offset = unshare_expr (DR_OFFSET (dr));
3519       init = unshare_expr (DR_INIT (dr));
3520
3521       if (is_gimple_call (stmt)
3522           && (!gimple_call_internal_p (stmt)
3523               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3524                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3525         {
3526           if (dump_enabled_p ())
3527             {
3528               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3529                                "not vectorized: dr in a call ");
3530               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3531               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3532             }
3533
3534           if (bb_vinfo)
3535             break;
3536
3537           if (gather || simd_lane_access)
3538             free_data_ref (dr);
3539           return false;
3540         }
3541
3542       /* Update DR field in stmt_vec_info struct.  */
3543
3544       /* If the dataref is in an inner-loop of the loop that is considered for
3545          for vectorization, we also want to analyze the access relative to
3546          the outer-loop (DR contains information only relative to the
3547          inner-most enclosing loop).  We do that by building a reference to the
3548          first location accessed by the inner-loop, and analyze it relative to
3549          the outer-loop.  */
3550       if (loop && nested_in_vect_loop_p (loop, stmt))
3551         {
3552           tree outer_step, outer_base, outer_init;
3553           HOST_WIDE_INT pbitsize, pbitpos;
3554           tree poffset;
3555           machine_mode pmode;
3556           int punsignedp, pvolatilep;
3557           affine_iv base_iv, offset_iv;
3558           tree dinit;
3559
3560           /* Build a reference to the first location accessed by the
3561              inner-loop: *(BASE+INIT).  (The first location is actually
3562              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3563           tree inner_base = build_fold_indirect_ref
3564                                 (fold_build_pointer_plus (base, init));
3565
3566           if (dump_enabled_p ())
3567             {
3568               dump_printf_loc (MSG_NOTE, vect_location,
3569                                "analyze in outer-loop: ");
3570               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3571               dump_printf (MSG_NOTE, "\n");
3572             }
3573
3574           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3575                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3576           gcc_assert (outer_base != NULL_TREE);
3577
3578           if (pbitpos % BITS_PER_UNIT != 0)
3579             {
3580               if (dump_enabled_p ())
3581                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3582                                  "failed: bit offset alignment.\n");
3583               return false;
3584             }
3585
3586           outer_base = build_fold_addr_expr (outer_base);
3587           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3588                           &base_iv, false))
3589             {
3590               if (dump_enabled_p ())
3591                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3592                                  "failed: evolution of base is not affine.\n");
3593               return false;
3594             }
3595
3596           if (offset)
3597             {
3598               if (poffset)
3599                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3600                                        poffset);
3601               else
3602                 poffset = offset;
3603             }
3604
3605           if (!poffset)
3606             {
3607               offset_iv.base = ssize_int (0);
3608               offset_iv.step = ssize_int (0);
3609             }
3610           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3611                                &offset_iv, false))
3612             {
3613               if (dump_enabled_p ())
3614                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3615                                  "evolution of offset is not affine.\n");
3616               return false;
3617             }
3618
3619           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3620           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3621           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3622           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3623           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3624
3625           outer_step = size_binop (PLUS_EXPR,
3626                                 fold_convert (ssizetype, base_iv.step),
3627                                 fold_convert (ssizetype, offset_iv.step));
3628
3629           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3630           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3631           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3632           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3633           STMT_VINFO_DR_OFFSET (stmt_info) =
3634                                 fold_convert (ssizetype, offset_iv.base);
3635           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3636                                 size_int (highest_pow2_factor (offset_iv.base));
3637
3638           if (dump_enabled_p ())
3639             {
3640               dump_printf_loc (MSG_NOTE, vect_location,
3641                                "\touter base_address: ");
3642               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3643                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3644               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3645               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3646                                  STMT_VINFO_DR_OFFSET (stmt_info));
3647               dump_printf (MSG_NOTE,
3648                            "\n\touter constant offset from base address: ");
3649               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3650                                  STMT_VINFO_DR_INIT (stmt_info));
3651               dump_printf (MSG_NOTE, "\n\touter step: ");
3652               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3653                                  STMT_VINFO_DR_STEP (stmt_info));
3654               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3655               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3656                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3657               dump_printf (MSG_NOTE, "\n");
3658             }
3659         }
3660
3661       if (STMT_VINFO_DATA_REF (stmt_info))
3662         {
3663           if (dump_enabled_p ())
3664             {
3665               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3666                                "not vectorized: more than one data ref "
3667                                "in stmt: ");
3668               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3669               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3670             }
3671
3672           if (bb_vinfo)
3673             break;
3674
3675           if (gather || simd_lane_access)
3676             free_data_ref (dr);
3677           return false;
3678         }
3679
3680       STMT_VINFO_DATA_REF (stmt_info) = dr;
3681       if (simd_lane_access)
3682         {
3683           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3684           free_data_ref (datarefs[i]);
3685           datarefs[i] = dr;
3686         }
3687
3688       /* Set vectype for STMT.  */
3689       scalar_type = TREE_TYPE (DR_REF (dr));
3690       STMT_VINFO_VECTYPE (stmt_info)
3691         = get_vectype_for_scalar_type (scalar_type);
3692       if (!STMT_VINFO_VECTYPE (stmt_info))
3693         {
3694           if (dump_enabled_p ())
3695             {
3696               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697                                "not vectorized: no vectype for stmt: ");
3698               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3699               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3700               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3701                                  scalar_type);
3702               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3703             }
3704
3705           if (bb_vinfo)
3706             break;
3707
3708           if (gather || simd_lane_access)
3709             {
3710               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3711               if (gather)
3712                 free_data_ref (dr);
3713             }
3714           return false;
3715         }
3716       else
3717         {
3718           if (dump_enabled_p ())
3719             {
3720               dump_printf_loc (MSG_NOTE, vect_location,
3721                                "got vectype for stmt: ");
3722               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3723               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3724                                  STMT_VINFO_VECTYPE (stmt_info));
3725               dump_printf (MSG_NOTE, "\n");
3726             }
3727         }
3728
3729       /* Adjust the minimal vectorization factor according to the
3730          vector type.  */
3731       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3732       if (vf > *min_vf)
3733         *min_vf = vf;
3734
3735       if (gather)
3736         {
3737           tree off;
3738
3739           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3740           if (gather
3741               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3742             gather = false;
3743           if (!gather)
3744             {
3745               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3746               free_data_ref (dr);
3747               if (dump_enabled_p ())
3748                 {
3749                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3750                                    "not vectorized: not suitable for gather "
3751                                    "load ");
3752                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3753                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3754                 }
3755               return false;
3756             }
3757
3758           datarefs[i] = dr;
3759           STMT_VINFO_GATHER_P (stmt_info) = true;
3760         }
3761       else if (loop_vinfo
3762                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3763         {
3764           if (nested_in_vect_loop_p (loop, stmt))
3765             {
3766               if (dump_enabled_p ())
3767                 {
3768                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3769                                    "not vectorized: not suitable for strided "
3770                                    "load ");
3771                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3772                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3773                 }
3774               return false;
3775             }
3776           STMT_VINFO_STRIDED_P (stmt_info) = true;
3777         }
3778     }
3779
3780   /* If we stopped analysis at the first dataref we could not analyze
3781      when trying to vectorize a basic-block mark the rest of the datarefs
3782      as not vectorizable and truncate the vector of datarefs.  That
3783      avoids spending useless time in analyzing their dependence.  */
3784   if (i != datarefs.length ())
3785     {
3786       gcc_assert (bb_vinfo != NULL);
3787       for (unsigned j = i; j < datarefs.length (); ++j)
3788         {
3789           data_reference_p dr = datarefs[j];
3790           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3791           free_data_ref (dr);
3792         }
3793       datarefs.truncate (i);
3794     }
3795
3796   return true;
3797 }
3798
3799
3800 /* Function vect_get_new_vect_var.
3801
3802    Returns a name for a new variable.  The current naming scheme appends the
3803    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3804    the name of vectorizer generated variables, and appends that to NAME if
3805    provided.  */
3806
3807 tree
3808 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3809 {
3810   const char *prefix;
3811   tree new_vect_var;
3812
3813   switch (var_kind)
3814   {
3815   case vect_simple_var:
3816     prefix = "vect";
3817     break;
3818   case vect_scalar_var:
3819     prefix = "stmp";
3820     break;
3821   case vect_pointer_var:
3822     prefix = "vectp";
3823     break;
3824   default:
3825     gcc_unreachable ();
3826   }
3827
3828   if (name)
3829     {
3830       char* tmp = concat (prefix, "_", name, NULL);
3831       new_vect_var = create_tmp_reg (type, tmp);
3832       free (tmp);
3833     }
3834   else
3835     new_vect_var = create_tmp_reg (type, prefix);
3836
3837   return new_vect_var;
3838 }
3839
3840 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3841
3842 static void
3843 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3844                                   stmt_vec_info stmt_info)
3845 {
3846   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3847   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3848   int misalign = DR_MISALIGNMENT (dr);
3849   if (misalign == -1)
3850     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3851   else
3852     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3853 }
3854
3855 /* Function vect_create_addr_base_for_vector_ref.
3856
3857    Create an expression that computes the address of the first memory location
3858    that will be accessed for a data reference.
3859
3860    Input:
3861    STMT: The statement containing the data reference.
3862    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3863    OFFSET: Optional. If supplied, it is be added to the initial address.
3864    LOOP:    Specify relative to which loop-nest should the address be computed.
3865             For example, when the dataref is in an inner-loop nested in an
3866             outer-loop that is now being vectorized, LOOP can be either the
3867             outer-loop, or the inner-loop.  The first memory location accessed
3868             by the following dataref ('in' points to short):
3869
3870                 for (i=0; i<N; i++)
3871                    for (j=0; j<M; j++)
3872                      s += in[i+j]
3873
3874             is as follows:
3875             if LOOP=i_loop:     &in             (relative to i_loop)
3876             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3877    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3878             initial address.  Unlike OFFSET, which is number of elements to
3879             be added, BYTE_OFFSET is measured in bytes.
3880
3881    Output:
3882    1. Return an SSA_NAME whose value is the address of the memory location of
3883       the first vector of the data reference.
3884    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3885       these statement(s) which define the returned SSA_NAME.
3886
3887    FORNOW: We are only handling array accesses with step 1.  */
3888
3889 tree
3890 vect_create_addr_base_for_vector_ref (gimple stmt,
3891                                       gimple_seq *new_stmt_list,
3892                                       tree offset,
3893                                       struct loop *loop,
3894                                       tree byte_offset)
3895 {
3896   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3897   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3898   tree data_ref_base;
3899   const char *base_name;
3900   tree addr_base;
3901   tree dest;
3902   gimple_seq seq = NULL;
3903   tree base_offset;
3904   tree init;
3905   tree vect_ptr_type;
3906   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3907   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3908
3909   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3910     {
3911       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3912
3913       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3914
3915       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3916       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3917       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3918     }
3919   else
3920     {
3921       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3922       base_offset = unshare_expr (DR_OFFSET (dr));
3923       init = unshare_expr (DR_INIT (dr));
3924     }
3925
3926   if (loop_vinfo)
3927     base_name = get_name (data_ref_base);
3928   else
3929     {
3930       base_offset = ssize_int (0);
3931       init = ssize_int (0);
3932       base_name = get_name (DR_REF (dr));
3933     }
3934
3935   /* Create base_offset */
3936   base_offset = size_binop (PLUS_EXPR,
3937                             fold_convert (sizetype, base_offset),
3938                             fold_convert (sizetype, init));
3939
3940   if (offset)
3941     {
3942       offset = fold_build2 (MULT_EXPR, sizetype,
3943                             fold_convert (sizetype, offset), step);
3944       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3945                                  base_offset, offset);
3946     }
3947   if (byte_offset)
3948     {
3949       byte_offset = fold_convert (sizetype, byte_offset);
3950       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3951                                  base_offset, byte_offset);
3952     }
3953
3954   /* base + base_offset */
3955   if (loop_vinfo)
3956     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3957   else
3958     {
3959       addr_base = build1 (ADDR_EXPR,
3960                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3961                           unshare_expr (DR_REF (dr)));
3962     }
3963
3964   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3965   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3966   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3967   gimple_seq_add_seq (new_stmt_list, seq);
3968
3969   if (DR_PTR_INFO (dr)
3970       && TREE_CODE (addr_base) == SSA_NAME
3971       && !SSA_NAME_PTR_INFO (addr_base))
3972     {
3973       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3974       if (offset || byte_offset)
3975         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3976     }
3977
3978   if (dump_enabled_p ())
3979     {
3980       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3981       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3982       dump_printf (MSG_NOTE, "\n");
3983     }
3984
3985   return addr_base;
3986 }
3987
3988
3989 /* Function vect_create_data_ref_ptr.
3990
3991    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3992    location accessed in the loop by STMT, along with the def-use update
3993    chain to appropriately advance the pointer through the loop iterations.
3994    Also set aliasing information for the pointer.  This pointer is used by
3995    the callers to this function to create a memory reference expression for
3996    vector load/store access.
3997
3998    Input:
3999    1. STMT: a stmt that references memory. Expected to be of the form
4000          GIMPLE_ASSIGN <name, data-ref> or
4001          GIMPLE_ASSIGN <data-ref, name>.
4002    2. AGGR_TYPE: the type of the reference, which should be either a vector
4003         or an array.
4004    3. AT_LOOP: the loop where the vector memref is to be created.
4005    4. OFFSET (optional): an offset to be added to the initial address accessed
4006         by the data-ref in STMT.
4007    5. BSI: location where the new stmts are to be placed if there is no loop
4008    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4009         pointing to the initial address.
4010    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4011         to the initial address accessed by the data-ref in STMT.  This is
4012         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4013         in bytes.
4014
4015    Output:
4016    1. Declare a new ptr to vector_type, and have it point to the base of the
4017       data reference (initial addressed accessed by the data reference).
4018       For example, for vector of type V8HI, the following code is generated:
4019
4020       v8hi *ap;
4021       ap = (v8hi *)initial_address;
4022
4023       if OFFSET is not supplied:
4024          initial_address = &a[init];
4025       if OFFSET is supplied:
4026          initial_address = &a[init + OFFSET];
4027       if BYTE_OFFSET is supplied:
4028          initial_address = &a[init] + BYTE_OFFSET;
4029
4030       Return the initial_address in INITIAL_ADDRESS.
4031
4032    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4033       update the pointer in each iteration of the loop.
4034
4035       Return the increment stmt that updates the pointer in PTR_INCR.
4036
4037    3. Set INV_P to true if the access pattern of the data reference in the
4038       vectorized loop is invariant.  Set it to false otherwise.
4039
4040    4. Return the pointer.  */
4041
4042 tree
4043 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4044                           tree offset, tree *initial_address,
4045                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
4046                           bool only_init, bool *inv_p, tree byte_offset)
4047 {
4048   const char *base_name;
4049   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4050   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4051   struct loop *loop = NULL;
4052   bool nested_in_vect_loop = false;
4053   struct loop *containing_loop = NULL;
4054   tree aggr_ptr_type;
4055   tree aggr_ptr;
4056   tree new_temp;
4057   gimple_seq new_stmt_list = NULL;
4058   edge pe = NULL;
4059   basic_block new_bb;
4060   tree aggr_ptr_init;
4061   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4062   tree aptr;
4063   gimple_stmt_iterator incr_gsi;
4064   bool insert_after;
4065   tree indx_before_incr, indx_after_incr;
4066   gimple incr;
4067   tree step;
4068   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4069
4070   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4071               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4072
4073   if (loop_vinfo)
4074     {
4075       loop = LOOP_VINFO_LOOP (loop_vinfo);
4076       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4077       containing_loop = (gimple_bb (stmt))->loop_father;
4078       pe = loop_preheader_edge (loop);
4079     }
4080   else
4081     {
4082       gcc_assert (bb_vinfo);
4083       only_init = true;
4084       *ptr_incr = NULL;
4085     }
4086
4087   /* Check the step (evolution) of the load in LOOP, and record
4088      whether it's invariant.  */
4089   if (nested_in_vect_loop)
4090     step = STMT_VINFO_DR_STEP (stmt_info);
4091   else
4092     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4093
4094   if (integer_zerop (step))
4095     *inv_p = true;
4096   else
4097     *inv_p = false;
4098
4099   /* Create an expression for the first address accessed by this load
4100      in LOOP.  */
4101   base_name = get_name (DR_BASE_ADDRESS (dr));
4102
4103   if (dump_enabled_p ())
4104     {
4105       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4106       dump_printf_loc (MSG_NOTE, vect_location,
4107                        "create %s-pointer variable to type: ",
4108                        get_tree_code_name (TREE_CODE (aggr_type)));
4109       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4110       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4111         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4112       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4113         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4114       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4115         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4116       else
4117         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4118       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4119       dump_printf (MSG_NOTE, "\n");
4120     }
4121
4122   /* (1) Create the new aggregate-pointer variable.
4123      Vector and array types inherit the alias set of their component
4124      type by default so we need to use a ref-all pointer if the data
4125      reference does not conflict with the created aggregated data
4126      reference because it is not addressable.  */
4127   bool need_ref_all = false;
4128   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4129                               get_alias_set (DR_REF (dr))))
4130     need_ref_all = true;
4131   /* Likewise for any of the data references in the stmt group.  */
4132   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4133     {
4134       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4135       do
4136         {
4137           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4138           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4139           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4140                                       get_alias_set (DR_REF (sdr))))
4141             {
4142               need_ref_all = true;
4143               break;
4144             }
4145           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4146         }
4147       while (orig_stmt);
4148     }
4149   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4150                                                need_ref_all);
4151   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4152
4153
4154   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4155      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4156      def-use update cycles for the pointer: one relative to the outer-loop
4157      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4158      to the inner-loop (which is the inner-most loop containing the dataref),
4159      and this is done be step (5) below.
4160
4161      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4162      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4163      redundant.  Steps (3),(4) create the following:
4164
4165         vp0 = &base_addr;
4166         LOOP:   vp1 = phi(vp0,vp2)
4167                 ...
4168                 ...
4169                 vp2 = vp1 + step
4170                 goto LOOP
4171
4172      If there is an inner-loop nested in loop, then step (5) will also be
4173      applied, and an additional update in the inner-loop will be created:
4174
4175         vp0 = &base_addr;
4176         LOOP:   vp1 = phi(vp0,vp2)
4177                 ...
4178         inner:     vp3 = phi(vp1,vp4)
4179                    vp4 = vp3 + inner_step
4180                    if () goto inner
4181                 ...
4182                 vp2 = vp1 + step
4183                 if () goto LOOP   */
4184
4185   /* (2) Calculate the initial address of the aggregate-pointer, and set
4186      the aggregate-pointer to point to it before the loop.  */
4187
4188   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4189
4190   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4191                                                    offset, loop, byte_offset);
4192   if (new_stmt_list)
4193     {
4194       if (pe)
4195         {
4196           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4197           gcc_assert (!new_bb);
4198         }
4199       else
4200         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4201     }
4202
4203   *initial_address = new_temp;
4204   aggr_ptr_init = new_temp;
4205
4206   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4207      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4208      inner-loop nested in LOOP (during outer-loop vectorization).  */
4209
4210   /* No update in loop is required.  */
4211   if (only_init && (!loop_vinfo || at_loop == loop))
4212     aptr = aggr_ptr_init;
4213   else
4214     {
4215       /* The step of the aggregate pointer is the type size.  */
4216       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4217       /* One exception to the above is when the scalar step of the load in
4218          LOOP is zero. In this case the step here is also zero.  */
4219       if (*inv_p)
4220         iv_step = size_zero_node;
4221       else if (tree_int_cst_sgn (step) == -1)
4222         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4223
4224       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4225
4226       create_iv (aggr_ptr_init,
4227                  fold_convert (aggr_ptr_type, iv_step),
4228                  aggr_ptr, loop, &incr_gsi, insert_after,
4229                  &indx_before_incr, &indx_after_incr);
4230       incr = gsi_stmt (incr_gsi);
4231       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4232
4233       /* Copy the points-to information if it exists. */
4234       if (DR_PTR_INFO (dr))
4235         {
4236           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4237           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4238         }
4239       if (ptr_incr)
4240         *ptr_incr = incr;
4241
4242       aptr = indx_before_incr;
4243     }
4244
4245   if (!nested_in_vect_loop || only_init)
4246     return aptr;
4247
4248
4249   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4250      nested in LOOP, if exists.  */
4251
4252   gcc_assert (nested_in_vect_loop);
4253   if (!only_init)
4254     {
4255       standard_iv_increment_position (containing_loop, &incr_gsi,
4256                                       &insert_after);
4257       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4258                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4259                  &indx_after_incr);
4260       incr = gsi_stmt (incr_gsi);
4261       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4262
4263       /* Copy the points-to information if it exists. */
4264       if (DR_PTR_INFO (dr))
4265         {
4266           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4267           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4268         }
4269       if (ptr_incr)
4270         *ptr_incr = incr;
4271
4272       return indx_before_incr;
4273     }
4274   else
4275     gcc_unreachable ();
4276 }
4277
4278
4279 /* Function bump_vector_ptr
4280
4281    Increment a pointer (to a vector type) by vector-size. If requested,
4282    i.e. if PTR-INCR is given, then also connect the new increment stmt
4283    to the existing def-use update-chain of the pointer, by modifying
4284    the PTR_INCR as illustrated below:
4285
4286    The pointer def-use update-chain before this function:
4287                         DATAREF_PTR = phi (p_0, p_2)
4288                         ....
4289         PTR_INCR:       p_2 = DATAREF_PTR + step
4290
4291    The pointer def-use update-chain after this function:
4292                         DATAREF_PTR = phi (p_0, p_2)
4293                         ....
4294                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4295                         ....
4296         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4297
4298    Input:
4299    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4300                  in the loop.
4301    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4302               the loop.  The increment amount across iterations is expected
4303               to be vector_size.
4304    BSI - location where the new update stmt is to be placed.
4305    STMT - the original scalar memory-access stmt that is being vectorized.
4306    BUMP - optional. The offset by which to bump the pointer. If not given,
4307           the offset is assumed to be vector_size.
4308
4309    Output: Return NEW_DATAREF_PTR as illustrated above.
4310
4311 */
4312
4313 tree
4314 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4315                  gimple stmt, tree bump)
4316 {
4317   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4318   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4319   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4320   tree update = TYPE_SIZE_UNIT (vectype);
4321   gassign *incr_stmt;
4322   ssa_op_iter iter;
4323   use_operand_p use_p;
4324   tree new_dataref_ptr;
4325
4326   if (bump)
4327     update = bump;
4328
4329   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4330     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4331   else
4332     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4333   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4334                                    dataref_ptr, update);
4335   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4336
4337   /* Copy the points-to information if it exists. */
4338   if (DR_PTR_INFO (dr))
4339     {
4340       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4341       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4342     }
4343
4344   if (!ptr_incr)
4345     return new_dataref_ptr;
4346
4347   /* Update the vector-pointer's cross-iteration increment.  */
4348   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4349     {
4350       tree use = USE_FROM_PTR (use_p);
4351
4352       if (use == dataref_ptr)
4353         SET_USE (use_p, new_dataref_ptr);
4354       else
4355         gcc_assert (tree_int_cst_compare (use, update) == 0);
4356     }
4357
4358   return new_dataref_ptr;
4359 }
4360
4361
4362 /* Function vect_create_destination_var.
4363
4364    Create a new temporary of type VECTYPE.  */
4365
4366 tree
4367 vect_create_destination_var (tree scalar_dest, tree vectype)
4368 {
4369   tree vec_dest;
4370   const char *name;
4371   char *new_name;
4372   tree type;
4373   enum vect_var_kind kind;
4374
4375   kind = vectype ? vect_simple_var : vect_scalar_var;
4376   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4377
4378   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4379
4380   name = get_name (scalar_dest);
4381   if (name)
4382     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4383   else
4384     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4385   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4386   free (new_name);
4387
4388   return vec_dest;
4389 }
4390
4391 /* Function vect_grouped_store_supported.
4392
4393    Returns TRUE if interleave high and interleave low permutations
4394    are supported, and FALSE otherwise.  */
4395
4396 bool
4397 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4398 {
4399   machine_mode mode = TYPE_MODE (vectype);
4400
4401   /* vect_permute_store_chain requires the group size to be equal to 3 or
4402      be a power of two.  */
4403   if (count != 3 && exact_log2 (count) == -1)
4404     {
4405       if (dump_enabled_p ())
4406         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4407                          "the size of the group of accesses"
4408                          " is not a power of 2 or not eqaul to 3\n");
4409       return false;
4410     }
4411
4412   /* Check that the permutation is supported.  */
4413   if (VECTOR_MODE_P (mode))
4414     {
4415       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4416       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4417
4418       if (count == 3)
4419         {
4420           unsigned int j0 = 0, j1 = 0, j2 = 0;
4421           unsigned int i, j;
4422
4423           for (j = 0; j < 3; j++)
4424             {
4425               int nelt0 = ((3 - j) * nelt) % 3;
4426               int nelt1 = ((3 - j) * nelt + 1) % 3;
4427               int nelt2 = ((3 - j) * nelt + 2) % 3;
4428               for (i = 0; i < nelt; i++)
4429                 {
4430                   if (3 * i + nelt0 < nelt)
4431                     sel[3 * i + nelt0] = j0++;
4432                   if (3 * i + nelt1 < nelt)
4433                     sel[3 * i + nelt1] = nelt + j1++;
4434                   if (3 * i + nelt2 < nelt)
4435                     sel[3 * i + nelt2] = 0;
4436                 }
4437               if (!can_vec_perm_p (mode, false, sel))
4438                 {
4439                   if (dump_enabled_p ())
4440                     dump_printf (MSG_MISSED_OPTIMIZATION,
4441                                  "permutaion op not supported by target.\n");
4442                   return false;
4443                 }
4444
4445               for (i = 0; i < nelt; i++)
4446                 {
4447                   if (3 * i + nelt0 < nelt)
4448                     sel[3 * i + nelt0] = 3 * i + nelt0;
4449                   if (3 * i + nelt1 < nelt)
4450                     sel[3 * i + nelt1] = 3 * i + nelt1;
4451                   if (3 * i + nelt2 < nelt)
4452                     sel[3 * i + nelt2] = nelt + j2++;
4453                 }
4454               if (!can_vec_perm_p (mode, false, sel))
4455                 {
4456                   if (dump_enabled_p ())
4457                     dump_printf (MSG_MISSED_OPTIMIZATION,
4458                                  "permutaion op not supported by target.\n");
4459                   return false;
4460                 }
4461             }
4462           return true;
4463         }
4464       else
4465         {
4466           /* If length is not equal to 3 then only power of 2 is supported.  */
4467           gcc_assert (exact_log2 (count) != -1);
4468
4469           for (i = 0; i < nelt / 2; i++)
4470             {
4471               sel[i * 2] = i;
4472               sel[i * 2 + 1] = i + nelt;
4473             }
4474             if (can_vec_perm_p (mode, false, sel))
4475               {
4476                 for (i = 0; i < nelt; i++)
4477                   sel[i] += nelt / 2;
4478                 if (can_vec_perm_p (mode, false, sel))
4479                   return true;
4480               }
4481         }
4482     }
4483
4484   if (dump_enabled_p ())
4485     dump_printf (MSG_MISSED_OPTIMIZATION,
4486                  "permutaion op not supported by target.\n");
4487   return false;
4488 }
4489
4490
4491 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4492    type VECTYPE.  */
4493
4494 bool
4495 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4496 {
4497   return vect_lanes_optab_supported_p ("vec_store_lanes",
4498                                        vec_store_lanes_optab,
4499                                        vectype, count);
4500 }
4501
4502
4503 /* Function vect_permute_store_chain.
4504
4505    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4506    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4507    the data correctly for the stores.  Return the final references for stores
4508    in RESULT_CHAIN.
4509
4510    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4511    The input is 4 vectors each containing 8 elements.  We assign a number to
4512    each element, the input sequence is:
4513
4514    1st vec:   0  1  2  3  4  5  6  7
4515    2nd vec:   8  9 10 11 12 13 14 15
4516    3rd vec:  16 17 18 19 20 21 22 23
4517    4th vec:  24 25 26 27 28 29 30 31
4518
4519    The output sequence should be:
4520
4521    1st vec:  0  8 16 24  1  9 17 25
4522    2nd vec:  2 10 18 26  3 11 19 27
4523    3rd vec:  4 12 20 28  5 13 21 30
4524    4th vec:  6 14 22 30  7 15 23 31
4525
4526    i.e., we interleave the contents of the four vectors in their order.
4527
4528    We use interleave_high/low instructions to create such output.  The input of
4529    each interleave_high/low operation is two vectors:
4530    1st vec    2nd vec
4531    0 1 2 3    4 5 6 7
4532    the even elements of the result vector are obtained left-to-right from the
4533    high/low elements of the first vector.  The odd elements of the result are
4534    obtained left-to-right from the high/low elements of the second vector.
4535    The output of interleave_high will be:   0 4 1 5
4536    and of interleave_low:                   2 6 3 7
4537
4538
4539    The permutation is done in log LENGTH stages.  In each stage interleave_high
4540    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4541    where the first argument is taken from the first half of DR_CHAIN and the
4542    second argument from it's second half.
4543    In our example,
4544
4545    I1: interleave_high (1st vec, 3rd vec)
4546    I2: interleave_low (1st vec, 3rd vec)
4547    I3: interleave_high (2nd vec, 4th vec)
4548    I4: interleave_low (2nd vec, 4th vec)
4549
4550    The output for the first stage is:
4551
4552    I1:  0 16  1 17  2 18  3 19
4553    I2:  4 20  5 21  6 22  7 23
4554    I3:  8 24  9 25 10 26 11 27
4555    I4: 12 28 13 29 14 30 15 31
4556
4557    The output of the second stage, i.e. the final result is:
4558
4559    I1:  0  8 16 24  1  9 17 25
4560    I2:  2 10 18 26  3 11 19 27
4561    I3:  4 12 20 28  5 13 21 30
4562    I4:  6 14 22 30  7 15 23 31.  */
4563
4564 void
4565 vect_permute_store_chain (vec<tree> dr_chain,
4566                           unsigned int length,
4567                           gimple stmt,
4568                           gimple_stmt_iterator *gsi,
4569                           vec<tree> *result_chain)
4570 {
4571   tree vect1, vect2, high, low;
4572   gimple perm_stmt;
4573   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4574   tree perm_mask_low, perm_mask_high;
4575   tree data_ref;
4576   tree perm3_mask_low, perm3_mask_high;
4577   unsigned int i, n, log_length = exact_log2 (length);
4578   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4579   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4580
4581   result_chain->quick_grow (length);
4582   memcpy (result_chain->address (), dr_chain.address (),
4583           length * sizeof (tree));
4584
4585   if (length == 3)
4586     {
4587       unsigned int j0 = 0, j1 = 0, j2 = 0;
4588
4589       for (j = 0; j < 3; j++)
4590         {
4591           int nelt0 = ((3 - j) * nelt) % 3;
4592           int nelt1 = ((3 - j) * nelt + 1) % 3;
4593           int nelt2 = ((3 - j) * nelt + 2) % 3;
4594
4595           for (i = 0; i < nelt; i++)
4596             {
4597               if (3 * i + nelt0 < nelt)
4598                 sel[3 * i + nelt0] = j0++;
4599               if (3 * i + nelt1 < nelt)
4600                 sel[3 * i + nelt1] = nelt + j1++;
4601               if (3 * i + nelt2 < nelt)
4602                 sel[3 * i + nelt2] = 0;
4603             }
4604           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4605
4606           for (i = 0; i < nelt; i++)
4607             {
4608               if (3 * i + nelt0 < nelt)
4609                 sel[3 * i + nelt0] = 3 * i + nelt0;
4610               if (3 * i + nelt1 < nelt)
4611                 sel[3 * i + nelt1] = 3 * i + nelt1;
4612               if (3 * i + nelt2 < nelt)
4613                 sel[3 * i + nelt2] = nelt + j2++;
4614             }
4615           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4616
4617           vect1 = dr_chain[0];
4618           vect2 = dr_chain[1];
4619
4620           /* Create interleaving stmt:
4621              low = VEC_PERM_EXPR <vect1, vect2,
4622                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4623                                    j + 2, nelt + j + 2, *, ...}>  */
4624           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4625           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4626                                            vect2, perm3_mask_low);
4627           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4628
4629           vect1 = data_ref;
4630           vect2 = dr_chain[2];
4631           /* Create interleaving stmt:
4632              low = VEC_PERM_EXPR <vect1, vect2,
4633                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4634                                    6, 7, nelt + j + 2, ...}>  */
4635           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4636           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4637                                            vect2, perm3_mask_high);
4638           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4639           (*result_chain)[j] = data_ref;
4640         }
4641     }
4642   else
4643     {
4644       /* If length is not equal to 3 then only power of 2 is supported.  */
4645       gcc_assert (exact_log2 (length) != -1);
4646
4647       for (i = 0, n = nelt / 2; i < n; i++)
4648         {
4649           sel[i * 2] = i;
4650           sel[i * 2 + 1] = i + nelt;
4651         }
4652         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4653
4654         for (i = 0; i < nelt; i++)
4655           sel[i] += nelt / 2;
4656         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4657
4658         for (i = 0, n = log_length; i < n; i++)
4659           {
4660             for (j = 0; j < length/2; j++)
4661               {
4662                 vect1 = dr_chain[j];
4663                 vect2 = dr_chain[j+length/2];
4664
4665                 /* Create interleaving stmt:
4666                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4667                                                         ...}>  */
4668                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4669                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4670                                                  vect2, perm_mask_high);
4671                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4672                 (*result_chain)[2*j] = high;
4673
4674                 /* Create interleaving stmt:
4675                    low = VEC_PERM_EXPR <vect1, vect2,
4676                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4677                                          ...}>  */
4678                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4679                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4680                                                  vect2, perm_mask_low);
4681                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4682                 (*result_chain)[2*j+1] = low;
4683               }
4684             memcpy (dr_chain.address (), result_chain->address (),
4685                     length * sizeof (tree));
4686           }
4687     }
4688 }
4689
4690 /* Function vect_setup_realignment
4691
4692    This function is called when vectorizing an unaligned load using
4693    the dr_explicit_realign[_optimized] scheme.
4694    This function generates the following code at the loop prolog:
4695
4696       p = initial_addr;
4697    x  msq_init = *(floor(p));   # prolog load
4698       realignment_token = call target_builtin;
4699     loop:
4700    x  msq = phi (msq_init, ---)
4701
4702    The stmts marked with x are generated only for the case of
4703    dr_explicit_realign_optimized.
4704
4705    The code above sets up a new (vector) pointer, pointing to the first
4706    location accessed by STMT, and a "floor-aligned" load using that pointer.
4707    It also generates code to compute the "realignment-token" (if the relevant
4708    target hook was defined), and creates a phi-node at the loop-header bb
4709    whose arguments are the result of the prolog-load (created by this
4710    function) and the result of a load that takes place in the loop (to be
4711    created by the caller to this function).
4712
4713    For the case of dr_explicit_realign_optimized:
4714    The caller to this function uses the phi-result (msq) to create the
4715    realignment code inside the loop, and sets up the missing phi argument,
4716    as follows:
4717     loop:
4718       msq = phi (msq_init, lsq)
4719       lsq = *(floor(p'));        # load in loop
4720       result = realign_load (msq, lsq, realignment_token);
4721
4722    For the case of dr_explicit_realign:
4723     loop:
4724       msq = *(floor(p));        # load in loop
4725       p' = p + (VS-1);
4726       lsq = *(floor(p'));       # load in loop
4727       result = realign_load (msq, lsq, realignment_token);
4728
4729    Input:
4730    STMT - (scalar) load stmt to be vectorized. This load accesses
4731           a memory location that may be unaligned.
4732    BSI - place where new code is to be inserted.
4733    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4734                               is used.
4735
4736    Output:
4737    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4738                        target hook, if defined.
4739    Return value - the result of the loop-header phi node.  */
4740
4741 tree
4742 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4743                         tree *realignment_token,
4744                         enum dr_alignment_support alignment_support_scheme,
4745                         tree init_addr,
4746                         struct loop **at_loop)
4747 {
4748   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4749   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4750   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4751   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4752   struct loop *loop = NULL;
4753   edge pe = NULL;
4754   tree scalar_dest = gimple_assign_lhs (stmt);
4755   tree vec_dest;
4756   gimple inc;
4757   tree ptr;
4758   tree data_ref;
4759   basic_block new_bb;
4760   tree msq_init = NULL_TREE;
4761   tree new_temp;
4762   gphi *phi_stmt;
4763   tree msq = NULL_TREE;
4764   gimple_seq stmts = NULL;
4765   bool inv_p;
4766   bool compute_in_loop = false;
4767   bool nested_in_vect_loop = false;
4768   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4769   struct loop *loop_for_initial_load = NULL;
4770
4771   if (loop_vinfo)
4772     {
4773       loop = LOOP_VINFO_LOOP (loop_vinfo);
4774       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4775     }
4776
4777   gcc_assert (alignment_support_scheme == dr_explicit_realign
4778               || alignment_support_scheme == dr_explicit_realign_optimized);
4779
4780   /* We need to generate three things:
4781      1. the misalignment computation
4782      2. the extra vector load (for the optimized realignment scheme).
4783      3. the phi node for the two vectors from which the realignment is
4784       done (for the optimized realignment scheme).  */
4785
4786   /* 1. Determine where to generate the misalignment computation.
4787
4788      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4789      calculation will be generated by this function, outside the loop (in the
4790      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4791      caller, inside the loop.
4792
4793      Background: If the misalignment remains fixed throughout the iterations of
4794      the loop, then both realignment schemes are applicable, and also the
4795      misalignment computation can be done outside LOOP.  This is because we are
4796      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4797      are a multiple of VS (the Vector Size), and therefore the misalignment in
4798      different vectorized LOOP iterations is always the same.
4799      The problem arises only if the memory access is in an inner-loop nested
4800      inside LOOP, which is now being vectorized using outer-loop vectorization.
4801      This is the only case when the misalignment of the memory access may not
4802      remain fixed throughout the iterations of the inner-loop (as explained in
4803      detail in vect_supportable_dr_alignment).  In this case, not only is the
4804      optimized realignment scheme not applicable, but also the misalignment
4805      computation (and generation of the realignment token that is passed to
4806      REALIGN_LOAD) have to be done inside the loop.
4807
4808      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4809      or not, which in turn determines if the misalignment is computed inside
4810      the inner-loop, or outside LOOP.  */
4811
4812   if (init_addr != NULL_TREE || !loop_vinfo)
4813     {
4814       compute_in_loop = true;
4815       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4816     }
4817
4818
4819   /* 2. Determine where to generate the extra vector load.
4820
4821      For the optimized realignment scheme, instead of generating two vector
4822      loads in each iteration, we generate a single extra vector load in the
4823      preheader of the loop, and in each iteration reuse the result of the
4824      vector load from the previous iteration.  In case the memory access is in
4825      an inner-loop nested inside LOOP, which is now being vectorized using
4826      outer-loop vectorization, we need to determine whether this initial vector
4827      load should be generated at the preheader of the inner-loop, or can be
4828      generated at the preheader of LOOP.  If the memory access has no evolution
4829      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4830      to be generated inside LOOP (in the preheader of the inner-loop).  */
4831
4832   if (nested_in_vect_loop)
4833     {
4834       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4835       bool invariant_in_outerloop =
4836             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4837       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4838     }
4839   else
4840     loop_for_initial_load = loop;
4841   if (at_loop)
4842     *at_loop = loop_for_initial_load;
4843
4844   if (loop_for_initial_load)
4845     pe = loop_preheader_edge (loop_for_initial_load);
4846
4847   /* 3. For the case of the optimized realignment, create the first vector
4848       load at the loop preheader.  */
4849
4850   if (alignment_support_scheme == dr_explicit_realign_optimized)
4851     {
4852       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4853       gassign *new_stmt;
4854
4855       gcc_assert (!compute_in_loop);
4856       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4857       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4858                                       NULL_TREE, &init_addr, NULL, &inc,
4859                                       true, &inv_p);
4860       new_temp = copy_ssa_name (ptr);
4861       new_stmt = gimple_build_assign
4862                    (new_temp, BIT_AND_EXPR, ptr,
4863                     build_int_cst (TREE_TYPE (ptr),
4864                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4865       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4866       gcc_assert (!new_bb);
4867       data_ref
4868         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4869                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4870       new_stmt = gimple_build_assign (vec_dest, data_ref);
4871       new_temp = make_ssa_name (vec_dest, new_stmt);
4872       gimple_assign_set_lhs (new_stmt, new_temp);
4873       if (pe)
4874         {
4875           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4876           gcc_assert (!new_bb);
4877         }
4878       else
4879          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4880
4881       msq_init = gimple_assign_lhs (new_stmt);
4882     }
4883
4884   /* 4. Create realignment token using a target builtin, if available.
4885       It is done either inside the containing loop, or before LOOP (as
4886       determined above).  */
4887
4888   if (targetm.vectorize.builtin_mask_for_load)
4889     {
4890       gcall *new_stmt;
4891       tree builtin_decl;
4892
4893       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4894       if (!init_addr)
4895         {
4896           /* Generate the INIT_ADDR computation outside LOOP.  */
4897           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4898                                                         NULL_TREE, loop);
4899           if (loop)
4900             {
4901               pe = loop_preheader_edge (loop);
4902               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4903               gcc_assert (!new_bb);
4904             }
4905           else
4906              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4907         }
4908
4909       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4910       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4911       vec_dest =
4912         vect_create_destination_var (scalar_dest,
4913                                      gimple_call_return_type (new_stmt));
4914       new_temp = make_ssa_name (vec_dest, new_stmt);
4915       gimple_call_set_lhs (new_stmt, new_temp);
4916
4917       if (compute_in_loop)
4918         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4919       else
4920         {
4921           /* Generate the misalignment computation outside LOOP.  */
4922           pe = loop_preheader_edge (loop);
4923           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4924           gcc_assert (!new_bb);
4925         }
4926
4927       *realignment_token = gimple_call_lhs (new_stmt);
4928
4929       /* The result of the CALL_EXPR to this builtin is determined from
4930          the value of the parameter and no global variables are touched
4931          which makes the builtin a "const" function.  Requiring the
4932          builtin to have the "const" attribute makes it unnecessary
4933          to call mark_call_clobbered.  */
4934       gcc_assert (TREE_READONLY (builtin_decl));
4935     }
4936
4937   if (alignment_support_scheme == dr_explicit_realign)
4938     return msq;
4939
4940   gcc_assert (!compute_in_loop);
4941   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4942
4943
4944   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4945
4946   pe = loop_preheader_edge (containing_loop);
4947   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4948   msq = make_ssa_name (vec_dest);
4949   phi_stmt = create_phi_node (msq, containing_loop->header);
4950   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4951
4952   return msq;
4953 }
4954
4955
4956 /* Function vect_grouped_load_supported.
4957
4958    Returns TRUE if even and odd permutations are supported,
4959    and FALSE otherwise.  */
4960
4961 bool
4962 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4963 {
4964   machine_mode mode = TYPE_MODE (vectype);
4965
4966   /* vect_permute_load_chain requires the group size to be equal to 3 or
4967      be a power of two.  */
4968   if (count != 3 && exact_log2 (count) == -1)
4969     {
4970       if (dump_enabled_p ())
4971         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4972                          "the size of the group of accesses"
4973                          " is not a power of 2 or not equal to 3\n");
4974       return false;
4975     }
4976
4977   /* Check that the permutation is supported.  */
4978   if (VECTOR_MODE_P (mode))
4979     {
4980       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4981       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4982
4983       if (count == 3)
4984         {
4985           unsigned int k;
4986           for (k = 0; k < 3; k++)
4987             {
4988               for (i = 0; i < nelt; i++)
4989                 if (3 * i + k < 2 * nelt)
4990                   sel[i] = 3 * i + k;
4991                 else
4992                   sel[i] = 0;
4993               if (!can_vec_perm_p (mode, false, sel))
4994                 {
4995                   if (dump_enabled_p ())
4996                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4997                                      "shuffle of 3 loads is not supported by"
4998                                      " target\n");
4999                   return false;
5000                 }
5001               for (i = 0, j = 0; i < nelt; i++)
5002                 if (3 * i + k < 2 * nelt)
5003                   sel[i] = i;
5004                 else
5005                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5006               if (!can_vec_perm_p (mode, false, sel))
5007                 {
5008                   if (dump_enabled_p ())
5009                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5010                                      "shuffle of 3 loads is not supported by"
5011                                      " target\n");
5012                   return false;
5013                 }
5014             }
5015           return true;
5016         }
5017       else
5018         {
5019           /* If length is not equal to 3 then only power of 2 is supported.  */
5020           gcc_assert (exact_log2 (count) != -1);
5021           for (i = 0; i < nelt; i++)
5022             sel[i] = i * 2;
5023           if (can_vec_perm_p (mode, false, sel))
5024             {
5025               for (i = 0; i < nelt; i++)
5026                 sel[i] = i * 2 + 1;
5027               if (can_vec_perm_p (mode, false, sel))
5028                 return true;
5029             }
5030         }
5031     }
5032
5033   if (dump_enabled_p ())
5034     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5035                      "extract even/odd not supported by target\n");
5036   return false;
5037 }
5038
5039 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5040    type VECTYPE.  */
5041
5042 bool
5043 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5044 {
5045   return vect_lanes_optab_supported_p ("vec_load_lanes",
5046                                        vec_load_lanes_optab,
5047                                        vectype, count);
5048 }
5049
5050 /* Function vect_permute_load_chain.
5051
5052    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5053    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5054    the input data correctly.  Return the final references for loads in
5055    RESULT_CHAIN.
5056
5057    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5058    The input is 4 vectors each containing 8 elements. We assign a number to each
5059    element, the input sequence is:
5060
5061    1st vec:   0  1  2  3  4  5  6  7
5062    2nd vec:   8  9 10 11 12 13 14 15
5063    3rd vec:  16 17 18 19 20 21 22 23
5064    4th vec:  24 25 26 27 28 29 30 31
5065
5066    The output sequence should be:
5067
5068    1st vec:  0 4  8 12 16 20 24 28
5069    2nd vec:  1 5  9 13 17 21 25 29
5070    3rd vec:  2 6 10 14 18 22 26 30
5071    4th vec:  3 7 11 15 19 23 27 31
5072
5073    i.e., the first output vector should contain the first elements of each
5074    interleaving group, etc.
5075
5076    We use extract_even/odd instructions to create such output.  The input of
5077    each extract_even/odd operation is two vectors
5078    1st vec    2nd vec
5079    0 1 2 3    4 5 6 7
5080
5081    and the output is the vector of extracted even/odd elements.  The output of
5082    extract_even will be:   0 2 4 6
5083    and of extract_odd:     1 3 5 7
5084
5085
5086    The permutation is done in log LENGTH stages.  In each stage extract_even
5087    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5088    their order.  In our example,
5089
5090    E1: extract_even (1st vec, 2nd vec)
5091    E2: extract_odd (1st vec, 2nd vec)
5092    E3: extract_even (3rd vec, 4th vec)
5093    E4: extract_odd (3rd vec, 4th vec)
5094
5095    The output for the first stage will be:
5096
5097    E1:  0  2  4  6  8 10 12 14
5098    E2:  1  3  5  7  9 11 13 15
5099    E3: 16 18 20 22 24 26 28 30
5100    E4: 17 19 21 23 25 27 29 31
5101
5102    In order to proceed and create the correct sequence for the next stage (or
5103    for the correct output, if the second stage is the last one, as in our
5104    example), we first put the output of extract_even operation and then the
5105    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5106    The input for the second stage is:
5107
5108    1st vec (E1):  0  2  4  6  8 10 12 14
5109    2nd vec (E3): 16 18 20 22 24 26 28 30
5110    3rd vec (E2):  1  3  5  7  9 11 13 15
5111    4th vec (E4): 17 19 21 23 25 27 29 31
5112
5113    The output of the second stage:
5114
5115    E1: 0 4  8 12 16 20 24 28
5116    E2: 2 6 10 14 18 22 26 30
5117    E3: 1 5  9 13 17 21 25 29
5118    E4: 3 7 11 15 19 23 27 31
5119
5120    And RESULT_CHAIN after reordering:
5121
5122    1st vec (E1):  0 4  8 12 16 20 24 28
5123    2nd vec (E3):  1 5  9 13 17 21 25 29
5124    3rd vec (E2):  2 6 10 14 18 22 26 30
5125    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5126
5127 static void
5128 vect_permute_load_chain (vec<tree> dr_chain,
5129                          unsigned int length,
5130                          gimple stmt,
5131                          gimple_stmt_iterator *gsi,
5132                          vec<tree> *result_chain)
5133 {
5134   tree data_ref, first_vect, second_vect;
5135   tree perm_mask_even, perm_mask_odd;
5136   tree perm3_mask_low, perm3_mask_high;
5137   gimple perm_stmt;
5138   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5139   unsigned int i, j, log_length = exact_log2 (length);
5140   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5141   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5142
5143   result_chain->quick_grow (length);
5144   memcpy (result_chain->address (), dr_chain.address (),
5145           length * sizeof (tree));
5146
5147   if (length == 3)
5148     {
5149       unsigned int k;
5150
5151       for (k = 0; k < 3; k++)
5152         {
5153           for (i = 0; i < nelt; i++)
5154             if (3 * i + k < 2 * nelt)
5155               sel[i] = 3 * i + k;
5156             else
5157               sel[i] = 0;
5158           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5159
5160           for (i = 0, j = 0; i < nelt; i++)
5161             if (3 * i + k < 2 * nelt)
5162               sel[i] = i;
5163             else
5164               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5165
5166           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5167
5168           first_vect = dr_chain[0];
5169           second_vect = dr_chain[1];
5170
5171           /* Create interleaving stmt (low part of):
5172              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5173                                                              ...}>  */
5174           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5175           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5176                                            second_vect, perm3_mask_low);
5177           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5178
5179           /* Create interleaving stmt (high part of):
5180              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5181                                                               ...}>  */
5182           first_vect = data_ref;
5183           second_vect = dr_chain[2];
5184           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5185           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5186                                            second_vect, perm3_mask_high);
5187           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5188           (*result_chain)[k] = data_ref;
5189         }
5190     }
5191   else
5192     {
5193       /* If length is not equal to 3 then only power of 2 is supported.  */
5194       gcc_assert (exact_log2 (length) != -1);
5195
5196       for (i = 0; i < nelt; ++i)
5197         sel[i] = i * 2;
5198       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5199
5200       for (i = 0; i < nelt; ++i)
5201         sel[i] = i * 2 + 1;
5202       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5203
5204       for (i = 0; i < log_length; i++)
5205         {
5206           for (j = 0; j < length; j += 2)
5207             {
5208               first_vect = dr_chain[j];
5209               second_vect = dr_chain[j+1];
5210
5211               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5212               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5213               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5214                                                first_vect, second_vect,
5215                                                perm_mask_even);
5216               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5217               (*result_chain)[j/2] = data_ref;
5218
5219               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5220               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5221               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5222                                                first_vect, second_vect,
5223                                                perm_mask_odd);
5224               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5225               (*result_chain)[j/2+length/2] = data_ref;
5226             }
5227           memcpy (dr_chain.address (), result_chain->address (),
5228                   length * sizeof (tree));
5229         }
5230     }
5231 }
5232
5233 /* Function vect_shift_permute_load_chain.
5234
5235    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5236    sequence of stmts to reorder the input data accordingly.
5237    Return the final references for loads in RESULT_CHAIN.
5238    Return true if successed, false otherwise.
5239
5240    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5241    The input is 3 vectors each containing 8 elements.  We assign a
5242    number to each element, the input sequence is:
5243
5244    1st vec:   0  1  2  3  4  5  6  7
5245    2nd vec:   8  9 10 11 12 13 14 15
5246    3rd vec:  16 17 18 19 20 21 22 23
5247
5248    The output sequence should be:
5249
5250    1st vec:  0 3 6  9 12 15 18 21
5251    2nd vec:  1 4 7 10 13 16 19 22
5252    3rd vec:  2 5 8 11 14 17 20 23
5253
5254    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5255
5256    First we shuffle all 3 vectors to get correct elements order:
5257
5258    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5259    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5260    3rd vec:  (16 19 22) (17 20 23) (18 21)
5261
5262    Next we unite and shift vector 3 times:
5263
5264    1st step:
5265      shift right by 6 the concatenation of:
5266      "1st vec" and  "2nd vec"
5267        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5268      "2nd vec" and  "3rd vec"
5269        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5270      "3rd vec" and  "1st vec"
5271        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5272                              | New vectors                   |
5273
5274      So that now new vectors are:
5275
5276      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5277      2nd vec:  (10 13) (16 19 22) (17 20 23)
5278      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5279
5280    2nd step:
5281      shift right by 5 the concatenation of:
5282      "1st vec" and  "3rd vec"
5283        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5284      "2nd vec" and  "1st vec"
5285        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5286      "3rd vec" and  "2nd vec"
5287        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5288                           | New vectors                   |
5289
5290      So that now new vectors are:
5291
5292      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5293      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5294      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5295
5296    3rd step:
5297      shift right by 5 the concatenation of:
5298      "1st vec" and  "1st vec"
5299        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5300      shift right by 3 the concatenation of:
5301      "2nd vec" and  "2nd vec"
5302                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5303                           | New vectors                   |
5304
5305      So that now all vectors are READY:
5306      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5307      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5308      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5309
5310    This algorithm is faster than one in vect_permute_load_chain if:
5311      1.  "shift of a concatination" is faster than general permutation.
5312          This is usually so.
5313      2.  The TARGET machine can't execute vector instructions in parallel.
5314          This is because each step of the algorithm depends on previous.
5315          The algorithm in vect_permute_load_chain is much more parallel.
5316
5317    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5318 */
5319
5320 static bool
5321 vect_shift_permute_load_chain (vec<tree> dr_chain,
5322                                unsigned int length,
5323                                gimple stmt,
5324                                gimple_stmt_iterator *gsi,
5325                                vec<tree> *result_chain)
5326 {
5327   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5328   tree perm2_mask1, perm2_mask2, perm3_mask;
5329   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5330   gimple perm_stmt;
5331
5332   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5333   unsigned int i;
5334   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5335   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5336   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5337   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5338
5339   result_chain->quick_grow (length);
5340   memcpy (result_chain->address (), dr_chain.address (),
5341           length * sizeof (tree));
5342
5343   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5344     {
5345       unsigned int j, log_length = exact_log2 (length);
5346       for (i = 0; i < nelt / 2; ++i)
5347         sel[i] = i * 2;
5348       for (i = 0; i < nelt / 2; ++i)
5349         sel[nelt / 2 + i] = i * 2 + 1;
5350       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5351         {
5352           if (dump_enabled_p ())
5353             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5354                              "shuffle of 2 fields structure is not \
5355                               supported by target\n");
5356           return false;
5357         }
5358       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5359
5360       for (i = 0; i < nelt / 2; ++i)
5361         sel[i] = i * 2 + 1;
5362       for (i = 0; i < nelt / 2; ++i)
5363         sel[nelt / 2 + i] = i * 2;
5364       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5365         {
5366           if (dump_enabled_p ())
5367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5368                              "shuffle of 2 fields structure is not \
5369                               supported by target\n");
5370           return false;
5371         }
5372       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5373
5374       /* Generating permutation constant to shift all elements.
5375          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5376       for (i = 0; i < nelt; i++)
5377         sel[i] = nelt / 2 + i;
5378       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5379         {
5380           if (dump_enabled_p ())
5381             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5382                              "shift permutation is not supported by target\n");
5383           return false;
5384         }
5385       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5386
5387       /* Generating permutation constant to select vector from 2.
5388          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5389       for (i = 0; i < nelt / 2; i++)
5390         sel[i] = i;
5391       for (i = nelt / 2; i < nelt; i++)
5392         sel[i] = nelt + i;
5393       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5394         {
5395           if (dump_enabled_p ())
5396             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5397                              "select is not supported by target\n");
5398           return false;
5399         }
5400       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5401
5402       for (i = 0; i < log_length; i++)
5403         {
5404           for (j = 0; j < length; j += 2)
5405             {
5406               first_vect = dr_chain[j];
5407               second_vect = dr_chain[j + 1];
5408
5409               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5410               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5411                                                first_vect, first_vect,
5412                                                perm2_mask1);
5413               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5414               vect[0] = data_ref;
5415
5416               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5417               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5418                                                second_vect, second_vect,
5419                                                perm2_mask2);
5420               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5421               vect[1] = data_ref;
5422
5423               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5424               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5425                                                vect[0], vect[1], shift1_mask);
5426               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5427               (*result_chain)[j/2 + length/2] = data_ref;
5428
5429               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5430               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5431                                                vect[0], vect[1], select_mask);
5432               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5433               (*result_chain)[j/2] = data_ref;
5434             }
5435           memcpy (dr_chain.address (), result_chain->address (),
5436                   length * sizeof (tree));
5437         }
5438       return true;
5439     }
5440   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5441     {
5442       unsigned int k = 0, l = 0;
5443
5444       /* Generating permutation constant to get all elements in rigth order.
5445          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5446       for (i = 0; i < nelt; i++)
5447         {
5448           if (3 * k + (l % 3) >= nelt)
5449             {
5450               k = 0;
5451               l += (3 - (nelt % 3));
5452             }
5453           sel[i] = 3 * k + (l % 3);
5454           k++;
5455         }
5456       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5457         {
5458           if (dump_enabled_p ())
5459             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5460                              "shuffle of 3 fields structure is not \
5461                               supported by target\n");
5462           return false;
5463         }
5464       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5465
5466       /* Generating permutation constant to shift all elements.
5467          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5468       for (i = 0; i < nelt; i++)
5469         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5470       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5471         {
5472           if (dump_enabled_p ())
5473             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5474                              "shift permutation is not supported by target\n");
5475           return false;
5476         }
5477       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5478
5479       /* Generating permutation constant to shift all elements.
5480          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5481       for (i = 0; i < nelt; i++)
5482         sel[i] = 2 * (nelt / 3) + 1 + i;
5483       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5484         {
5485           if (dump_enabled_p ())
5486             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5487                              "shift permutation is not supported by target\n");
5488           return false;
5489         }
5490       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5491
5492       /* Generating permutation constant to shift all elements.
5493          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5494       for (i = 0; i < nelt; i++)
5495         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5496       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5497         {
5498           if (dump_enabled_p ())
5499             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5500                              "shift permutation is not supported by target\n");
5501           return false;
5502         }
5503       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5504
5505       /* Generating permutation constant to shift all elements.
5506          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5507       for (i = 0; i < nelt; i++)
5508         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5509       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5510         {
5511           if (dump_enabled_p ())
5512             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5513                              "shift permutation is not supported by target\n");
5514           return false;
5515         }
5516       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5517
5518       for (k = 0; k < 3; k++)
5519         {
5520           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5521           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5522                                            dr_chain[k], dr_chain[k],
5523                                            perm3_mask);
5524           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5525           vect[k] = data_ref;
5526         }
5527
5528       for (k = 0; k < 3; k++)
5529         {
5530           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5531           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5532                                            vect[k % 3], vect[(k + 1) % 3],
5533                                            shift1_mask);
5534           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5535           vect_shift[k] = data_ref;
5536         }
5537
5538       for (k = 0; k < 3; k++)
5539         {
5540           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5541           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5542                                            vect_shift[(4 - k) % 3],
5543                                            vect_shift[(3 - k) % 3],
5544                                            shift2_mask);
5545           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5546           vect[k] = data_ref;
5547         }
5548
5549       (*result_chain)[3 - (nelt % 3)] = vect[2];
5550
5551       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5552       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5553                                        vect[0], shift3_mask);
5554       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5555       (*result_chain)[nelt % 3] = data_ref;
5556
5557       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5558       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5559                                        vect[1], shift4_mask);
5560       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5561       (*result_chain)[0] = data_ref;
5562       return true;
5563     }
5564   return false;
5565 }
5566
5567 /* Function vect_transform_grouped_load.
5568
5569    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5570    to perform their permutation and ascribe the result vectorized statements to
5571    the scalar statements.
5572 */
5573
5574 void
5575 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5576                              gimple_stmt_iterator *gsi)
5577 {
5578   machine_mode mode;
5579   vec<tree> result_chain = vNULL;
5580
5581   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5582      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5583      vectors, that are ready for vector computation.  */
5584   result_chain.create (size);
5585
5586   /* If reassociation width for vector type is 2 or greater target machine can
5587      execute 2 or more vector instructions in parallel.  Otherwise try to
5588      get chain for loads group using vect_shift_permute_load_chain.  */
5589   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5590   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5591       || exact_log2 (size) != -1
5592       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5593                                          gsi, &result_chain))
5594     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5595   vect_record_grouped_load_vectors (stmt, result_chain);
5596   result_chain.release ();
5597 }
5598
5599 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5600    generated as part of the vectorization of STMT.  Assign the statement
5601    for each vector to the associated scalar statement.  */
5602
5603 void
5604 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5605 {
5606   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5607   gimple next_stmt, new_stmt;
5608   unsigned int i, gap_count;
5609   tree tmp_data_ref;
5610
5611   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5612      Since we scan the chain starting from it's first node, their order
5613      corresponds the order of data-refs in RESULT_CHAIN.  */
5614   next_stmt = first_stmt;
5615   gap_count = 1;
5616   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5617     {
5618       if (!next_stmt)
5619         break;
5620
5621       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5622        code elimination pass later.  No need to check for the first stmt in
5623        the group, since it always exists.
5624        GROUP_GAP is the number of steps in elements from the previous
5625        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5626        correspond to the gaps.  */
5627       if (next_stmt != first_stmt
5628           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5629       {
5630         gap_count++;
5631         continue;
5632       }
5633
5634       while (next_stmt)
5635         {
5636           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5637           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5638              copies, and we put the new vector statement in the first available
5639              RELATED_STMT.  */
5640           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5641             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5642           else
5643             {
5644               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5645                 {
5646                   gimple prev_stmt =
5647                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5648                   gimple rel_stmt =
5649                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5650                   while (rel_stmt)
5651                     {
5652                       prev_stmt = rel_stmt;
5653                       rel_stmt =
5654                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5655                     }
5656
5657                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5658                     new_stmt;
5659                 }
5660             }
5661
5662           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5663           gap_count = 1;
5664           /* If NEXT_STMT accesses the same DR as the previous statement,
5665              put the same TMP_DATA_REF as its vectorized statement; otherwise
5666              get the next data-ref from RESULT_CHAIN.  */
5667           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5668             break;
5669         }
5670     }
5671 }
5672
5673 /* Function vect_force_dr_alignment_p.
5674
5675    Returns whether the alignment of a DECL can be forced to be aligned
5676    on ALIGNMENT bit boundary.  */
5677
5678 bool
5679 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5680 {
5681   if (TREE_CODE (decl) != VAR_DECL)
5682     return false;
5683
5684   if (decl_in_symtab_p (decl)
5685       && !symtab_node::get (decl)->can_increase_alignment_p ())
5686     return false;
5687
5688   if (TREE_STATIC (decl))
5689     return (alignment <= MAX_OFILE_ALIGNMENT);
5690   else
5691     return (alignment <= MAX_STACK_ALIGNMENT);
5692 }
5693
5694
5695 /* Return whether the data reference DR is supported with respect to its
5696    alignment.
5697    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5698    it is aligned, i.e., check if it is possible to vectorize it with different
5699    alignment.  */
5700
5701 enum dr_alignment_support
5702 vect_supportable_dr_alignment (struct data_reference *dr,
5703                                bool check_aligned_accesses)
5704 {
5705   gimple stmt = DR_STMT (dr);
5706   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5707   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5708   machine_mode mode = TYPE_MODE (vectype);
5709   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5710   struct loop *vect_loop = NULL;
5711   bool nested_in_vect_loop = false;
5712
5713   if (aligned_access_p (dr) && !check_aligned_accesses)
5714     return dr_aligned;
5715
5716   /* For now assume all conditional loads/stores support unaligned
5717      access without any special code.  */
5718   if (is_gimple_call (stmt)
5719       && gimple_call_internal_p (stmt)
5720       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5721           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5722     return dr_unaligned_supported;
5723
5724   if (loop_vinfo)
5725     {
5726       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5727       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5728     }
5729
5730   /* Possibly unaligned access.  */
5731
5732   /* We can choose between using the implicit realignment scheme (generating
5733      a misaligned_move stmt) and the explicit realignment scheme (generating
5734      aligned loads with a REALIGN_LOAD).  There are two variants to the
5735      explicit realignment scheme: optimized, and unoptimized.
5736      We can optimize the realignment only if the step between consecutive
5737      vector loads is equal to the vector size.  Since the vector memory
5738      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5739      is guaranteed that the misalignment amount remains the same throughout the
5740      execution of the vectorized loop.  Therefore, we can create the
5741      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5742      at the loop preheader.
5743
5744      However, in the case of outer-loop vectorization, when vectorizing a
5745      memory access in the inner-loop nested within the LOOP that is now being
5746      vectorized, while it is guaranteed that the misalignment of the
5747      vectorized memory access will remain the same in different outer-loop
5748      iterations, it is *not* guaranteed that is will remain the same throughout
5749      the execution of the inner-loop.  This is because the inner-loop advances
5750      with the original scalar step (and not in steps of VS).  If the inner-loop
5751      step happens to be a multiple of VS, then the misalignment remains fixed
5752      and we can use the optimized realignment scheme.  For example:
5753
5754       for (i=0; i<N; i++)
5755         for (j=0; j<M; j++)
5756           s += a[i+j];
5757
5758      When vectorizing the i-loop in the above example, the step between
5759      consecutive vector loads is 1, and so the misalignment does not remain
5760      fixed across the execution of the inner-loop, and the realignment cannot
5761      be optimized (as illustrated in the following pseudo vectorized loop):
5762
5763       for (i=0; i<N; i+=4)
5764         for (j=0; j<M; j++){
5765           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5766                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5767                          // (assuming that we start from an aligned address).
5768           }
5769
5770      We therefore have to use the unoptimized realignment scheme:
5771
5772       for (i=0; i<N; i+=4)
5773           for (j=k; j<M; j+=4)
5774           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5775                            // that the misalignment of the initial address is
5776                            // 0).
5777
5778      The loop can then be vectorized as follows:
5779
5780       for (k=0; k<4; k++){
5781         rt = get_realignment_token (&vp[k]);
5782         for (i=0; i<N; i+=4){
5783           v1 = vp[i+k];
5784           for (j=k; j<M; j+=4){
5785             v2 = vp[i+j+VS-1];
5786             va = REALIGN_LOAD <v1,v2,rt>;
5787             vs += va;
5788             v1 = v2;
5789           }
5790         }
5791     } */
5792
5793   if (DR_IS_READ (dr))
5794     {
5795       bool is_packed = false;
5796       tree type = (TREE_TYPE (DR_REF (dr)));
5797
5798       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5799           && (!targetm.vectorize.builtin_mask_for_load
5800               || targetm.vectorize.builtin_mask_for_load ()))
5801         {
5802           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5803           if ((nested_in_vect_loop
5804                && (TREE_INT_CST_LOW (DR_STEP (dr))
5805                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5806               || !loop_vinfo)
5807             return dr_explicit_realign;
5808           else
5809             return dr_explicit_realign_optimized;
5810         }
5811       if (!known_alignment_for_access_p (dr))
5812         is_packed = not_size_aligned (DR_REF (dr));
5813
5814       if ((TYPE_USER_ALIGN (type) && !is_packed)
5815           || targetm.vectorize.
5816                support_vector_misalignment (mode, type,
5817                                             DR_MISALIGNMENT (dr), is_packed))
5818         /* Can't software pipeline the loads, but can at least do them.  */
5819         return dr_unaligned_supported;
5820     }
5821   else
5822     {
5823       bool is_packed = false;
5824       tree type = (TREE_TYPE (DR_REF (dr)));
5825
5826       if (!known_alignment_for_access_p (dr))
5827         is_packed = not_size_aligned (DR_REF (dr));
5828
5829      if ((TYPE_USER_ALIGN (type) && !is_packed)
5830          || targetm.vectorize.
5831               support_vector_misalignment (mode, type,
5832                                            DR_MISALIGNMENT (dr), is_packed))
5833        return dr_unaligned_supported;
5834     }
5835
5836   /* Unsupported.  */
5837   return dr_unaligned_unsupported;
5838 }