gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "tm_p.h"
  30 #include "target.h"
  31 #include "basic-block.h"
  32 #include "gimple-pretty-print.h"
  33 #include "tree-ssa-alias.h"
  34 #include "internal-fn.h"
  35 #include "tree-eh.h"
  36 #include "gimple-expr.h"
  37 #include "is-a.h"
  38 #include "gimple.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "gimple-ssa.h"
  43 #include "tree-phinodes.h"
  44 #include "ssa-iterators.h"
  45 #include "stringpool.h"
  46 #include "tree-ssanames.h"
  47 #include "tree-ssa-loop-ivopts.h"
  48 #include "tree-ssa-loop-manip.h"
  49 #include "tree-ssa-loop.h"
  50 #include "dumpfile.h"
  51 #include "cfgloop.h"
  52 #include "tree-chrec.h"
  53 #include "tree-scalar-evolution.h"
  54 #include "tree-vectorizer.h"
  55 #include "diagnostic-core.h"
  56 #include "cgraph.h"
  57 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  58 #include "expr.h"
  59 #include "optabs.h"
  60 #include "builtins.h"
  61 #include "varasm.h"
  62
  63 /* Return true if load- or store-lanes optab OPTAB is implemented for
  64    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  65
  66 static bool
  67 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  68                               tree vectype, unsigned HOST_WIDE_INT count)
  69 {
  70   enum machine_mode mode, array_mode;
  71   bool limit_p;
  72
  73   mode = TYPE_MODE (vectype);
  74   limit_p = !targetm.array_mode_supported_p (mode, count);
  75   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  76                               MODE_INT, limit_p);
  77
  78   if (array_mode == BLKmode)
  79     {
  80       if (dump_enabled_p ())
  81         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  82                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  83                          GET_MODE_NAME (mode), count);
  84       return false;
  85     }
  86
  87   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  88     {
  89       if (dump_enabled_p ())
  90         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  91                          "cannot use %s<%s><%s>\n", name,
  92                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  93       return false;
  94     }
  95
  96   if (dump_enabled_p ())
  97     dump_printf_loc (MSG_NOTE, vect_location,
  98                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  99                      GET_MODE_NAME (mode));
 100
 101   return true;
 102 }
 103
 104
 105 /* Return the smallest scalar part of STMT.
 106    This is used to determine the vectype of the stmt.  We generally set the
 107    vectype according to the type of the result (lhs).  For stmts whose
 108    result-type is different than the type of the arguments (e.g., demotion,
 109    promotion), vectype will be reset appropriately (later).  Note that we have
 110    to visit the smallest datatype in this function, because that determines the
 111    VF.  If the smallest datatype in the loop is present only as the rhs of a
 112    promotion operation - we'd miss it.
 113    Such a case, where a variable of this datatype does not appear in the lhs
 114    anywhere in the loop, can only occur if it's an invariant: e.g.:
 115    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 116    invariant motion.  However, we cannot rely on invariant motion to always
 117    take invariants out of the loop, and so in the case of promotion we also
 118    have to check the rhs.
 119    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 120    types.  */
 121
 122 tree
 123 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 124                                HOST_WIDE_INT *rhs_size_unit)
 125 {
 126   tree scalar_type = gimple_expr_type (stmt);
 127   HOST_WIDE_INT lhs, rhs;
 128
 129   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 130
 131   if (is_gimple_assign (stmt)
 132       && (gimple_assign_cast_p (stmt)
 133           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 134           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 135           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 136     {
 137       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 138
 139       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 140       if (rhs < lhs)
 141         scalar_type = rhs_type;
 142     }
 143
 144   *lhs_size_unit = lhs;
 145   *rhs_size_unit = rhs;
 146   return scalar_type;
 147 }
 148
 149
 150 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 151    tested at run-time.  Return TRUE if DDR was successfully inserted.
 152    Return false if versioning is not supported.  */
 153
 154 static bool
 155 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 156 {
 157   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 158
 159   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 160     return false;
 161
 162   if (dump_enabled_p ())
 163     {
 164       dump_printf_loc (MSG_NOTE, vect_location,
 165                        "mark for run-time aliasing test between ");
 166       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 167       dump_printf (MSG_NOTE,  " and ");
 168       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 169       dump_printf (MSG_NOTE, "\n");
 170     }
 171
 172   if (optimize_loop_nest_for_size_p (loop))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 176                          "versioning not supported when optimizing"
 177                          " for size.\n");
 178       return false;
 179     }
 180
 181   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 182   if (loop->inner)
 183     {
 184       if (dump_enabled_p ())
 185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 186                          "versioning not yet supported for outer-loops.\n");
 187       return false;
 188     }
 189
 190   /* FORNOW: We don't support creating runtime alias tests for non-constant
 191      step.  */
 192   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 193       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 194     {
 195       if (dump_enabled_p ())
 196         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 197                          "versioning not yet supported for non-constant "
 198                          "step\n");
 199       return false;
 200     }
 201
 202   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 203   return true;
 204 }
 205
 206
 207 /* Function vect_analyze_data_ref_dependence.
 208
 209    Return TRUE if there (might) exist a dependence between a memory-reference
 210    DRA and a memory-reference DRB.  When versioning for alias may check a
 211    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 212    the data dependence.  */
 213
 214 static bool
 215 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 216                                   loop_vec_info loop_vinfo, int *max_vf)
 217 {
 218   unsigned int i;
 219   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 220   struct data_reference *dra = DDR_A (ddr);
 221   struct data_reference *drb = DDR_B (ddr);
 222   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 223   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 224   lambda_vector dist_v;
 225   unsigned int loop_depth;
 226
 227   /* In loop analysis all data references should be vectorizable.  */
 228   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 229       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 230     gcc_unreachable ();
 231
 232   /* Independent data accesses.  */
 233   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 234     return false;
 235
 236   if (dra == drb
 237       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 238     return false;
 239
 240   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 241      least two scalar iterations, there is always also a true dependence.
 242      As the vectorizer does not re-order loads and stores we can ignore
 243      the anti-dependence if TBAA can disambiguate both DRs similar to the
 244      case with known negative distance anti-dependences (positive
 245      distance anti-dependences would violate TBAA constraints).  */
 246   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 247        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 248       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 249                                  get_alias_set (DR_REF (drb))))
 250     return false;
 251
 252   /* Unknown data dependence.  */
 253   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 254     {
 255       /* If user asserted safelen consecutive iterations can be
 256          executed concurrently, assume independence.  */
 257       if (loop->safelen >= 2)
 258         {
 259           if (loop->safelen < *max_vf)
 260             *max_vf = loop->safelen;
 261           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 262           return false;
 263         }
 264
 265       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 266           || STMT_VINFO_GATHER_P (stmtinfo_b))
 267         {
 268           if (dump_enabled_p ())
 269             {
 270               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 271                                "versioning for alias not supported for: "
 272                                "can't determine dependence between ");
 273               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 274                                  DR_REF (dra));
 275               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 276               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 277                                  DR_REF (drb));
 278               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 279             }
 280           return true;
 281         }
 282
 283       if (dump_enabled_p ())
 284         {
 285           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 286                            "versioning for alias required: "
 287                            "can't determine dependence between ");
 288           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 289                              DR_REF (dra));
 290           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 291           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 292                              DR_REF (drb));
 293           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 294         }
 295
 296       /* Add to list of ddrs that need to be tested at run-time.  */
 297       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 298     }
 299
 300   /* Known data dependence.  */
 301   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 302     {
 303       /* If user asserted safelen consecutive iterations can be
 304          executed concurrently, assume independence.  */
 305       if (loop->safelen >= 2)
 306         {
 307           if (loop->safelen < *max_vf)
 308             *max_vf = loop->safelen;
 309           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 310           return false;
 311         }
 312
 313       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 314           || STMT_VINFO_GATHER_P (stmtinfo_b))
 315         {
 316           if (dump_enabled_p ())
 317             {
 318               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 319                                "versioning for alias not supported for: "
 320                                "bad dist vector for ");
 321               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 322                                  DR_REF (dra));
 323               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 324               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 325                                  DR_REF (drb));
 326               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 327             }
 328           return true;
 329         }
 330
 331       if (dump_enabled_p ())
 332         {
 333           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 334                            "versioning for alias required: "
 335                            "bad dist vector for ");
 336           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 337           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 338           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 339           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 340         }
 341       /* Add to list of ddrs that need to be tested at run-time.  */
 342       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 343     }
 344
 345   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 346   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 347     {
 348       int dist = dist_v[loop_depth];
 349
 350       if (dump_enabled_p ())
 351         dump_printf_loc (MSG_NOTE, vect_location,
 352                          "dependence distance  = %d.\n", dist);
 353
 354       if (dist == 0)
 355         {
 356           if (dump_enabled_p ())
 357             {
 358               dump_printf_loc (MSG_NOTE, vect_location,
 359                                "dependence distance == 0 between ");
 360               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 361               dump_printf (MSG_NOTE, " and ");
 362               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 363               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 364             }
 365
 366           /* When we perform grouped accesses and perform implicit CSE
 367              by detecting equal accesses and doing disambiguation with
 368              runtime alias tests like for
 369                 .. = a[i];
 370                 .. = a[i+1];
 371                 a[i] = ..;
 372                 a[i+1] = ..;
 373                 *p = ..;
 374                 .. = a[i];
 375                 .. = a[i+1];
 376              where we will end up loading { a[i], a[i+1] } once, make
 377              sure that inserting group loads before the first load and
 378              stores after the last store will do the right thing.
 379              Similar for groups like
 380                 a[i] = ...;
 381                 ... = a[i];
 382                 a[i+1] = ...;
 383              where loads from the group interleave with the store.  */
 384           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 385               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 386             {
 387               gimple earlier_stmt;
 388               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 389               if (DR_IS_WRITE
 390                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 391                 {
 392                   if (dump_enabled_p ())
 393                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                      "READ_WRITE dependence in interleaving."
 395                                      "\n");
 396                   return true;
 397                 }
 398             }
 399
 400           continue;
 401         }
 402
 403       if (dist > 0 && DDR_REVERSED_P (ddr))
 404         {
 405           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 406              reversed (to make distance vector positive), and the actual
 407              distance is negative.  */
 408           if (dump_enabled_p ())
 409             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 410                              "dependence distance negative.\n");
 411           /* Record a negative dependence distance to later limit the
 412              amount of stmt copying / unrolling we can perform.
 413              Only need to handle read-after-write dependence.  */
 414           if (DR_IS_READ (drb)
 415               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 416                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 417             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 418           continue;
 419         }
 420
 421       if (abs (dist) >= 2
 422           && abs (dist) < *max_vf)
 423         {
 424           /* The dependence distance requires reduction of the maximal
 425              vectorization factor.  */
 426           *max_vf = abs (dist);
 427           if (dump_enabled_p ())
 428             dump_printf_loc (MSG_NOTE, vect_location,
 429                              "adjusting maximal vectorization factor to %i\n",
 430                              *max_vf);
 431         }
 432
 433       if (abs (dist) >= *max_vf)
 434         {
 435           /* Dependence distance does not create dependence, as far as
 436              vectorization is concerned, in this case.  */
 437           if (dump_enabled_p ())
 438             dump_printf_loc (MSG_NOTE, vect_location,
 439                              "dependence distance >= VF.\n");
 440           continue;
 441         }
 442
 443       if (dump_enabled_p ())
 444         {
 445           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 446                        "not vectorized, possible dependence "
 447                        "between data-refs ");
 448           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 449           dump_printf (MSG_NOTE,  " and ");
 450           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 451           dump_printf (MSG_NOTE,  "\n");
 452         }
 453
 454       return true;
 455     }
 456
 457   return false;
 458 }
 459
 460 /* Function vect_analyze_data_ref_dependences.
 461
 462    Examine all the data references in the loop, and make sure there do not
 463    exist any data dependences between them.  Set *MAX_VF according to
 464    the maximum vectorization factor the data dependences allow.  */
 465
 466 bool
 467 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 468 {
 469   unsigned int i;
 470   struct data_dependence_relation *ddr;
 471
 472   if (dump_enabled_p ())
 473     dump_printf_loc (MSG_NOTE, vect_location,
 474                      "=== vect_analyze_data_ref_dependences ===\n");
 475
 476   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 477   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 478                                 &LOOP_VINFO_DDRS (loop_vinfo),
 479                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 480     return false;
 481
 482   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 483     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 484       return false;
 485
 486   return true;
 487 }
 488
 489
 490 /* Function vect_slp_analyze_data_ref_dependence.
 491
 492    Return TRUE if there (might) exist a dependence between a memory-reference
 493    DRA and a memory-reference DRB.  When versioning for alias may check a
 494    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 495    the data dependence.  */
 496
 497 static bool
 498 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 499 {
 500   struct data_reference *dra = DDR_A (ddr);
 501   struct data_reference *drb = DDR_B (ddr);
 502
 503   /* We need to check dependences of statements marked as unvectorizable
 504      as well, they still can prohibit vectorization.  */
 505
 506   /* Independent data accesses.  */
 507   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 508     return false;
 509
 510   if (dra == drb)
 511     return false;
 512
 513   /* Read-read is OK.  */
 514   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 515     return false;
 516
 517   /* If dra and drb are part of the same interleaving chain consider
 518      them independent.  */
 519   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 520       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 521           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 522     return false;
 523
 524   /* Unknown data dependence.  */
 525   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 526     {
 527       if  (dump_enabled_p ())
 528         {
 529           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 530                            "can't determine dependence between ");
 531           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 532           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 533           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 534           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 535         }
 536     }
 537   else if (dump_enabled_p ())
 538     {
 539       dump_printf_loc (MSG_NOTE, vect_location,
 540                        "determined dependence between ");
 541       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 542       dump_printf (MSG_NOTE, " and ");
 543       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 544       dump_printf (MSG_NOTE,  "\n");
 545     }
 546
 547   /* We do not vectorize basic blocks with write-write dependencies.  */
 548   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 549     return true;
 550
 551   /* If we have a read-write dependence check that the load is before the store.
 552      When we vectorize basic blocks, vector load can be only before
 553      corresponding scalar load, and vector store can be only after its
 554      corresponding scalar store.  So the order of the acceses is preserved in
 555      case the load is before the store.  */
 556   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 557   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 558     {
 559       /* That only holds for load-store pairs taking part in vectorization.  */
 560       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 561           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 562         return false;
 563     }
 564
 565   return true;
 566 }
 567
 568
 569 /* Function vect_analyze_data_ref_dependences.
 570
 571    Examine all the data references in the basic-block, and make sure there
 572    do not exist any data dependences between them.  Set *MAX_VF according to
 573    the maximum vectorization factor the data dependences allow.  */
 574
 575 bool
 576 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 577 {
 578   struct data_dependence_relation *ddr;
 579   unsigned int i;
 580
 581   if (dump_enabled_p ())
 582     dump_printf_loc (MSG_NOTE, vect_location,
 583                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 584
 585   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 586                                 &BB_VINFO_DDRS (bb_vinfo),
 587                                 vNULL, true))
 588     return false;
 589
 590   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 591     if (vect_slp_analyze_data_ref_dependence (ddr))
 592       return false;
 593
 594   return true;
 595 }
 596
 597
 598 /* Function vect_compute_data_ref_alignment
 599
 600    Compute the misalignment of the data reference DR.
 601
 602    Output:
 603    1. If during the misalignment computation it is found that the data reference
 604       cannot be vectorized then false is returned.
 605    2. DR_MISALIGNMENT (DR) is defined.
 606
 607    FOR NOW: No analysis is actually performed. Misalignment is calculated
 608    only for trivial cases. TODO.  */
 609
 610 static bool
 611 vect_compute_data_ref_alignment (struct data_reference *dr)
 612 {
 613   gimple stmt = DR_STMT (dr);
 614   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 615   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 616   struct loop *loop = NULL;
 617   tree ref = DR_REF (dr);
 618   tree vectype;
 619   tree base, base_addr;
 620   bool base_aligned;
 621   tree misalign;
 622   tree aligned_to, alignment;
 623
 624   if (dump_enabled_p ())
 625     dump_printf_loc (MSG_NOTE, vect_location,
 626                      "vect_compute_data_ref_alignment:\n");
 627
 628   if (loop_vinfo)
 629     loop = LOOP_VINFO_LOOP (loop_vinfo);
 630
 631   /* Initialize misalignment to unknown.  */
 632   SET_DR_MISALIGNMENT (dr, -1);
 633
 634   /* Strided loads perform only component accesses, misalignment information
 635      is irrelevant for them.  */
 636   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 637     return true;
 638
 639   misalign = DR_INIT (dr);
 640   aligned_to = DR_ALIGNED_TO (dr);
 641   base_addr = DR_BASE_ADDRESS (dr);
 642   vectype = STMT_VINFO_VECTYPE (stmt_info);
 643
 644   /* In case the dataref is in an inner-loop of the loop that is being
 645      vectorized (LOOP), we use the base and misalignment information
 646      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 647      stays the same throughout the execution of the inner-loop, which is why
 648      we have to check that the stride of the dataref in the inner-loop evenly
 649      divides by the vector size.  */
 650   if (loop && nested_in_vect_loop_p (loop, stmt))
 651     {
 652       tree step = DR_STEP (dr);
 653       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 654
 655       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 656         {
 657           if (dump_enabled_p ())
 658             dump_printf_loc (MSG_NOTE, vect_location,
 659                              "inner step divides the vector-size.\n");
 660           misalign = STMT_VINFO_DR_INIT (stmt_info);
 661           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 662           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 663         }
 664       else
 665         {
 666           if (dump_enabled_p ())
 667             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 668                              "inner step doesn't divide the vector-size.\n");
 669           misalign = NULL_TREE;
 670         }
 671     }
 672
 673   /* Similarly, if we're doing basic-block vectorization, we can only use
 674      base and misalignment information relative to an innermost loop if the
 675      misalignment stays the same throughout the execution of the loop.
 676      As above, this is the case if the stride of the dataref evenly divides
 677      by the vector size.  */
 678   if (!loop)
 679     {
 680       tree step = DR_STEP (dr);
 681       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 682
 683       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 684         {
 685           if (dump_enabled_p ())
 686             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 687                              "SLP: step doesn't divide the vector-size.\n");
 688           misalign = NULL_TREE;
 689         }
 690     }
 691
 692   base = build_fold_indirect_ref (base_addr);
 693   alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
 694
 695   if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
 696       || !misalign)
 697     {
 698       if (dump_enabled_p ())
 699         {
 700           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 701                            "Unknown alignment for access: ");
 702           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
 703           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 704         }
 705       return true;
 706     }
 707
 708   if ((DECL_P (base)
 709        && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
 710                                 alignment) >= 0)
 711       || (TREE_CODE (base_addr) == SSA_NAME
 712           && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
 713                                                       TREE_TYPE (base_addr)))),
 714                                    alignment) >= 0)
 715       || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
 716     base_aligned = true;
 717   else
 718     base_aligned = false;
 719
 720   if (!base_aligned)
 721     {
 722       /* Do not change the alignment of global variables here if
 723          flag_section_anchors is enabled as we already generated
 724          RTL for other functions.  Most global variables should
 725          have been aligned during the IPA increase_alignment pass.  */
 726       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
 727           || (TREE_STATIC (base) && flag_section_anchors))
 728         {
 729           if (dump_enabled_p ())
 730             {
 731               dump_printf_loc (MSG_NOTE, vect_location,
 732                                "can't force alignment of ref: ");
 733               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 734               dump_printf (MSG_NOTE, "\n");
 735             }
 736           return true;
 737         }
 738
 739       /* Force the alignment of the decl.
 740          NOTE: This is the only change to the code we make during
 741          the analysis phase, before deciding to vectorize the loop.  */
 742       if (dump_enabled_p ())
 743         {
 744           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 745           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 746           dump_printf (MSG_NOTE, "\n");
 747         }
 748
 749       ((dataref_aux *)dr->aux)->base_decl = base;
 750       ((dataref_aux *)dr->aux)->base_misaligned = true;
 751     }
 752
 753   /* If this is a backward running DR then first access in the larger
 754      vectype actually is N-1 elements before the address in the DR.
 755      Adjust misalign accordingly.  */
 756   if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
 757     {
 758       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 759       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 760          otherwise we wouldn't be here.  */
 761       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 762       /* PLUS because DR_STEP was negative.  */
 763       misalign = size_binop (PLUS_EXPR, misalign, offset);
 764     }
 765
 766   /* Modulo alignment.  */
 767   misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
 768
 769   if (!tree_fits_uhwi_p (misalign))
 770     {
 771       /* Negative or overflowed misalignment value.  */
 772       if (dump_enabled_p ())
 773         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 774                          "unexpected misalign value\n");
 775       return false;
 776     }
 777
 778   SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
 779
 780   if (dump_enabled_p ())
 781     {
 782       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 783                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 784       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 785       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 786     }
 787
 788   return true;
 789 }
 790
 791
 792 /* Function vect_compute_data_refs_alignment
 793
 794    Compute the misalignment of data references in the loop.
 795    Return FALSE if a data reference is found that cannot be vectorized.  */
 796
 797 static bool
 798 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 799                                   bb_vec_info bb_vinfo)
 800 {
 801   vec<data_reference_p> datarefs;
 802   struct data_reference *dr;
 803   unsigned int i;
 804
 805   if (loop_vinfo)
 806     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 807   else
 808     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 809
 810   FOR_EACH_VEC_ELT (datarefs, i, dr)
 811     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 812         && !vect_compute_data_ref_alignment (dr))
 813       {
 814         if (bb_vinfo)
 815           {
 816             /* Mark unsupported statement as unvectorizable.  */
 817             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 818             continue;
 819           }
 820         else
 821           return false;
 822       }
 823
 824   return true;
 825 }
 826
 827
 828 /* Function vect_update_misalignment_for_peel
 829
 830    DR - the data reference whose misalignment is to be adjusted.
 831    DR_PEEL - the data reference whose misalignment is being made
 832              zero in the vector loop by the peel.
 833    NPEEL - the number of iterations in the peel loop if the misalignment
 834            of DR_PEEL is known at compile time.  */
 835
 836 static void
 837 vect_update_misalignment_for_peel (struct data_reference *dr,
 838                                    struct data_reference *dr_peel, int npeel)
 839 {
 840   unsigned int i;
 841   vec<dr_p> same_align_drs;
 842   struct data_reference *current_dr;
 843   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 844   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 845   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 846   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 847
 848  /* For interleaved data accesses the step in the loop must be multiplied by
 849      the size of the interleaving group.  */
 850   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 851     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 852   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 853     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 854
 855   /* It can be assumed that the data refs with the same alignment as dr_peel
 856      are aligned in the vector loop.  */
 857   same_align_drs
 858     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 859   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 860     {
 861       if (current_dr != dr)
 862         continue;
 863       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 864                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 865       SET_DR_MISALIGNMENT (dr, 0);
 866       return;
 867     }
 868
 869   if (known_alignment_for_access_p (dr)
 870       && known_alignment_for_access_p (dr_peel))
 871     {
 872       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 873       int misal = DR_MISALIGNMENT (dr);
 874       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 875       misal += negative ? -npeel * dr_size : npeel * dr_size;
 876       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 877       SET_DR_MISALIGNMENT (dr, misal);
 878       return;
 879     }
 880
 881   if (dump_enabled_p ())
 882     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 883   SET_DR_MISALIGNMENT (dr, -1);
 884 }
 885
 886
 887 /* Function vect_verify_datarefs_alignment
 888
 889    Return TRUE if all data references in the loop can be
 890    handled with respect to alignment.  */
 891
 892 bool
 893 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 894 {
 895   vec<data_reference_p> datarefs;
 896   struct data_reference *dr;
 897   enum dr_alignment_support supportable_dr_alignment;
 898   unsigned int i;
 899
 900   if (loop_vinfo)
 901     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 902   else
 903     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 904
 905   FOR_EACH_VEC_ELT (datarefs, i, dr)
 906     {
 907       gimple stmt = DR_STMT (dr);
 908       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 909
 910       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 911         continue;
 912
 913       /* For interleaving, only the alignment of the first access matters.
 914          Skip statements marked as not vectorizable.  */
 915       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 916            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 917           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 918         continue;
 919
 920       /* Strided loads perform only component accesses, alignment is
 921          irrelevant for them.  */
 922       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 923         continue;
 924
 925       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 926       if (!supportable_dr_alignment)
 927         {
 928           if (dump_enabled_p ())
 929             {
 930               if (DR_IS_READ (dr))
 931                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 932                                  "not vectorized: unsupported unaligned load.");
 933               else
 934                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 935                                  "not vectorized: unsupported unaligned "
 936                                  "store.");
 937
 938               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 939                                  DR_REF (dr));
 940               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 941             }
 942           return false;
 943         }
 944       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 945         dump_printf_loc (MSG_NOTE, vect_location,
 946                          "Vectorizing an unaligned access.\n");
 947     }
 948   return true;
 949 }
 950
 951 /* Given an memory reference EXP return whether its alignment is less
 952    than its size.  */
 953
 954 static bool
 955 not_size_aligned (tree exp)
 956 {
 957   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 958     return true;
 959
 960   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 961           > get_object_alignment (exp));
 962 }
 963
 964 /* Function vector_alignment_reachable_p
 965
 966    Return true if vector alignment for DR is reachable by peeling
 967    a few loop iterations.  Return false otherwise.  */
 968
 969 static bool
 970 vector_alignment_reachable_p (struct data_reference *dr)
 971 {
 972   gimple stmt = DR_STMT (dr);
 973   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 974   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 975
 976   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 977     {
 978       /* For interleaved access we peel only if number of iterations in
 979          the prolog loop ({VF - misalignment}), is a multiple of the
 980          number of the interleaved accesses.  */
 981       int elem_size, mis_in_elements;
 982       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 983
 984       /* FORNOW: handle only known alignment.  */
 985       if (!known_alignment_for_access_p (dr))
 986         return false;
 987
 988       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 989       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 990
 991       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
 992         return false;
 993     }
 994
 995   /* If misalignment is known at the compile time then allow peeling
 996      only if natural alignment is reachable through peeling.  */
 997   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
 998     {
 999       HOST_WIDE_INT elmsize =
1000                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1001       if (dump_enabled_p ())
1002         {
1003           dump_printf_loc (MSG_NOTE, vect_location,
1004                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1005           dump_printf (MSG_NOTE,
1006                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1007         }
1008       if (DR_MISALIGNMENT (dr) % elmsize)
1009         {
1010           if (dump_enabled_p ())
1011             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1012                              "data size does not divide the misalignment.\n");
1013           return false;
1014         }
1015     }
1016
1017   if (!known_alignment_for_access_p (dr))
1018     {
1019       tree type = TREE_TYPE (DR_REF (dr));
1020       bool is_packed = not_size_aligned (DR_REF (dr));
1021       if (dump_enabled_p ())
1022         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1023                          "Unknown misalignment, is_packed = %d\n",is_packed);
1024       if ((TYPE_USER_ALIGN (type) && !is_packed)
1025           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1026         return true;
1027       else
1028         return false;
1029     }
1030
1031   return true;
1032 }
1033
1034
1035 /* Calculate the cost of the memory access represented by DR.  */
1036
1037 static void
1038 vect_get_data_access_cost (struct data_reference *dr,
1039                            unsigned int *inside_cost,
1040                            unsigned int *outside_cost,
1041                            stmt_vector_for_cost *body_cost_vec)
1042 {
1043   gimple stmt = DR_STMT (dr);
1044   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1045   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1046   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1047   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1048   int ncopies = vf / nunits;
1049
1050   if (DR_IS_READ (dr))
1051     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1052                         NULL, body_cost_vec, false);
1053   else
1054     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1055
1056   if (dump_enabled_p ())
1057     dump_printf_loc (MSG_NOTE, vect_location,
1058                      "vect_get_data_access_cost: inside_cost = %d, "
1059                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1060 }
1061
1062
1063 /* Insert DR into peeling hash table with NPEEL as key.  */
1064
1065 static void
1066 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1067                           int npeel)
1068 {
1069   struct _vect_peel_info elem, *slot;
1070   _vect_peel_info **new_slot;
1071   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1072
1073   elem.npeel = npeel;
1074   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1075   if (slot)
1076     slot->count++;
1077   else
1078     {
1079       slot = XNEW (struct _vect_peel_info);
1080       slot->npeel = npeel;
1081       slot->dr = dr;
1082       slot->count = 1;
1083       new_slot
1084         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1085       *new_slot = slot;
1086     }
1087
1088   if (!supportable_dr_alignment
1089       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1090     slot->count += VECT_MAX_COST;
1091 }
1092
1093
1094 /* Traverse peeling hash table to find peeling option that aligns maximum
1095    number of data accesses.  */
1096
1097 int
1098 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1099                                      _vect_peel_extended_info *max)
1100 {
1101   vect_peel_info elem = *slot;
1102
1103   if (elem->count > max->peel_info.count
1104       || (elem->count == max->peel_info.count
1105           && max->peel_info.npeel > elem->npeel))
1106     {
1107       max->peel_info.npeel = elem->npeel;
1108       max->peel_info.count = elem->count;
1109       max->peel_info.dr = elem->dr;
1110     }
1111
1112   return 1;
1113 }
1114
1115
1116 /* Traverse peeling hash table and calculate cost for each peeling option.
1117    Find the one with the lowest cost.  */
1118
1119 int
1120 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1121                                    _vect_peel_extended_info *min)
1122 {
1123   vect_peel_info elem = *slot;
1124   int save_misalignment, dummy;
1125   unsigned int inside_cost = 0, outside_cost = 0, i;
1126   gimple stmt = DR_STMT (elem->dr);
1127   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1128   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1129   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1130   struct data_reference *dr;
1131   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1132   int single_iter_cost;
1133
1134   prologue_cost_vec.create (2);
1135   body_cost_vec.create (2);
1136   epilogue_cost_vec.create (2);
1137
1138   FOR_EACH_VEC_ELT (datarefs, i, dr)
1139     {
1140       stmt = DR_STMT (dr);
1141       stmt_info = vinfo_for_stmt (stmt);
1142       /* For interleaving, only the alignment of the first access
1143          matters.  */
1144       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1145           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1146         continue;
1147
1148       save_misalignment = DR_MISALIGNMENT (dr);
1149       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1150       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1151                                  &body_cost_vec);
1152       SET_DR_MISALIGNMENT (dr, save_misalignment);
1153     }
1154
1155   single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1156   outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1157                                                &dummy, single_iter_cost,
1158                                                &prologue_cost_vec,
1159                                                &epilogue_cost_vec);
1160
1161   /* Prologue and epilogue costs are added to the target model later.
1162      These costs depend only on the scalar iteration cost, the
1163      number of peeling iterations finally chosen, and the number of
1164      misaligned statements.  So discard the information found here.  */
1165   prologue_cost_vec.release ();
1166   epilogue_cost_vec.release ();
1167
1168   if (inside_cost < min->inside_cost
1169       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1170     {
1171       min->inside_cost = inside_cost;
1172       min->outside_cost = outside_cost;
1173       min->body_cost_vec.release ();
1174       min->body_cost_vec = body_cost_vec;
1175       min->peel_info.dr = elem->dr;
1176       min->peel_info.npeel = elem->npeel;
1177     }
1178   else
1179     body_cost_vec.release ();
1180
1181   return 1;
1182 }
1183
1184
1185 /* Choose best peeling option by traversing peeling hash table and either
1186    choosing an option with the lowest cost (if cost model is enabled) or the
1187    option that aligns as many accesses as possible.  */
1188
1189 static struct data_reference *
1190 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1191                                        unsigned int *npeel,
1192                                        stmt_vector_for_cost *body_cost_vec)
1193 {
1194    struct _vect_peel_extended_info res;
1195
1196    res.peel_info.dr = NULL;
1197    res.body_cost_vec = stmt_vector_for_cost ();
1198
1199    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1200      {
1201        res.inside_cost = INT_MAX;
1202        res.outside_cost = INT_MAX;
1203        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1204            ->traverse <_vect_peel_extended_info *,
1205                        vect_peeling_hash_get_lowest_cost> (&res);
1206      }
1207    else
1208      {
1209        res.peel_info.count = 0;
1210        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1211            ->traverse <_vect_peel_extended_info *,
1212                        vect_peeling_hash_get_most_frequent> (&res);
1213      }
1214
1215    *npeel = res.peel_info.npeel;
1216    *body_cost_vec = res.body_cost_vec;
1217    return res.peel_info.dr;
1218 }
1219
1220
1221 /* Function vect_enhance_data_refs_alignment
1222
1223    This pass will use loop versioning and loop peeling in order to enhance
1224    the alignment of data references in the loop.
1225
1226    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1227    original loop is to be vectorized.  Any other loops that are created by
1228    the transformations performed in this pass - are not supposed to be
1229    vectorized.  This restriction will be relaxed.
1230
1231    This pass will require a cost model to guide it whether to apply peeling
1232    or versioning or a combination of the two.  For example, the scheme that
1233    intel uses when given a loop with several memory accesses, is as follows:
1234    choose one memory access ('p') which alignment you want to force by doing
1235    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1236    other accesses are not necessarily aligned, or (2) use loop versioning to
1237    generate one loop in which all accesses are aligned, and another loop in
1238    which only 'p' is necessarily aligned.
1239
1240    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1241    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1242    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1243
1244    Devising a cost model is the most critical aspect of this work.  It will
1245    guide us on which access to peel for, whether to use loop versioning, how
1246    many versions to create, etc.  The cost model will probably consist of
1247    generic considerations as well as target specific considerations (on
1248    powerpc for example, misaligned stores are more painful than misaligned
1249    loads).
1250
1251    Here are the general steps involved in alignment enhancements:
1252
1253      -- original loop, before alignment analysis:
1254         for (i=0; i<N; i++){
1255           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1256           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1257         }
1258
1259      -- After vect_compute_data_refs_alignment:
1260         for (i=0; i<N; i++){
1261           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1262           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1263         }
1264
1265      -- Possibility 1: we do loop versioning:
1266      if (p is aligned) {
1267         for (i=0; i<N; i++){    # loop 1A
1268           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1269           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1270         }
1271      }
1272      else {
1273         for (i=0; i<N; i++){    # loop 1B
1274           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1275           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1276         }
1277      }
1278
1279      -- Possibility 2: we do loop peeling:
1280      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1281         x = q[i];
1282         p[i] = y;
1283      }
1284      for (i = 3; i < N; i++){   # loop 2A
1285         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1286         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1287      }
1288
1289      -- Possibility 3: combination of loop peeling and versioning:
1290      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1291         x = q[i];
1292         p[i] = y;
1293      }
1294      if (p is aligned) {
1295         for (i = 3; i<N; i++){  # loop 3A
1296           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1297           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1298         }
1299      }
1300      else {
1301         for (i = 3; i<N; i++){  # loop 3B
1302           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1303           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1304         }
1305      }
1306
1307      These loops are later passed to loop_transform to be vectorized.  The
1308      vectorizer will use the alignment information to guide the transformation
1309      (whether to generate regular loads/stores, or with special handling for
1310      misalignment).  */
1311
1312 bool
1313 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1314 {
1315   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1316   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1317   enum dr_alignment_support supportable_dr_alignment;
1318   struct data_reference *dr0 = NULL, *first_store = NULL;
1319   struct data_reference *dr;
1320   unsigned int i, j;
1321   bool do_peeling = false;
1322   bool do_versioning = false;
1323   bool stat;
1324   gimple stmt;
1325   stmt_vec_info stmt_info;
1326   unsigned int npeel = 0;
1327   bool all_misalignments_unknown = true;
1328   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1329   unsigned possible_npeel_number = 1;
1330   tree vectype;
1331   unsigned int nelements, mis, same_align_drs_max = 0;
1332   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1333
1334   if (dump_enabled_p ())
1335     dump_printf_loc (MSG_NOTE, vect_location,
1336                      "=== vect_enhance_data_refs_alignment ===\n");
1337
1338   /* While cost model enhancements are expected in the future, the high level
1339      view of the code at this time is as follows:
1340
1341      A) If there is a misaligned access then see if peeling to align
1342         this access can make all data references satisfy
1343         vect_supportable_dr_alignment.  If so, update data structures
1344         as needed and return true.
1345
1346      B) If peeling wasn't possible and there is a data reference with an
1347         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1348         then see if loop versioning checks can be used to make all data
1349         references satisfy vect_supportable_dr_alignment.  If so, update
1350         data structures as needed and return true.
1351
1352      C) If neither peeling nor versioning were successful then return false if
1353         any data reference does not satisfy vect_supportable_dr_alignment.
1354
1355      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1356
1357      Note, Possibility 3 above (which is peeling and versioning together) is not
1358      being done at this time.  */
1359
1360   /* (1) Peeling to force alignment.  */
1361
1362   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1363      Considerations:
1364      + How many accesses will become aligned due to the peeling
1365      - How many accesses will become unaligned due to the peeling,
1366        and the cost of misaligned accesses.
1367      - The cost of peeling (the extra runtime checks, the increase
1368        in code size).  */
1369
1370   FOR_EACH_VEC_ELT (datarefs, i, dr)
1371     {
1372       stmt = DR_STMT (dr);
1373       stmt_info = vinfo_for_stmt (stmt);
1374
1375       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1376         continue;
1377
1378       /* For interleaving, only the alignment of the first access
1379          matters.  */
1380       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1381           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1382         continue;
1383
1384       /* For invariant accesses there is nothing to enhance.  */
1385       if (integer_zerop (DR_STEP (dr)))
1386         continue;
1387
1388       /* Strided loads perform only component accesses, alignment is
1389          irrelevant for them.  */
1390       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1391         continue;
1392
1393       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1394       do_peeling = vector_alignment_reachable_p (dr);
1395       if (do_peeling)
1396         {
1397           if (known_alignment_for_access_p (dr))
1398             {
1399               unsigned int npeel_tmp;
1400               bool negative = tree_int_cst_compare (DR_STEP (dr),
1401                                                     size_zero_node) < 0;
1402
1403               /* Save info about DR in the hash table.  */
1404               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1405                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1406                   = new hash_table<peel_info_hasher> (1);
1407
1408               vectype = STMT_VINFO_VECTYPE (stmt_info);
1409               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1410               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1411                                                 TREE_TYPE (DR_REF (dr))));
1412               npeel_tmp = (negative
1413                            ? (mis - nelements) : (nelements - mis))
1414                   & (nelements - 1);
1415
1416               /* For multiple types, it is possible that the bigger type access
1417                  will have more than one peeling option.  E.g., a loop with two
1418                  types: one of size (vector size / 4), and the other one of
1419                  size (vector size / 8).  Vectorization factor will 8.  If both
1420                  access are misaligned by 3, the first one needs one scalar
1421                  iteration to be aligned, and the second one needs 5.  But the
1422                  the first one will be aligned also by peeling 5 scalar
1423                  iterations, and in that case both accesses will be aligned.
1424                  Hence, except for the immediate peeling amount, we also want
1425                  to try to add full vector size, while we don't exceed
1426                  vectorization factor.
1427                  We do this automtically for cost model, since we calculate cost
1428                  for every peeling option.  */
1429               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1430                 possible_npeel_number = vf /nelements;
1431
1432               /* Handle the aligned case. We may decide to align some other
1433                  access, making DR unaligned.  */
1434               if (DR_MISALIGNMENT (dr) == 0)
1435                 {
1436                   npeel_tmp = 0;
1437                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1438                     possible_npeel_number++;
1439                 }
1440
1441               for (j = 0; j < possible_npeel_number; j++)
1442                 {
1443                   gcc_assert (npeel_tmp <= vf);
1444                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1445                   npeel_tmp += nelements;
1446                 }
1447
1448               all_misalignments_unknown = false;
1449               /* Data-ref that was chosen for the case that all the
1450                  misalignments are unknown is not relevant anymore, since we
1451                  have a data-ref with known alignment.  */
1452               dr0 = NULL;
1453             }
1454           else
1455             {
1456               /* If we don't know any misalignment values, we prefer
1457                  peeling for data-ref that has the maximum number of data-refs
1458                  with the same alignment, unless the target prefers to align
1459                  stores over load.  */
1460               if (all_misalignments_unknown)
1461                 {
1462                   unsigned same_align_drs
1463                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1464                   if (!dr0
1465                       || same_align_drs_max < same_align_drs)
1466                     {
1467                       same_align_drs_max = same_align_drs;
1468                       dr0 = dr;
1469                     }
1470                   /* For data-refs with the same number of related
1471                      accesses prefer the one where the misalign
1472                      computation will be invariant in the outermost loop.  */
1473                   else if (same_align_drs_max == same_align_drs)
1474                     {
1475                       struct loop *ivloop0, *ivloop;
1476                       ivloop0 = outermost_invariant_loop_for_expr
1477                           (loop, DR_BASE_ADDRESS (dr0));
1478                       ivloop = outermost_invariant_loop_for_expr
1479                           (loop, DR_BASE_ADDRESS (dr));
1480                       if ((ivloop && !ivloop0)
1481                           || (ivloop && ivloop0
1482                               && flow_loop_nested_p (ivloop, ivloop0)))
1483                         dr0 = dr;
1484                     }
1485
1486                   if (!first_store && DR_IS_WRITE (dr))
1487                     first_store = dr;
1488                 }
1489
1490               /* If there are both known and unknown misaligned accesses in the
1491                  loop, we choose peeling amount according to the known
1492                  accesses.  */
1493               if (!supportable_dr_alignment)
1494                 {
1495                   dr0 = dr;
1496                   if (!first_store && DR_IS_WRITE (dr))
1497                     first_store = dr;
1498                 }
1499             }
1500         }
1501       else
1502         {
1503           if (!aligned_access_p (dr))
1504             {
1505               if (dump_enabled_p ())
1506                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507                                  "vector alignment may not be reachable\n");
1508               break;
1509             }
1510         }
1511     }
1512
1513   /* Check if we can possibly peel the loop.  */
1514   if (!vect_can_advance_ivs_p (loop_vinfo)
1515       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1516     do_peeling = false;
1517
1518   /* If we don't know how many times the peeling loop will run
1519      assume it will run VF-1 times and disable peeling if the remaining
1520      iters are less than the vectorization factor.  */
1521   if (do_peeling
1522       && all_misalignments_unknown
1523       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1524       && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1525           < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1526     do_peeling = false;
1527
1528   if (do_peeling
1529       && all_misalignments_unknown
1530       && vect_supportable_dr_alignment (dr0, false))
1531     {
1532       /* Check if the target requires to prefer stores over loads, i.e., if
1533          misaligned stores are more expensive than misaligned loads (taking
1534          drs with same alignment into account).  */
1535       if (first_store && DR_IS_READ (dr0))
1536         {
1537           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1538           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1539           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1540           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1541           stmt_vector_for_cost dummy;
1542           dummy.create (2);
1543
1544           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1545                                      &dummy);
1546           vect_get_data_access_cost (first_store, &store_inside_cost,
1547                                      &store_outside_cost, &dummy);
1548
1549           dummy.release ();
1550
1551           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1552              aligning the load DR0).  */
1553           load_inside_penalty = store_inside_cost;
1554           load_outside_penalty = store_outside_cost;
1555           for (i = 0;
1556                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1557                           DR_STMT (first_store))).iterate (i, &dr);
1558                i++)
1559             if (DR_IS_READ (dr))
1560               {
1561                 load_inside_penalty += load_inside_cost;
1562                 load_outside_penalty += load_outside_cost;
1563               }
1564             else
1565               {
1566                 load_inside_penalty += store_inside_cost;
1567                 load_outside_penalty += store_outside_cost;
1568               }
1569
1570           /* Calculate the penalty for leaving DR0 unaligned (by
1571              aligning the FIRST_STORE).  */
1572           store_inside_penalty = load_inside_cost;
1573           store_outside_penalty = load_outside_cost;
1574           for (i = 0;
1575                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1576                       DR_STMT (dr0))).iterate (i, &dr);
1577                i++)
1578             if (DR_IS_READ (dr))
1579               {
1580                 store_inside_penalty += load_inside_cost;
1581                 store_outside_penalty += load_outside_cost;
1582               }
1583             else
1584               {
1585                 store_inside_penalty += store_inside_cost;
1586                 store_outside_penalty += store_outside_cost;
1587               }
1588
1589           if (load_inside_penalty > store_inside_penalty
1590               || (load_inside_penalty == store_inside_penalty
1591                   && load_outside_penalty > store_outside_penalty))
1592             dr0 = first_store;
1593         }
1594
1595       /* In case there are only loads with different unknown misalignments, use
1596          peeling only if it may help to align other accesses in the loop.  */
1597       if (!first_store
1598           && !STMT_VINFO_SAME_ALIGN_REFS (
1599                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1600           && vect_supportable_dr_alignment (dr0, false)
1601               != dr_unaligned_supported)
1602         do_peeling = false;
1603     }
1604
1605   if (do_peeling && !dr0)
1606     {
1607       /* Peeling is possible, but there is no data access that is not supported
1608          unless aligned. So we try to choose the best possible peeling.  */
1609
1610       /* We should get here only if there are drs with known misalignment.  */
1611       gcc_assert (!all_misalignments_unknown);
1612
1613       /* Choose the best peeling from the hash table.  */
1614       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1615                                                    &body_cost_vec);
1616       if (!dr0 || !npeel)
1617         do_peeling = false;
1618
1619       /* If peeling by npeel will result in a remaining loop not iterating
1620          enough to be vectorized then do not peel.  */
1621       if (do_peeling
1622           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1623           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1624               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1625         do_peeling = false;
1626     }
1627
1628   if (do_peeling)
1629     {
1630       stmt = DR_STMT (dr0);
1631       stmt_info = vinfo_for_stmt (stmt);
1632       vectype = STMT_VINFO_VECTYPE (stmt_info);
1633       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1634
1635       if (known_alignment_for_access_p (dr0))
1636         {
1637           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1638                                                 size_zero_node) < 0;
1639           if (!npeel)
1640             {
1641               /* Since it's known at compile time, compute the number of
1642                  iterations in the peeled loop (the peeling factor) for use in
1643                  updating DR_MISALIGNMENT values.  The peeling factor is the
1644                  vectorization factor minus the misalignment as an element
1645                  count.  */
1646               mis = DR_MISALIGNMENT (dr0);
1647               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1648               npeel = ((negative ? mis - nelements : nelements - mis)
1649                        & (nelements - 1));
1650             }
1651
1652           /* For interleaved data access every iteration accesses all the
1653              members of the group, therefore we divide the number of iterations
1654              by the group size.  */
1655           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1656           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1657             npeel /= GROUP_SIZE (stmt_info);
1658
1659           if (dump_enabled_p ())
1660             dump_printf_loc (MSG_NOTE, vect_location,
1661                              "Try peeling by %d\n", npeel);
1662         }
1663
1664       /* Ensure that all data refs can be vectorized after the peel.  */
1665       FOR_EACH_VEC_ELT (datarefs, i, dr)
1666         {
1667           int save_misalignment;
1668
1669           if (dr == dr0)
1670             continue;
1671
1672           stmt = DR_STMT (dr);
1673           stmt_info = vinfo_for_stmt (stmt);
1674           /* For interleaving, only the alignment of the first access
1675             matters.  */
1676           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1677               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1678             continue;
1679
1680           /* Strided loads perform only component accesses, alignment is
1681              irrelevant for them.  */
1682           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1683             continue;
1684
1685           save_misalignment = DR_MISALIGNMENT (dr);
1686           vect_update_misalignment_for_peel (dr, dr0, npeel);
1687           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1688           SET_DR_MISALIGNMENT (dr, save_misalignment);
1689
1690           if (!supportable_dr_alignment)
1691             {
1692               do_peeling = false;
1693               break;
1694             }
1695         }
1696
1697       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1698         {
1699           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1700           if (!stat)
1701             do_peeling = false;
1702           else
1703             {
1704               body_cost_vec.release ();
1705               return stat;
1706             }
1707         }
1708
1709       if (do_peeling)
1710         {
1711           unsigned max_allowed_peel
1712             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1713           if (max_allowed_peel != (unsigned)-1)
1714             {
1715               unsigned max_peel = npeel;
1716               if (max_peel == 0)
1717                 {
1718                   gimple dr_stmt = DR_STMT (dr0);
1719                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1720                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1721                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1722                 }
1723               if (max_peel > max_allowed_peel)
1724                 {
1725                   do_peeling = false;
1726                   if (dump_enabled_p ())
1727                     dump_printf_loc (MSG_NOTE, vect_location,
1728                         "Disable peeling, max peels reached: %d\n", max_peel);
1729                 }
1730             }
1731         }
1732
1733       if (do_peeling)
1734         {
1735           stmt_info_for_cost *si;
1736           void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1737
1738           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1739              If the misalignment of DR_i is identical to that of dr0 then set
1740              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1741              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1742              by the peeling factor times the element size of DR_i (MOD the
1743              vectorization factor times the size).  Otherwise, the
1744              misalignment of DR_i must be set to unknown.  */
1745           FOR_EACH_VEC_ELT (datarefs, i, dr)
1746             if (dr != dr0)
1747               vect_update_misalignment_for_peel (dr, dr0, npeel);
1748
1749           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1750           if (npeel)
1751             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1752           else
1753             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1754               = DR_MISALIGNMENT (dr0);
1755           SET_DR_MISALIGNMENT (dr0, 0);
1756           if (dump_enabled_p ())
1757             {
1758               dump_printf_loc (MSG_NOTE, vect_location,
1759                                "Alignment of access forced using peeling.\n");
1760               dump_printf_loc (MSG_NOTE, vect_location,
1761                                "Peeling for alignment will be applied.\n");
1762             }
1763           /* We've delayed passing the inside-loop peeling costs to the
1764              target cost model until we were sure peeling would happen.
1765              Do so now.  */
1766           if (body_cost_vec.exists ())
1767             {
1768               FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1769                 {
1770                   struct _stmt_vec_info *stmt_info
1771                     = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1772                   (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1773                                         si->misalign, vect_body);
1774                 }
1775               body_cost_vec.release ();
1776             }
1777
1778           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1779           gcc_assert (stat);
1780           return stat;
1781         }
1782     }
1783
1784   body_cost_vec.release ();
1785
1786   /* (2) Versioning to force alignment.  */
1787
1788   /* Try versioning if:
1789      1) optimize loop for speed
1790      2) there is at least one unsupported misaligned data ref with an unknown
1791         misalignment, and
1792      3) all misaligned data refs with a known misalignment are supported, and
1793      4) the number of runtime alignment checks is within reason.  */
1794
1795   do_versioning =
1796         optimize_loop_nest_for_speed_p (loop)
1797         && (!loop->inner); /* FORNOW */
1798
1799   if (do_versioning)
1800     {
1801       FOR_EACH_VEC_ELT (datarefs, i, dr)
1802         {
1803           stmt = DR_STMT (dr);
1804           stmt_info = vinfo_for_stmt (stmt);
1805
1806           /* For interleaving, only the alignment of the first access
1807              matters.  */
1808           if (aligned_access_p (dr)
1809               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1810                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1811             continue;
1812
1813           /* Strided loads perform only component accesses, alignment is
1814              irrelevant for them.  */
1815           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1816             continue;
1817
1818           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1819
1820           if (!supportable_dr_alignment)
1821             {
1822               gimple stmt;
1823               int mask;
1824               tree vectype;
1825
1826               if (known_alignment_for_access_p (dr)
1827                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1828                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1829                 {
1830                   do_versioning = false;
1831                   break;
1832                 }
1833
1834               stmt = DR_STMT (dr);
1835               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1836               gcc_assert (vectype);
1837
1838               /* The rightmost bits of an aligned address must be zeros.
1839                  Construct the mask needed for this test.  For example,
1840                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1841                  mask must be 15 = 0xf. */
1842               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1843
1844               /* FORNOW: use the same mask to test all potentially unaligned
1845                  references in the loop.  The vectorizer currently supports
1846                  a single vector size, see the reference to
1847                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1848                  vectorization factor is computed.  */
1849               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1850                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1851               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1852               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1853                       DR_STMT (dr));
1854             }
1855         }
1856
1857       /* Versioning requires at least one misaligned data reference.  */
1858       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1859         do_versioning = false;
1860       else if (!do_versioning)
1861         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1862     }
1863
1864   if (do_versioning)
1865     {
1866       vec<gimple> may_misalign_stmts
1867         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1868       gimple stmt;
1869
1870       /* It can now be assumed that the data references in the statements
1871          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1872          of the loop being vectorized.  */
1873       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1874         {
1875           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1876           dr = STMT_VINFO_DATA_REF (stmt_info);
1877           SET_DR_MISALIGNMENT (dr, 0);
1878           if (dump_enabled_p ())
1879             dump_printf_loc (MSG_NOTE, vect_location,
1880                              "Alignment of access forced using versioning.\n");
1881         }
1882
1883       if (dump_enabled_p ())
1884         dump_printf_loc (MSG_NOTE, vect_location,
1885                          "Versioning for alignment will be applied.\n");
1886
1887       /* Peeling and versioning can't be done together at this time.  */
1888       gcc_assert (! (do_peeling && do_versioning));
1889
1890       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1891       gcc_assert (stat);
1892       return stat;
1893     }
1894
1895   /* This point is reached if neither peeling nor versioning is being done.  */
1896   gcc_assert (! (do_peeling || do_versioning));
1897
1898   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1899   return stat;
1900 }
1901
1902
1903 /* Function vect_find_same_alignment_drs.
1904
1905    Update group and alignment relations according to the chosen
1906    vectorization factor.  */
1907
1908 static void
1909 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1910                               loop_vec_info loop_vinfo)
1911 {
1912   unsigned int i;
1913   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1914   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1915   struct data_reference *dra = DDR_A (ddr);
1916   struct data_reference *drb = DDR_B (ddr);
1917   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1918   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1919   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1920   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1921   lambda_vector dist_v;
1922   unsigned int loop_depth;
1923
1924   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1925     return;
1926
1927   if (dra == drb)
1928     return;
1929
1930   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1931     return;
1932
1933   /* Loop-based vectorization and known data dependence.  */
1934   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1935     return;
1936
1937   /* Data-dependence analysis reports a distance vector of zero
1938      for data-references that overlap only in the first iteration
1939      but have different sign step (see PR45764).
1940      So as a sanity check require equal DR_STEP.  */
1941   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1942     return;
1943
1944   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1945   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1946     {
1947       int dist = dist_v[loop_depth];
1948
1949       if (dump_enabled_p ())
1950         dump_printf_loc (MSG_NOTE, vect_location,
1951                          "dependence distance  = %d.\n", dist);
1952
1953       /* Same loop iteration.  */
1954       if (dist == 0
1955           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1956         {
1957           /* Two references with distance zero have the same alignment.  */
1958           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1959           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1960           if (dump_enabled_p ())
1961             {
1962               dump_printf_loc (MSG_NOTE, vect_location,
1963                                "accesses have the same alignment.\n");
1964               dump_printf (MSG_NOTE,
1965                            "dependence distance modulo vf == 0 between ");
1966               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1967               dump_printf (MSG_NOTE,  " and ");
1968               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1969               dump_printf (MSG_NOTE, "\n");
1970             }
1971         }
1972     }
1973 }
1974
1975
1976 /* Function vect_analyze_data_refs_alignment
1977
1978    Analyze the alignment of the data-references in the loop.
1979    Return FALSE if a data reference is found that cannot be vectorized.  */
1980
1981 bool
1982 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1983                                   bb_vec_info bb_vinfo)
1984 {
1985   if (dump_enabled_p ())
1986     dump_printf_loc (MSG_NOTE, vect_location,
1987                      "=== vect_analyze_data_refs_alignment ===\n");
1988
1989   /* Mark groups of data references with same alignment using
1990      data dependence information.  */
1991   if (loop_vinfo)
1992     {
1993       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1994       struct data_dependence_relation *ddr;
1995       unsigned int i;
1996
1997       FOR_EACH_VEC_ELT (ddrs, i, ddr)
1998         vect_find_same_alignment_drs (ddr, loop_vinfo);
1999     }
2000
2001   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2002     {
2003       if (dump_enabled_p ())
2004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005                          "not vectorized: can't calculate alignment "
2006                          "for data ref.\n");
2007       return false;
2008     }
2009
2010   return true;
2011 }
2012
2013
2014 /* Analyze groups of accesses: check that DR belongs to a group of
2015    accesses of legal size, step, etc.  Detect gaps, single element
2016    interleaving, and other special cases. Set grouped access info.
2017    Collect groups of strided stores for further use in SLP analysis.  */
2018
2019 static bool
2020 vect_analyze_group_access (struct data_reference *dr)
2021 {
2022   tree step = DR_STEP (dr);
2023   tree scalar_type = TREE_TYPE (DR_REF (dr));
2024   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2025   gimple stmt = DR_STMT (dr);
2026   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2027   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2028   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2029   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2030   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2031   bool slp_impossible = false;
2032   struct loop *loop = NULL;
2033
2034   if (loop_vinfo)
2035     loop = LOOP_VINFO_LOOP (loop_vinfo);
2036
2037   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2038      size of the interleaving group (including gaps).  */
2039   groupsize = absu_hwi (dr_step) / type_size;
2040
2041   /* Not consecutive access is possible only if it is a part of interleaving.  */
2042   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2043     {
2044       /* Check if it this DR is a part of interleaving, and is a single
2045          element of the group that is accessed in the loop.  */
2046
2047       /* Gaps are supported only for loads. STEP must be a multiple of the type
2048          size.  The size of the group must be a power of 2.  */
2049       if (DR_IS_READ (dr)
2050           && (dr_step % type_size) == 0
2051           && groupsize > 0
2052           && exact_log2 (groupsize) != -1)
2053         {
2054           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2055           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2056           if (dump_enabled_p ())
2057             {
2058               dump_printf_loc (MSG_NOTE, vect_location,
2059                                "Detected single element interleaving ");
2060               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2061               dump_printf (MSG_NOTE, " step ");
2062               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2063               dump_printf (MSG_NOTE, "\n");
2064             }
2065
2066           if (loop_vinfo)
2067             {
2068               if (dump_enabled_p ())
2069                 dump_printf_loc (MSG_NOTE, vect_location,
2070                                  "Data access with gaps requires scalar "
2071                                  "epilogue loop\n");
2072               if (loop->inner)
2073                 {
2074                   if (dump_enabled_p ())
2075                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076                                      "Peeling for outer loop is not"
2077                                      " supported\n");
2078                   return false;
2079                 }
2080
2081               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2082             }
2083
2084           return true;
2085         }
2086
2087       if (dump_enabled_p ())
2088         {
2089           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2090                            "not consecutive access ");
2091           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2092           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2093         }
2094
2095       if (bb_vinfo)
2096         {
2097           /* Mark the statement as unvectorizable.  */
2098           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2099           return true;
2100         }
2101
2102       return false;
2103     }
2104
2105   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2106     {
2107       /* First stmt in the interleaving chain. Check the chain.  */
2108       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2109       struct data_reference *data_ref = dr;
2110       unsigned int count = 1;
2111       tree prev_init = DR_INIT (data_ref);
2112       gimple prev = stmt;
2113       HOST_WIDE_INT diff, gaps = 0;
2114       unsigned HOST_WIDE_INT count_in_bytes;
2115
2116       while (next)
2117         {
2118           /* Skip same data-refs.  In case that two or more stmts share
2119              data-ref (supported only for loads), we vectorize only the first
2120              stmt, and the rest get their vectorized loads from the first
2121              one.  */
2122           if (!tree_int_cst_compare (DR_INIT (data_ref),
2123                                      DR_INIT (STMT_VINFO_DATA_REF (
2124                                                    vinfo_for_stmt (next)))))
2125             {
2126               if (DR_IS_WRITE (data_ref))
2127                 {
2128                   if (dump_enabled_p ())
2129                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2130                                      "Two store stmts share the same dr.\n");
2131                   return false;
2132                 }
2133
2134               /* For load use the same data-ref load.  */
2135               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2136
2137               prev = next;
2138               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2139               continue;
2140             }
2141
2142           prev = next;
2143           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2144
2145           /* All group members have the same STEP by construction.  */
2146           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2147
2148           /* Check that the distance between two accesses is equal to the type
2149              size. Otherwise, we have gaps.  */
2150           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2151                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2152           if (diff != 1)
2153             {
2154               /* FORNOW: SLP of accesses with gaps is not supported.  */
2155               slp_impossible = true;
2156               if (DR_IS_WRITE (data_ref))
2157                 {
2158                   if (dump_enabled_p ())
2159                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2160                                      "interleaved store with gaps\n");
2161                   return false;
2162                 }
2163
2164               gaps += diff - 1;
2165             }
2166
2167           last_accessed_element += diff;
2168
2169           /* Store the gap from the previous member of the group. If there is no
2170              gap in the access, GROUP_GAP is always 1.  */
2171           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2172
2173           prev_init = DR_INIT (data_ref);
2174           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2175           /* Count the number of data-refs in the chain.  */
2176           count++;
2177         }
2178
2179       /* COUNT is the number of accesses found, we multiply it by the size of
2180          the type to get COUNT_IN_BYTES.  */
2181       count_in_bytes = type_size * count;
2182
2183       /* Check that the size of the interleaving (including gaps) is not
2184          greater than STEP.  */
2185       if (dr_step != 0
2186           && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2187         {
2188           if (dump_enabled_p ())
2189             {
2190               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191                                "interleaving size is greater than step for ");
2192               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2193                                  DR_REF (dr));
2194               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2195             }
2196           return false;
2197         }
2198
2199       /* Check that the size of the interleaving is equal to STEP for stores,
2200          i.e., that there are no gaps.  */
2201       if (dr_step != 0
2202           && absu_hwi (dr_step) != count_in_bytes)
2203         {
2204           if (DR_IS_READ (dr))
2205             {
2206               slp_impossible = true;
2207               /* There is a gap after the last load in the group. This gap is a
2208                  difference between the groupsize and the number of elements.
2209                  When there is no gap, this difference should be 0.  */
2210               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2211             }
2212           else
2213             {
2214               if (dump_enabled_p ())
2215                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2216                                  "interleaved store with gaps\n");
2217               return false;
2218             }
2219         }
2220
2221       /* Check that STEP is a multiple of type size.  */
2222       if (dr_step != 0
2223           && (dr_step % type_size) != 0)
2224         {
2225           if (dump_enabled_p ())
2226             {
2227               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2228                                "step is not a multiple of type size: step ");
2229               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2230               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2231               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2232                                  TYPE_SIZE_UNIT (scalar_type));
2233               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2234             }
2235           return false;
2236         }
2237
2238       if (groupsize == 0)
2239         groupsize = count;
2240
2241       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2242       if (dump_enabled_p ())
2243         dump_printf_loc (MSG_NOTE, vect_location,
2244                          "Detected interleaving of size %d\n", (int)groupsize);
2245
2246       /* SLP: create an SLP data structure for every interleaving group of
2247          stores for further analysis in vect_analyse_slp.  */
2248       if (DR_IS_WRITE (dr) && !slp_impossible)
2249         {
2250           if (loop_vinfo)
2251             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2252           if (bb_vinfo)
2253             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2254         }
2255
2256       /* There is a gap in the end of the group.  */
2257       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2258         {
2259           if (dump_enabled_p ())
2260             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2261                              "Data access with gaps requires scalar "
2262                              "epilogue loop\n");
2263           if (loop->inner)
2264             {
2265               if (dump_enabled_p ())
2266                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2267                                  "Peeling for outer loop is not supported\n");
2268               return false;
2269             }
2270
2271           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2272         }
2273     }
2274
2275   return true;
2276 }
2277
2278
2279 /* Analyze the access pattern of the data-reference DR.
2280    In case of non-consecutive accesses call vect_analyze_group_access() to
2281    analyze groups of accesses.  */
2282
2283 static bool
2284 vect_analyze_data_ref_access (struct data_reference *dr)
2285 {
2286   tree step = DR_STEP (dr);
2287   tree scalar_type = TREE_TYPE (DR_REF (dr));
2288   gimple stmt = DR_STMT (dr);
2289   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2290   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2291   struct loop *loop = NULL;
2292
2293   if (loop_vinfo)
2294     loop = LOOP_VINFO_LOOP (loop_vinfo);
2295
2296   if (loop_vinfo && !step)
2297     {
2298       if (dump_enabled_p ())
2299         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2300                          "bad data-ref access in loop\n");
2301       return false;
2302     }
2303
2304   /* Allow invariant loads in not nested loops.  */
2305   if (loop_vinfo && integer_zerop (step))
2306     {
2307       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2308       if (nested_in_vect_loop_p (loop, stmt))
2309         {
2310           if (dump_enabled_p ())
2311             dump_printf_loc (MSG_NOTE, vect_location,
2312                              "zero step in inner loop of nest\n");
2313           return false;
2314         }
2315       return DR_IS_READ (dr);
2316     }
2317
2318   if (loop && nested_in_vect_loop_p (loop, stmt))
2319     {
2320       /* Interleaved accesses are not yet supported within outer-loop
2321         vectorization for references in the inner-loop.  */
2322       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2323
2324       /* For the rest of the analysis we use the outer-loop step.  */
2325       step = STMT_VINFO_DR_STEP (stmt_info);
2326       if (integer_zerop (step))
2327         {
2328           if (dump_enabled_p ())
2329             dump_printf_loc (MSG_NOTE, vect_location,
2330                              "zero step in outer loop.\n");
2331           if (DR_IS_READ (dr))
2332             return true;
2333           else
2334             return false;
2335         }
2336     }
2337
2338   /* Consecutive?  */
2339   if (TREE_CODE (step) == INTEGER_CST)
2340     {
2341       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2342       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2343           || (dr_step < 0
2344               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2345         {
2346           /* Mark that it is not interleaving.  */
2347           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2348           return true;
2349         }
2350     }
2351
2352   if (loop && nested_in_vect_loop_p (loop, stmt))
2353     {
2354       if (dump_enabled_p ())
2355         dump_printf_loc (MSG_NOTE, vect_location,
2356                          "grouped access in outer loop.\n");
2357       return false;
2358     }
2359
2360   /* Assume this is a DR handled by non-constant strided load case.  */
2361   if (TREE_CODE (step) != INTEGER_CST)
2362     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2363
2364   /* Not consecutive access - check if it's a part of interleaving group.  */
2365   return vect_analyze_group_access (dr);
2366 }
2367
2368
2369
2370 /*  A helper function used in the comparator function to sort data
2371     references.  T1 and T2 are two data references to be compared.
2372     The function returns -1, 0, or 1.  */
2373
2374 static int
2375 compare_tree (tree t1, tree t2)
2376 {
2377   int i, cmp;
2378   enum tree_code code;
2379   char tclass;
2380
2381   if (t1 == t2)
2382     return 0;
2383   if (t1 == NULL)
2384     return -1;
2385   if (t2 == NULL)
2386     return 1;
2387
2388
2389   if (TREE_CODE (t1) != TREE_CODE (t2))
2390     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2391
2392   code = TREE_CODE (t1);
2393   switch (code)
2394     {
2395     /* For const values, we can just use hash values for comparisons.  */
2396     case INTEGER_CST:
2397     case REAL_CST:
2398     case FIXED_CST:
2399     case STRING_CST:
2400     case COMPLEX_CST:
2401     case VECTOR_CST:
2402       {
2403         hashval_t h1 = iterative_hash_expr (t1, 0);
2404         hashval_t h2 = iterative_hash_expr (t2, 0);
2405         if (h1 != h2)
2406           return h1 < h2 ? -1 : 1;
2407         break;
2408       }
2409
2410     case SSA_NAME:
2411       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2412       if (cmp != 0)
2413         return cmp;
2414
2415       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2416         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2417       break;
2418
2419     default:
2420       tclass = TREE_CODE_CLASS (code);
2421
2422       /* For var-decl, we could compare their UIDs.  */
2423       if (tclass == tcc_declaration)
2424         {
2425           if (DECL_UID (t1) != DECL_UID (t2))
2426             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2427           break;
2428         }
2429
2430       /* For expressions with operands, compare their operands recursively.  */
2431       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2432         {
2433           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2434           if (cmp != 0)
2435             return cmp;
2436         }
2437     }
2438
2439   return 0;
2440 }
2441
2442
2443 /* Compare two data-references DRA and DRB to group them into chunks
2444    suitable for grouping.  */
2445
2446 static int
2447 dr_group_sort_cmp (const void *dra_, const void *drb_)
2448 {
2449   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2450   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2451   int cmp;
2452
2453   /* Stabilize sort.  */
2454   if (dra == drb)
2455     return 0;
2456
2457   /* Ordering of DRs according to base.  */
2458   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2459     {
2460       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2461       if (cmp != 0)
2462         return cmp;
2463     }
2464
2465   /* And according to DR_OFFSET.  */
2466   if (!dr_equal_offsets_p (dra, drb))
2467     {
2468       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2469       if (cmp != 0)
2470         return cmp;
2471     }
2472
2473   /* Put reads before writes.  */
2474   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2475     return DR_IS_READ (dra) ? -1 : 1;
2476
2477   /* Then sort after access size.  */
2478   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2479                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2480     {
2481       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2482                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2483       if (cmp != 0)
2484         return cmp;
2485     }
2486
2487   /* And after step.  */
2488   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2489     {
2490       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2491       if (cmp != 0)
2492         return cmp;
2493     }
2494
2495   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2496   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2497   if (cmp == 0)
2498     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2499   return cmp;
2500 }
2501
2502 /* Function vect_analyze_data_ref_accesses.
2503
2504    Analyze the access pattern of all the data references in the loop.
2505
2506    FORNOW: the only access pattern that is considered vectorizable is a
2507            simple step 1 (consecutive) access.
2508
2509    FORNOW: handle only arrays and pointer accesses.  */
2510
2511 bool
2512 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2513 {
2514   unsigned int i;
2515   vec<data_reference_p> datarefs;
2516   struct data_reference *dr;
2517
2518   if (dump_enabled_p ())
2519     dump_printf_loc (MSG_NOTE, vect_location,
2520                      "=== vect_analyze_data_ref_accesses ===\n");
2521
2522   if (loop_vinfo)
2523     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2524   else
2525     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2526
2527   if (datarefs.is_empty ())
2528     return true;
2529
2530   /* Sort the array of datarefs to make building the interleaving chains
2531      linear.  Don't modify the original vector's order, it is needed for
2532      determining what dependencies are reversed.  */
2533   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2534   datarefs_copy.qsort (dr_group_sort_cmp);
2535
2536   /* Build the interleaving chains.  */
2537   for (i = 0; i < datarefs_copy.length () - 1;)
2538     {
2539       data_reference_p dra = datarefs_copy[i];
2540       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2541       stmt_vec_info lastinfo = NULL;
2542       for (i = i + 1; i < datarefs_copy.length (); ++i)
2543         {
2544           data_reference_p drb = datarefs_copy[i];
2545           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2546
2547           /* ???  Imperfect sorting (non-compatible types, non-modulo
2548              accesses, same accesses) can lead to a group to be artificially
2549              split here as we don't just skip over those.  If it really
2550              matters we can push those to a worklist and re-iterate
2551              over them.  The we can just skip ahead to the next DR here.  */
2552
2553           /* Check that the data-refs have same first location (except init)
2554              and they are both either store or load (not load and store,
2555              not masked loads or stores).  */
2556           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2557               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2558                                    DR_BASE_ADDRESS (drb), 0)
2559               || !dr_equal_offsets_p (dra, drb)
2560               || !gimple_assign_single_p (DR_STMT (dra))
2561               || !gimple_assign_single_p (DR_STMT (drb)))
2562             break;
2563
2564           /* Check that the data-refs have the same constant size and step.  */
2565           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2566           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2567           if (!tree_fits_uhwi_p (sza)
2568               || !tree_fits_uhwi_p (szb)
2569               || !tree_int_cst_equal (sza, szb)
2570               || !tree_fits_shwi_p (DR_STEP (dra))
2571               || !tree_fits_shwi_p (DR_STEP (drb))
2572               || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2573             break;
2574
2575           /* Do not place the same access in the interleaving chain twice.  */
2576           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2577             break;
2578
2579           /* Check the types are compatible.
2580              ???  We don't distinguish this during sorting.  */
2581           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2582                                    TREE_TYPE (DR_REF (drb))))
2583             break;
2584
2585           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2586           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2587           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2588           gcc_assert (init_a < init_b);
2589
2590           /* If init_b == init_a + the size of the type * k, we have an
2591              interleaving, and DRA is accessed before DRB.  */
2592           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2593           if ((init_b - init_a) % type_size_a != 0)
2594             break;
2595
2596           /* The step (if not zero) is greater than the difference between
2597              data-refs' inits.  This splits groups into suitable sizes.  */
2598           HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2599           if (step != 0 && step <= (init_b - init_a))
2600             break;
2601
2602           if (dump_enabled_p ())
2603             {
2604               dump_printf_loc (MSG_NOTE, vect_location,
2605                                "Detected interleaving ");
2606               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2607               dump_printf (MSG_NOTE,  " and ");
2608               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2609               dump_printf (MSG_NOTE, "\n");
2610             }
2611
2612           /* Link the found element into the group list.  */
2613           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2614             {
2615               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2616               lastinfo = stmtinfo_a;
2617             }
2618           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2619           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2620           lastinfo = stmtinfo_b;
2621         }
2622     }
2623
2624   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2625     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2626         && !vect_analyze_data_ref_access (dr))
2627       {
2628         if (dump_enabled_p ())
2629           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630                            "not vectorized: complicated access pattern.\n");
2631
2632         if (bb_vinfo)
2633           {
2634             /* Mark the statement as not vectorizable.  */
2635             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2636             continue;
2637           }
2638         else
2639           {
2640             datarefs_copy.release ();
2641             return false;
2642           }
2643       }
2644
2645   datarefs_copy.release ();
2646   return true;
2647 }
2648
2649
2650 /* Operator == between two dr_with_seg_len objects.
2651
2652    This equality operator is used to make sure two data refs
2653    are the same one so that we will consider to combine the
2654    aliasing checks of those two pairs of data dependent data
2655    refs.  */
2656
2657 static bool
2658 operator == (const dr_with_seg_len& d1,
2659              const dr_with_seg_len& d2)
2660 {
2661   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2662                           DR_BASE_ADDRESS (d2.dr), 0)
2663            && compare_tree (d1.offset, d2.offset) == 0
2664            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2665 }
2666
2667 /* Function comp_dr_with_seg_len_pair.
2668
2669    Comparison function for sorting objects of dr_with_seg_len_pair_t
2670    so that we can combine aliasing checks in one scan.  */
2671
2672 static int
2673 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2674 {
2675   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2676   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2677
2678   const dr_with_seg_len &p11 = p1->first,
2679                         &p12 = p1->second,
2680                         &p21 = p2->first,
2681                         &p22 = p2->second;
2682
2683   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2684      if a and c have the same basic address snd step, and b and d have the same
2685      address and step.  Therefore, if any a&c or b&d don't have the same address
2686      and step, we don't care the order of those two pairs after sorting.  */
2687   int comp_res;
2688
2689   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2690                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2691     return comp_res;
2692   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2693                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2694     return comp_res;
2695   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2696     return comp_res;
2697   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2698     return comp_res;
2699   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2700     return comp_res;
2701   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2702     return comp_res;
2703
2704   return 0;
2705 }
2706
2707 template <class T> static void
2708 swap (T& a, T& b)
2709 {
2710   T c (a);
2711   a = b;
2712   b = c;
2713 }
2714
2715 /* Function vect_vfa_segment_size.
2716
2717    Create an expression that computes the size of segment
2718    that will be accessed for a data reference.  The functions takes into
2719    account that realignment loads may access one more vector.
2720
2721    Input:
2722      DR: The data reference.
2723      LENGTH_FACTOR: segment length to consider.
2724
2725    Return an expression whose value is the size of segment which will be
2726    accessed by DR.  */
2727
2728 static tree
2729 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2730 {
2731   tree segment_length;
2732
2733   if (integer_zerop (DR_STEP (dr)))
2734     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2735   else
2736     segment_length = size_binop (MULT_EXPR,
2737                                  fold_convert (sizetype, DR_STEP (dr)),
2738                                  fold_convert (sizetype, length_factor));
2739
2740   if (vect_supportable_dr_alignment (dr, false)
2741         == dr_explicit_realign_optimized)
2742     {
2743       tree vector_size = TYPE_SIZE_UNIT
2744                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2745
2746       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2747     }
2748   return segment_length;
2749 }
2750
2751 /* Function vect_prune_runtime_alias_test_list.
2752
2753    Prune a list of ddrs to be tested at run-time by versioning for alias.
2754    Merge several alias checks into one if possible.
2755    Return FALSE if resulting list of ddrs is longer then allowed by
2756    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2757
2758 bool
2759 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2760 {
2761   vec<ddr_p> may_alias_ddrs =
2762     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2763   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2764     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2765   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2766   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2767
2768   ddr_p ddr;
2769   unsigned int i;
2770   tree length_factor;
2771
2772   if (dump_enabled_p ())
2773     dump_printf_loc (MSG_NOTE, vect_location,
2774                      "=== vect_prune_runtime_alias_test_list ===\n");
2775
2776   if (may_alias_ddrs.is_empty ())
2777     return true;
2778
2779   /* Basically, for each pair of dependent data refs store_ptr_0
2780      and load_ptr_0, we create an expression:
2781
2782      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2783      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2784
2785      for aliasing checks.  However, in some cases we can decrease
2786      the number of checks by combining two checks into one.  For
2787      example, suppose we have another pair of data refs store_ptr_0
2788      and load_ptr_1, and if the following condition is satisfied:
2789
2790      load_ptr_0 < load_ptr_1  &&
2791      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2792
2793      (this condition means, in each iteration of vectorized loop,
2794      the accessed memory of store_ptr_0 cannot be between the memory
2795      of load_ptr_0 and load_ptr_1.)
2796
2797      we then can use only the following expression to finish the
2798      alising checks between store_ptr_0 & load_ptr_0 and
2799      store_ptr_0 & load_ptr_1:
2800
2801      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2802      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2803
2804      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2805      same basic address.  */
2806
2807   comp_alias_ddrs.create (may_alias_ddrs.length ());
2808
2809   /* First, we collect all data ref pairs for aliasing checks.  */
2810   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2811     {
2812       struct data_reference *dr_a, *dr_b;
2813       gimple dr_group_first_a, dr_group_first_b;
2814       tree segment_length_a, segment_length_b;
2815       gimple stmt_a, stmt_b;
2816
2817       dr_a = DDR_A (ddr);
2818       stmt_a = DR_STMT (DDR_A (ddr));
2819       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2820       if (dr_group_first_a)
2821         {
2822           stmt_a = dr_group_first_a;
2823           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2824         }
2825
2826       dr_b = DDR_B (ddr);
2827       stmt_b = DR_STMT (DDR_B (ddr));
2828       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2829       if (dr_group_first_b)
2830         {
2831           stmt_b = dr_group_first_b;
2832           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2833         }
2834
2835       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2836         length_factor = scalar_loop_iters;
2837       else
2838         length_factor = size_int (vect_factor);
2839       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2840       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2841
2842       dr_with_seg_len_pair_t dr_with_seg_len_pair
2843           (dr_with_seg_len (dr_a, segment_length_a),
2844            dr_with_seg_len (dr_b, segment_length_b));
2845
2846       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2847         swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2848
2849       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2850     }
2851
2852   /* Second, we sort the collected data ref pairs so that we can scan
2853      them once to combine all possible aliasing checks.  */
2854   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2855
2856   /* Third, we scan the sorted dr pairs and check if we can combine
2857      alias checks of two neighbouring dr pairs.  */
2858   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2859     {
2860       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2861       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2862                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2863                       *dr_a2 = &comp_alias_ddrs[i].first,
2864                       *dr_b2 = &comp_alias_ddrs[i].second;
2865
2866       /* Remove duplicate data ref pairs.  */
2867       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2868         {
2869           if (dump_enabled_p ())
2870             {
2871               dump_printf_loc (MSG_NOTE, vect_location,
2872                                "found equal ranges ");
2873               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2874                                  DR_REF (dr_a1->dr));
2875               dump_printf (MSG_NOTE,  ", ");
2876               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2877                                  DR_REF (dr_b1->dr));
2878               dump_printf (MSG_NOTE,  " and ");
2879               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2880                                  DR_REF (dr_a2->dr));
2881               dump_printf (MSG_NOTE,  ", ");
2882               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2883                                  DR_REF (dr_b2->dr));
2884               dump_printf (MSG_NOTE, "\n");
2885             }
2886
2887           comp_alias_ddrs.ordered_remove (i--);
2888           continue;
2889         }
2890
2891       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2892         {
2893           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2894              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2895           if (*dr_a1 == *dr_a2)
2896             {
2897               swap (dr_a1, dr_b1);
2898               swap (dr_a2, dr_b2);
2899             }
2900
2901           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2902                                 DR_BASE_ADDRESS (dr_a2->dr),
2903                                 0)
2904               || !tree_fits_shwi_p (dr_a1->offset)
2905               || !tree_fits_shwi_p (dr_a2->offset))
2906             continue;
2907
2908           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2909                                 - tree_to_shwi (dr_a1->offset));
2910
2911
2912           /* Now we check if the following condition is satisfied:
2913
2914              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2915
2916              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2917              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2918              have to make a best estimation.  We can get the minimum value
2919              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2920              then either of the following two conditions can guarantee the
2921              one above:
2922
2923              1: DIFF <= MIN_SEG_LEN_B
2924              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2925
2926              */
2927
2928           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2929                                           ? tree_to_shwi (dr_b1->seg_len)
2930                                           : vect_factor);
2931
2932           if (diff <= min_seg_len_b
2933               || (tree_fits_shwi_p (dr_a1->seg_len)
2934                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2935             {
2936               if (dump_enabled_p ())
2937                 {
2938                   dump_printf_loc (MSG_NOTE, vect_location,
2939                                    "merging ranges for ");
2940                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2941                                      DR_REF (dr_a1->dr));
2942                   dump_printf (MSG_NOTE,  ", ");
2943                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2944                                      DR_REF (dr_b1->dr));
2945                   dump_printf (MSG_NOTE,  " and ");
2946                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2947                                      DR_REF (dr_a2->dr));
2948                   dump_printf (MSG_NOTE,  ", ");
2949                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2950                                      DR_REF (dr_b2->dr));
2951                   dump_printf (MSG_NOTE, "\n");
2952                 }
2953
2954               dr_a1->seg_len = size_binop (PLUS_EXPR,
2955                                            dr_a2->seg_len, size_int (diff));
2956               comp_alias_ddrs.ordered_remove (i--);
2957             }
2958         }
2959     }
2960
2961   dump_printf_loc (MSG_NOTE, vect_location,
2962                    "improved number of alias checks from %d to %d\n",
2963                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2964   if ((int) comp_alias_ddrs.length () >
2965       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2966     return false;
2967
2968   return true;
2969 }
2970
2971 /* Check whether a non-affine read in stmt is suitable for gather load
2972    and if so, return a builtin decl for that operation.  */
2973
2974 tree
2975 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2976                    tree *offp, int *scalep)
2977 {
2978   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2979   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2980   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2981   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2982   tree offtype = NULL_TREE;
2983   tree decl, base, off;
2984   enum machine_mode pmode;
2985   int punsignedp, pvolatilep;
2986
2987   base = DR_REF (dr);
2988   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2989      see if we can use the def stmt of the address.  */
2990   if (is_gimple_call (stmt)
2991       && gimple_call_internal_p (stmt)
2992       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2993           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2994       && TREE_CODE (base) == MEM_REF
2995       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2996       && integer_zerop (TREE_OPERAND (base, 1))
2997       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2998     {
2999       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3000       if (is_gimple_assign (def_stmt)
3001           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3002         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3003     }
3004
3005   /* The gather builtins need address of the form
3006      loop_invariant + vector * {1, 2, 4, 8}
3007      or
3008      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3009      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3010      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3011      multiplications and additions in it.  To get a vector, we need
3012      a single SSA_NAME that will be defined in the loop and will
3013      contain everything that is not loop invariant and that can be
3014      vectorized.  The following code attempts to find such a preexistng
3015      SSA_NAME OFF and put the loop invariants into a tree BASE
3016      that can be gimplified before the loop.  */
3017   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3018                               &pmode, &punsignedp, &pvolatilep, false);
3019   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3020
3021   if (TREE_CODE (base) == MEM_REF)
3022     {
3023       if (!integer_zerop (TREE_OPERAND (base, 1)))
3024         {
3025           if (off == NULL_TREE)
3026             {
3027               offset_int moff = mem_ref_offset (base);
3028               off = wide_int_to_tree (sizetype, moff);
3029             }
3030           else
3031             off = size_binop (PLUS_EXPR, off,
3032                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3033         }
3034       base = TREE_OPERAND (base, 0);
3035     }
3036   else
3037     base = build_fold_addr_expr (base);
3038
3039   if (off == NULL_TREE)
3040     off = size_zero_node;
3041
3042   /* If base is not loop invariant, either off is 0, then we start with just
3043      the constant offset in the loop invariant BASE and continue with base
3044      as OFF, otherwise give up.
3045      We could handle that case by gimplifying the addition of base + off
3046      into some SSA_NAME and use that as off, but for now punt.  */
3047   if (!expr_invariant_in_loop_p (loop, base))
3048     {
3049       if (!integer_zerop (off))
3050         return NULL_TREE;
3051       off = base;
3052       base = size_int (pbitpos / BITS_PER_UNIT);
3053     }
3054   /* Otherwise put base + constant offset into the loop invariant BASE
3055      and continue with OFF.  */
3056   else
3057     {
3058       base = fold_convert (sizetype, base);
3059       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3060     }
3061
3062   /* OFF at this point may be either a SSA_NAME or some tree expression
3063      from get_inner_reference.  Try to peel off loop invariants from it
3064      into BASE as long as possible.  */
3065   STRIP_NOPS (off);
3066   while (offtype == NULL_TREE)
3067     {
3068       enum tree_code code;
3069       tree op0, op1, add = NULL_TREE;
3070
3071       if (TREE_CODE (off) == SSA_NAME)
3072         {
3073           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3074
3075           if (expr_invariant_in_loop_p (loop, off))
3076             return NULL_TREE;
3077
3078           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3079             break;
3080
3081           op0 = gimple_assign_rhs1 (def_stmt);
3082           code = gimple_assign_rhs_code (def_stmt);
3083           op1 = gimple_assign_rhs2 (def_stmt);
3084         }
3085       else
3086         {
3087           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3088             return NULL_TREE;
3089           code = TREE_CODE (off);
3090           extract_ops_from_tree (off, &code, &op0, &op1);
3091         }
3092       switch (code)
3093         {
3094         case POINTER_PLUS_EXPR:
3095         case PLUS_EXPR:
3096           if (expr_invariant_in_loop_p (loop, op0))
3097             {
3098               add = op0;
3099               off = op1;
3100             do_add:
3101               add = fold_convert (sizetype, add);
3102               if (scale != 1)
3103                 add = size_binop (MULT_EXPR, add, size_int (scale));
3104               base = size_binop (PLUS_EXPR, base, add);
3105               continue;
3106             }
3107           if (expr_invariant_in_loop_p (loop, op1))
3108             {
3109               add = op1;
3110               off = op0;
3111               goto do_add;
3112             }
3113           break;
3114         case MINUS_EXPR:
3115           if (expr_invariant_in_loop_p (loop, op1))
3116             {
3117               add = fold_convert (sizetype, op1);
3118               add = size_binop (MINUS_EXPR, size_zero_node, add);
3119               off = op0;
3120               goto do_add;
3121             }
3122           break;
3123         case MULT_EXPR:
3124           if (scale == 1 && tree_fits_shwi_p (op1))
3125             {
3126               scale = tree_to_shwi (op1);
3127               off = op0;
3128               continue;
3129             }
3130           break;
3131         case SSA_NAME:
3132           off = op0;
3133           continue;
3134         CASE_CONVERT:
3135           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3136               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3137             break;
3138           if (TYPE_PRECISION (TREE_TYPE (op0))
3139               == TYPE_PRECISION (TREE_TYPE (off)))
3140             {
3141               off = op0;
3142               continue;
3143             }
3144           if (TYPE_PRECISION (TREE_TYPE (op0))
3145               < TYPE_PRECISION (TREE_TYPE (off)))
3146             {
3147               off = op0;
3148               offtype = TREE_TYPE (off);
3149               STRIP_NOPS (off);
3150               continue;
3151             }
3152           break;
3153         default:
3154           break;
3155         }
3156       break;
3157     }
3158
3159   /* If at the end OFF still isn't a SSA_NAME or isn't
3160      defined in the loop, punt.  */
3161   if (TREE_CODE (off) != SSA_NAME
3162       || expr_invariant_in_loop_p (loop, off))
3163     return NULL_TREE;
3164
3165   if (offtype == NULL_TREE)
3166     offtype = TREE_TYPE (off);
3167
3168   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3169                                            offtype, scale);
3170   if (decl == NULL_TREE)
3171     return NULL_TREE;
3172
3173   if (basep)
3174     *basep = base;
3175   if (offp)
3176     *offp = off;
3177   if (scalep)
3178     *scalep = scale;
3179   return decl;
3180 }
3181
3182 /* Function vect_analyze_data_refs.
3183
3184   Find all the data references in the loop or basic block.
3185
3186    The general structure of the analysis of data refs in the vectorizer is as
3187    follows:
3188    1- vect_analyze_data_refs(loop/bb): call
3189       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3190       in the loop/bb and their dependences.
3191    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3192    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3193    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3194
3195 */
3196
3197 bool
3198 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3199                         bb_vec_info bb_vinfo,
3200                         int *min_vf, unsigned *n_stmts)
3201 {
3202   struct loop *loop = NULL;
3203   basic_block bb = NULL;
3204   unsigned int i;
3205   vec<data_reference_p> datarefs;
3206   struct data_reference *dr;
3207   tree scalar_type;
3208
3209   if (dump_enabled_p ())
3210     dump_printf_loc (MSG_NOTE, vect_location,
3211                      "=== vect_analyze_data_refs ===\n");
3212
3213   if (loop_vinfo)
3214     {
3215       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3216
3217       loop = LOOP_VINFO_LOOP (loop_vinfo);
3218       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3219       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3220         {
3221           if (dump_enabled_p ())
3222             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3223                              "not vectorized: loop contains function calls"
3224                              " or data references that cannot be analyzed\n");
3225           return false;
3226         }
3227
3228       for (i = 0; i < loop->num_nodes; i++)
3229         {
3230           gimple_stmt_iterator gsi;
3231
3232           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3233             {
3234               gimple stmt = gsi_stmt (gsi);
3235               if (is_gimple_debug (stmt))
3236                 continue;
3237               ++*n_stmts;
3238               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3239                 {
3240                   if (is_gimple_call (stmt) && loop->safelen)
3241                     {
3242                       tree fndecl = gimple_call_fndecl (stmt), op;
3243                       if (fndecl != NULL_TREE)
3244                         {
3245                           struct cgraph_node *node = cgraph_node::get (fndecl);
3246                           if (node != NULL && node->simd_clones != NULL)
3247                             {
3248                               unsigned int j, n = gimple_call_num_args (stmt);
3249                               for (j = 0; j < n; j++)
3250                                 {
3251                                   op = gimple_call_arg (stmt, j);
3252                                   if (DECL_P (op)
3253                                       || (REFERENCE_CLASS_P (op)
3254                                           && get_base_address (op)))
3255                                     break;
3256                                 }
3257                               op = gimple_call_lhs (stmt);
3258                               /* Ignore #pragma omp declare simd functions
3259                                  if they don't have data references in the
3260                                  call stmt itself.  */
3261                               if (j == n
3262                                   && !(op
3263                                        && (DECL_P (op)
3264                                            || (REFERENCE_CLASS_P (op)
3265                                                && get_base_address (op)))))
3266                                 continue;
3267                             }
3268                         }
3269                     }
3270                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3271                   if (dump_enabled_p ())
3272                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3273                                      "not vectorized: loop contains function "
3274                                      "calls or data references that cannot "
3275                                      "be analyzed\n");
3276                   return false;
3277                 }
3278             }
3279         }
3280
3281       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3282     }
3283   else
3284     {
3285       gimple_stmt_iterator gsi;
3286
3287       bb = BB_VINFO_BB (bb_vinfo);
3288       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3289         {
3290           gimple stmt = gsi_stmt (gsi);
3291           if (is_gimple_debug (stmt))
3292             continue;
3293           ++*n_stmts;
3294           if (!find_data_references_in_stmt (NULL, stmt,
3295                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3296             {
3297               /* Mark the rest of the basic-block as unvectorizable.  */
3298               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3299                 {
3300                   stmt = gsi_stmt (gsi);
3301                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3302                 }
3303               break;
3304             }
3305         }
3306
3307       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3308     }
3309
3310   /* Go through the data-refs, check that the analysis succeeded.  Update
3311      pointer from stmt_vec_info struct to DR and vectype.  */
3312
3313   FOR_EACH_VEC_ELT (datarefs, i, dr)
3314     {
3315       gimple stmt;
3316       stmt_vec_info stmt_info;
3317       tree base, offset, init;
3318       bool gather = false;
3319       bool simd_lane_access = false;
3320       int vf;
3321
3322 again:
3323       if (!dr || !DR_REF (dr))
3324         {
3325           if (dump_enabled_p ())
3326             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3327                              "not vectorized: unhandled data-ref\n");
3328           return false;
3329         }
3330
3331       stmt = DR_STMT (dr);
3332       stmt_info = vinfo_for_stmt (stmt);
3333
3334       /* Discard clobbers from the dataref vector.  We will remove
3335          clobber stmts during vectorization.  */
3336       if (gimple_clobber_p (stmt))
3337         {
3338           free_data_ref (dr);
3339           if (i == datarefs.length () - 1)
3340             {
3341               datarefs.pop ();
3342               break;
3343             }
3344           datarefs.ordered_remove (i);
3345           dr = datarefs[i];
3346           goto again;
3347         }
3348
3349       /* Check that analysis of the data-ref succeeded.  */
3350       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3351           || !DR_STEP (dr))
3352         {
3353           bool maybe_gather
3354             = DR_IS_READ (dr)
3355               && !TREE_THIS_VOLATILE (DR_REF (dr))
3356               && targetm.vectorize.builtin_gather != NULL;
3357           bool maybe_simd_lane_access
3358             = loop_vinfo && loop->simduid;
3359
3360           /* If target supports vector gather loads, or if this might be
3361              a SIMD lane access, see if they can't be used.  */
3362           if (loop_vinfo
3363               && (maybe_gather || maybe_simd_lane_access)
3364               && !nested_in_vect_loop_p (loop, stmt))
3365             {
3366               struct data_reference *newdr
3367                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3368                                    DR_REF (dr), stmt, true);
3369               gcc_assert (newdr != NULL && DR_REF (newdr));
3370               if (DR_BASE_ADDRESS (newdr)
3371                   && DR_OFFSET (newdr)
3372                   && DR_INIT (newdr)
3373                   && DR_STEP (newdr)
3374                   && integer_zerop (DR_STEP (newdr)))
3375                 {
3376                   if (maybe_simd_lane_access)
3377                     {
3378                       tree off = DR_OFFSET (newdr);
3379                       STRIP_NOPS (off);
3380                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3381                           && TREE_CODE (off) == MULT_EXPR
3382                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3383                         {
3384                           tree step = TREE_OPERAND (off, 1);
3385                           off = TREE_OPERAND (off, 0);
3386                           STRIP_NOPS (off);
3387                           if (CONVERT_EXPR_P (off)
3388                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3389                                                                           0)))
3390                                  < TYPE_PRECISION (TREE_TYPE (off)))
3391                             off = TREE_OPERAND (off, 0);
3392                           if (TREE_CODE (off) == SSA_NAME)
3393                             {
3394                               gimple def = SSA_NAME_DEF_STMT (off);
3395                               tree reft = TREE_TYPE (DR_REF (newdr));
3396                               if (is_gimple_call (def)
3397                                   && gimple_call_internal_p (def)
3398                                   && (gimple_call_internal_fn (def)
3399                                       == IFN_GOMP_SIMD_LANE))
3400                                 {
3401                                   tree arg = gimple_call_arg (def, 0);
3402                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3403                                   arg = SSA_NAME_VAR (arg);
3404                                   if (arg == loop->simduid
3405                                       /* For now.  */
3406                                       && tree_int_cst_equal
3407                                            (TYPE_SIZE_UNIT (reft),
3408                                             step))
3409                                     {
3410                                       DR_OFFSET (newdr) = ssize_int (0);
3411                                       DR_STEP (newdr) = step;
3412                                       DR_ALIGNED_TO (newdr)
3413                                         = size_int (BIGGEST_ALIGNMENT);
3414                                       dr = newdr;
3415                                       simd_lane_access = true;
3416                                     }
3417                                 }
3418                             }
3419                         }
3420                     }
3421                   if (!simd_lane_access && maybe_gather)
3422                     {
3423                       dr = newdr;
3424                       gather = true;
3425                     }
3426                 }
3427               if (!gather && !simd_lane_access)
3428                 free_data_ref (newdr);
3429             }
3430
3431           if (!gather && !simd_lane_access)
3432             {
3433               if (dump_enabled_p ())
3434                 {
3435                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3436                                    "not vectorized: data ref analysis "
3437                                    "failed ");
3438                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3439                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3440                 }
3441
3442               if (bb_vinfo)
3443                 break;
3444
3445               return false;
3446             }
3447         }
3448
3449       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3450         {
3451           if (dump_enabled_p ())
3452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3453                              "not vectorized: base addr of dr is a "
3454                              "constant\n");
3455
3456           if (bb_vinfo)
3457             break;
3458
3459           if (gather || simd_lane_access)
3460             free_data_ref (dr);
3461           return false;
3462         }
3463
3464       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3465         {
3466           if (dump_enabled_p ())
3467             {
3468               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3469                                "not vectorized: volatile type ");
3470               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3471               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3472             }
3473
3474           if (bb_vinfo)
3475             break;
3476
3477           return false;
3478         }
3479
3480       if (stmt_can_throw_internal (stmt))
3481         {
3482           if (dump_enabled_p ())
3483             {
3484               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3485                                "not vectorized: statement can throw an "
3486                                "exception ");
3487               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3488               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3489             }
3490
3491           if (bb_vinfo)
3492             break;
3493
3494           if (gather || simd_lane_access)
3495             free_data_ref (dr);
3496           return false;
3497         }
3498
3499       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3500           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3501         {
3502           if (dump_enabled_p ())
3503             {
3504               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3505                                "not vectorized: statement is bitfield "
3506                                "access ");
3507               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3508               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3509             }
3510
3511           if (bb_vinfo)
3512             break;
3513
3514           if (gather || simd_lane_access)
3515             free_data_ref (dr);
3516           return false;
3517         }
3518
3519       base = unshare_expr (DR_BASE_ADDRESS (dr));
3520       offset = unshare_expr (DR_OFFSET (dr));
3521       init = unshare_expr (DR_INIT (dr));
3522
3523       if (is_gimple_call (stmt)
3524           && (!gimple_call_internal_p (stmt)
3525               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3526                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3527         {
3528           if (dump_enabled_p ())
3529             {
3530               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3531                                "not vectorized: dr in a call ");
3532               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3533               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3534             }
3535
3536           if (bb_vinfo)
3537             break;
3538
3539           if (gather || simd_lane_access)
3540             free_data_ref (dr);
3541           return false;
3542         }
3543
3544       /* Update DR field in stmt_vec_info struct.  */
3545
3546       /* If the dataref is in an inner-loop of the loop that is considered for
3547          for vectorization, we also want to analyze the access relative to
3548          the outer-loop (DR contains information only relative to the
3549          inner-most enclosing loop).  We do that by building a reference to the
3550          first location accessed by the inner-loop, and analyze it relative to
3551          the outer-loop.  */
3552       if (loop && nested_in_vect_loop_p (loop, stmt))
3553         {
3554           tree outer_step, outer_base, outer_init;
3555           HOST_WIDE_INT pbitsize, pbitpos;
3556           tree poffset;
3557           enum machine_mode pmode;
3558           int punsignedp, pvolatilep;
3559           affine_iv base_iv, offset_iv;
3560           tree dinit;
3561
3562           /* Build a reference to the first location accessed by the
3563              inner-loop: *(BASE+INIT).  (The first location is actually
3564              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3565           tree inner_base = build_fold_indirect_ref
3566                                 (fold_build_pointer_plus (base, init));
3567
3568           if (dump_enabled_p ())
3569             {
3570               dump_printf_loc (MSG_NOTE, vect_location,
3571                                "analyze in outer-loop: ");
3572               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3573               dump_printf (MSG_NOTE, "\n");
3574             }
3575
3576           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3577                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3578           gcc_assert (outer_base != NULL_TREE);
3579
3580           if (pbitpos % BITS_PER_UNIT != 0)
3581             {
3582               if (dump_enabled_p ())
3583                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3584                                  "failed: bit offset alignment.\n");
3585               return false;
3586             }
3587
3588           outer_base = build_fold_addr_expr (outer_base);
3589           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3590                           &base_iv, false))
3591             {
3592               if (dump_enabled_p ())
3593                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3594                                  "failed: evolution of base is not affine.\n");
3595               return false;
3596             }
3597
3598           if (offset)
3599             {
3600               if (poffset)
3601                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3602                                        poffset);
3603               else
3604                 poffset = offset;
3605             }
3606
3607           if (!poffset)
3608             {
3609               offset_iv.base = ssize_int (0);
3610               offset_iv.step = ssize_int (0);
3611             }
3612           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3613                                &offset_iv, false))
3614             {
3615               if (dump_enabled_p ())
3616                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3617                                  "evolution of offset is not affine.\n");
3618               return false;
3619             }
3620
3621           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3622           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3623           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3624           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3625           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3626
3627           outer_step = size_binop (PLUS_EXPR,
3628                                 fold_convert (ssizetype, base_iv.step),
3629                                 fold_convert (ssizetype, offset_iv.step));
3630
3631           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3632           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3633           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3634           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3635           STMT_VINFO_DR_OFFSET (stmt_info) =
3636                                 fold_convert (ssizetype, offset_iv.base);
3637           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3638                                 size_int (highest_pow2_factor (offset_iv.base));
3639
3640           if (dump_enabled_p ())
3641             {
3642               dump_printf_loc (MSG_NOTE, vect_location,
3643                                "\touter base_address: ");
3644               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3645                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3646               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3647               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3648                                  STMT_VINFO_DR_OFFSET (stmt_info));
3649               dump_printf (MSG_NOTE,
3650                            "\n\touter constant offset from base address: ");
3651               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3652                                  STMT_VINFO_DR_INIT (stmt_info));
3653               dump_printf (MSG_NOTE, "\n\touter step: ");
3654               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3655                                  STMT_VINFO_DR_STEP (stmt_info));
3656               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3657               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3658                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3659               dump_printf (MSG_NOTE, "\n");
3660             }
3661         }
3662
3663       if (STMT_VINFO_DATA_REF (stmt_info))
3664         {
3665           if (dump_enabled_p ())
3666             {
3667               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3668                                "not vectorized: more than one data ref "
3669                                "in stmt: ");
3670               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3671               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3672             }
3673
3674           if (bb_vinfo)
3675             break;
3676
3677           if (gather || simd_lane_access)
3678             free_data_ref (dr);
3679           return false;
3680         }
3681
3682       STMT_VINFO_DATA_REF (stmt_info) = dr;
3683       if (simd_lane_access)
3684         {
3685           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3686           free_data_ref (datarefs[i]);
3687           datarefs[i] = dr;
3688         }
3689
3690       /* Set vectype for STMT.  */
3691       scalar_type = TREE_TYPE (DR_REF (dr));
3692       STMT_VINFO_VECTYPE (stmt_info)
3693         = get_vectype_for_scalar_type (scalar_type);
3694       if (!STMT_VINFO_VECTYPE (stmt_info))
3695         {
3696           if (dump_enabled_p ())
3697             {
3698               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3699                                "not vectorized: no vectype for stmt: ");
3700               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3701               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3702               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3703                                  scalar_type);
3704               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3705             }
3706
3707           if (bb_vinfo)
3708             break;
3709
3710           if (gather || simd_lane_access)
3711             {
3712               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3713               if (gather)
3714                 free_data_ref (dr);
3715             }
3716           return false;
3717         }
3718       else
3719         {
3720           if (dump_enabled_p ())
3721             {
3722               dump_printf_loc (MSG_NOTE, vect_location,
3723                                "got vectype for stmt: ");
3724               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3725               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3726                                  STMT_VINFO_VECTYPE (stmt_info));
3727               dump_printf (MSG_NOTE, "\n");
3728             }
3729         }
3730
3731       /* Adjust the minimal vectorization factor according to the
3732          vector type.  */
3733       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3734       if (vf > *min_vf)
3735         *min_vf = vf;
3736
3737       if (gather)
3738         {
3739           tree off;
3740
3741           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3742           if (gather
3743               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3744             gather = false;
3745           if (!gather)
3746             {
3747               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3748               free_data_ref (dr);
3749               if (dump_enabled_p ())
3750                 {
3751                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3752                                    "not vectorized: not suitable for gather "
3753                                    "load ");
3754                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3755                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3756                 }
3757               return false;
3758             }
3759
3760           datarefs[i] = dr;
3761           STMT_VINFO_GATHER_P (stmt_info) = true;
3762         }
3763       else if (loop_vinfo
3764                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3765         {
3766           if (nested_in_vect_loop_p (loop, stmt)
3767               || !DR_IS_READ (dr))
3768             {
3769               if (dump_enabled_p ())
3770                 {
3771                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3772                                    "not vectorized: not suitable for strided "
3773                                    "load ");
3774                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3775                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3776                 }
3777               return false;
3778             }
3779           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3780         }
3781     }
3782
3783   /* If we stopped analysis at the first dataref we could not analyze
3784      when trying to vectorize a basic-block mark the rest of the datarefs
3785      as not vectorizable and truncate the vector of datarefs.  That
3786      avoids spending useless time in analyzing their dependence.  */
3787   if (i != datarefs.length ())
3788     {
3789       gcc_assert (bb_vinfo != NULL);
3790       for (unsigned j = i; j < datarefs.length (); ++j)
3791         {
3792           data_reference_p dr = datarefs[j];
3793           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3794           free_data_ref (dr);
3795         }
3796       datarefs.truncate (i);
3797     }
3798
3799   return true;
3800 }
3801
3802
3803 /* Function vect_get_new_vect_var.
3804
3805    Returns a name for a new variable.  The current naming scheme appends the
3806    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3807    the name of vectorizer generated variables, and appends that to NAME if
3808    provided.  */
3809
3810 tree
3811 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3812 {
3813   const char *prefix;
3814   tree new_vect_var;
3815
3816   switch (var_kind)
3817   {
3818   case vect_simple_var:
3819     prefix = "vect";
3820     break;
3821   case vect_scalar_var:
3822     prefix = "stmp";
3823     break;
3824   case vect_pointer_var:
3825     prefix = "vectp";
3826     break;
3827   default:
3828     gcc_unreachable ();
3829   }
3830
3831   if (name)
3832     {
3833       char* tmp = concat (prefix, "_", name, NULL);
3834       new_vect_var = create_tmp_reg (type, tmp);
3835       free (tmp);
3836     }
3837   else
3838     new_vect_var = create_tmp_reg (type, prefix);
3839
3840   return new_vect_var;
3841 }
3842
3843
3844 /* Function vect_create_addr_base_for_vector_ref.
3845
3846    Create an expression that computes the address of the first memory location
3847    that will be accessed for a data reference.
3848
3849    Input:
3850    STMT: The statement containing the data reference.
3851    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3852    OFFSET: Optional. If supplied, it is be added to the initial address.
3853    LOOP:    Specify relative to which loop-nest should the address be computed.
3854             For example, when the dataref is in an inner-loop nested in an
3855             outer-loop that is now being vectorized, LOOP can be either the
3856             outer-loop, or the inner-loop.  The first memory location accessed
3857             by the following dataref ('in' points to short):
3858
3859                 for (i=0; i<N; i++)
3860                    for (j=0; j<M; j++)
3861                      s += in[i+j]
3862
3863             is as follows:
3864             if LOOP=i_loop:     &in             (relative to i_loop)
3865             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3866    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3867             initial address.  Unlike OFFSET, which is number of elements to
3868             be added, BYTE_OFFSET is measured in bytes.
3869
3870    Output:
3871    1. Return an SSA_NAME whose value is the address of the memory location of
3872       the first vector of the data reference.
3873    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3874       these statement(s) which define the returned SSA_NAME.
3875
3876    FORNOW: We are only handling array accesses with step 1.  */
3877
3878 tree
3879 vect_create_addr_base_for_vector_ref (gimple stmt,
3880                                       gimple_seq *new_stmt_list,
3881                                       tree offset,
3882                                       struct loop *loop,
3883                                       tree byte_offset)
3884 {
3885   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3886   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3887   tree data_ref_base;
3888   const char *base_name;
3889   tree addr_base;
3890   tree dest;
3891   gimple_seq seq = NULL;
3892   tree base_offset;
3893   tree init;
3894   tree vect_ptr_type;
3895   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3896   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3897
3898   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3899     {
3900       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3901
3902       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3903
3904       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3905       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3906       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3907     }
3908   else
3909     {
3910       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3911       base_offset = unshare_expr (DR_OFFSET (dr));
3912       init = unshare_expr (DR_INIT (dr));
3913     }
3914
3915   if (loop_vinfo)
3916     base_name = get_name (data_ref_base);
3917   else
3918     {
3919       base_offset = ssize_int (0);
3920       init = ssize_int (0);
3921       base_name = get_name (DR_REF (dr));
3922     }
3923
3924   /* Create base_offset */
3925   base_offset = size_binop (PLUS_EXPR,
3926                             fold_convert (sizetype, base_offset),
3927                             fold_convert (sizetype, init));
3928
3929   if (offset)
3930     {
3931       offset = fold_build2 (MULT_EXPR, sizetype,
3932                             fold_convert (sizetype, offset), step);
3933       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3934                                  base_offset, offset);
3935     }
3936   if (byte_offset)
3937     {
3938       byte_offset = fold_convert (sizetype, byte_offset);
3939       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3940                                  base_offset, byte_offset);
3941     }
3942
3943   /* base + base_offset */
3944   if (loop_vinfo)
3945     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3946   else
3947     {
3948       addr_base = build1 (ADDR_EXPR,
3949                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3950                           unshare_expr (DR_REF (dr)));
3951     }
3952
3953   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3954   addr_base = fold_convert (vect_ptr_type, addr_base);
3955   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3956   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3957   gimple_seq_add_seq (new_stmt_list, seq);
3958
3959   if (DR_PTR_INFO (dr)
3960       && TREE_CODE (addr_base) == SSA_NAME)
3961     {
3962       duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3963       if (offset)
3964         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3965     }
3966
3967   if (dump_enabled_p ())
3968     {
3969       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3970       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3971       dump_printf (MSG_NOTE, "\n");
3972     }
3973
3974   return addr_base;
3975 }
3976
3977
3978 /* Function vect_create_data_ref_ptr.
3979
3980    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3981    location accessed in the loop by STMT, along with the def-use update
3982    chain to appropriately advance the pointer through the loop iterations.
3983    Also set aliasing information for the pointer.  This pointer is used by
3984    the callers to this function to create a memory reference expression for
3985    vector load/store access.
3986
3987    Input:
3988    1. STMT: a stmt that references memory. Expected to be of the form
3989          GIMPLE_ASSIGN <name, data-ref> or
3990          GIMPLE_ASSIGN <data-ref, name>.
3991    2. AGGR_TYPE: the type of the reference, which should be either a vector
3992         or an array.
3993    3. AT_LOOP: the loop where the vector memref is to be created.
3994    4. OFFSET (optional): an offset to be added to the initial address accessed
3995         by the data-ref in STMT.
3996    5. BSI: location where the new stmts are to be placed if there is no loop
3997    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3998         pointing to the initial address.
3999    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4000         to the initial address accessed by the data-ref in STMT.  This is
4001         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4002         in bytes.
4003
4004    Output:
4005    1. Declare a new ptr to vector_type, and have it point to the base of the
4006       data reference (initial addressed accessed by the data reference).
4007       For example, for vector of type V8HI, the following code is generated:
4008
4009       v8hi *ap;
4010       ap = (v8hi *)initial_address;
4011
4012       if OFFSET is not supplied:
4013          initial_address = &a[init];
4014       if OFFSET is supplied:
4015          initial_address = &a[init + OFFSET];
4016       if BYTE_OFFSET is supplied:
4017          initial_address = &a[init] + BYTE_OFFSET;
4018
4019       Return the initial_address in INITIAL_ADDRESS.
4020
4021    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4022       update the pointer in each iteration of the loop.
4023
4024       Return the increment stmt that updates the pointer in PTR_INCR.
4025
4026    3. Set INV_P to true if the access pattern of the data reference in the
4027       vectorized loop is invariant.  Set it to false otherwise.
4028
4029    4. Return the pointer.  */
4030
4031 tree
4032 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4033                           tree offset, tree *initial_address,
4034                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
4035                           bool only_init, bool *inv_p, tree byte_offset)
4036 {
4037   const char *base_name;
4038   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4039   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4040   struct loop *loop = NULL;
4041   bool nested_in_vect_loop = false;
4042   struct loop *containing_loop = NULL;
4043   tree aggr_ptr_type;
4044   tree aggr_ptr;
4045   tree new_temp;
4046   gimple vec_stmt;
4047   gimple_seq new_stmt_list = NULL;
4048   edge pe = NULL;
4049   basic_block new_bb;
4050   tree aggr_ptr_init;
4051   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4052   tree aptr;
4053   gimple_stmt_iterator incr_gsi;
4054   bool insert_after;
4055   tree indx_before_incr, indx_after_incr;
4056   gimple incr;
4057   tree step;
4058   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4059
4060   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4061               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4062
4063   if (loop_vinfo)
4064     {
4065       loop = LOOP_VINFO_LOOP (loop_vinfo);
4066       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4067       containing_loop = (gimple_bb (stmt))->loop_father;
4068       pe = loop_preheader_edge (loop);
4069     }
4070   else
4071     {
4072       gcc_assert (bb_vinfo);
4073       only_init = true;
4074       *ptr_incr = NULL;
4075     }
4076
4077   /* Check the step (evolution) of the load in LOOP, and record
4078      whether it's invariant.  */
4079   if (nested_in_vect_loop)
4080     step = STMT_VINFO_DR_STEP (stmt_info);
4081   else
4082     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4083
4084   if (integer_zerop (step))
4085     *inv_p = true;
4086   else
4087     *inv_p = false;
4088
4089   /* Create an expression for the first address accessed by this load
4090      in LOOP.  */
4091   base_name = get_name (DR_BASE_ADDRESS (dr));
4092
4093   if (dump_enabled_p ())
4094     {
4095       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4096       dump_printf_loc (MSG_NOTE, vect_location,
4097                        "create %s-pointer variable to type: ",
4098                        get_tree_code_name (TREE_CODE (aggr_type)));
4099       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4100       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4101         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4102       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4103         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4104       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4105         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4106       else
4107         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4108       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4109       dump_printf (MSG_NOTE, "\n");
4110     }
4111
4112   /* (1) Create the new aggregate-pointer variable.
4113      Vector and array types inherit the alias set of their component
4114      type by default so we need to use a ref-all pointer if the data
4115      reference does not conflict with the created aggregated data
4116      reference because it is not addressable.  */
4117   bool need_ref_all = false;
4118   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4119                               get_alias_set (DR_REF (dr))))
4120     need_ref_all = true;
4121   /* Likewise for any of the data references in the stmt group.  */
4122   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4123     {
4124       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4125       do
4126         {
4127           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4128           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4129           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4130                                       get_alias_set (DR_REF (sdr))))
4131             {
4132               need_ref_all = true;
4133               break;
4134             }
4135           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4136         }
4137       while (orig_stmt);
4138     }
4139   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4140                                                need_ref_all);
4141   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4142
4143
4144   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4145      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4146      def-use update cycles for the pointer: one relative to the outer-loop
4147      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4148      to the inner-loop (which is the inner-most loop containing the dataref),
4149      and this is done be step (5) below.
4150
4151      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4152      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4153      redundant.  Steps (3),(4) create the following:
4154
4155         vp0 = &base_addr;
4156         LOOP:   vp1 = phi(vp0,vp2)
4157                 ...
4158                 ...
4159                 vp2 = vp1 + step
4160                 goto LOOP
4161
4162      If there is an inner-loop nested in loop, then step (5) will also be
4163      applied, and an additional update in the inner-loop will be created:
4164
4165         vp0 = &base_addr;
4166         LOOP:   vp1 = phi(vp0,vp2)
4167                 ...
4168         inner:     vp3 = phi(vp1,vp4)
4169                    vp4 = vp3 + inner_step
4170                    if () goto inner
4171                 ...
4172                 vp2 = vp1 + step
4173                 if () goto LOOP   */
4174
4175   /* (2) Calculate the initial address of the aggregate-pointer, and set
4176      the aggregate-pointer to point to it before the loop.  */
4177
4178   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4179
4180   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4181                                                    offset, loop, byte_offset);
4182   if (new_stmt_list)
4183     {
4184       if (pe)
4185         {
4186           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4187           gcc_assert (!new_bb);
4188         }
4189       else
4190         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4191     }
4192
4193   *initial_address = new_temp;
4194
4195   /* Create: p = (aggr_type *) initial_base  */
4196   if (TREE_CODE (new_temp) != SSA_NAME
4197       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4198     {
4199       vec_stmt = gimple_build_assign (aggr_ptr,
4200                                       fold_convert (aggr_ptr_type, new_temp));
4201       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4202       /* Copy the points-to information if it exists. */
4203       if (DR_PTR_INFO (dr))
4204         duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4205       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4206       if (pe)
4207         {
4208           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4209           gcc_assert (!new_bb);
4210         }
4211       else
4212         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4213     }
4214   else
4215     aggr_ptr_init = new_temp;
4216
4217   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4218      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4219      inner-loop nested in LOOP (during outer-loop vectorization).  */
4220
4221   /* No update in loop is required.  */
4222   if (only_init && (!loop_vinfo || at_loop == loop))
4223     aptr = aggr_ptr_init;
4224   else
4225     {
4226       /* The step of the aggregate pointer is the type size.  */
4227       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4228       /* One exception to the above is when the scalar step of the load in
4229          LOOP is zero. In this case the step here is also zero.  */
4230       if (*inv_p)
4231         iv_step = size_zero_node;
4232       else if (tree_int_cst_sgn (step) == -1)
4233         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4234
4235       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4236
4237       create_iv (aggr_ptr_init,
4238                  fold_convert (aggr_ptr_type, iv_step),
4239                  aggr_ptr, loop, &incr_gsi, insert_after,
4240                  &indx_before_incr, &indx_after_incr);
4241       incr = gsi_stmt (incr_gsi);
4242       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4243
4244       /* Copy the points-to information if it exists. */
4245       if (DR_PTR_INFO (dr))
4246         {
4247           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4248           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4249         }
4250       if (ptr_incr)
4251         *ptr_incr = incr;
4252
4253       aptr = indx_before_incr;
4254     }
4255
4256   if (!nested_in_vect_loop || only_init)
4257     return aptr;
4258
4259
4260   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4261      nested in LOOP, if exists.  */
4262
4263   gcc_assert (nested_in_vect_loop);
4264   if (!only_init)
4265     {
4266       standard_iv_increment_position (containing_loop, &incr_gsi,
4267                                       &insert_after);
4268       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4269                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4270                  &indx_after_incr);
4271       incr = gsi_stmt (incr_gsi);
4272       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4273
4274       /* Copy the points-to information if it exists. */
4275       if (DR_PTR_INFO (dr))
4276         {
4277           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4278           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4279         }
4280       if (ptr_incr)
4281         *ptr_incr = incr;
4282
4283       return indx_before_incr;
4284     }
4285   else
4286     gcc_unreachable ();
4287 }
4288
4289
4290 /* Function bump_vector_ptr
4291
4292    Increment a pointer (to a vector type) by vector-size. If requested,
4293    i.e. if PTR-INCR is given, then also connect the new increment stmt
4294    to the existing def-use update-chain of the pointer, by modifying
4295    the PTR_INCR as illustrated below:
4296
4297    The pointer def-use update-chain before this function:
4298                         DATAREF_PTR = phi (p_0, p_2)
4299                         ....
4300         PTR_INCR:       p_2 = DATAREF_PTR + step
4301
4302    The pointer def-use update-chain after this function:
4303                         DATAREF_PTR = phi (p_0, p_2)
4304                         ....
4305                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4306                         ....
4307         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4308
4309    Input:
4310    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4311                  in the loop.
4312    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4313               the loop.  The increment amount across iterations is expected
4314               to be vector_size.
4315    BSI - location where the new update stmt is to be placed.
4316    STMT - the original scalar memory-access stmt that is being vectorized.
4317    BUMP - optional. The offset by which to bump the pointer. If not given,
4318           the offset is assumed to be vector_size.
4319
4320    Output: Return NEW_DATAREF_PTR as illustrated above.
4321
4322 */
4323
4324 tree
4325 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4326                  gimple stmt, tree bump)
4327 {
4328   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4329   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4330   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4331   tree update = TYPE_SIZE_UNIT (vectype);
4332   gimple incr_stmt;
4333   ssa_op_iter iter;
4334   use_operand_p use_p;
4335   tree new_dataref_ptr;
4336
4337   if (bump)
4338     update = bump;
4339
4340   new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4341   incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4342                                             dataref_ptr, update);
4343   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4344
4345   /* Copy the points-to information if it exists. */
4346   if (DR_PTR_INFO (dr))
4347     {
4348       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4349       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4350     }
4351
4352   if (!ptr_incr)
4353     return new_dataref_ptr;
4354
4355   /* Update the vector-pointer's cross-iteration increment.  */
4356   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4357     {
4358       tree use = USE_FROM_PTR (use_p);
4359
4360       if (use == dataref_ptr)
4361         SET_USE (use_p, new_dataref_ptr);
4362       else
4363         gcc_assert (tree_int_cst_compare (use, update) == 0);
4364     }
4365
4366   return new_dataref_ptr;
4367 }
4368
4369
4370 /* Function vect_create_destination_var.
4371
4372    Create a new temporary of type VECTYPE.  */
4373
4374 tree
4375 vect_create_destination_var (tree scalar_dest, tree vectype)
4376 {
4377   tree vec_dest;
4378   const char *name;
4379   char *new_name;
4380   tree type;
4381   enum vect_var_kind kind;
4382
4383   kind = vectype ? vect_simple_var : vect_scalar_var;
4384   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4385
4386   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4387
4388   name = get_name (scalar_dest);
4389   if (name)
4390     asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4391   else
4392     asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4393   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4394   free (new_name);
4395
4396   return vec_dest;
4397 }
4398
4399 /* Function vect_grouped_store_supported.
4400
4401    Returns TRUE if interleave high and interleave low permutations
4402    are supported, and FALSE otherwise.  */
4403
4404 bool
4405 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4406 {
4407   enum machine_mode mode = TYPE_MODE (vectype);
4408
4409   /* vect_permute_store_chain requires the group size to be equal to 3 or
4410      be a power of two.  */
4411   if (count != 3 && exact_log2 (count) == -1)
4412     {
4413       if (dump_enabled_p ())
4414         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4415                          "the size of the group of accesses"
4416                          " is not a power of 2 or not eqaul to 3\n");
4417       return false;
4418     }
4419
4420   /* Check that the permutation is supported.  */
4421   if (VECTOR_MODE_P (mode))
4422     {
4423       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4424       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4425
4426       if (count == 3)
4427         {
4428           unsigned int j0 = 0, j1 = 0, j2 = 0;
4429           unsigned int i, j;
4430
4431           for (j = 0; j < 3; j++)
4432             {
4433               int nelt0 = ((3 - j) * nelt) % 3;
4434               int nelt1 = ((3 - j) * nelt + 1) % 3;
4435               int nelt2 = ((3 - j) * nelt + 2) % 3;
4436               for (i = 0; i < nelt; i++)
4437                 {
4438                   if (3 * i + nelt0 < nelt)
4439                     sel[3 * i + nelt0] = j0++;
4440                   if (3 * i + nelt1 < nelt)
4441                     sel[3 * i + nelt1] = nelt + j1++;
4442                   if (3 * i + nelt2 < nelt)
4443                     sel[3 * i + nelt2] = 0;
4444                 }
4445               if (!can_vec_perm_p (mode, false, sel))
4446                 {
4447                   if (dump_enabled_p ())
4448                     dump_printf (MSG_MISSED_OPTIMIZATION,
4449                                  "permutaion op not supported by target.\n");
4450                   return false;
4451                 }
4452
4453               for (i = 0; i < nelt; i++)
4454                 {
4455                   if (3 * i + nelt0 < nelt)
4456                     sel[3 * i + nelt0] = 3 * i + nelt0;
4457                   if (3 * i + nelt1 < nelt)
4458                     sel[3 * i + nelt1] = 3 * i + nelt1;
4459                   if (3 * i + nelt2 < nelt)
4460                     sel[3 * i + nelt2] = nelt + j2++;
4461                 }
4462               if (!can_vec_perm_p (mode, false, sel))
4463                 {
4464                   if (dump_enabled_p ())
4465                     dump_printf (MSG_MISSED_OPTIMIZATION,
4466                                  "permutaion op not supported by target.\n");
4467                   return false;
4468                 }
4469             }
4470           return true;
4471         }
4472       else
4473         {
4474           /* If length is not equal to 3 then only power of 2 is supported.  */
4475           gcc_assert (exact_log2 (count) != -1);
4476
4477           for (i = 0; i < nelt / 2; i++)
4478             {
4479               sel[i * 2] = i;
4480               sel[i * 2 + 1] = i + nelt;
4481             }
4482             if (can_vec_perm_p (mode, false, sel))
4483               {
4484                 for (i = 0; i < nelt; i++)
4485                   sel[i] += nelt / 2;
4486                 if (can_vec_perm_p (mode, false, sel))
4487                   return true;
4488               }
4489         }
4490     }
4491
4492   if (dump_enabled_p ())
4493     dump_printf (MSG_MISSED_OPTIMIZATION,
4494                  "permutaion op not supported by target.\n");
4495   return false;
4496 }
4497
4498
4499 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4500    type VECTYPE.  */
4501
4502 bool
4503 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4504 {
4505   return vect_lanes_optab_supported_p ("vec_store_lanes",
4506                                        vec_store_lanes_optab,
4507                                        vectype, count);
4508 }
4509
4510
4511 /* Function vect_permute_store_chain.
4512
4513    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4514    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4515    the data correctly for the stores.  Return the final references for stores
4516    in RESULT_CHAIN.
4517
4518    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4519    The input is 4 vectors each containing 8 elements.  We assign a number to
4520    each element, the input sequence is:
4521
4522    1st vec:   0  1  2  3  4  5  6  7
4523    2nd vec:   8  9 10 11 12 13 14 15
4524    3rd vec:  16 17 18 19 20 21 22 23
4525    4th vec:  24 25 26 27 28 29 30 31
4526
4527    The output sequence should be:
4528
4529    1st vec:  0  8 16 24  1  9 17 25
4530    2nd vec:  2 10 18 26  3 11 19 27
4531    3rd vec:  4 12 20 28  5 13 21 30
4532    4th vec:  6 14 22 30  7 15 23 31
4533
4534    i.e., we interleave the contents of the four vectors in their order.
4535
4536    We use interleave_high/low instructions to create such output.  The input of
4537    each interleave_high/low operation is two vectors:
4538    1st vec    2nd vec
4539    0 1 2 3    4 5 6 7
4540    the even elements of the result vector are obtained left-to-right from the
4541    high/low elements of the first vector.  The odd elements of the result are
4542    obtained left-to-right from the high/low elements of the second vector.
4543    The output of interleave_high will be:   0 4 1 5
4544    and of interleave_low:                   2 6 3 7
4545
4546
4547    The permutation is done in log LENGTH stages.  In each stage interleave_high
4548    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4549    where the first argument is taken from the first half of DR_CHAIN and the
4550    second argument from it's second half.
4551    In our example,
4552
4553    I1: interleave_high (1st vec, 3rd vec)
4554    I2: interleave_low (1st vec, 3rd vec)
4555    I3: interleave_high (2nd vec, 4th vec)
4556    I4: interleave_low (2nd vec, 4th vec)
4557
4558    The output for the first stage is:
4559
4560    I1:  0 16  1 17  2 18  3 19
4561    I2:  4 20  5 21  6 22  7 23
4562    I3:  8 24  9 25 10 26 11 27
4563    I4: 12 28 13 29 14 30 15 31
4564
4565    The output of the second stage, i.e. the final result is:
4566
4567    I1:  0  8 16 24  1  9 17 25
4568    I2:  2 10 18 26  3 11 19 27
4569    I3:  4 12 20 28  5 13 21 30
4570    I4:  6 14 22 30  7 15 23 31.  */
4571
4572 void
4573 vect_permute_store_chain (vec<tree> dr_chain,
4574                           unsigned int length,
4575                           gimple stmt,
4576                           gimple_stmt_iterator *gsi,
4577                           vec<tree> *result_chain)
4578 {
4579   tree vect1, vect2, high, low;
4580   gimple perm_stmt;
4581   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4582   tree perm_mask_low, perm_mask_high;
4583   tree data_ref;
4584   tree perm3_mask_low, perm3_mask_high;
4585   unsigned int i, n, log_length = exact_log2 (length);
4586   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4587   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4588
4589   result_chain->quick_grow (length);
4590   memcpy (result_chain->address (), dr_chain.address (),
4591           length * sizeof (tree));
4592
4593   if (length == 3)
4594     {
4595       unsigned int j0 = 0, j1 = 0, j2 = 0;
4596
4597       for (j = 0; j < 3; j++)
4598         {
4599           int nelt0 = ((3 - j) * nelt) % 3;
4600           int nelt1 = ((3 - j) * nelt + 1) % 3;
4601           int nelt2 = ((3 - j) * nelt + 2) % 3;
4602
4603           for (i = 0; i < nelt; i++)
4604             {
4605               if (3 * i + nelt0 < nelt)
4606                 sel[3 * i + nelt0] = j0++;
4607               if (3 * i + nelt1 < nelt)
4608                 sel[3 * i + nelt1] = nelt + j1++;
4609               if (3 * i + nelt2 < nelt)
4610                 sel[3 * i + nelt2] = 0;
4611             }
4612           perm3_mask_low = vect_gen_perm_mask (vectype, sel);
4613           gcc_assert (perm3_mask_low != NULL);
4614
4615           for (i = 0; i < nelt; i++)
4616             {
4617               if (3 * i + nelt0 < nelt)
4618                 sel[3 * i + nelt0] = 3 * i + nelt0;
4619               if (3 * i + nelt1 < nelt)
4620                 sel[3 * i + nelt1] = 3 * i + nelt1;
4621               if (3 * i + nelt2 < nelt)
4622                 sel[3 * i + nelt2] = nelt + j2++;
4623             }
4624           perm3_mask_high = vect_gen_perm_mask (vectype, sel);
4625           gcc_assert (perm3_mask_high != NULL);
4626
4627           vect1 = dr_chain[0];
4628           vect2 = dr_chain[1];
4629
4630           /* Create interleaving stmt:
4631              low = VEC_PERM_EXPR <vect1, vect2,
4632                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4633                                    j + 2, nelt + j + 2, *, ...}>  */
4634           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4635           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4636                                                     vect1, vect2,
4637                                                     perm3_mask_low);
4638           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4639
4640           vect1 = data_ref;
4641           vect2 = dr_chain[2];
4642           /* Create interleaving stmt:
4643              low = VEC_PERM_EXPR <vect1, vect2,
4644                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4645                                    6, 7, nelt + j + 2, ...}>  */
4646           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4647           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4648                                                     vect1, vect2,
4649                                                     perm3_mask_high);
4650           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4651           (*result_chain)[j] = data_ref;
4652         }
4653     }
4654   else
4655     {
4656       /* If length is not equal to 3 then only power of 2 is supported.  */
4657       gcc_assert (exact_log2 (length) != -1);
4658
4659       for (i = 0, n = nelt / 2; i < n; i++)
4660         {
4661           sel[i * 2] = i;
4662           sel[i * 2 + 1] = i + nelt;
4663         }
4664         perm_mask_high = vect_gen_perm_mask (vectype, sel);
4665         gcc_assert (perm_mask_high != NULL);
4666
4667         for (i = 0; i < nelt; i++)
4668           sel[i] += nelt / 2;
4669         perm_mask_low = vect_gen_perm_mask (vectype, sel);
4670         gcc_assert (perm_mask_low != NULL);
4671
4672         for (i = 0, n = log_length; i < n; i++)
4673           {
4674             for (j = 0; j < length/2; j++)
4675               {
4676                 vect1 = dr_chain[j];
4677                 vect2 = dr_chain[j+length/2];
4678
4679                 /* Create interleaving stmt:
4680                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4681                                                         ...}>  */
4682                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4683                 perm_stmt
4684                   = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4685                                                   vect1, vect2, perm_mask_high);
4686                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4687                 (*result_chain)[2*j] = high;
4688
4689                 /* Create interleaving stmt:
4690                    low = VEC_PERM_EXPR <vect1, vect2,
4691                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4692                                          ...}>  */
4693                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4694                 perm_stmt
4695                   = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4696                                                   vect1, vect2, perm_mask_low);
4697                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4698                 (*result_chain)[2*j+1] = low;
4699               }
4700             memcpy (dr_chain.address (), result_chain->address (),
4701                     length * sizeof (tree));
4702           }
4703     }
4704 }
4705
4706 /* Function vect_setup_realignment
4707
4708    This function is called when vectorizing an unaligned load using
4709    the dr_explicit_realign[_optimized] scheme.
4710    This function generates the following code at the loop prolog:
4711
4712       p = initial_addr;
4713    x  msq_init = *(floor(p));   # prolog load
4714       realignment_token = call target_builtin;
4715     loop:
4716    x  msq = phi (msq_init, ---)
4717
4718    The stmts marked with x are generated only for the case of
4719    dr_explicit_realign_optimized.
4720
4721    The code above sets up a new (vector) pointer, pointing to the first
4722    location accessed by STMT, and a "floor-aligned" load using that pointer.
4723    It also generates code to compute the "realignment-token" (if the relevant
4724    target hook was defined), and creates a phi-node at the loop-header bb
4725    whose arguments are the result of the prolog-load (created by this
4726    function) and the result of a load that takes place in the loop (to be
4727    created by the caller to this function).
4728
4729    For the case of dr_explicit_realign_optimized:
4730    The caller to this function uses the phi-result (msq) to create the
4731    realignment code inside the loop, and sets up the missing phi argument,
4732    as follows:
4733     loop:
4734       msq = phi (msq_init, lsq)
4735       lsq = *(floor(p'));        # load in loop
4736       result = realign_load (msq, lsq, realignment_token);
4737
4738    For the case of dr_explicit_realign:
4739     loop:
4740       msq = *(floor(p));        # load in loop
4741       p' = p + (VS-1);
4742       lsq = *(floor(p'));       # load in loop
4743       result = realign_load (msq, lsq, realignment_token);
4744
4745    Input:
4746    STMT - (scalar) load stmt to be vectorized. This load accesses
4747           a memory location that may be unaligned.
4748    BSI - place where new code is to be inserted.
4749    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4750                               is used.
4751
4752    Output:
4753    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4754                        target hook, if defined.
4755    Return value - the result of the loop-header phi node.  */
4756
4757 tree
4758 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4759                         tree *realignment_token,
4760                         enum dr_alignment_support alignment_support_scheme,
4761                         tree init_addr,
4762                         struct loop **at_loop)
4763 {
4764   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4765   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4766   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4767   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4768   struct loop *loop = NULL;
4769   edge pe = NULL;
4770   tree scalar_dest = gimple_assign_lhs (stmt);
4771   tree vec_dest;
4772   gimple inc;
4773   tree ptr;
4774   tree data_ref;
4775   gimple new_stmt;
4776   basic_block new_bb;
4777   tree msq_init = NULL_TREE;
4778   tree new_temp;
4779   gimple phi_stmt;
4780   tree msq = NULL_TREE;
4781   gimple_seq stmts = NULL;
4782   bool inv_p;
4783   bool compute_in_loop = false;
4784   bool nested_in_vect_loop = false;
4785   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4786   struct loop *loop_for_initial_load = NULL;
4787
4788   if (loop_vinfo)
4789     {
4790       loop = LOOP_VINFO_LOOP (loop_vinfo);
4791       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4792     }
4793
4794   gcc_assert (alignment_support_scheme == dr_explicit_realign
4795               || alignment_support_scheme == dr_explicit_realign_optimized);
4796
4797   /* We need to generate three things:
4798      1. the misalignment computation
4799      2. the extra vector load (for the optimized realignment scheme).
4800      3. the phi node for the two vectors from which the realignment is
4801       done (for the optimized realignment scheme).  */
4802
4803   /* 1. Determine where to generate the misalignment computation.
4804
4805      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4806      calculation will be generated by this function, outside the loop (in the
4807      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4808      caller, inside the loop.
4809
4810      Background: If the misalignment remains fixed throughout the iterations of
4811      the loop, then both realignment schemes are applicable, and also the
4812      misalignment computation can be done outside LOOP.  This is because we are
4813      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4814      are a multiple of VS (the Vector Size), and therefore the misalignment in
4815      different vectorized LOOP iterations is always the same.
4816      The problem arises only if the memory access is in an inner-loop nested
4817      inside LOOP, which is now being vectorized using outer-loop vectorization.
4818      This is the only case when the misalignment of the memory access may not
4819      remain fixed throughout the iterations of the inner-loop (as explained in
4820      detail in vect_supportable_dr_alignment).  In this case, not only is the
4821      optimized realignment scheme not applicable, but also the misalignment
4822      computation (and generation of the realignment token that is passed to
4823      REALIGN_LOAD) have to be done inside the loop.
4824
4825      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4826      or not, which in turn determines if the misalignment is computed inside
4827      the inner-loop, or outside LOOP.  */
4828
4829   if (init_addr != NULL_TREE || !loop_vinfo)
4830     {
4831       compute_in_loop = true;
4832       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4833     }
4834
4835
4836   /* 2. Determine where to generate the extra vector load.
4837
4838      For the optimized realignment scheme, instead of generating two vector
4839      loads in each iteration, we generate a single extra vector load in the
4840      preheader of the loop, and in each iteration reuse the result of the
4841      vector load from the previous iteration.  In case the memory access is in
4842      an inner-loop nested inside LOOP, which is now being vectorized using
4843      outer-loop vectorization, we need to determine whether this initial vector
4844      load should be generated at the preheader of the inner-loop, or can be
4845      generated at the preheader of LOOP.  If the memory access has no evolution
4846      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4847      to be generated inside LOOP (in the preheader of the inner-loop).  */
4848
4849   if (nested_in_vect_loop)
4850     {
4851       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4852       bool invariant_in_outerloop =
4853             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4854       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4855     }
4856   else
4857     loop_for_initial_load = loop;
4858   if (at_loop)
4859     *at_loop = loop_for_initial_load;
4860
4861   if (loop_for_initial_load)
4862     pe = loop_preheader_edge (loop_for_initial_load);
4863
4864   /* 3. For the case of the optimized realignment, create the first vector
4865       load at the loop preheader.  */
4866
4867   if (alignment_support_scheme == dr_explicit_realign_optimized)
4868     {
4869       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4870
4871       gcc_assert (!compute_in_loop);
4872       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4873       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4874                                       NULL_TREE, &init_addr, NULL, &inc,
4875                                       true, &inv_p);
4876       new_temp = copy_ssa_name (ptr, NULL);
4877       new_stmt = gimple_build_assign_with_ops
4878                    (BIT_AND_EXPR, new_temp, ptr,
4879                     build_int_cst (TREE_TYPE (ptr),
4880                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4881       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4882       gcc_assert (!new_bb);
4883       data_ref
4884         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4885                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4886       new_stmt = gimple_build_assign (vec_dest, data_ref);
4887       new_temp = make_ssa_name (vec_dest, new_stmt);
4888       gimple_assign_set_lhs (new_stmt, new_temp);
4889       if (pe)
4890         {
4891           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4892           gcc_assert (!new_bb);
4893         }
4894       else
4895          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4896
4897       msq_init = gimple_assign_lhs (new_stmt);
4898     }
4899
4900   /* 4. Create realignment token using a target builtin, if available.
4901       It is done either inside the containing loop, or before LOOP (as
4902       determined above).  */
4903
4904   if (targetm.vectorize.builtin_mask_for_load)
4905     {
4906       tree builtin_decl;
4907
4908       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4909       if (!init_addr)
4910         {
4911           /* Generate the INIT_ADDR computation outside LOOP.  */
4912           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4913                                                         NULL_TREE, loop);
4914           if (loop)
4915             {
4916               pe = loop_preheader_edge (loop);
4917               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4918               gcc_assert (!new_bb);
4919             }
4920           else
4921              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4922         }
4923
4924       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4925       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4926       vec_dest =
4927         vect_create_destination_var (scalar_dest,
4928                                      gimple_call_return_type (new_stmt));
4929       new_temp = make_ssa_name (vec_dest, new_stmt);
4930       gimple_call_set_lhs (new_stmt, new_temp);
4931
4932       if (compute_in_loop)
4933         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4934       else
4935         {
4936           /* Generate the misalignment computation outside LOOP.  */
4937           pe = loop_preheader_edge (loop);
4938           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4939           gcc_assert (!new_bb);
4940         }
4941
4942       *realignment_token = gimple_call_lhs (new_stmt);
4943
4944       /* The result of the CALL_EXPR to this builtin is determined from
4945          the value of the parameter and no global variables are touched
4946          which makes the builtin a "const" function.  Requiring the
4947          builtin to have the "const" attribute makes it unnecessary
4948          to call mark_call_clobbered.  */
4949       gcc_assert (TREE_READONLY (builtin_decl));
4950     }
4951
4952   if (alignment_support_scheme == dr_explicit_realign)
4953     return msq;
4954
4955   gcc_assert (!compute_in_loop);
4956   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4957
4958
4959   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4960
4961   pe = loop_preheader_edge (containing_loop);
4962   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4963   msq = make_ssa_name (vec_dest, NULL);
4964   phi_stmt = create_phi_node (msq, containing_loop->header);
4965   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4966
4967   return msq;
4968 }
4969
4970
4971 /* Function vect_grouped_load_supported.
4972
4973    Returns TRUE if even and odd permutations are supported,
4974    and FALSE otherwise.  */
4975
4976 bool
4977 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4978 {
4979   enum machine_mode mode = TYPE_MODE (vectype);
4980
4981   /* vect_permute_load_chain requires the group size to be equal to 3 or
4982      be a power of two.  */
4983   if (count != 3 && exact_log2 (count) == -1)
4984     {
4985       if (dump_enabled_p ())
4986         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4987                          "the size of the group of accesses"
4988                          " is not a power of 2 or not equal to 3\n");
4989       return false;
4990     }
4991
4992   /* Check that the permutation is supported.  */
4993   if (VECTOR_MODE_P (mode))
4994     {
4995       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4996       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4997
4998       if (count == 3)
4999         {
5000           unsigned int k;
5001           for (k = 0; k < 3; k++)
5002             {
5003               for (i = 0; i < nelt; i++)
5004                 if (3 * i + k < 2 * nelt)
5005                   sel[i] = 3 * i + k;
5006                 else
5007                   sel[i] = 0;
5008               if (!can_vec_perm_p (mode, false, sel))
5009                 {
5010                   if (dump_enabled_p ())
5011                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5012                                      "shuffle of 3 loads is not supported by"
5013                                      " target\n");
5014                     return false;
5015                 }
5016               for (i = 0, j = 0; i < nelt; i++)
5017                 if (3 * i + k < 2 * nelt)
5018                   sel[i] = i;
5019                 else
5020                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5021               if (!can_vec_perm_p (mode, false, sel))
5022                 {
5023                   if (dump_enabled_p ())
5024                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5025                                      "shuffle of 3 loads is not supported by"
5026                                      " target\n");
5027                   return false;
5028                 }
5029             }
5030           return true;
5031         }
5032       else
5033         {
5034           /* If length is not equal to 3 then only power of 2 is supported.  */
5035           gcc_assert (exact_log2 (count) != -1);
5036           for (i = 0; i < nelt; i++)
5037             sel[i] = i * 2;
5038           if (can_vec_perm_p (mode, false, sel))
5039             {
5040               for (i = 0; i < nelt; i++)
5041                 sel[i] = i * 2 + 1;
5042               if (can_vec_perm_p (mode, false, sel))
5043                 return true;
5044             }
5045         }
5046     }
5047
5048   if (dump_enabled_p ())
5049     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5050                      "extract even/odd not supported by target\n");
5051   return false;
5052 }
5053
5054 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5055    type VECTYPE.  */
5056
5057 bool
5058 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5059 {
5060   return vect_lanes_optab_supported_p ("vec_load_lanes",
5061                                        vec_load_lanes_optab,
5062                                        vectype, count);
5063 }
5064
5065 /* Function vect_permute_load_chain.
5066
5067    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5068    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5069    the input data correctly.  Return the final references for loads in
5070    RESULT_CHAIN.
5071
5072    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5073    The input is 4 vectors each containing 8 elements. We assign a number to each
5074    element, the input sequence is:
5075
5076    1st vec:   0  1  2  3  4  5  6  7
5077    2nd vec:   8  9 10 11 12 13 14 15
5078    3rd vec:  16 17 18 19 20 21 22 23
5079    4th vec:  24 25 26 27 28 29 30 31
5080
5081    The output sequence should be:
5082
5083    1st vec:  0 4  8 12 16 20 24 28
5084    2nd vec:  1 5  9 13 17 21 25 29
5085    3rd vec:  2 6 10 14 18 22 26 30
5086    4th vec:  3 7 11 15 19 23 27 31
5087
5088    i.e., the first output vector should contain the first elements of each
5089    interleaving group, etc.
5090
5091    We use extract_even/odd instructions to create such output.  The input of
5092    each extract_even/odd operation is two vectors
5093    1st vec    2nd vec
5094    0 1 2 3    4 5 6 7
5095
5096    and the output is the vector of extracted even/odd elements.  The output of
5097    extract_even will be:   0 2 4 6
5098    and of extract_odd:     1 3 5 7
5099
5100
5101    The permutation is done in log LENGTH stages.  In each stage extract_even
5102    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5103    their order.  In our example,
5104
5105    E1: extract_even (1st vec, 2nd vec)
5106    E2: extract_odd (1st vec, 2nd vec)
5107    E3: extract_even (3rd vec, 4th vec)
5108    E4: extract_odd (3rd vec, 4th vec)
5109
5110    The output for the first stage will be:
5111
5112    E1:  0  2  4  6  8 10 12 14
5113    E2:  1  3  5  7  9 11 13 15
5114    E3: 16 18 20 22 24 26 28 30
5115    E4: 17 19 21 23 25 27 29 31
5116
5117    In order to proceed and create the correct sequence for the next stage (or
5118    for the correct output, if the second stage is the last one, as in our
5119    example), we first put the output of extract_even operation and then the
5120    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5121    The input for the second stage is:
5122
5123    1st vec (E1):  0  2  4  6  8 10 12 14
5124    2nd vec (E3): 16 18 20 22 24 26 28 30
5125    3rd vec (E2):  1  3  5  7  9 11 13 15
5126    4th vec (E4): 17 19 21 23 25 27 29 31
5127
5128    The output of the second stage:
5129
5130    E1: 0 4  8 12 16 20 24 28
5131    E2: 2 6 10 14 18 22 26 30
5132    E3: 1 5  9 13 17 21 25 29
5133    E4: 3 7 11 15 19 23 27 31
5134
5135    And RESULT_CHAIN after reordering:
5136
5137    1st vec (E1):  0 4  8 12 16 20 24 28
5138    2nd vec (E3):  1 5  9 13 17 21 25 29
5139    3rd vec (E2):  2 6 10 14 18 22 26 30
5140    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5141
5142 static void
5143 vect_permute_load_chain (vec<tree> dr_chain,
5144                          unsigned int length,
5145                          gimple stmt,
5146                          gimple_stmt_iterator *gsi,
5147                          vec<tree> *result_chain)
5148 {
5149   tree data_ref, first_vect, second_vect;
5150   tree perm_mask_even, perm_mask_odd;
5151   tree perm3_mask_low, perm3_mask_high;
5152   gimple perm_stmt;
5153   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5154   unsigned int i, j, log_length = exact_log2 (length);
5155   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5156   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5157
5158   result_chain->quick_grow (length);
5159   memcpy (result_chain->address (), dr_chain.address (),
5160           length * sizeof (tree));
5161
5162   if (length == 3)
5163     {
5164       unsigned int k;
5165
5166       for (k = 0; k < 3; k++)
5167         {
5168           for (i = 0; i < nelt; i++)
5169             if (3 * i + k < 2 * nelt)
5170               sel[i] = 3 * i + k;
5171             else
5172               sel[i] = 0;
5173           perm3_mask_low = vect_gen_perm_mask (vectype, sel);
5174           gcc_assert (perm3_mask_low != NULL);
5175
5176           for (i = 0, j = 0; i < nelt; i++)
5177             if (3 * i + k < 2 * nelt)
5178               sel[i] = i;
5179             else
5180               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5181
5182           perm3_mask_high = vect_gen_perm_mask (vectype, sel);
5183           gcc_assert (perm3_mask_high != NULL);
5184
5185           first_vect = dr_chain[0];
5186           second_vect = dr_chain[1];
5187
5188           /* Create interleaving stmt (low part of):
5189              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5190                                                              ...}>  */
5191           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5192           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5193                                                     first_vect, second_vect,
5194                                                     perm3_mask_low);
5195           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5196
5197           /* Create interleaving stmt (high part of):
5198              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5199                                                               ...}>  */
5200           first_vect = data_ref;
5201           second_vect = dr_chain[2];
5202           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5203           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5204                                                     first_vect, second_vect,
5205                                                     perm3_mask_high);
5206           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5207           (*result_chain)[k] = data_ref;
5208         }
5209     }
5210   else
5211     {
5212       /* If length is not equal to 3 then only power of 2 is supported.  */
5213       gcc_assert (exact_log2 (length) != -1);
5214
5215       for (i = 0; i < nelt; ++i)
5216         sel[i] = i * 2;
5217       perm_mask_even = vect_gen_perm_mask (vectype, sel);
5218       gcc_assert (perm_mask_even != NULL);
5219
5220       for (i = 0; i < nelt; ++i)
5221         sel[i] = i * 2 + 1;
5222       perm_mask_odd = vect_gen_perm_mask (vectype, sel);
5223       gcc_assert (perm_mask_odd != NULL);
5224
5225       for (i = 0; i < log_length; i++)
5226         {
5227           for (j = 0; j < length; j += 2)
5228             {
5229               first_vect = dr_chain[j];
5230               second_vect = dr_chain[j+1];
5231
5232               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5233               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5234               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5235                                                         first_vect, second_vect,
5236                                                         perm_mask_even);
5237               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5238               (*result_chain)[j/2] = data_ref;
5239
5240               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5241               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5242               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5243                                                         first_vect, second_vect,
5244                                                         perm_mask_odd);
5245               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5246               (*result_chain)[j/2+length/2] = data_ref;
5247             }
5248           memcpy (dr_chain.address (), result_chain->address (),
5249                   length * sizeof (tree));
5250         }
5251     }
5252 }
5253
5254 /* Function vect_shift_permute_load_chain.
5255
5256    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5257    sequence of stmts to reorder the input data accordingly.
5258    Return the final references for loads in RESULT_CHAIN.
5259    Return true if successed, false otherwise.
5260
5261    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5262    The input is 3 vectors each containing 8 elements.  We assign a
5263    number to each element, the input sequence is:
5264
5265    1st vec:   0  1  2  3  4  5  6  7
5266    2nd vec:   8  9 10 11 12 13 14 15
5267    3rd vec:  16 17 18 19 20 21 22 23
5268
5269    The output sequence should be:
5270
5271    1st vec:  0 3 6  9 12 15 18 21
5272    2nd vec:  1 4 7 10 13 16 19 22
5273    3rd vec:  2 5 8 11 14 17 20 23
5274
5275    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5276
5277    First we shuffle all 3 vectors to get correct elements order:
5278
5279    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5280    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5281    3rd vec:  (16 19 22) (17 20 23) (18 21)
5282
5283    Next we unite and shift vector 3 times:
5284
5285    1st step:
5286      shift right by 6 the concatenation of:
5287      "1st vec" and  "2nd vec"
5288        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5289      "2nd vec" and  "3rd vec"
5290        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5291      "3rd vec" and  "1st vec"
5292        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5293                              | New vectors                   |
5294
5295      So that now new vectors are:
5296
5297      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5298      2nd vec:  (10 13) (16 19 22) (17 20 23)
5299      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5300
5301    2nd step:
5302      shift right by 5 the concatenation of:
5303      "1st vec" and  "3rd vec"
5304        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5305      "2nd vec" and  "1st vec"
5306        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5307      "3rd vec" and  "2nd vec"
5308        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5309                           | New vectors                   |
5310
5311      So that now new vectors are:
5312
5313      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5314      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5315      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5316
5317    3rd step:
5318      shift right by 5 the concatenation of:
5319      "1st vec" and  "1st vec"
5320        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5321      shift right by 3 the concatenation of:
5322      "2nd vec" and  "2nd vec"
5323                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5324                           | New vectors                   |
5325
5326      So that now all vectors are READY:
5327      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5328      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5329      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5330
5331    This algorithm is faster than one in vect_permute_load_chain if:
5332      1.  "shift of a concatination" is faster than general permutation.
5333          This is usually so.
5334      2.  The TARGET machine can't execute vector instructions in parallel.
5335          This is because each step of the algorithm depends on previous.
5336          The algorithm in vect_permute_load_chain is much more parallel.
5337
5338    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5339 */
5340
5341 static bool
5342 vect_shift_permute_load_chain (vec<tree> dr_chain,
5343                                unsigned int length,
5344                                gimple stmt,
5345                                gimple_stmt_iterator *gsi,
5346                                vec<tree> *result_chain)
5347 {
5348   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5349   tree perm2_mask1, perm2_mask2, perm3_mask;
5350   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5351   gimple perm_stmt;
5352
5353   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5354   unsigned int i;
5355   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5356   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5357   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5358   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5359
5360   result_chain->quick_grow (length);
5361   memcpy (result_chain->address (), dr_chain.address (),
5362           length * sizeof (tree));
5363
5364   if (length == 2 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5365     {
5366       for (i = 0; i < nelt / 2; ++i)
5367         sel[i] = i * 2;
5368       for (i = 0; i < nelt / 2; ++i)
5369         sel[nelt / 2 + i] = i * 2 + 1;
5370       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5371         {
5372           if (dump_enabled_p ())
5373             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5374                              "shuffle of 2 fields structure is not \
5375                               supported by target\n");
5376           return false;
5377         }
5378       perm2_mask1 = vect_gen_perm_mask (vectype, sel);
5379       gcc_assert (perm2_mask1 != NULL);
5380
5381       for (i = 0; i < nelt / 2; ++i)
5382         sel[i] = i * 2 + 1;
5383       for (i = 0; i < nelt / 2; ++i)
5384         sel[nelt / 2 + i] = i * 2;
5385       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5386         {
5387           if (dump_enabled_p ())
5388             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5389                              "shuffle of 2 fields structure is not \
5390                               supported by target\n");
5391           return false;
5392         }
5393       perm2_mask2 = vect_gen_perm_mask (vectype, sel);
5394       gcc_assert (perm2_mask2 != NULL);
5395
5396       /* Generating permutation constant to shift all elements.
5397          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5398       for (i = 0; i < nelt; i++)
5399         sel[i] = nelt / 2 + i;
5400       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5401         {
5402           if (dump_enabled_p ())
5403             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5404                              "shift permutation is not supported by target\n");
5405           return false;
5406         }
5407       shift1_mask = vect_gen_perm_mask (vectype, sel);
5408       gcc_assert (shift1_mask != NULL);
5409
5410       /* Generating permutation constant to select vector from 2.
5411          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5412       for (i = 0; i < nelt / 2; i++)
5413         sel[i] = i;
5414       for (i = nelt / 2; i < nelt; i++)
5415         sel[i] = nelt + i;
5416       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5417         {
5418           if (dump_enabled_p ())
5419             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5420                              "select is not supported by target\n");
5421           return false;
5422         }
5423       select_mask = vect_gen_perm_mask (vectype, sel);
5424       gcc_assert (select_mask != NULL);
5425
5426       first_vect = dr_chain[0];
5427       second_vect = dr_chain[1];
5428
5429       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5430       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5431                                                 first_vect, first_vect,
5432                                                 perm2_mask1);
5433       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5434       vect[0] = data_ref;
5435
5436       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5437       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5438                                                 second_vect, second_vect,
5439                                                 perm2_mask2);
5440       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5441       vect[1] = data_ref;
5442
5443       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5444       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5445                                                 vect[0], vect[1],
5446                                                 shift1_mask);
5447       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5448       (*result_chain)[1] = data_ref;
5449
5450       data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5451       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5452                                                 vect[0], vect[1],
5453                                                 select_mask);
5454       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5455       (*result_chain)[0] = data_ref;
5456
5457       return true;
5458     }
5459   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5460     {
5461       unsigned int k = 0, l = 0;
5462
5463       /* Generating permutation constant to get all elements in rigth order.
5464          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5465       for (i = 0; i < nelt; i++)
5466         {
5467           if (3 * k + (l % 3) >= nelt)
5468             {
5469               k = 0;
5470               l += (3 - (nelt % 3));
5471             }
5472           sel[i] = 3 * k + (l % 3);
5473           k++;
5474         }
5475       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5476         {
5477           if (dump_enabled_p ())
5478             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5479                              "shuffle of 3 fields structure is not \
5480                               supported by target\n");
5481           return false;
5482         }
5483       perm3_mask = vect_gen_perm_mask (vectype, sel);
5484       gcc_assert (perm3_mask != NULL);
5485
5486       /* Generating permutation constant to shift all elements.
5487          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5488       for (i = 0; i < nelt; i++)
5489         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5490       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5491         {
5492           if (dump_enabled_p ())
5493             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5494                              "shift permutation is not supported by target\n");
5495           return false;
5496         }
5497       shift1_mask = vect_gen_perm_mask (vectype, sel);
5498       gcc_assert (shift1_mask != NULL);
5499
5500       /* Generating permutation constant to shift all elements.
5501          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5502       for (i = 0; i < nelt; i++)
5503         sel[i] = 2 * (nelt / 3) + 1 + i;
5504       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5505         {
5506           if (dump_enabled_p ())
5507             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5508                              "shift permutation is not supported by target\n");
5509           return false;
5510         }
5511       shift2_mask = vect_gen_perm_mask (vectype, sel);
5512       gcc_assert (shift2_mask != NULL);
5513
5514       /* Generating permutation constant to shift all elements.
5515          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5516       for (i = 0; i < nelt; i++)
5517         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5518       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5519         {
5520           if (dump_enabled_p ())
5521             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5522                              "shift permutation is not supported by target\n");
5523           return false;
5524         }
5525       shift3_mask = vect_gen_perm_mask (vectype, sel);
5526       gcc_assert (shift3_mask != NULL);
5527
5528       /* Generating permutation constant to shift all elements.
5529          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5530       for (i = 0; i < nelt; i++)
5531         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5532       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5533         {
5534           if (dump_enabled_p ())
5535             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5536                              "shift permutation is not supported by target\n");
5537           return false;
5538         }
5539       shift4_mask = vect_gen_perm_mask (vectype, sel);
5540       gcc_assert (shift4_mask != NULL);
5541
5542       for (k = 0; k < 3; k++)
5543         {
5544           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5545           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5546                                                     dr_chain[k], dr_chain[k],
5547                                                     perm3_mask);
5548           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5549           vect[k] = data_ref;
5550         }
5551
5552       for (k = 0; k < 3; k++)
5553         {
5554           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5555           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5556                                                     vect[k % 3],
5557                                                     vect[(k + 1) % 3],
5558                                                     shift1_mask);
5559           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5560           vect_shift[k] = data_ref;
5561         }
5562
5563       for (k = 0; k < 3; k++)
5564         {
5565           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5566           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5567                                                     vect_shift[(4 - k) % 3],
5568                                                     vect_shift[(3 - k) % 3],
5569                                                     shift2_mask);
5570           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5571           vect[k] = data_ref;
5572         }
5573
5574       (*result_chain)[3 - (nelt % 3)] = vect[2];
5575
5576       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5577       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5578                                                 vect[0], vect[0],
5579                                                 shift3_mask);
5580       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5581       (*result_chain)[nelt % 3] = data_ref;
5582
5583       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5584       perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5585                                                 vect[1], vect[1],
5586                                                 shift4_mask);
5587       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5588       (*result_chain)[0] = data_ref;
5589       return true;
5590     }
5591   return false;
5592 }
5593
5594 /* Function vect_transform_grouped_load.
5595
5596    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5597    to perform their permutation and ascribe the result vectorized statements to
5598    the scalar statements.
5599 */
5600
5601 void
5602 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5603                              gimple_stmt_iterator *gsi)
5604 {
5605   enum machine_mode mode;
5606   vec<tree> result_chain = vNULL;
5607
5608   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5609      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5610      vectors, that are ready for vector computation.  */
5611   result_chain.create (size);
5612
5613   /* If reassociation width for vector type is 2 or greater target machine can
5614      execute 2 or more vector instructions in parallel.  Otherwise try to
5615      get chain for loads group using vect_shift_permute_load_chain.  */
5616   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5617   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5618       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5619                                          gsi, &result_chain))
5620     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5621   vect_record_grouped_load_vectors (stmt, result_chain);
5622   result_chain.release ();
5623 }
5624
5625 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5626    generated as part of the vectorization of STMT.  Assign the statement
5627    for each vector to the associated scalar statement.  */
5628
5629 void
5630 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5631 {
5632   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5633   gimple next_stmt, new_stmt;
5634   unsigned int i, gap_count;
5635   tree tmp_data_ref;
5636
5637   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5638      Since we scan the chain starting from it's first node, their order
5639      corresponds the order of data-refs in RESULT_CHAIN.  */
5640   next_stmt = first_stmt;
5641   gap_count = 1;
5642   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5643     {
5644       if (!next_stmt)
5645         break;
5646
5647       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5648        code elimination pass later.  No need to check for the first stmt in
5649        the group, since it always exists.
5650        GROUP_GAP is the number of steps in elements from the previous
5651        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5652        correspond to the gaps.  */
5653       if (next_stmt != first_stmt
5654           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5655       {
5656         gap_count++;
5657         continue;
5658       }
5659
5660       while (next_stmt)
5661         {
5662           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5663           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5664              copies, and we put the new vector statement in the first available
5665              RELATED_STMT.  */
5666           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5667             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5668           else
5669             {
5670               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5671                 {
5672                   gimple prev_stmt =
5673                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5674                   gimple rel_stmt =
5675                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5676                   while (rel_stmt)
5677                     {
5678                       prev_stmt = rel_stmt;
5679                       rel_stmt =
5680                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5681                     }
5682
5683                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5684                     new_stmt;
5685                 }
5686             }
5687
5688           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5689           gap_count = 1;
5690           /* If NEXT_STMT accesses the same DR as the previous statement,
5691              put the same TMP_DATA_REF as its vectorized statement; otherwise
5692              get the next data-ref from RESULT_CHAIN.  */
5693           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5694             break;
5695         }
5696     }
5697 }
5698
5699 /* Function vect_force_dr_alignment_p.
5700
5701    Returns whether the alignment of a DECL can be forced to be aligned
5702    on ALIGNMENT bit boundary.  */
5703
5704 bool
5705 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5706 {
5707   if (TREE_CODE (decl) != VAR_DECL)
5708     return false;
5709
5710   /* With -fno-toplevel-reorder we may have already output the constant.  */
5711   if (TREE_ASM_WRITTEN (decl))
5712     return false;
5713
5714   /* Constant pool entries may be shared and not properly merged by LTO.  */
5715   if (DECL_IN_CONSTANT_POOL (decl))
5716     return false;
5717
5718   if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
5719     {
5720       symtab_node *snode;
5721
5722       /* We cannot change alignment of symbols that may bind to symbols
5723          in other translation unit that may contain a definition with lower
5724          alignment.  */
5725       if (!decl_binds_to_current_def_p (decl))
5726         return false;
5727
5728       /* When compiling partition, be sure the symbol is not output by other
5729          partition.  */
5730       snode = symtab_node::get (decl);
5731       if (flag_ltrans
5732           && (snode->in_other_partition
5733               || snode->get_partitioning_class () == SYMBOL_DUPLICATE))
5734         return false;
5735     }
5736
5737   /* Do not override the alignment as specified by the ABI when the used
5738      attribute is set.  */
5739   if (DECL_PRESERVE_P (decl))
5740     return false;
5741
5742   /* Do not override explicit alignment set by the user when an explicit
5743      section name is also used.  This is a common idiom used by many
5744      software projects.  */
5745   if (TREE_STATIC (decl)
5746       && DECL_SECTION_NAME (decl) != NULL
5747       && !symtab_node::get (decl)->implicit_section)
5748     return false;
5749
5750   /* If symbol is an alias, we need to check that target is OK.  */
5751   if (TREE_STATIC (decl))
5752     {
5753       tree target = symtab_node::get (decl)->ultimate_alias_target ()->decl;
5754       if (target != decl)
5755         {
5756           if (DECL_PRESERVE_P (target))
5757             return false;
5758           decl = target;
5759         }
5760     }
5761
5762   if (TREE_STATIC (decl))
5763     return (alignment <= MAX_OFILE_ALIGNMENT);
5764   else
5765     return (alignment <= MAX_STACK_ALIGNMENT);
5766 }
5767
5768
5769 /* Return whether the data reference DR is supported with respect to its
5770    alignment.
5771    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5772    it is aligned, i.e., check if it is possible to vectorize it with different
5773    alignment.  */
5774
5775 enum dr_alignment_support
5776 vect_supportable_dr_alignment (struct data_reference *dr,
5777                                bool check_aligned_accesses)
5778 {
5779   gimple stmt = DR_STMT (dr);
5780   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5781   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5782   enum machine_mode mode = TYPE_MODE (vectype);
5783   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5784   struct loop *vect_loop = NULL;
5785   bool nested_in_vect_loop = false;
5786
5787   if (aligned_access_p (dr) && !check_aligned_accesses)
5788     return dr_aligned;
5789
5790   /* For now assume all conditional loads/stores support unaligned
5791      access without any special code.  */
5792   if (is_gimple_call (stmt)
5793       && gimple_call_internal_p (stmt)
5794       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5795           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5796     return dr_unaligned_supported;
5797
5798   if (loop_vinfo)
5799     {
5800       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5801       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5802     }
5803
5804   /* Possibly unaligned access.  */
5805
5806   /* We can choose between using the implicit realignment scheme (generating
5807      a misaligned_move stmt) and the explicit realignment scheme (generating
5808      aligned loads with a REALIGN_LOAD).  There are two variants to the
5809      explicit realignment scheme: optimized, and unoptimized.
5810      We can optimize the realignment only if the step between consecutive
5811      vector loads is equal to the vector size.  Since the vector memory
5812      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5813      is guaranteed that the misalignment amount remains the same throughout the
5814      execution of the vectorized loop.  Therefore, we can create the
5815      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5816      at the loop preheader.
5817
5818      However, in the case of outer-loop vectorization, when vectorizing a
5819      memory access in the inner-loop nested within the LOOP that is now being
5820      vectorized, while it is guaranteed that the misalignment of the
5821      vectorized memory access will remain the same in different outer-loop
5822      iterations, it is *not* guaranteed that is will remain the same throughout
5823      the execution of the inner-loop.  This is because the inner-loop advances
5824      with the original scalar step (and not in steps of VS).  If the inner-loop
5825      step happens to be a multiple of VS, then the misalignment remains fixed
5826      and we can use the optimized realignment scheme.  For example:
5827
5828       for (i=0; i<N; i++)
5829         for (j=0; j<M; j++)
5830           s += a[i+j];
5831
5832      When vectorizing the i-loop in the above example, the step between
5833      consecutive vector loads is 1, and so the misalignment does not remain
5834      fixed across the execution of the inner-loop, and the realignment cannot
5835      be optimized (as illustrated in the following pseudo vectorized loop):
5836
5837       for (i=0; i<N; i+=4)
5838         for (j=0; j<M; j++){
5839           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5840                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5841                          // (assuming that we start from an aligned address).
5842           }
5843
5844      We therefore have to use the unoptimized realignment scheme:
5845
5846       for (i=0; i<N; i+=4)
5847           for (j=k; j<M; j+=4)
5848           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5849                            // that the misalignment of the initial address is
5850                            // 0).
5851
5852      The loop can then be vectorized as follows:
5853
5854       for (k=0; k<4; k++){
5855         rt = get_realignment_token (&vp[k]);
5856         for (i=0; i<N; i+=4){
5857           v1 = vp[i+k];
5858           for (j=k; j<M; j+=4){
5859             v2 = vp[i+j+VS-1];
5860             va = REALIGN_LOAD <v1,v2,rt>;
5861             vs += va;
5862             v1 = v2;
5863           }
5864         }
5865     } */
5866
5867   if (DR_IS_READ (dr))
5868     {
5869       bool is_packed = false;
5870       tree type = (TREE_TYPE (DR_REF (dr)));
5871
5872       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5873           && (!targetm.vectorize.builtin_mask_for_load
5874               || targetm.vectorize.builtin_mask_for_load ()))
5875         {
5876           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5877           if ((nested_in_vect_loop
5878                && (TREE_INT_CST_LOW (DR_STEP (dr))
5879                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5880               || !loop_vinfo)
5881             return dr_explicit_realign;
5882           else
5883             return dr_explicit_realign_optimized;
5884         }
5885       if (!known_alignment_for_access_p (dr))
5886         is_packed = not_size_aligned (DR_REF (dr));
5887
5888       if ((TYPE_USER_ALIGN (type) && !is_packed)
5889           || targetm.vectorize.
5890                support_vector_misalignment (mode, type,
5891                                             DR_MISALIGNMENT (dr), is_packed))
5892         /* Can't software pipeline the loads, but can at least do them.  */
5893         return dr_unaligned_supported;
5894     }
5895   else
5896     {
5897       bool is_packed = false;
5898       tree type = (TREE_TYPE (DR_REF (dr)));
5899
5900       if (!known_alignment_for_access_p (dr))
5901         is_packed = not_size_aligned (DR_REF (dr));
5902
5903      if ((TYPE_USER_ALIGN (type) && !is_packed)
5904          || targetm.vectorize.
5905               support_vector_misalignment (mode, type,
5906                                            DR_MISALIGNMENT (dr), is_packed))
5907        return dr_unaligned_supported;
5908     }
5909
5910   /* Unsupported.  */
5911   return dr_unaligned_unsupported;
5912 }