gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "hash-set.h"
  28 #include "machmode.h"
  29 #include "vec.h"
  30 #include "double-int.h"
  31 #include "input.h"
  32 #include "alias.h"
  33 #include "symtab.h"
  34 #include "wide-int.h"
  35 #include "inchash.h"
  36 #include "tree.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "tm_p.h"
  40 #include "target.h"
  41 #include "predict.h"
  42 #include "hard-reg-set.h"
  43 #include "function.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "basic-block.h"
  47 #include "gimple-pretty-print.h"
  48 #include "tree-ssa-alias.h"
  49 #include "internal-fn.h"
  50 #include "tree-eh.h"
  51 #include "gimple-expr.h"
  52 #include "is-a.h"
  53 #include "gimple.h"
  54 #include "gimplify.h"
  55 #include "gimple-iterator.h"
  56 #include "gimplify-me.h"
  57 #include "gimple-ssa.h"
  58 #include "tree-phinodes.h"
  59 #include "ssa-iterators.h"
  60 #include "stringpool.h"
  61 #include "tree-ssanames.h"
  62 #include "tree-ssa-loop-ivopts.h"
  63 #include "tree-ssa-loop-manip.h"
  64 #include "tree-ssa-loop.h"
  65 #include "cfgloop.h"
  66 #include "tree-chrec.h"
  67 #include "tree-scalar-evolution.h"
  68 #include "tree-vectorizer.h"
  69 #include "diagnostic-core.h"
  70 #include "hash-map.h"
  71 #include "plugin-api.h"
  72 #include "ipa-ref.h"
  73 #include "cgraph.h"
  74 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  75 #include "hashtab.h"
  76 #include "rtl.h"
  77 #include "flags.h"
  78 #include "statistics.h"
  79 #include "real.h"
  80 #include "fixed-value.h"
  81 #include "insn-config.h"
  82 #include "expmed.h"
  83 #include "dojump.h"
  84 #include "explow.h"
  85 #include "calls.h"
  86 #include "emit-rtl.h"
  87 #include "varasm.h"
  88 #include "stmt.h"
  89 #include "expr.h"
  90 #include "insn-codes.h"
  91 #include "optabs.h"
  92 #include "builtins.h"
  93
  94 /* Return true if load- or store-lanes optab OPTAB is implemented for
  95    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  96
  97 static bool
  98 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  99                               tree vectype, unsigned HOST_WIDE_INT count)
 100 {
 101   machine_mode mode, array_mode;
 102   bool limit_p;
 103
 104   mode = TYPE_MODE (vectype);
 105   limit_p = !targetm.array_mode_supported_p (mode, count);
 106   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
 107                               MODE_INT, limit_p);
 108
 109   if (array_mode == BLKmode)
 110     {
 111       if (dump_enabled_p ())
 112         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 113                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
 114                          GET_MODE_NAME (mode), count);
 115       return false;
 116     }
 117
 118   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
 119     {
 120       if (dump_enabled_p ())
 121         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 122                          "cannot use %s<%s><%s>\n", name,
 123                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
 124       return false;
 125     }
 126
 127   if (dump_enabled_p ())
 128     dump_printf_loc (MSG_NOTE, vect_location,
 129                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
 130                      GET_MODE_NAME (mode));
 131
 132   return true;
 133 }
 134
 135
 136 /* Return the smallest scalar part of STMT.
 137    This is used to determine the vectype of the stmt.  We generally set the
 138    vectype according to the type of the result (lhs).  For stmts whose
 139    result-type is different than the type of the arguments (e.g., demotion,
 140    promotion), vectype will be reset appropriately (later).  Note that we have
 141    to visit the smallest datatype in this function, because that determines the
 142    VF.  If the smallest datatype in the loop is present only as the rhs of a
 143    promotion operation - we'd miss it.
 144    Such a case, where a variable of this datatype does not appear in the lhs
 145    anywhere in the loop, can only occur if it's an invariant: e.g.:
 146    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 147    invariant motion.  However, we cannot rely on invariant motion to always
 148    take invariants out of the loop, and so in the case of promotion we also
 149    have to check the rhs.
 150    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 151    types.  */
 152
 153 tree
 154 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 155                                HOST_WIDE_INT *rhs_size_unit)
 156 {
 157   tree scalar_type = gimple_expr_type (stmt);
 158   HOST_WIDE_INT lhs, rhs;
 159
 160   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 161
 162   if (is_gimple_assign (stmt)
 163       && (gimple_assign_cast_p (stmt)
 164           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 165           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 166           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 167     {
 168       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 169
 170       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 171       if (rhs < lhs)
 172         scalar_type = rhs_type;
 173     }
 174
 175   *lhs_size_unit = lhs;
 176   *rhs_size_unit = rhs;
 177   return scalar_type;
 178 }
 179
 180
 181 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 182    tested at run-time.  Return TRUE if DDR was successfully inserted.
 183    Return false if versioning is not supported.  */
 184
 185 static bool
 186 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 187 {
 188   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 189
 190   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 191     return false;
 192
 193   if (dump_enabled_p ())
 194     {
 195       dump_printf_loc (MSG_NOTE, vect_location,
 196                        "mark for run-time aliasing test between ");
 197       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 198       dump_printf (MSG_NOTE,  " and ");
 199       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 200       dump_printf (MSG_NOTE, "\n");
 201     }
 202
 203   if (optimize_loop_nest_for_size_p (loop))
 204     {
 205       if (dump_enabled_p ())
 206         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 207                          "versioning not supported when optimizing"
 208                          " for size.\n");
 209       return false;
 210     }
 211
 212   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 213   if (loop->inner)
 214     {
 215       if (dump_enabled_p ())
 216         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 217                          "versioning not yet supported for outer-loops.\n");
 218       return false;
 219     }
 220
 221   /* FORNOW: We don't support creating runtime alias tests for non-constant
 222      step.  */
 223   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 224       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 225     {
 226       if (dump_enabled_p ())
 227         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 228                          "versioning not yet supported for non-constant "
 229                          "step\n");
 230       return false;
 231     }
 232
 233   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 234   return true;
 235 }
 236
 237
 238 /* Function vect_analyze_data_ref_dependence.
 239
 240    Return TRUE if there (might) exist a dependence between a memory-reference
 241    DRA and a memory-reference DRB.  When versioning for alias may check a
 242    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 243    the data dependence.  */
 244
 245 static bool
 246 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 247                                   loop_vec_info loop_vinfo, int *max_vf)
 248 {
 249   unsigned int i;
 250   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 251   struct data_reference *dra = DDR_A (ddr);
 252   struct data_reference *drb = DDR_B (ddr);
 253   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 254   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 255   lambda_vector dist_v;
 256   unsigned int loop_depth;
 257
 258   /* In loop analysis all data references should be vectorizable.  */
 259   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 260       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 261     gcc_unreachable ();
 262
 263   /* Independent data accesses.  */
 264   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 265     return false;
 266
 267   if (dra == drb
 268       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 269     return false;
 270
 271   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 272      least two scalar iterations, there is always also a true dependence.
 273      As the vectorizer does not re-order loads and stores we can ignore
 274      the anti-dependence if TBAA can disambiguate both DRs similar to the
 275      case with known negative distance anti-dependences (positive
 276      distance anti-dependences would violate TBAA constraints).  */
 277   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 278        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 279       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 280                                  get_alias_set (DR_REF (drb))))
 281     return false;
 282
 283   /* Unknown data dependence.  */
 284   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 285     {
 286       /* If user asserted safelen consecutive iterations can be
 287          executed concurrently, assume independence.  */
 288       if (loop->safelen >= 2)
 289         {
 290           if (loop->safelen < *max_vf)
 291             *max_vf = loop->safelen;
 292           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 293           return false;
 294         }
 295
 296       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 297           || STMT_VINFO_GATHER_P (stmtinfo_b))
 298         {
 299           if (dump_enabled_p ())
 300             {
 301               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 302                                "versioning for alias not supported for: "
 303                                "can't determine dependence between ");
 304               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 305                                  DR_REF (dra));
 306               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 307               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 308                                  DR_REF (drb));
 309               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 310             }
 311           return true;
 312         }
 313
 314       if (dump_enabled_p ())
 315         {
 316           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 317                            "versioning for alias required: "
 318                            "can't determine dependence between ");
 319           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 320                              DR_REF (dra));
 321           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 322           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 323                              DR_REF (drb));
 324           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 325         }
 326
 327       /* Add to list of ddrs that need to be tested at run-time.  */
 328       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 329     }
 330
 331   /* Known data dependence.  */
 332   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 333     {
 334       /* If user asserted safelen consecutive iterations can be
 335          executed concurrently, assume independence.  */
 336       if (loop->safelen >= 2)
 337         {
 338           if (loop->safelen < *max_vf)
 339             *max_vf = loop->safelen;
 340           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 341           return false;
 342         }
 343
 344       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 345           || STMT_VINFO_GATHER_P (stmtinfo_b))
 346         {
 347           if (dump_enabled_p ())
 348             {
 349               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 350                                "versioning for alias not supported for: "
 351                                "bad dist vector for ");
 352               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 353                                  DR_REF (dra));
 354               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 355               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 356                                  DR_REF (drb));
 357               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 358             }
 359           return true;
 360         }
 361
 362       if (dump_enabled_p ())
 363         {
 364           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 365                            "versioning for alias required: "
 366                            "bad dist vector for ");
 367           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 368           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 369           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 370           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 371         }
 372       /* Add to list of ddrs that need to be tested at run-time.  */
 373       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 374     }
 375
 376   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 377   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 378     {
 379       int dist = dist_v[loop_depth];
 380
 381       if (dump_enabled_p ())
 382         dump_printf_loc (MSG_NOTE, vect_location,
 383                          "dependence distance  = %d.\n", dist);
 384
 385       if (dist == 0)
 386         {
 387           if (dump_enabled_p ())
 388             {
 389               dump_printf_loc (MSG_NOTE, vect_location,
 390                                "dependence distance == 0 between ");
 391               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 392               dump_printf (MSG_NOTE, " and ");
 393               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 394               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 395             }
 396
 397           /* When we perform grouped accesses and perform implicit CSE
 398              by detecting equal accesses and doing disambiguation with
 399              runtime alias tests like for
 400                 .. = a[i];
 401                 .. = a[i+1];
 402                 a[i] = ..;
 403                 a[i+1] = ..;
 404                 *p = ..;
 405                 .. = a[i];
 406                 .. = a[i+1];
 407              where we will end up loading { a[i], a[i+1] } once, make
 408              sure that inserting group loads before the first load and
 409              stores after the last store will do the right thing.
 410              Similar for groups like
 411                 a[i] = ...;
 412                 ... = a[i];
 413                 a[i+1] = ...;
 414              where loads from the group interleave with the store.  */
 415           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 416               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 417             {
 418               gimple earlier_stmt;
 419               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 420               if (DR_IS_WRITE
 421                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 422                 {
 423                   if (dump_enabled_p ())
 424                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 425                                      "READ_WRITE dependence in interleaving."
 426                                      "\n");
 427                   return true;
 428                 }
 429             }
 430
 431           continue;
 432         }
 433
 434       if (dist > 0 && DDR_REVERSED_P (ddr))
 435         {
 436           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 437              reversed (to make distance vector positive), and the actual
 438              distance is negative.  */
 439           if (dump_enabled_p ())
 440             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 441                              "dependence distance negative.\n");
 442           /* Record a negative dependence distance to later limit the
 443              amount of stmt copying / unrolling we can perform.
 444              Only need to handle read-after-write dependence.  */
 445           if (DR_IS_READ (drb)
 446               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 447                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 448             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 449           continue;
 450         }
 451
 452       if (abs (dist) >= 2
 453           && abs (dist) < *max_vf)
 454         {
 455           /* The dependence distance requires reduction of the maximal
 456              vectorization factor.  */
 457           *max_vf = abs (dist);
 458           if (dump_enabled_p ())
 459             dump_printf_loc (MSG_NOTE, vect_location,
 460                              "adjusting maximal vectorization factor to %i\n",
 461                              *max_vf);
 462         }
 463
 464       if (abs (dist) >= *max_vf)
 465         {
 466           /* Dependence distance does not create dependence, as far as
 467              vectorization is concerned, in this case.  */
 468           if (dump_enabled_p ())
 469             dump_printf_loc (MSG_NOTE, vect_location,
 470                              "dependence distance >= VF.\n");
 471           continue;
 472         }
 473
 474       if (dump_enabled_p ())
 475         {
 476           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                        "not vectorized, possible dependence "
 478                        "between data-refs ");
 479           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 480           dump_printf (MSG_NOTE,  " and ");
 481           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 482           dump_printf (MSG_NOTE,  "\n");
 483         }
 484
 485       return true;
 486     }
 487
 488   return false;
 489 }
 490
 491 /* Function vect_analyze_data_ref_dependences.
 492
 493    Examine all the data references in the loop, and make sure there do not
 494    exist any data dependences between them.  Set *MAX_VF according to
 495    the maximum vectorization factor the data dependences allow.  */
 496
 497 bool
 498 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 499 {
 500   unsigned int i;
 501   struct data_dependence_relation *ddr;
 502
 503   if (dump_enabled_p ())
 504     dump_printf_loc (MSG_NOTE, vect_location,
 505                      "=== vect_analyze_data_ref_dependences ===\n");
 506
 507   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 508   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 509                                 &LOOP_VINFO_DDRS (loop_vinfo),
 510                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 511     return false;
 512
 513   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 514     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 515       return false;
 516
 517   return true;
 518 }
 519
 520
 521 /* Function vect_slp_analyze_data_ref_dependence.
 522
 523    Return TRUE if there (might) exist a dependence between a memory-reference
 524    DRA and a memory-reference DRB.  When versioning for alias may check a
 525    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 526    the data dependence.  */
 527
 528 static bool
 529 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 530 {
 531   struct data_reference *dra = DDR_A (ddr);
 532   struct data_reference *drb = DDR_B (ddr);
 533
 534   /* We need to check dependences of statements marked as unvectorizable
 535      as well, they still can prohibit vectorization.  */
 536
 537   /* Independent data accesses.  */
 538   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 539     return false;
 540
 541   if (dra == drb)
 542     return false;
 543
 544   /* Read-read is OK.  */
 545   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 546     return false;
 547
 548   /* If dra and drb are part of the same interleaving chain consider
 549      them independent.  */
 550   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 551       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 552           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 553     return false;
 554
 555   /* Unknown data dependence.  */
 556   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 557     {
 558       if  (dump_enabled_p ())
 559         {
 560           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 561                            "can't determine dependence between ");
 562           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 563           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 564           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 565           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 566         }
 567     }
 568   else if (dump_enabled_p ())
 569     {
 570       dump_printf_loc (MSG_NOTE, vect_location,
 571                        "determined dependence between ");
 572       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 573       dump_printf (MSG_NOTE, " and ");
 574       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 575       dump_printf (MSG_NOTE,  "\n");
 576     }
 577
 578   /* We do not vectorize basic blocks with write-write dependencies.  */
 579   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 580     return true;
 581
 582   /* If we have a read-write dependence check that the load is before the store.
 583      When we vectorize basic blocks, vector load can be only before
 584      corresponding scalar load, and vector store can be only after its
 585      corresponding scalar store.  So the order of the acceses is preserved in
 586      case the load is before the store.  */
 587   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 588   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 589     {
 590       /* That only holds for load-store pairs taking part in vectorization.  */
 591       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 592           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 593         return false;
 594     }
 595
 596   return true;
 597 }
 598
 599
 600 /* Function vect_analyze_data_ref_dependences.
 601
 602    Examine all the data references in the basic-block, and make sure there
 603    do not exist any data dependences between them.  Set *MAX_VF according to
 604    the maximum vectorization factor the data dependences allow.  */
 605
 606 bool
 607 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 608 {
 609   struct data_dependence_relation *ddr;
 610   unsigned int i;
 611
 612   if (dump_enabled_p ())
 613     dump_printf_loc (MSG_NOTE, vect_location,
 614                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 615
 616   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 617                                 &BB_VINFO_DDRS (bb_vinfo),
 618                                 vNULL, true))
 619     return false;
 620
 621   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 622     if (vect_slp_analyze_data_ref_dependence (ddr))
 623       return false;
 624
 625   return true;
 626 }
 627
 628
 629 /* Function vect_compute_data_ref_alignment
 630
 631    Compute the misalignment of the data reference DR.
 632
 633    Output:
 634    1. If during the misalignment computation it is found that the data reference
 635       cannot be vectorized then false is returned.
 636    2. DR_MISALIGNMENT (DR) is defined.
 637
 638    FOR NOW: No analysis is actually performed. Misalignment is calculated
 639    only for trivial cases. TODO.  */
 640
 641 static bool
 642 vect_compute_data_ref_alignment (struct data_reference *dr)
 643 {
 644   gimple stmt = DR_STMT (dr);
 645   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 646   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 647   struct loop *loop = NULL;
 648   tree ref = DR_REF (dr);
 649   tree vectype;
 650   tree base, base_addr;
 651   bool base_aligned;
 652   tree misalign;
 653   tree aligned_to;
 654   unsigned HOST_WIDE_INT alignment;
 655
 656   if (dump_enabled_p ())
 657     dump_printf_loc (MSG_NOTE, vect_location,
 658                      "vect_compute_data_ref_alignment:\n");
 659
 660   if (loop_vinfo)
 661     loop = LOOP_VINFO_LOOP (loop_vinfo);
 662
 663   /* Initialize misalignment to unknown.  */
 664   SET_DR_MISALIGNMENT (dr, -1);
 665
 666   /* Strided loads perform only component accesses, misalignment information
 667      is irrelevant for them.  */
 668   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 669     return true;
 670
 671   misalign = DR_INIT (dr);
 672   aligned_to = DR_ALIGNED_TO (dr);
 673   base_addr = DR_BASE_ADDRESS (dr);
 674   vectype = STMT_VINFO_VECTYPE (stmt_info);
 675
 676   /* In case the dataref is in an inner-loop of the loop that is being
 677      vectorized (LOOP), we use the base and misalignment information
 678      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 679      stays the same throughout the execution of the inner-loop, which is why
 680      we have to check that the stride of the dataref in the inner-loop evenly
 681      divides by the vector size.  */
 682   if (loop && nested_in_vect_loop_p (loop, stmt))
 683     {
 684       tree step = DR_STEP (dr);
 685       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 686
 687       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 688         {
 689           if (dump_enabled_p ())
 690             dump_printf_loc (MSG_NOTE, vect_location,
 691                              "inner step divides the vector-size.\n");
 692           misalign = STMT_VINFO_DR_INIT (stmt_info);
 693           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 694           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 695         }
 696       else
 697         {
 698           if (dump_enabled_p ())
 699             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 700                              "inner step doesn't divide the vector-size.\n");
 701           misalign = NULL_TREE;
 702         }
 703     }
 704
 705   /* Similarly, if we're doing basic-block vectorization, we can only use
 706      base and misalignment information relative to an innermost loop if the
 707      misalignment stays the same throughout the execution of the loop.
 708      As above, this is the case if the stride of the dataref evenly divides
 709      by the vector size.  */
 710   if (!loop)
 711     {
 712       tree step = DR_STEP (dr);
 713       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 714
 715       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 716         {
 717           if (dump_enabled_p ())
 718             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 719                              "SLP: step doesn't divide the vector-size.\n");
 720           misalign = NULL_TREE;
 721         }
 722     }
 723
 724   alignment = TYPE_ALIGN_UNIT (vectype);
 725
 726   if ((compare_tree_int (aligned_to, alignment) < 0)
 727       || !misalign)
 728     {
 729       if (dump_enabled_p ())
 730         {
 731           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 732                            "Unknown alignment for access: ");
 733           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 734           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 735         }
 736       return true;
 737     }
 738
 739   /* To look at alignment of the base we have to preserve an inner MEM_REF
 740      as that carries alignment information of the actual access.  */
 741   base = ref;
 742   while (handled_component_p (base))
 743     base = TREE_OPERAND (base, 0);
 744   if (TREE_CODE (base) == MEM_REF)
 745     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 746                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 747
 748   if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
 749     base_aligned = true;
 750   else
 751     base_aligned = false;
 752
 753   if (!base_aligned)
 754     {
 755       /* Strip an inner MEM_REF to a bare decl if possible.  */
 756       if (TREE_CODE (base) == MEM_REF
 757           && integer_zerop (TREE_OPERAND (base, 1))
 758           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 759         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 760
 761       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 762         {
 763           if (dump_enabled_p ())
 764             {
 765               dump_printf_loc (MSG_NOTE, vect_location,
 766                                "can't force alignment of ref: ");
 767               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 768               dump_printf (MSG_NOTE, "\n");
 769             }
 770           return true;
 771         }
 772
 773       /* Force the alignment of the decl.
 774          NOTE: This is the only change to the code we make during
 775          the analysis phase, before deciding to vectorize the loop.  */
 776       if (dump_enabled_p ())
 777         {
 778           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 779           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 780           dump_printf (MSG_NOTE, "\n");
 781         }
 782
 783       ((dataref_aux *)dr->aux)->base_decl = base;
 784       ((dataref_aux *)dr->aux)->base_misaligned = true;
 785     }
 786
 787   /* If this is a backward running DR then first access in the larger
 788      vectype actually is N-1 elements before the address in the DR.
 789      Adjust misalign accordingly.  */
 790   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 791     {
 792       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 793       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 794          otherwise we wouldn't be here.  */
 795       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 796       /* PLUS because DR_STEP was negative.  */
 797       misalign = size_binop (PLUS_EXPR, misalign, offset);
 798     }
 799
 800   SET_DR_MISALIGNMENT (dr,
 801                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 802
 803   if (dump_enabled_p ())
 804     {
 805       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 806                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 807       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 808       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 809     }
 810
 811   return true;
 812 }
 813
 814
 815 /* Function vect_compute_data_refs_alignment
 816
 817    Compute the misalignment of data references in the loop.
 818    Return FALSE if a data reference is found that cannot be vectorized.  */
 819
 820 static bool
 821 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 822                                   bb_vec_info bb_vinfo)
 823 {
 824   vec<data_reference_p> datarefs;
 825   struct data_reference *dr;
 826   unsigned int i;
 827
 828   if (loop_vinfo)
 829     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 830   else
 831     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 832
 833   FOR_EACH_VEC_ELT (datarefs, i, dr)
 834     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 835         && !vect_compute_data_ref_alignment (dr))
 836       {
 837         if (bb_vinfo)
 838           {
 839             /* Mark unsupported statement as unvectorizable.  */
 840             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 841             continue;
 842           }
 843         else
 844           return false;
 845       }
 846
 847   return true;
 848 }
 849
 850
 851 /* Function vect_update_misalignment_for_peel
 852
 853    DR - the data reference whose misalignment is to be adjusted.
 854    DR_PEEL - the data reference whose misalignment is being made
 855              zero in the vector loop by the peel.
 856    NPEEL - the number of iterations in the peel loop if the misalignment
 857            of DR_PEEL is known at compile time.  */
 858
 859 static void
 860 vect_update_misalignment_for_peel (struct data_reference *dr,
 861                                    struct data_reference *dr_peel, int npeel)
 862 {
 863   unsigned int i;
 864   vec<dr_p> same_align_drs;
 865   struct data_reference *current_dr;
 866   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 867   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 868   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 869   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 870
 871  /* For interleaved data accesses the step in the loop must be multiplied by
 872      the size of the interleaving group.  */
 873   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 874     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 875   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 876     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 877
 878   /* It can be assumed that the data refs with the same alignment as dr_peel
 879      are aligned in the vector loop.  */
 880   same_align_drs
 881     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 882   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 883     {
 884       if (current_dr != dr)
 885         continue;
 886       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 887                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 888       SET_DR_MISALIGNMENT (dr, 0);
 889       return;
 890     }
 891
 892   if (known_alignment_for_access_p (dr)
 893       && known_alignment_for_access_p (dr_peel))
 894     {
 895       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 896       int misal = DR_MISALIGNMENT (dr);
 897       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 898       misal += negative ? -npeel * dr_size : npeel * dr_size;
 899       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 900       SET_DR_MISALIGNMENT (dr, misal);
 901       return;
 902     }
 903
 904   if (dump_enabled_p ())
 905     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 906   SET_DR_MISALIGNMENT (dr, -1);
 907 }
 908
 909
 910 /* Function vect_verify_datarefs_alignment
 911
 912    Return TRUE if all data references in the loop can be
 913    handled with respect to alignment.  */
 914
 915 bool
 916 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 917 {
 918   vec<data_reference_p> datarefs;
 919   struct data_reference *dr;
 920   enum dr_alignment_support supportable_dr_alignment;
 921   unsigned int i;
 922
 923   if (loop_vinfo)
 924     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 925   else
 926     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 927
 928   FOR_EACH_VEC_ELT (datarefs, i, dr)
 929     {
 930       gimple stmt = DR_STMT (dr);
 931       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 932
 933       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 934         continue;
 935
 936       /* For interleaving, only the alignment of the first access matters.
 937          Skip statements marked as not vectorizable.  */
 938       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 939            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 940           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 941         continue;
 942
 943       /* Strided loads perform only component accesses, alignment is
 944          irrelevant for them.  */
 945       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 946         continue;
 947
 948       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 949       if (!supportable_dr_alignment)
 950         {
 951           if (dump_enabled_p ())
 952             {
 953               if (DR_IS_READ (dr))
 954                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 955                                  "not vectorized: unsupported unaligned load.");
 956               else
 957                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 958                                  "not vectorized: unsupported unaligned "
 959                                  "store.");
 960
 961               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 962                                  DR_REF (dr));
 963               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 964             }
 965           return false;
 966         }
 967       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 968         dump_printf_loc (MSG_NOTE, vect_location,
 969                          "Vectorizing an unaligned access.\n");
 970     }
 971   return true;
 972 }
 973
 974 /* Given an memory reference EXP return whether its alignment is less
 975    than its size.  */
 976
 977 static bool
 978 not_size_aligned (tree exp)
 979 {
 980   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 981     return true;
 982
 983   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 984           > get_object_alignment (exp));
 985 }
 986
 987 /* Function vector_alignment_reachable_p
 988
 989    Return true if vector alignment for DR is reachable by peeling
 990    a few loop iterations.  Return false otherwise.  */
 991
 992 static bool
 993 vector_alignment_reachable_p (struct data_reference *dr)
 994 {
 995   gimple stmt = DR_STMT (dr);
 996   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 997   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 998
 999   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1000     {
1001       /* For interleaved access we peel only if number of iterations in
1002          the prolog loop ({VF - misalignment}), is a multiple of the
1003          number of the interleaved accesses.  */
1004       int elem_size, mis_in_elements;
1005       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1006
1007       /* FORNOW: handle only known alignment.  */
1008       if (!known_alignment_for_access_p (dr))
1009         return false;
1010
1011       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1012       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1013
1014       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1015         return false;
1016     }
1017
1018   /* If misalignment is known at the compile time then allow peeling
1019      only if natural alignment is reachable through peeling.  */
1020   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1021     {
1022       HOST_WIDE_INT elmsize =
1023                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1024       if (dump_enabled_p ())
1025         {
1026           dump_printf_loc (MSG_NOTE, vect_location,
1027                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1028           dump_printf (MSG_NOTE,
1029                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1030         }
1031       if (DR_MISALIGNMENT (dr) % elmsize)
1032         {
1033           if (dump_enabled_p ())
1034             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035                              "data size does not divide the misalignment.\n");
1036           return false;
1037         }
1038     }
1039
1040   if (!known_alignment_for_access_p (dr))
1041     {
1042       tree type = TREE_TYPE (DR_REF (dr));
1043       bool is_packed = not_size_aligned (DR_REF (dr));
1044       if (dump_enabled_p ())
1045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046                          "Unknown misalignment, is_packed = %d\n",is_packed);
1047       if ((TYPE_USER_ALIGN (type) && !is_packed)
1048           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1049         return true;
1050       else
1051         return false;
1052     }
1053
1054   return true;
1055 }
1056
1057
1058 /* Calculate the cost of the memory access represented by DR.  */
1059
1060 static void
1061 vect_get_data_access_cost (struct data_reference *dr,
1062                            unsigned int *inside_cost,
1063                            unsigned int *outside_cost,
1064                            stmt_vector_for_cost *body_cost_vec)
1065 {
1066   gimple stmt = DR_STMT (dr);
1067   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1068   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1069   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1070   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1071   int ncopies = vf / nunits;
1072
1073   if (DR_IS_READ (dr))
1074     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1075                         NULL, body_cost_vec, false);
1076   else
1077     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1078
1079   if (dump_enabled_p ())
1080     dump_printf_loc (MSG_NOTE, vect_location,
1081                      "vect_get_data_access_cost: inside_cost = %d, "
1082                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1083 }
1084
1085
1086 /* Insert DR into peeling hash table with NPEEL as key.  */
1087
1088 static void
1089 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1090                           int npeel)
1091 {
1092   struct _vect_peel_info elem, *slot;
1093   _vect_peel_info **new_slot;
1094   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1095
1096   elem.npeel = npeel;
1097   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1098   if (slot)
1099     slot->count++;
1100   else
1101     {
1102       slot = XNEW (struct _vect_peel_info);
1103       slot->npeel = npeel;
1104       slot->dr = dr;
1105       slot->count = 1;
1106       new_slot
1107         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1108       *new_slot = slot;
1109     }
1110
1111   if (!supportable_dr_alignment
1112       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1113     slot->count += VECT_MAX_COST;
1114 }
1115
1116
1117 /* Traverse peeling hash table to find peeling option that aligns maximum
1118    number of data accesses.  */
1119
1120 int
1121 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1122                                      _vect_peel_extended_info *max)
1123 {
1124   vect_peel_info elem = *slot;
1125
1126   if (elem->count > max->peel_info.count
1127       || (elem->count == max->peel_info.count
1128           && max->peel_info.npeel > elem->npeel))
1129     {
1130       max->peel_info.npeel = elem->npeel;
1131       max->peel_info.count = elem->count;
1132       max->peel_info.dr = elem->dr;
1133     }
1134
1135   return 1;
1136 }
1137
1138
1139 /* Traverse peeling hash table and calculate cost for each peeling option.
1140    Find the one with the lowest cost.  */
1141
1142 int
1143 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1144                                    _vect_peel_extended_info *min)
1145 {
1146   vect_peel_info elem = *slot;
1147   int save_misalignment, dummy;
1148   unsigned int inside_cost = 0, outside_cost = 0, i;
1149   gimple stmt = DR_STMT (elem->dr);
1150   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1151   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1152   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1153   struct data_reference *dr;
1154   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1155   int single_iter_cost;
1156
1157   prologue_cost_vec.create (2);
1158   body_cost_vec.create (2);
1159   epilogue_cost_vec.create (2);
1160
1161   FOR_EACH_VEC_ELT (datarefs, i, dr)
1162     {
1163       stmt = DR_STMT (dr);
1164       stmt_info = vinfo_for_stmt (stmt);
1165       /* For interleaving, only the alignment of the first access
1166          matters.  */
1167       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1168           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1169         continue;
1170
1171       save_misalignment = DR_MISALIGNMENT (dr);
1172       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1173       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1174                                  &body_cost_vec);
1175       SET_DR_MISALIGNMENT (dr, save_misalignment);
1176     }
1177
1178   single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1179   outside_cost += vect_get_known_peeling_cost
1180     (loop_vinfo, elem->npeel, &dummy,
1181      /* ???  We use this cost as number of stmts with scalar_stmt cost,
1182         thus divide by that.  This introduces rounding errors, thus better
1183         introduce a new cost kind (raw_cost?  scalar_iter_cost?). */
1184      single_iter_cost / vect_get_stmt_cost (scalar_stmt),
1185      &prologue_cost_vec, &epilogue_cost_vec);
1186
1187   /* Prologue and epilogue costs are added to the target model later.
1188      These costs depend only on the scalar iteration cost, the
1189      number of peeling iterations finally chosen, and the number of
1190      misaligned statements.  So discard the information found here.  */
1191   prologue_cost_vec.release ();
1192   epilogue_cost_vec.release ();
1193
1194   if (inside_cost < min->inside_cost
1195       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1196     {
1197       min->inside_cost = inside_cost;
1198       min->outside_cost = outside_cost;
1199       min->body_cost_vec.release ();
1200       min->body_cost_vec = body_cost_vec;
1201       min->peel_info.dr = elem->dr;
1202       min->peel_info.npeel = elem->npeel;
1203     }
1204   else
1205     body_cost_vec.release ();
1206
1207   return 1;
1208 }
1209
1210
1211 /* Choose best peeling option by traversing peeling hash table and either
1212    choosing an option with the lowest cost (if cost model is enabled) or the
1213    option that aligns as many accesses as possible.  */
1214
1215 static struct data_reference *
1216 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1217                                        unsigned int *npeel,
1218                                        stmt_vector_for_cost *body_cost_vec)
1219 {
1220    struct _vect_peel_extended_info res;
1221
1222    res.peel_info.dr = NULL;
1223    res.body_cost_vec = stmt_vector_for_cost ();
1224
1225    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1226      {
1227        res.inside_cost = INT_MAX;
1228        res.outside_cost = INT_MAX;
1229        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1230            ->traverse <_vect_peel_extended_info *,
1231                        vect_peeling_hash_get_lowest_cost> (&res);
1232      }
1233    else
1234      {
1235        res.peel_info.count = 0;
1236        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1237            ->traverse <_vect_peel_extended_info *,
1238                        vect_peeling_hash_get_most_frequent> (&res);
1239      }
1240
1241    *npeel = res.peel_info.npeel;
1242    *body_cost_vec = res.body_cost_vec;
1243    return res.peel_info.dr;
1244 }
1245
1246
1247 /* Function vect_enhance_data_refs_alignment
1248
1249    This pass will use loop versioning and loop peeling in order to enhance
1250    the alignment of data references in the loop.
1251
1252    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1253    original loop is to be vectorized.  Any other loops that are created by
1254    the transformations performed in this pass - are not supposed to be
1255    vectorized.  This restriction will be relaxed.
1256
1257    This pass will require a cost model to guide it whether to apply peeling
1258    or versioning or a combination of the two.  For example, the scheme that
1259    intel uses when given a loop with several memory accesses, is as follows:
1260    choose one memory access ('p') which alignment you want to force by doing
1261    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1262    other accesses are not necessarily aligned, or (2) use loop versioning to
1263    generate one loop in which all accesses are aligned, and another loop in
1264    which only 'p' is necessarily aligned.
1265
1266    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1267    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1268    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1269
1270    Devising a cost model is the most critical aspect of this work.  It will
1271    guide us on which access to peel for, whether to use loop versioning, how
1272    many versions to create, etc.  The cost model will probably consist of
1273    generic considerations as well as target specific considerations (on
1274    powerpc for example, misaligned stores are more painful than misaligned
1275    loads).
1276
1277    Here are the general steps involved in alignment enhancements:
1278
1279      -- original loop, before alignment analysis:
1280         for (i=0; i<N; i++){
1281           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1282           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1283         }
1284
1285      -- After vect_compute_data_refs_alignment:
1286         for (i=0; i<N; i++){
1287           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1288           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1289         }
1290
1291      -- Possibility 1: we do loop versioning:
1292      if (p is aligned) {
1293         for (i=0; i<N; i++){    # loop 1A
1294           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1295           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1296         }
1297      }
1298      else {
1299         for (i=0; i<N; i++){    # loop 1B
1300           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1301           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1302         }
1303      }
1304
1305      -- Possibility 2: we do loop peeling:
1306      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1307         x = q[i];
1308         p[i] = y;
1309      }
1310      for (i = 3; i < N; i++){   # loop 2A
1311         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1312         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1313      }
1314
1315      -- Possibility 3: combination of loop peeling and versioning:
1316      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1317         x = q[i];
1318         p[i] = y;
1319      }
1320      if (p is aligned) {
1321         for (i = 3; i<N; i++){  # loop 3A
1322           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1323           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1324         }
1325      }
1326      else {
1327         for (i = 3; i<N; i++){  # loop 3B
1328           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1329           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1330         }
1331      }
1332
1333      These loops are later passed to loop_transform to be vectorized.  The
1334      vectorizer will use the alignment information to guide the transformation
1335      (whether to generate regular loads/stores, or with special handling for
1336      misalignment).  */
1337
1338 bool
1339 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1340 {
1341   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1342   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1343   enum dr_alignment_support supportable_dr_alignment;
1344   struct data_reference *dr0 = NULL, *first_store = NULL;
1345   struct data_reference *dr;
1346   unsigned int i, j;
1347   bool do_peeling = false;
1348   bool do_versioning = false;
1349   bool stat;
1350   gimple stmt;
1351   stmt_vec_info stmt_info;
1352   unsigned int npeel = 0;
1353   bool all_misalignments_unknown = true;
1354   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1355   unsigned possible_npeel_number = 1;
1356   tree vectype;
1357   unsigned int nelements, mis, same_align_drs_max = 0;
1358   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1359
1360   if (dump_enabled_p ())
1361     dump_printf_loc (MSG_NOTE, vect_location,
1362                      "=== vect_enhance_data_refs_alignment ===\n");
1363
1364   /* While cost model enhancements are expected in the future, the high level
1365      view of the code at this time is as follows:
1366
1367      A) If there is a misaligned access then see if peeling to align
1368         this access can make all data references satisfy
1369         vect_supportable_dr_alignment.  If so, update data structures
1370         as needed and return true.
1371
1372      B) If peeling wasn't possible and there is a data reference with an
1373         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1374         then see if loop versioning checks can be used to make all data
1375         references satisfy vect_supportable_dr_alignment.  If so, update
1376         data structures as needed and return true.
1377
1378      C) If neither peeling nor versioning were successful then return false if
1379         any data reference does not satisfy vect_supportable_dr_alignment.
1380
1381      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1382
1383      Note, Possibility 3 above (which is peeling and versioning together) is not
1384      being done at this time.  */
1385
1386   /* (1) Peeling to force alignment.  */
1387
1388   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1389      Considerations:
1390      + How many accesses will become aligned due to the peeling
1391      - How many accesses will become unaligned due to the peeling,
1392        and the cost of misaligned accesses.
1393      - The cost of peeling (the extra runtime checks, the increase
1394        in code size).  */
1395
1396   FOR_EACH_VEC_ELT (datarefs, i, dr)
1397     {
1398       stmt = DR_STMT (dr);
1399       stmt_info = vinfo_for_stmt (stmt);
1400
1401       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1402         continue;
1403
1404       /* For interleaving, only the alignment of the first access
1405          matters.  */
1406       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1407           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1408         continue;
1409
1410       /* For invariant accesses there is nothing to enhance.  */
1411       if (integer_zerop (DR_STEP (dr)))
1412         continue;
1413
1414       /* Strided loads perform only component accesses, alignment is
1415          irrelevant for them.  */
1416       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1417         continue;
1418
1419       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1420       do_peeling = vector_alignment_reachable_p (dr);
1421       if (do_peeling)
1422         {
1423           if (known_alignment_for_access_p (dr))
1424             {
1425               unsigned int npeel_tmp;
1426               bool negative = tree_int_cst_compare (DR_STEP (dr),
1427                                                     size_zero_node) < 0;
1428
1429               /* Save info about DR in the hash table.  */
1430               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1431                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1432                   = new hash_table<peel_info_hasher> (1);
1433
1434               vectype = STMT_VINFO_VECTYPE (stmt_info);
1435               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1436               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1437                                                 TREE_TYPE (DR_REF (dr))));
1438               npeel_tmp = (negative
1439                            ? (mis - nelements) : (nelements - mis))
1440                   & (nelements - 1);
1441
1442               /* For multiple types, it is possible that the bigger type access
1443                  will have more than one peeling option.  E.g., a loop with two
1444                  types: one of size (vector size / 4), and the other one of
1445                  size (vector size / 8).  Vectorization factor will 8.  If both
1446                  access are misaligned by 3, the first one needs one scalar
1447                  iteration to be aligned, and the second one needs 5.  But the
1448                  the first one will be aligned also by peeling 5 scalar
1449                  iterations, and in that case both accesses will be aligned.
1450                  Hence, except for the immediate peeling amount, we also want
1451                  to try to add full vector size, while we don't exceed
1452                  vectorization factor.
1453                  We do this automtically for cost model, since we calculate cost
1454                  for every peeling option.  */
1455               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1456                 possible_npeel_number = vf /nelements;
1457
1458               /* Handle the aligned case. We may decide to align some other
1459                  access, making DR unaligned.  */
1460               if (DR_MISALIGNMENT (dr) == 0)
1461                 {
1462                   npeel_tmp = 0;
1463                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1464                     possible_npeel_number++;
1465                 }
1466
1467               for (j = 0; j < possible_npeel_number; j++)
1468                 {
1469                   gcc_assert (npeel_tmp <= vf);
1470                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1471                   npeel_tmp += nelements;
1472                 }
1473
1474               all_misalignments_unknown = false;
1475               /* Data-ref that was chosen for the case that all the
1476                  misalignments are unknown is not relevant anymore, since we
1477                  have a data-ref with known alignment.  */
1478               dr0 = NULL;
1479             }
1480           else
1481             {
1482               /* If we don't know any misalignment values, we prefer
1483                  peeling for data-ref that has the maximum number of data-refs
1484                  with the same alignment, unless the target prefers to align
1485                  stores over load.  */
1486               if (all_misalignments_unknown)
1487                 {
1488                   unsigned same_align_drs
1489                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1490                   if (!dr0
1491                       || same_align_drs_max < same_align_drs)
1492                     {
1493                       same_align_drs_max = same_align_drs;
1494                       dr0 = dr;
1495                     }
1496                   /* For data-refs with the same number of related
1497                      accesses prefer the one where the misalign
1498                      computation will be invariant in the outermost loop.  */
1499                   else if (same_align_drs_max == same_align_drs)
1500                     {
1501                       struct loop *ivloop0, *ivloop;
1502                       ivloop0 = outermost_invariant_loop_for_expr
1503                           (loop, DR_BASE_ADDRESS (dr0));
1504                       ivloop = outermost_invariant_loop_for_expr
1505                           (loop, DR_BASE_ADDRESS (dr));
1506                       if ((ivloop && !ivloop0)
1507                           || (ivloop && ivloop0
1508                               && flow_loop_nested_p (ivloop, ivloop0)))
1509                         dr0 = dr;
1510                     }
1511
1512                   if (!first_store && DR_IS_WRITE (dr))
1513                     first_store = dr;
1514                 }
1515
1516               /* If there are both known and unknown misaligned accesses in the
1517                  loop, we choose peeling amount according to the known
1518                  accesses.  */
1519               if (!supportable_dr_alignment)
1520                 {
1521                   dr0 = dr;
1522                   if (!first_store && DR_IS_WRITE (dr))
1523                     first_store = dr;
1524                 }
1525             }
1526         }
1527       else
1528         {
1529           if (!aligned_access_p (dr))
1530             {
1531               if (dump_enabled_p ())
1532                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                                  "vector alignment may not be reachable\n");
1534               break;
1535             }
1536         }
1537     }
1538
1539   /* Check if we can possibly peel the loop.  */
1540   if (!vect_can_advance_ivs_p (loop_vinfo)
1541       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1542     do_peeling = false;
1543
1544   /* If we don't know how many times the peeling loop will run
1545      assume it will run VF-1 times and disable peeling if the remaining
1546      iters are less than the vectorization factor.  */
1547   if (do_peeling
1548       && all_misalignments_unknown
1549       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1550       && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1551           < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1552     do_peeling = false;
1553
1554   if (do_peeling
1555       && all_misalignments_unknown
1556       && vect_supportable_dr_alignment (dr0, false))
1557     {
1558       /* Check if the target requires to prefer stores over loads, i.e., if
1559          misaligned stores are more expensive than misaligned loads (taking
1560          drs with same alignment into account).  */
1561       if (first_store && DR_IS_READ (dr0))
1562         {
1563           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1564           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1565           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1566           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1567           stmt_vector_for_cost dummy;
1568           dummy.create (2);
1569
1570           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1571                                      &dummy);
1572           vect_get_data_access_cost (first_store, &store_inside_cost,
1573                                      &store_outside_cost, &dummy);
1574
1575           dummy.release ();
1576
1577           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1578              aligning the load DR0).  */
1579           load_inside_penalty = store_inside_cost;
1580           load_outside_penalty = store_outside_cost;
1581           for (i = 0;
1582                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1583                           DR_STMT (first_store))).iterate (i, &dr);
1584                i++)
1585             if (DR_IS_READ (dr))
1586               {
1587                 load_inside_penalty += load_inside_cost;
1588                 load_outside_penalty += load_outside_cost;
1589               }
1590             else
1591               {
1592                 load_inside_penalty += store_inside_cost;
1593                 load_outside_penalty += store_outside_cost;
1594               }
1595
1596           /* Calculate the penalty for leaving DR0 unaligned (by
1597              aligning the FIRST_STORE).  */
1598           store_inside_penalty = load_inside_cost;
1599           store_outside_penalty = load_outside_cost;
1600           for (i = 0;
1601                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1602                       DR_STMT (dr0))).iterate (i, &dr);
1603                i++)
1604             if (DR_IS_READ (dr))
1605               {
1606                 store_inside_penalty += load_inside_cost;
1607                 store_outside_penalty += load_outside_cost;
1608               }
1609             else
1610               {
1611                 store_inside_penalty += store_inside_cost;
1612                 store_outside_penalty += store_outside_cost;
1613               }
1614
1615           if (load_inside_penalty > store_inside_penalty
1616               || (load_inside_penalty == store_inside_penalty
1617                   && load_outside_penalty > store_outside_penalty))
1618             dr0 = first_store;
1619         }
1620
1621       /* In case there are only loads with different unknown misalignments, use
1622          peeling only if it may help to align other accesses in the loop.  */
1623       if (!first_store
1624           && !STMT_VINFO_SAME_ALIGN_REFS (
1625                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1626           && vect_supportable_dr_alignment (dr0, false)
1627               != dr_unaligned_supported)
1628         do_peeling = false;
1629     }
1630
1631   if (do_peeling && !dr0)
1632     {
1633       /* Peeling is possible, but there is no data access that is not supported
1634          unless aligned. So we try to choose the best possible peeling.  */
1635
1636       /* We should get here only if there are drs with known misalignment.  */
1637       gcc_assert (!all_misalignments_unknown);
1638
1639       /* Choose the best peeling from the hash table.  */
1640       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1641                                                    &body_cost_vec);
1642       if (!dr0 || !npeel)
1643         do_peeling = false;
1644
1645       /* If peeling by npeel will result in a remaining loop not iterating
1646          enough to be vectorized then do not peel.  */
1647       if (do_peeling
1648           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1649           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1650               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1651         do_peeling = false;
1652     }
1653
1654   if (do_peeling)
1655     {
1656       stmt = DR_STMT (dr0);
1657       stmt_info = vinfo_for_stmt (stmt);
1658       vectype = STMT_VINFO_VECTYPE (stmt_info);
1659       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1660
1661       if (known_alignment_for_access_p (dr0))
1662         {
1663           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1664                                                 size_zero_node) < 0;
1665           if (!npeel)
1666             {
1667               /* Since it's known at compile time, compute the number of
1668                  iterations in the peeled loop (the peeling factor) for use in
1669                  updating DR_MISALIGNMENT values.  The peeling factor is the
1670                  vectorization factor minus the misalignment as an element
1671                  count.  */
1672               mis = DR_MISALIGNMENT (dr0);
1673               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1674               npeel = ((negative ? mis - nelements : nelements - mis)
1675                        & (nelements - 1));
1676             }
1677
1678           /* For interleaved data access every iteration accesses all the
1679              members of the group, therefore we divide the number of iterations
1680              by the group size.  */
1681           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1682           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1683             npeel /= GROUP_SIZE (stmt_info);
1684
1685           if (dump_enabled_p ())
1686             dump_printf_loc (MSG_NOTE, vect_location,
1687                              "Try peeling by %d\n", npeel);
1688         }
1689
1690       /* Ensure that all data refs can be vectorized after the peel.  */
1691       FOR_EACH_VEC_ELT (datarefs, i, dr)
1692         {
1693           int save_misalignment;
1694
1695           if (dr == dr0)
1696             continue;
1697
1698           stmt = DR_STMT (dr);
1699           stmt_info = vinfo_for_stmt (stmt);
1700           /* For interleaving, only the alignment of the first access
1701             matters.  */
1702           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1703               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1704             continue;
1705
1706           /* Strided loads perform only component accesses, alignment is
1707              irrelevant for them.  */
1708           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1709             continue;
1710
1711           save_misalignment = DR_MISALIGNMENT (dr);
1712           vect_update_misalignment_for_peel (dr, dr0, npeel);
1713           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1714           SET_DR_MISALIGNMENT (dr, save_misalignment);
1715
1716           if (!supportable_dr_alignment)
1717             {
1718               do_peeling = false;
1719               break;
1720             }
1721         }
1722
1723       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1724         {
1725           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1726           if (!stat)
1727             do_peeling = false;
1728           else
1729             {
1730               body_cost_vec.release ();
1731               return stat;
1732             }
1733         }
1734
1735       if (do_peeling)
1736         {
1737           unsigned max_allowed_peel
1738             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1739           if (max_allowed_peel != (unsigned)-1)
1740             {
1741               unsigned max_peel = npeel;
1742               if (max_peel == 0)
1743                 {
1744                   gimple dr_stmt = DR_STMT (dr0);
1745                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1746                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1747                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1748                 }
1749               if (max_peel > max_allowed_peel)
1750                 {
1751                   do_peeling = false;
1752                   if (dump_enabled_p ())
1753                     dump_printf_loc (MSG_NOTE, vect_location,
1754                         "Disable peeling, max peels reached: %d\n", max_peel);
1755                 }
1756             }
1757         }
1758
1759       if (do_peeling)
1760         {
1761           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1762              If the misalignment of DR_i is identical to that of dr0 then set
1763              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1764              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1765              by the peeling factor times the element size of DR_i (MOD the
1766              vectorization factor times the size).  Otherwise, the
1767              misalignment of DR_i must be set to unknown.  */
1768           FOR_EACH_VEC_ELT (datarefs, i, dr)
1769             if (dr != dr0)
1770               vect_update_misalignment_for_peel (dr, dr0, npeel);
1771
1772           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1773           if (npeel)
1774             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1775           else
1776             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1777               = DR_MISALIGNMENT (dr0);
1778           SET_DR_MISALIGNMENT (dr0, 0);
1779           if (dump_enabled_p ())
1780             {
1781               dump_printf_loc (MSG_NOTE, vect_location,
1782                                "Alignment of access forced using peeling.\n");
1783               dump_printf_loc (MSG_NOTE, vect_location,
1784                                "Peeling for alignment will be applied.\n");
1785             }
1786           /* The inside-loop cost will be accounted for in vectorizable_load
1787              and vectorizable_store correctly with adjusted alignments.
1788              Drop the body_cst_vec on the floor here.  */
1789           body_cost_vec.release ();
1790
1791           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1792           gcc_assert (stat);
1793           return stat;
1794         }
1795     }
1796
1797   body_cost_vec.release ();
1798
1799   /* (2) Versioning to force alignment.  */
1800
1801   /* Try versioning if:
1802      1) optimize loop for speed
1803      2) there is at least one unsupported misaligned data ref with an unknown
1804         misalignment, and
1805      3) all misaligned data refs with a known misalignment are supported, and
1806      4) the number of runtime alignment checks is within reason.  */
1807
1808   do_versioning =
1809         optimize_loop_nest_for_speed_p (loop)
1810         && (!loop->inner); /* FORNOW */
1811
1812   if (do_versioning)
1813     {
1814       FOR_EACH_VEC_ELT (datarefs, i, dr)
1815         {
1816           stmt = DR_STMT (dr);
1817           stmt_info = vinfo_for_stmt (stmt);
1818
1819           /* For interleaving, only the alignment of the first access
1820              matters.  */
1821           if (aligned_access_p (dr)
1822               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1823                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1824             continue;
1825
1826           /* Strided loads perform only component accesses, alignment is
1827              irrelevant for them.  */
1828           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1829             continue;
1830
1831           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1832
1833           if (!supportable_dr_alignment)
1834             {
1835               gimple stmt;
1836               int mask;
1837               tree vectype;
1838
1839               if (known_alignment_for_access_p (dr)
1840                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1841                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1842                 {
1843                   do_versioning = false;
1844                   break;
1845                 }
1846
1847               stmt = DR_STMT (dr);
1848               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1849               gcc_assert (vectype);
1850
1851               /* The rightmost bits of an aligned address must be zeros.
1852                  Construct the mask needed for this test.  For example,
1853                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1854                  mask must be 15 = 0xf. */
1855               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1856
1857               /* FORNOW: use the same mask to test all potentially unaligned
1858                  references in the loop.  The vectorizer currently supports
1859                  a single vector size, see the reference to
1860                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1861                  vectorization factor is computed.  */
1862               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1863                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1864               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1865               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1866                       DR_STMT (dr));
1867             }
1868         }
1869
1870       /* Versioning requires at least one misaligned data reference.  */
1871       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1872         do_versioning = false;
1873       else if (!do_versioning)
1874         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1875     }
1876
1877   if (do_versioning)
1878     {
1879       vec<gimple> may_misalign_stmts
1880         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1881       gimple stmt;
1882
1883       /* It can now be assumed that the data references in the statements
1884          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1885          of the loop being vectorized.  */
1886       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1887         {
1888           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1889           dr = STMT_VINFO_DATA_REF (stmt_info);
1890           SET_DR_MISALIGNMENT (dr, 0);
1891           if (dump_enabled_p ())
1892             dump_printf_loc (MSG_NOTE, vect_location,
1893                              "Alignment of access forced using versioning.\n");
1894         }
1895
1896       if (dump_enabled_p ())
1897         dump_printf_loc (MSG_NOTE, vect_location,
1898                          "Versioning for alignment will be applied.\n");
1899
1900       /* Peeling and versioning can't be done together at this time.  */
1901       gcc_assert (! (do_peeling && do_versioning));
1902
1903       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1904       gcc_assert (stat);
1905       return stat;
1906     }
1907
1908   /* This point is reached if neither peeling nor versioning is being done.  */
1909   gcc_assert (! (do_peeling || do_versioning));
1910
1911   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1912   return stat;
1913 }
1914
1915
1916 /* Function vect_find_same_alignment_drs.
1917
1918    Update group and alignment relations according to the chosen
1919    vectorization factor.  */
1920
1921 static void
1922 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1923                               loop_vec_info loop_vinfo)
1924 {
1925   unsigned int i;
1926   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1927   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1928   struct data_reference *dra = DDR_A (ddr);
1929   struct data_reference *drb = DDR_B (ddr);
1930   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1931   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1932   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1933   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1934   lambda_vector dist_v;
1935   unsigned int loop_depth;
1936
1937   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1938     return;
1939
1940   if (dra == drb)
1941     return;
1942
1943   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1944     return;
1945
1946   /* Loop-based vectorization and known data dependence.  */
1947   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1948     return;
1949
1950   /* Data-dependence analysis reports a distance vector of zero
1951      for data-references that overlap only in the first iteration
1952      but have different sign step (see PR45764).
1953      So as a sanity check require equal DR_STEP.  */
1954   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1955     return;
1956
1957   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1958   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1959     {
1960       int dist = dist_v[loop_depth];
1961
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_NOTE, vect_location,
1964                          "dependence distance  = %d.\n", dist);
1965
1966       /* Same loop iteration.  */
1967       if (dist == 0
1968           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1969         {
1970           /* Two references with distance zero have the same alignment.  */
1971           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1972           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1973           if (dump_enabled_p ())
1974             {
1975               dump_printf_loc (MSG_NOTE, vect_location,
1976                                "accesses have the same alignment.\n");
1977               dump_printf (MSG_NOTE,
1978                            "dependence distance modulo vf == 0 between ");
1979               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1980               dump_printf (MSG_NOTE,  " and ");
1981               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1982               dump_printf (MSG_NOTE, "\n");
1983             }
1984         }
1985     }
1986 }
1987
1988
1989 /* Function vect_analyze_data_refs_alignment
1990
1991    Analyze the alignment of the data-references in the loop.
1992    Return FALSE if a data reference is found that cannot be vectorized.  */
1993
1994 bool
1995 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1996                                   bb_vec_info bb_vinfo)
1997 {
1998   if (dump_enabled_p ())
1999     dump_printf_loc (MSG_NOTE, vect_location,
2000                      "=== vect_analyze_data_refs_alignment ===\n");
2001
2002   /* Mark groups of data references with same alignment using
2003      data dependence information.  */
2004   if (loop_vinfo)
2005     {
2006       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2007       struct data_dependence_relation *ddr;
2008       unsigned int i;
2009
2010       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2011         vect_find_same_alignment_drs (ddr, loop_vinfo);
2012     }
2013
2014   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2015     {
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "not vectorized: can't calculate alignment "
2019                          "for data ref.\n");
2020       return false;
2021     }
2022
2023   return true;
2024 }
2025
2026
2027 /* Analyze groups of accesses: check that DR belongs to a group of
2028    accesses of legal size, step, etc.  Detect gaps, single element
2029    interleaving, and other special cases. Set grouped access info.
2030    Collect groups of strided stores for further use in SLP analysis.  */
2031
2032 static bool
2033 vect_analyze_group_access (struct data_reference *dr)
2034 {
2035   tree step = DR_STEP (dr);
2036   tree scalar_type = TREE_TYPE (DR_REF (dr));
2037   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2038   gimple stmt = DR_STMT (dr);
2039   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2040   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2041   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2042   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2043   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2044   bool slp_impossible = false;
2045   struct loop *loop = NULL;
2046
2047   if (loop_vinfo)
2048     loop = LOOP_VINFO_LOOP (loop_vinfo);
2049
2050   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2051      size of the interleaving group (including gaps).  */
2052   groupsize = absu_hwi (dr_step) / type_size;
2053
2054   /* Not consecutive access is possible only if it is a part of interleaving.  */
2055   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2056     {
2057       /* Check if it this DR is a part of interleaving, and is a single
2058          element of the group that is accessed in the loop.  */
2059
2060       /* Gaps are supported only for loads. STEP must be a multiple of the type
2061          size.  The size of the group must be a power of 2.  */
2062       if (DR_IS_READ (dr)
2063           && (dr_step % type_size) == 0
2064           && groupsize > 0
2065           && exact_log2 (groupsize) != -1)
2066         {
2067           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2068           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2069           if (dump_enabled_p ())
2070             {
2071               dump_printf_loc (MSG_NOTE, vect_location,
2072                                "Detected single element interleaving ");
2073               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2074               dump_printf (MSG_NOTE, " step ");
2075               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2076               dump_printf (MSG_NOTE, "\n");
2077             }
2078
2079           if (loop_vinfo)
2080             {
2081               if (dump_enabled_p ())
2082                 dump_printf_loc (MSG_NOTE, vect_location,
2083                                  "Data access with gaps requires scalar "
2084                                  "epilogue loop\n");
2085               if (loop->inner)
2086                 {
2087                   if (dump_enabled_p ())
2088                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089                                      "Peeling for outer loop is not"
2090                                      " supported\n");
2091                   return false;
2092                 }
2093
2094               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2095             }
2096
2097           return true;
2098         }
2099
2100       if (dump_enabled_p ())
2101         {
2102           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2103                            "not consecutive access ");
2104           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2105           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2106         }
2107
2108       if (bb_vinfo)
2109         {
2110           /* Mark the statement as unvectorizable.  */
2111           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2112           return true;
2113         }
2114
2115       return false;
2116     }
2117
2118   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2119     {
2120       /* First stmt in the interleaving chain. Check the chain.  */
2121       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2122       struct data_reference *data_ref = dr;
2123       unsigned int count = 1;
2124       tree prev_init = DR_INIT (data_ref);
2125       gimple prev = stmt;
2126       HOST_WIDE_INT diff, gaps = 0;
2127       unsigned HOST_WIDE_INT count_in_bytes;
2128
2129       while (next)
2130         {
2131           /* Skip same data-refs.  In case that two or more stmts share
2132              data-ref (supported only for loads), we vectorize only the first
2133              stmt, and the rest get their vectorized loads from the first
2134              one.  */
2135           if (!tree_int_cst_compare (DR_INIT (data_ref),
2136                                      DR_INIT (STMT_VINFO_DATA_REF (
2137                                                    vinfo_for_stmt (next)))))
2138             {
2139               if (DR_IS_WRITE (data_ref))
2140                 {
2141                   if (dump_enabled_p ())
2142                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2143                                      "Two store stmts share the same dr.\n");
2144                   return false;
2145                 }
2146
2147               /* For load use the same data-ref load.  */
2148               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2149
2150               prev = next;
2151               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2152               continue;
2153             }
2154
2155           prev = next;
2156           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2157
2158           /* All group members have the same STEP by construction.  */
2159           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2160
2161           /* Check that the distance between two accesses is equal to the type
2162              size. Otherwise, we have gaps.  */
2163           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2164                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2165           if (diff != 1)
2166             {
2167               /* FORNOW: SLP of accesses with gaps is not supported.  */
2168               slp_impossible = true;
2169               if (DR_IS_WRITE (data_ref))
2170                 {
2171                   if (dump_enabled_p ())
2172                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173                                      "interleaved store with gaps\n");
2174                   return false;
2175                 }
2176
2177               gaps += diff - 1;
2178             }
2179
2180           last_accessed_element += diff;
2181
2182           /* Store the gap from the previous member of the group. If there is no
2183              gap in the access, GROUP_GAP is always 1.  */
2184           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2185
2186           prev_init = DR_INIT (data_ref);
2187           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2188           /* Count the number of data-refs in the chain.  */
2189           count++;
2190         }
2191
2192       /* COUNT is the number of accesses found, we multiply it by the size of
2193          the type to get COUNT_IN_BYTES.  */
2194       count_in_bytes = type_size * count;
2195
2196       /* Check that the size of the interleaving (including gaps) is not
2197          greater than STEP.  */
2198       if (dr_step != 0
2199           && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2200         {
2201           if (dump_enabled_p ())
2202             {
2203               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204                                "interleaving size is greater than step for ");
2205               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2206                                  DR_REF (dr));
2207               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2208             }
2209           return false;
2210         }
2211
2212       /* Check that the size of the interleaving is equal to STEP for stores,
2213          i.e., that there are no gaps.  */
2214       if (dr_step != 0
2215           && absu_hwi (dr_step) != count_in_bytes)
2216         {
2217           if (DR_IS_READ (dr))
2218             {
2219               slp_impossible = true;
2220               /* There is a gap after the last load in the group. This gap is a
2221                  difference between the groupsize and the number of elements.
2222                  When there is no gap, this difference should be 0.  */
2223               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2224             }
2225           else
2226             {
2227               if (dump_enabled_p ())
2228                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2229                                  "interleaved store with gaps\n");
2230               return false;
2231             }
2232         }
2233
2234       /* Check that STEP is a multiple of type size.  */
2235       if (dr_step != 0
2236           && (dr_step % type_size) != 0)
2237         {
2238           if (dump_enabled_p ())
2239             {
2240               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2241                                "step is not a multiple of type size: step ");
2242               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2243               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2244               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2245                                  TYPE_SIZE_UNIT (scalar_type));
2246               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2247             }
2248           return false;
2249         }
2250
2251       if (groupsize == 0)
2252         groupsize = count;
2253
2254       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2255       if (dump_enabled_p ())
2256         dump_printf_loc (MSG_NOTE, vect_location,
2257                          "Detected interleaving of size %d\n", (int)groupsize);
2258
2259       /* SLP: create an SLP data structure for every interleaving group of
2260          stores for further analysis in vect_analyse_slp.  */
2261       if (DR_IS_WRITE (dr) && !slp_impossible)
2262         {
2263           if (loop_vinfo)
2264             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2265           if (bb_vinfo)
2266             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2267         }
2268
2269       /* There is a gap in the end of the group.  */
2270       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2271         {
2272           if (dump_enabled_p ())
2273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274                              "Data access with gaps requires scalar "
2275                              "epilogue loop\n");
2276           if (loop->inner)
2277             {
2278               if (dump_enabled_p ())
2279                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280                                  "Peeling for outer loop is not supported\n");
2281               return false;
2282             }
2283
2284           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2285         }
2286     }
2287
2288   return true;
2289 }
2290
2291
2292 /* Analyze the access pattern of the data-reference DR.
2293    In case of non-consecutive accesses call vect_analyze_group_access() to
2294    analyze groups of accesses.  */
2295
2296 static bool
2297 vect_analyze_data_ref_access (struct data_reference *dr)
2298 {
2299   tree step = DR_STEP (dr);
2300   tree scalar_type = TREE_TYPE (DR_REF (dr));
2301   gimple stmt = DR_STMT (dr);
2302   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2303   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2304   struct loop *loop = NULL;
2305
2306   if (loop_vinfo)
2307     loop = LOOP_VINFO_LOOP (loop_vinfo);
2308
2309   if (loop_vinfo && !step)
2310     {
2311       if (dump_enabled_p ())
2312         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313                          "bad data-ref access in loop\n");
2314       return false;
2315     }
2316
2317   /* Allow invariant loads in not nested loops.  */
2318   if (loop_vinfo && integer_zerop (step))
2319     {
2320       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2321       if (nested_in_vect_loop_p (loop, stmt))
2322         {
2323           if (dump_enabled_p ())
2324             dump_printf_loc (MSG_NOTE, vect_location,
2325                              "zero step in inner loop of nest\n");
2326           return false;
2327         }
2328       return DR_IS_READ (dr);
2329     }
2330
2331   if (loop && nested_in_vect_loop_p (loop, stmt))
2332     {
2333       /* Interleaved accesses are not yet supported within outer-loop
2334         vectorization for references in the inner-loop.  */
2335       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2336
2337       /* For the rest of the analysis we use the outer-loop step.  */
2338       step = STMT_VINFO_DR_STEP (stmt_info);
2339       if (integer_zerop (step))
2340         {
2341           if (dump_enabled_p ())
2342             dump_printf_loc (MSG_NOTE, vect_location,
2343                              "zero step in outer loop.\n");
2344           if (DR_IS_READ (dr))
2345             return true;
2346           else
2347             return false;
2348         }
2349     }
2350
2351   /* Consecutive?  */
2352   if (TREE_CODE (step) == INTEGER_CST)
2353     {
2354       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2355       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2356           || (dr_step < 0
2357               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2358         {
2359           /* Mark that it is not interleaving.  */
2360           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2361           return true;
2362         }
2363     }
2364
2365   if (loop && nested_in_vect_loop_p (loop, stmt))
2366     {
2367       if (dump_enabled_p ())
2368         dump_printf_loc (MSG_NOTE, vect_location,
2369                          "grouped access in outer loop.\n");
2370       return false;
2371     }
2372
2373   /* Assume this is a DR handled by non-constant strided load case.  */
2374   if (TREE_CODE (step) != INTEGER_CST)
2375     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2376
2377   /* Not consecutive access - check if it's a part of interleaving group.  */
2378   return vect_analyze_group_access (dr);
2379 }
2380
2381
2382
2383 /*  A helper function used in the comparator function to sort data
2384     references.  T1 and T2 are two data references to be compared.
2385     The function returns -1, 0, or 1.  */
2386
2387 static int
2388 compare_tree (tree t1, tree t2)
2389 {
2390   int i, cmp;
2391   enum tree_code code;
2392   char tclass;
2393
2394   if (t1 == t2)
2395     return 0;
2396   if (t1 == NULL)
2397     return -1;
2398   if (t2 == NULL)
2399     return 1;
2400
2401
2402   if (TREE_CODE (t1) != TREE_CODE (t2))
2403     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2404
2405   code = TREE_CODE (t1);
2406   switch (code)
2407     {
2408     /* For const values, we can just use hash values for comparisons.  */
2409     case INTEGER_CST:
2410     case REAL_CST:
2411     case FIXED_CST:
2412     case STRING_CST:
2413     case COMPLEX_CST:
2414     case VECTOR_CST:
2415       {
2416         hashval_t h1 = iterative_hash_expr (t1, 0);
2417         hashval_t h2 = iterative_hash_expr (t2, 0);
2418         if (h1 != h2)
2419           return h1 < h2 ? -1 : 1;
2420         break;
2421       }
2422
2423     case SSA_NAME:
2424       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2425       if (cmp != 0)
2426         return cmp;
2427
2428       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2429         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2430       break;
2431
2432     default:
2433       tclass = TREE_CODE_CLASS (code);
2434
2435       /* For var-decl, we could compare their UIDs.  */
2436       if (tclass == tcc_declaration)
2437         {
2438           if (DECL_UID (t1) != DECL_UID (t2))
2439             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2440           break;
2441         }
2442
2443       /* For expressions with operands, compare their operands recursively.  */
2444       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2445         {
2446           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2447           if (cmp != 0)
2448             return cmp;
2449         }
2450     }
2451
2452   return 0;
2453 }
2454
2455
2456 /* Compare two data-references DRA and DRB to group them into chunks
2457    suitable for grouping.  */
2458
2459 static int
2460 dr_group_sort_cmp (const void *dra_, const void *drb_)
2461 {
2462   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2463   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2464   int cmp;
2465
2466   /* Stabilize sort.  */
2467   if (dra == drb)
2468     return 0;
2469
2470   /* Ordering of DRs according to base.  */
2471   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2472     {
2473       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2474       if (cmp != 0)
2475         return cmp;
2476     }
2477
2478   /* And according to DR_OFFSET.  */
2479   if (!dr_equal_offsets_p (dra, drb))
2480     {
2481       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2482       if (cmp != 0)
2483         return cmp;
2484     }
2485
2486   /* Put reads before writes.  */
2487   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2488     return DR_IS_READ (dra) ? -1 : 1;
2489
2490   /* Then sort after access size.  */
2491   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2492                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2493     {
2494       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2495                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2496       if (cmp != 0)
2497         return cmp;
2498     }
2499
2500   /* And after step.  */
2501   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2502     {
2503       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2504       if (cmp != 0)
2505         return cmp;
2506     }
2507
2508   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2509   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2510   if (cmp == 0)
2511     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2512   return cmp;
2513 }
2514
2515 /* Function vect_analyze_data_ref_accesses.
2516
2517    Analyze the access pattern of all the data references in the loop.
2518
2519    FORNOW: the only access pattern that is considered vectorizable is a
2520            simple step 1 (consecutive) access.
2521
2522    FORNOW: handle only arrays and pointer accesses.  */
2523
2524 bool
2525 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2526 {
2527   unsigned int i;
2528   vec<data_reference_p> datarefs;
2529   struct data_reference *dr;
2530
2531   if (dump_enabled_p ())
2532     dump_printf_loc (MSG_NOTE, vect_location,
2533                      "=== vect_analyze_data_ref_accesses ===\n");
2534
2535   if (loop_vinfo)
2536     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2537   else
2538     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2539
2540   if (datarefs.is_empty ())
2541     return true;
2542
2543   /* Sort the array of datarefs to make building the interleaving chains
2544      linear.  Don't modify the original vector's order, it is needed for
2545      determining what dependencies are reversed.  */
2546   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2547   datarefs_copy.qsort (dr_group_sort_cmp);
2548
2549   /* Build the interleaving chains.  */
2550   for (i = 0; i < datarefs_copy.length () - 1;)
2551     {
2552       data_reference_p dra = datarefs_copy[i];
2553       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2554       stmt_vec_info lastinfo = NULL;
2555       for (i = i + 1; i < datarefs_copy.length (); ++i)
2556         {
2557           data_reference_p drb = datarefs_copy[i];
2558           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2559
2560           /* ???  Imperfect sorting (non-compatible types, non-modulo
2561              accesses, same accesses) can lead to a group to be artificially
2562              split here as we don't just skip over those.  If it really
2563              matters we can push those to a worklist and re-iterate
2564              over them.  The we can just skip ahead to the next DR here.  */
2565
2566           /* Check that the data-refs have same first location (except init)
2567              and they are both either store or load (not load and store,
2568              not masked loads or stores).  */
2569           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2570               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2571                                    DR_BASE_ADDRESS (drb), 0)
2572               || !dr_equal_offsets_p (dra, drb)
2573               || !gimple_assign_single_p (DR_STMT (dra))
2574               || !gimple_assign_single_p (DR_STMT (drb)))
2575             break;
2576
2577           /* Check that the data-refs have the same constant size and step.  */
2578           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2579           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2580           if (!tree_fits_uhwi_p (sza)
2581               || !tree_fits_uhwi_p (szb)
2582               || !tree_int_cst_equal (sza, szb)
2583               || !tree_fits_shwi_p (DR_STEP (dra))
2584               || !tree_fits_shwi_p (DR_STEP (drb))
2585               || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2586             break;
2587
2588           /* Do not place the same access in the interleaving chain twice.  */
2589           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2590             break;
2591
2592           /* Check the types are compatible.
2593              ???  We don't distinguish this during sorting.  */
2594           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2595                                    TREE_TYPE (DR_REF (drb))))
2596             break;
2597
2598           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2599           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2600           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2601           gcc_assert (init_a < init_b);
2602
2603           /* If init_b == init_a + the size of the type * k, we have an
2604              interleaving, and DRA is accessed before DRB.  */
2605           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2606           if ((init_b - init_a) % type_size_a != 0)
2607             break;
2608
2609           /* The step (if not zero) is greater than the difference between
2610              data-refs' inits.  This splits groups into suitable sizes.  */
2611           HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2612           if (step != 0 && step <= (init_b - init_a))
2613             break;
2614
2615           if (dump_enabled_p ())
2616             {
2617               dump_printf_loc (MSG_NOTE, vect_location,
2618                                "Detected interleaving ");
2619               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2620               dump_printf (MSG_NOTE,  " and ");
2621               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2622               dump_printf (MSG_NOTE, "\n");
2623             }
2624
2625           /* Link the found element into the group list.  */
2626           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2627             {
2628               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2629               lastinfo = stmtinfo_a;
2630             }
2631           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2632           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2633           lastinfo = stmtinfo_b;
2634         }
2635     }
2636
2637   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2638     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2639         && !vect_analyze_data_ref_access (dr))
2640       {
2641         if (dump_enabled_p ())
2642           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2643                            "not vectorized: complicated access pattern.\n");
2644
2645         if (bb_vinfo)
2646           {
2647             /* Mark the statement as not vectorizable.  */
2648             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2649             continue;
2650           }
2651         else
2652           {
2653             datarefs_copy.release ();
2654             return false;
2655           }
2656       }
2657
2658   datarefs_copy.release ();
2659   return true;
2660 }
2661
2662
2663 /* Operator == between two dr_with_seg_len objects.
2664
2665    This equality operator is used to make sure two data refs
2666    are the same one so that we will consider to combine the
2667    aliasing checks of those two pairs of data dependent data
2668    refs.  */
2669
2670 static bool
2671 operator == (const dr_with_seg_len& d1,
2672              const dr_with_seg_len& d2)
2673 {
2674   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2675                           DR_BASE_ADDRESS (d2.dr), 0)
2676            && compare_tree (d1.offset, d2.offset) == 0
2677            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2678 }
2679
2680 /* Function comp_dr_with_seg_len_pair.
2681
2682    Comparison function for sorting objects of dr_with_seg_len_pair_t
2683    so that we can combine aliasing checks in one scan.  */
2684
2685 static int
2686 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2687 {
2688   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2689   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2690
2691   const dr_with_seg_len &p11 = p1->first,
2692                         &p12 = p1->second,
2693                         &p21 = p2->first,
2694                         &p22 = p2->second;
2695
2696   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2697      if a and c have the same basic address snd step, and b and d have the same
2698      address and step.  Therefore, if any a&c or b&d don't have the same address
2699      and step, we don't care the order of those two pairs after sorting.  */
2700   int comp_res;
2701
2702   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2703                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2704     return comp_res;
2705   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2706                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2707     return comp_res;
2708   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2709     return comp_res;
2710   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2711     return comp_res;
2712   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2713     return comp_res;
2714   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2715     return comp_res;
2716
2717   return 0;
2718 }
2719
2720 /* Function vect_vfa_segment_size.
2721
2722    Create an expression that computes the size of segment
2723    that will be accessed for a data reference.  The functions takes into
2724    account that realignment loads may access one more vector.
2725
2726    Input:
2727      DR: The data reference.
2728      LENGTH_FACTOR: segment length to consider.
2729
2730    Return an expression whose value is the size of segment which will be
2731    accessed by DR.  */
2732
2733 static tree
2734 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2735 {
2736   tree segment_length;
2737
2738   if (integer_zerop (DR_STEP (dr)))
2739     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2740   else
2741     segment_length = size_binop (MULT_EXPR,
2742                                  fold_convert (sizetype, DR_STEP (dr)),
2743                                  fold_convert (sizetype, length_factor));
2744
2745   if (vect_supportable_dr_alignment (dr, false)
2746         == dr_explicit_realign_optimized)
2747     {
2748       tree vector_size = TYPE_SIZE_UNIT
2749                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2750
2751       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2752     }
2753   return segment_length;
2754 }
2755
2756 /* Function vect_prune_runtime_alias_test_list.
2757
2758    Prune a list of ddrs to be tested at run-time by versioning for alias.
2759    Merge several alias checks into one if possible.
2760    Return FALSE if resulting list of ddrs is longer then allowed by
2761    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2762
2763 bool
2764 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2765 {
2766   vec<ddr_p> may_alias_ddrs =
2767     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2768   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2769     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2770   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2771   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2772
2773   ddr_p ddr;
2774   unsigned int i;
2775   tree length_factor;
2776
2777   if (dump_enabled_p ())
2778     dump_printf_loc (MSG_NOTE, vect_location,
2779                      "=== vect_prune_runtime_alias_test_list ===\n");
2780
2781   if (may_alias_ddrs.is_empty ())
2782     return true;
2783
2784   /* Basically, for each pair of dependent data refs store_ptr_0
2785      and load_ptr_0, we create an expression:
2786
2787      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2788      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2789
2790      for aliasing checks.  However, in some cases we can decrease
2791      the number of checks by combining two checks into one.  For
2792      example, suppose we have another pair of data refs store_ptr_0
2793      and load_ptr_1, and if the following condition is satisfied:
2794
2795      load_ptr_0 < load_ptr_1  &&
2796      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2797
2798      (this condition means, in each iteration of vectorized loop,
2799      the accessed memory of store_ptr_0 cannot be between the memory
2800      of load_ptr_0 and load_ptr_1.)
2801
2802      we then can use only the following expression to finish the
2803      alising checks between store_ptr_0 & load_ptr_0 and
2804      store_ptr_0 & load_ptr_1:
2805
2806      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2807      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2808
2809      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2810      same basic address.  */
2811
2812   comp_alias_ddrs.create (may_alias_ddrs.length ());
2813
2814   /* First, we collect all data ref pairs for aliasing checks.  */
2815   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2816     {
2817       struct data_reference *dr_a, *dr_b;
2818       gimple dr_group_first_a, dr_group_first_b;
2819       tree segment_length_a, segment_length_b;
2820       gimple stmt_a, stmt_b;
2821
2822       dr_a = DDR_A (ddr);
2823       stmt_a = DR_STMT (DDR_A (ddr));
2824       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2825       if (dr_group_first_a)
2826         {
2827           stmt_a = dr_group_first_a;
2828           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2829         }
2830
2831       dr_b = DDR_B (ddr);
2832       stmt_b = DR_STMT (DDR_B (ddr));
2833       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2834       if (dr_group_first_b)
2835         {
2836           stmt_b = dr_group_first_b;
2837           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2838         }
2839
2840       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2841         length_factor = scalar_loop_iters;
2842       else
2843         length_factor = size_int (vect_factor);
2844       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2845       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2846
2847       dr_with_seg_len_pair_t dr_with_seg_len_pair
2848           (dr_with_seg_len (dr_a, segment_length_a),
2849            dr_with_seg_len (dr_b, segment_length_b));
2850
2851       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2852         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2853
2854       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2855     }
2856
2857   /* Second, we sort the collected data ref pairs so that we can scan
2858      them once to combine all possible aliasing checks.  */
2859   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2860
2861   /* Third, we scan the sorted dr pairs and check if we can combine
2862      alias checks of two neighbouring dr pairs.  */
2863   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2864     {
2865       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2866       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2867                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2868                       *dr_a2 = &comp_alias_ddrs[i].first,
2869                       *dr_b2 = &comp_alias_ddrs[i].second;
2870
2871       /* Remove duplicate data ref pairs.  */
2872       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2873         {
2874           if (dump_enabled_p ())
2875             {
2876               dump_printf_loc (MSG_NOTE, vect_location,
2877                                "found equal ranges ");
2878               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2879                                  DR_REF (dr_a1->dr));
2880               dump_printf (MSG_NOTE,  ", ");
2881               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2882                                  DR_REF (dr_b1->dr));
2883               dump_printf (MSG_NOTE,  " and ");
2884               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2885                                  DR_REF (dr_a2->dr));
2886               dump_printf (MSG_NOTE,  ", ");
2887               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2888                                  DR_REF (dr_b2->dr));
2889               dump_printf (MSG_NOTE, "\n");
2890             }
2891
2892           comp_alias_ddrs.ordered_remove (i--);
2893           continue;
2894         }
2895
2896       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2897         {
2898           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2899              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2900           if (*dr_a1 == *dr_a2)
2901             {
2902               std::swap (dr_a1, dr_b1);
2903               std::swap (dr_a2, dr_b2);
2904             }
2905
2906           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2907                                 DR_BASE_ADDRESS (dr_a2->dr),
2908                                 0)
2909               || !tree_fits_shwi_p (dr_a1->offset)
2910               || !tree_fits_shwi_p (dr_a2->offset))
2911             continue;
2912
2913           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2914                                 - tree_to_shwi (dr_a1->offset));
2915
2916
2917           /* Now we check if the following condition is satisfied:
2918
2919              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2920
2921              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2922              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2923              have to make a best estimation.  We can get the minimum value
2924              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2925              then either of the following two conditions can guarantee the
2926              one above:
2927
2928              1: DIFF <= MIN_SEG_LEN_B
2929              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2930
2931              */
2932
2933           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2934                                           ? tree_to_shwi (dr_b1->seg_len)
2935                                           : vect_factor);
2936
2937           if (diff <= min_seg_len_b
2938               || (tree_fits_shwi_p (dr_a1->seg_len)
2939                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2940             {
2941               if (dump_enabled_p ())
2942                 {
2943                   dump_printf_loc (MSG_NOTE, vect_location,
2944                                    "merging ranges for ");
2945                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2946                                      DR_REF (dr_a1->dr));
2947                   dump_printf (MSG_NOTE,  ", ");
2948                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2949                                      DR_REF (dr_b1->dr));
2950                   dump_printf (MSG_NOTE,  " and ");
2951                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2952                                      DR_REF (dr_a2->dr));
2953                   dump_printf (MSG_NOTE,  ", ");
2954                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2955                                      DR_REF (dr_b2->dr));
2956                   dump_printf (MSG_NOTE, "\n");
2957                 }
2958
2959               dr_a1->seg_len = size_binop (PLUS_EXPR,
2960                                            dr_a2->seg_len, size_int (diff));
2961               comp_alias_ddrs.ordered_remove (i--);
2962             }
2963         }
2964     }
2965
2966   dump_printf_loc (MSG_NOTE, vect_location,
2967                    "improved number of alias checks from %d to %d\n",
2968                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2969   if ((int) comp_alias_ddrs.length () >
2970       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2971     return false;
2972
2973   return true;
2974 }
2975
2976 /* Check whether a non-affine read in stmt is suitable for gather load
2977    and if so, return a builtin decl for that operation.  */
2978
2979 tree
2980 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2981                    tree *offp, int *scalep)
2982 {
2983   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2984   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2985   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2986   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2987   tree offtype = NULL_TREE;
2988   tree decl, base, off;
2989   machine_mode pmode;
2990   int punsignedp, pvolatilep;
2991
2992   base = DR_REF (dr);
2993   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2994      see if we can use the def stmt of the address.  */
2995   if (is_gimple_call (stmt)
2996       && gimple_call_internal_p (stmt)
2997       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2998           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2999       && TREE_CODE (base) == MEM_REF
3000       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3001       && integer_zerop (TREE_OPERAND (base, 1))
3002       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3003     {
3004       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3005       if (is_gimple_assign (def_stmt)
3006           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3007         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3008     }
3009
3010   /* The gather builtins need address of the form
3011      loop_invariant + vector * {1, 2, 4, 8}
3012      or
3013      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3014      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3015      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3016      multiplications and additions in it.  To get a vector, we need
3017      a single SSA_NAME that will be defined in the loop and will
3018      contain everything that is not loop invariant and that can be
3019      vectorized.  The following code attempts to find such a preexistng
3020      SSA_NAME OFF and put the loop invariants into a tree BASE
3021      that can be gimplified before the loop.  */
3022   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3023                               &pmode, &punsignedp, &pvolatilep, false);
3024   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3025
3026   if (TREE_CODE (base) == MEM_REF)
3027     {
3028       if (!integer_zerop (TREE_OPERAND (base, 1)))
3029         {
3030           if (off == NULL_TREE)
3031             {
3032               offset_int moff = mem_ref_offset (base);
3033               off = wide_int_to_tree (sizetype, moff);
3034             }
3035           else
3036             off = size_binop (PLUS_EXPR, off,
3037                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3038         }
3039       base = TREE_OPERAND (base, 0);
3040     }
3041   else
3042     base = build_fold_addr_expr (base);
3043
3044   if (off == NULL_TREE)
3045     off = size_zero_node;
3046
3047   /* If base is not loop invariant, either off is 0, then we start with just
3048      the constant offset in the loop invariant BASE and continue with base
3049      as OFF, otherwise give up.
3050      We could handle that case by gimplifying the addition of base + off
3051      into some SSA_NAME and use that as off, but for now punt.  */
3052   if (!expr_invariant_in_loop_p (loop, base))
3053     {
3054       if (!integer_zerop (off))
3055         return NULL_TREE;
3056       off = base;
3057       base = size_int (pbitpos / BITS_PER_UNIT);
3058     }
3059   /* Otherwise put base + constant offset into the loop invariant BASE
3060      and continue with OFF.  */
3061   else
3062     {
3063       base = fold_convert (sizetype, base);
3064       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3065     }
3066
3067   /* OFF at this point may be either a SSA_NAME or some tree expression
3068      from get_inner_reference.  Try to peel off loop invariants from it
3069      into BASE as long as possible.  */
3070   STRIP_NOPS (off);
3071   while (offtype == NULL_TREE)
3072     {
3073       enum tree_code code;
3074       tree op0, op1, add = NULL_TREE;
3075
3076       if (TREE_CODE (off) == SSA_NAME)
3077         {
3078           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3079
3080           if (expr_invariant_in_loop_p (loop, off))
3081             return NULL_TREE;
3082
3083           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3084             break;
3085
3086           op0 = gimple_assign_rhs1 (def_stmt);
3087           code = gimple_assign_rhs_code (def_stmt);
3088           op1 = gimple_assign_rhs2 (def_stmt);
3089         }
3090       else
3091         {
3092           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3093             return NULL_TREE;
3094           code = TREE_CODE (off);
3095           extract_ops_from_tree (off, &code, &op0, &op1);
3096         }
3097       switch (code)
3098         {
3099         case POINTER_PLUS_EXPR:
3100         case PLUS_EXPR:
3101           if (expr_invariant_in_loop_p (loop, op0))
3102             {
3103               add = op0;
3104               off = op1;
3105             do_add:
3106               add = fold_convert (sizetype, add);
3107               if (scale != 1)
3108                 add = size_binop (MULT_EXPR, add, size_int (scale));
3109               base = size_binop (PLUS_EXPR, base, add);
3110               continue;
3111             }
3112           if (expr_invariant_in_loop_p (loop, op1))
3113             {
3114               add = op1;
3115               off = op0;
3116               goto do_add;
3117             }
3118           break;
3119         case MINUS_EXPR:
3120           if (expr_invariant_in_loop_p (loop, op1))
3121             {
3122               add = fold_convert (sizetype, op1);
3123               add = size_binop (MINUS_EXPR, size_zero_node, add);
3124               off = op0;
3125               goto do_add;
3126             }
3127           break;
3128         case MULT_EXPR:
3129           if (scale == 1 && tree_fits_shwi_p (op1))
3130             {
3131               scale = tree_to_shwi (op1);
3132               off = op0;
3133               continue;
3134             }
3135           break;
3136         case SSA_NAME:
3137           off = op0;
3138           continue;
3139         CASE_CONVERT:
3140           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3141               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3142             break;
3143           if (TYPE_PRECISION (TREE_TYPE (op0))
3144               == TYPE_PRECISION (TREE_TYPE (off)))
3145             {
3146               off = op0;
3147               continue;
3148             }
3149           if (TYPE_PRECISION (TREE_TYPE (op0))
3150               < TYPE_PRECISION (TREE_TYPE (off)))
3151             {
3152               off = op0;
3153               offtype = TREE_TYPE (off);
3154               STRIP_NOPS (off);
3155               continue;
3156             }
3157           break;
3158         default:
3159           break;
3160         }
3161       break;
3162     }
3163
3164   /* If at the end OFF still isn't a SSA_NAME or isn't
3165      defined in the loop, punt.  */
3166   if (TREE_CODE (off) != SSA_NAME
3167       || expr_invariant_in_loop_p (loop, off))
3168     return NULL_TREE;
3169
3170   if (offtype == NULL_TREE)
3171     offtype = TREE_TYPE (off);
3172
3173   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3174                                            offtype, scale);
3175   if (decl == NULL_TREE)
3176     return NULL_TREE;
3177
3178   if (basep)
3179     *basep = base;
3180   if (offp)
3181     *offp = off;
3182   if (scalep)
3183     *scalep = scale;
3184   return decl;
3185 }
3186
3187 /* Function vect_analyze_data_refs.
3188
3189   Find all the data references in the loop or basic block.
3190
3191    The general structure of the analysis of data refs in the vectorizer is as
3192    follows:
3193    1- vect_analyze_data_refs(loop/bb): call
3194       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3195       in the loop/bb and their dependences.
3196    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3197    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3198    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3199
3200 */
3201
3202 bool
3203 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3204                         bb_vec_info bb_vinfo,
3205                         int *min_vf, unsigned *n_stmts)
3206 {
3207   struct loop *loop = NULL;
3208   basic_block bb = NULL;
3209   unsigned int i;
3210   vec<data_reference_p> datarefs;
3211   struct data_reference *dr;
3212   tree scalar_type;
3213
3214   if (dump_enabled_p ())
3215     dump_printf_loc (MSG_NOTE, vect_location,
3216                      "=== vect_analyze_data_refs ===\n");
3217
3218   if (loop_vinfo)
3219     {
3220       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3221
3222       loop = LOOP_VINFO_LOOP (loop_vinfo);
3223       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3224       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3225         {
3226           if (dump_enabled_p ())
3227             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3228                              "not vectorized: loop contains function calls"
3229                              " or data references that cannot be analyzed\n");
3230           return false;
3231         }
3232
3233       for (i = 0; i < loop->num_nodes; i++)
3234         {
3235           gimple_stmt_iterator gsi;
3236
3237           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3238             {
3239               gimple stmt = gsi_stmt (gsi);
3240               if (is_gimple_debug (stmt))
3241                 continue;
3242               ++*n_stmts;
3243               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3244                 {
3245                   if (is_gimple_call (stmt) && loop->safelen)
3246                     {
3247                       tree fndecl = gimple_call_fndecl (stmt), op;
3248                       if (fndecl != NULL_TREE)
3249                         {
3250                           struct cgraph_node *node = cgraph_node::get (fndecl);
3251                           if (node != NULL && node->simd_clones != NULL)
3252                             {
3253                               unsigned int j, n = gimple_call_num_args (stmt);
3254                               for (j = 0; j < n; j++)
3255                                 {
3256                                   op = gimple_call_arg (stmt, j);
3257                                   if (DECL_P (op)
3258                                       || (REFERENCE_CLASS_P (op)
3259                                           && get_base_address (op)))
3260                                     break;
3261                                 }
3262                               op = gimple_call_lhs (stmt);
3263                               /* Ignore #pragma omp declare simd functions
3264                                  if they don't have data references in the
3265                                  call stmt itself.  */
3266                               if (j == n
3267                                   && !(op
3268                                        && (DECL_P (op)
3269                                            || (REFERENCE_CLASS_P (op)
3270                                                && get_base_address (op)))))
3271                                 continue;
3272                             }
3273                         }
3274                     }
3275                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3276                   if (dump_enabled_p ())
3277                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278                                      "not vectorized: loop contains function "
3279                                      "calls or data references that cannot "
3280                                      "be analyzed\n");
3281                   return false;
3282                 }
3283             }
3284         }
3285
3286       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3287     }
3288   else
3289     {
3290       gimple_stmt_iterator gsi;
3291
3292       bb = BB_VINFO_BB (bb_vinfo);
3293       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3294         {
3295           gimple stmt = gsi_stmt (gsi);
3296           if (is_gimple_debug (stmt))
3297             continue;
3298           ++*n_stmts;
3299           if (!find_data_references_in_stmt (NULL, stmt,
3300                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3301             {
3302               /* Mark the rest of the basic-block as unvectorizable.  */
3303               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3304                 {
3305                   stmt = gsi_stmt (gsi);
3306                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3307                 }
3308               break;
3309             }
3310         }
3311
3312       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3313     }
3314
3315   /* Go through the data-refs, check that the analysis succeeded.  Update
3316      pointer from stmt_vec_info struct to DR and vectype.  */
3317
3318   FOR_EACH_VEC_ELT (datarefs, i, dr)
3319     {
3320       gimple stmt;
3321       stmt_vec_info stmt_info;
3322       tree base, offset, init;
3323       bool gather = false;
3324       bool simd_lane_access = false;
3325       int vf;
3326
3327 again:
3328       if (!dr || !DR_REF (dr))
3329         {
3330           if (dump_enabled_p ())
3331             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3332                              "not vectorized: unhandled data-ref\n");
3333           return false;
3334         }
3335
3336       stmt = DR_STMT (dr);
3337       stmt_info = vinfo_for_stmt (stmt);
3338
3339       /* Discard clobbers from the dataref vector.  We will remove
3340          clobber stmts during vectorization.  */
3341       if (gimple_clobber_p (stmt))
3342         {
3343           free_data_ref (dr);
3344           if (i == datarefs.length () - 1)
3345             {
3346               datarefs.pop ();
3347               break;
3348             }
3349           datarefs.ordered_remove (i);
3350           dr = datarefs[i];
3351           goto again;
3352         }
3353
3354       /* Check that analysis of the data-ref succeeded.  */
3355       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3356           || !DR_STEP (dr))
3357         {
3358           bool maybe_gather
3359             = DR_IS_READ (dr)
3360               && !TREE_THIS_VOLATILE (DR_REF (dr))
3361               && targetm.vectorize.builtin_gather != NULL;
3362           bool maybe_simd_lane_access
3363             = loop_vinfo && loop->simduid;
3364
3365           /* If target supports vector gather loads, or if this might be
3366              a SIMD lane access, see if they can't be used.  */
3367           if (loop_vinfo
3368               && (maybe_gather || maybe_simd_lane_access)
3369               && !nested_in_vect_loop_p (loop, stmt))
3370             {
3371               struct data_reference *newdr
3372                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3373                                    DR_REF (dr), stmt, true);
3374               gcc_assert (newdr != NULL && DR_REF (newdr));
3375               if (DR_BASE_ADDRESS (newdr)
3376                   && DR_OFFSET (newdr)
3377                   && DR_INIT (newdr)
3378                   && DR_STEP (newdr)
3379                   && integer_zerop (DR_STEP (newdr)))
3380                 {
3381                   if (maybe_simd_lane_access)
3382                     {
3383                       tree off = DR_OFFSET (newdr);
3384                       STRIP_NOPS (off);
3385                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3386                           && TREE_CODE (off) == MULT_EXPR
3387                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3388                         {
3389                           tree step = TREE_OPERAND (off, 1);
3390                           off = TREE_OPERAND (off, 0);
3391                           STRIP_NOPS (off);
3392                           if (CONVERT_EXPR_P (off)
3393                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3394                                                                           0)))
3395                                  < TYPE_PRECISION (TREE_TYPE (off)))
3396                             off = TREE_OPERAND (off, 0);
3397                           if (TREE_CODE (off) == SSA_NAME)
3398                             {
3399                               gimple def = SSA_NAME_DEF_STMT (off);
3400                               tree reft = TREE_TYPE (DR_REF (newdr));
3401                               if (is_gimple_call (def)
3402                                   && gimple_call_internal_p (def)
3403                                   && (gimple_call_internal_fn (def)
3404                                       == IFN_GOMP_SIMD_LANE))
3405                                 {
3406                                   tree arg = gimple_call_arg (def, 0);
3407                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3408                                   arg = SSA_NAME_VAR (arg);
3409                                   if (arg == loop->simduid
3410                                       /* For now.  */
3411                                       && tree_int_cst_equal
3412                                            (TYPE_SIZE_UNIT (reft),
3413                                             step))
3414                                     {
3415                                       DR_OFFSET (newdr) = ssize_int (0);
3416                                       DR_STEP (newdr) = step;
3417                                       DR_ALIGNED_TO (newdr)
3418                                         = size_int (BIGGEST_ALIGNMENT);
3419                                       dr = newdr;
3420                                       simd_lane_access = true;
3421                                     }
3422                                 }
3423                             }
3424                         }
3425                     }
3426                   if (!simd_lane_access && maybe_gather)
3427                     {
3428                       dr = newdr;
3429                       gather = true;
3430                     }
3431                 }
3432               if (!gather && !simd_lane_access)
3433                 free_data_ref (newdr);
3434             }
3435
3436           if (!gather && !simd_lane_access)
3437             {
3438               if (dump_enabled_p ())
3439                 {
3440                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3441                                    "not vectorized: data ref analysis "
3442                                    "failed ");
3443                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3444                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3445                 }
3446
3447               if (bb_vinfo)
3448                 break;
3449
3450               return false;
3451             }
3452         }
3453
3454       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3455         {
3456           if (dump_enabled_p ())
3457             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3458                              "not vectorized: base addr of dr is a "
3459                              "constant\n");
3460
3461           if (bb_vinfo)
3462             break;
3463
3464           if (gather || simd_lane_access)
3465             free_data_ref (dr);
3466           return false;
3467         }
3468
3469       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3470         {
3471           if (dump_enabled_p ())
3472             {
3473               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3474                                "not vectorized: volatile type ");
3475               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3476               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3477             }
3478
3479           if (bb_vinfo)
3480             break;
3481
3482           return false;
3483         }
3484
3485       if (stmt_can_throw_internal (stmt))
3486         {
3487           if (dump_enabled_p ())
3488             {
3489               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3490                                "not vectorized: statement can throw an "
3491                                "exception ");
3492               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3493               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3494             }
3495
3496           if (bb_vinfo)
3497             break;
3498
3499           if (gather || simd_lane_access)
3500             free_data_ref (dr);
3501           return false;
3502         }
3503
3504       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3505           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3506         {
3507           if (dump_enabled_p ())
3508             {
3509               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3510                                "not vectorized: statement is bitfield "
3511                                "access ");
3512               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3513               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3514             }
3515
3516           if (bb_vinfo)
3517             break;
3518
3519           if (gather || simd_lane_access)
3520             free_data_ref (dr);
3521           return false;
3522         }
3523
3524       base = unshare_expr (DR_BASE_ADDRESS (dr));
3525       offset = unshare_expr (DR_OFFSET (dr));
3526       init = unshare_expr (DR_INIT (dr));
3527
3528       if (is_gimple_call (stmt)
3529           && (!gimple_call_internal_p (stmt)
3530               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3531                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3532         {
3533           if (dump_enabled_p ())
3534             {
3535               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3536                                "not vectorized: dr in a call ");
3537               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3538               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3539             }
3540
3541           if (bb_vinfo)
3542             break;
3543
3544           if (gather || simd_lane_access)
3545             free_data_ref (dr);
3546           return false;
3547         }
3548
3549       /* Update DR field in stmt_vec_info struct.  */
3550
3551       /* If the dataref is in an inner-loop of the loop that is considered for
3552          for vectorization, we also want to analyze the access relative to
3553          the outer-loop (DR contains information only relative to the
3554          inner-most enclosing loop).  We do that by building a reference to the
3555          first location accessed by the inner-loop, and analyze it relative to
3556          the outer-loop.  */
3557       if (loop && nested_in_vect_loop_p (loop, stmt))
3558         {
3559           tree outer_step, outer_base, outer_init;
3560           HOST_WIDE_INT pbitsize, pbitpos;
3561           tree poffset;
3562           machine_mode pmode;
3563           int punsignedp, pvolatilep;
3564           affine_iv base_iv, offset_iv;
3565           tree dinit;
3566
3567           /* Build a reference to the first location accessed by the
3568              inner-loop: *(BASE+INIT).  (The first location is actually
3569              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3570           tree inner_base = build_fold_indirect_ref
3571                                 (fold_build_pointer_plus (base, init));
3572
3573           if (dump_enabled_p ())
3574             {
3575               dump_printf_loc (MSG_NOTE, vect_location,
3576                                "analyze in outer-loop: ");
3577               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3578               dump_printf (MSG_NOTE, "\n");
3579             }
3580
3581           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3582                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3583           gcc_assert (outer_base != NULL_TREE);
3584
3585           if (pbitpos % BITS_PER_UNIT != 0)
3586             {
3587               if (dump_enabled_p ())
3588                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589                                  "failed: bit offset alignment.\n");
3590               return false;
3591             }
3592
3593           outer_base = build_fold_addr_expr (outer_base);
3594           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3595                           &base_iv, false))
3596             {
3597               if (dump_enabled_p ())
3598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3599                                  "failed: evolution of base is not affine.\n");
3600               return false;
3601             }
3602
3603           if (offset)
3604             {
3605               if (poffset)
3606                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3607                                        poffset);
3608               else
3609                 poffset = offset;
3610             }
3611
3612           if (!poffset)
3613             {
3614               offset_iv.base = ssize_int (0);
3615               offset_iv.step = ssize_int (0);
3616             }
3617           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3618                                &offset_iv, false))
3619             {
3620               if (dump_enabled_p ())
3621                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622                                  "evolution of offset is not affine.\n");
3623               return false;
3624             }
3625
3626           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3627           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3628           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3629           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3630           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3631
3632           outer_step = size_binop (PLUS_EXPR,
3633                                 fold_convert (ssizetype, base_iv.step),
3634                                 fold_convert (ssizetype, offset_iv.step));
3635
3636           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3637           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3638           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3639           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3640           STMT_VINFO_DR_OFFSET (stmt_info) =
3641                                 fold_convert (ssizetype, offset_iv.base);
3642           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3643                                 size_int (highest_pow2_factor (offset_iv.base));
3644
3645           if (dump_enabled_p ())
3646             {
3647               dump_printf_loc (MSG_NOTE, vect_location,
3648                                "\touter base_address: ");
3649               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3650                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3651               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3652               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3653                                  STMT_VINFO_DR_OFFSET (stmt_info));
3654               dump_printf (MSG_NOTE,
3655                            "\n\touter constant offset from base address: ");
3656               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3657                                  STMT_VINFO_DR_INIT (stmt_info));
3658               dump_printf (MSG_NOTE, "\n\touter step: ");
3659               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660                                  STMT_VINFO_DR_STEP (stmt_info));
3661               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3662               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3663                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3664               dump_printf (MSG_NOTE, "\n");
3665             }
3666         }
3667
3668       if (STMT_VINFO_DATA_REF (stmt_info))
3669         {
3670           if (dump_enabled_p ())
3671             {
3672               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3673                                "not vectorized: more than one data ref "
3674                                "in stmt: ");
3675               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3676               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3677             }
3678
3679           if (bb_vinfo)
3680             break;
3681
3682           if (gather || simd_lane_access)
3683             free_data_ref (dr);
3684           return false;
3685         }
3686
3687       STMT_VINFO_DATA_REF (stmt_info) = dr;
3688       if (simd_lane_access)
3689         {
3690           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3691           free_data_ref (datarefs[i]);
3692           datarefs[i] = dr;
3693         }
3694
3695       /* Set vectype for STMT.  */
3696       scalar_type = TREE_TYPE (DR_REF (dr));
3697       STMT_VINFO_VECTYPE (stmt_info)
3698         = get_vectype_for_scalar_type (scalar_type);
3699       if (!STMT_VINFO_VECTYPE (stmt_info))
3700         {
3701           if (dump_enabled_p ())
3702             {
3703               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3704                                "not vectorized: no vectype for stmt: ");
3705               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3706               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3707               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3708                                  scalar_type);
3709               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3710             }
3711
3712           if (bb_vinfo)
3713             break;
3714
3715           if (gather || simd_lane_access)
3716             {
3717               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3718               if (gather)
3719                 free_data_ref (dr);
3720             }
3721           return false;
3722         }
3723       else
3724         {
3725           if (dump_enabled_p ())
3726             {
3727               dump_printf_loc (MSG_NOTE, vect_location,
3728                                "got vectype for stmt: ");
3729               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3730               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3731                                  STMT_VINFO_VECTYPE (stmt_info));
3732               dump_printf (MSG_NOTE, "\n");
3733             }
3734         }
3735
3736       /* Adjust the minimal vectorization factor according to the
3737          vector type.  */
3738       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3739       if (vf > *min_vf)
3740         *min_vf = vf;
3741
3742       if (gather)
3743         {
3744           tree off;
3745
3746           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3747           if (gather
3748               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3749             gather = false;
3750           if (!gather)
3751             {
3752               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3753               free_data_ref (dr);
3754               if (dump_enabled_p ())
3755                 {
3756                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3757                                    "not vectorized: not suitable for gather "
3758                                    "load ");
3759                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3760                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3761                 }
3762               return false;
3763             }
3764
3765           datarefs[i] = dr;
3766           STMT_VINFO_GATHER_P (stmt_info) = true;
3767         }
3768       else if (loop_vinfo
3769                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3770         {
3771           if (nested_in_vect_loop_p (loop, stmt)
3772               || !DR_IS_READ (dr))
3773             {
3774               if (dump_enabled_p ())
3775                 {
3776                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3777                                    "not vectorized: not suitable for strided "
3778                                    "load ");
3779                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3780                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3781                 }
3782               return false;
3783             }
3784           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3785         }
3786     }
3787
3788   /* If we stopped analysis at the first dataref we could not analyze
3789      when trying to vectorize a basic-block mark the rest of the datarefs
3790      as not vectorizable and truncate the vector of datarefs.  That
3791      avoids spending useless time in analyzing their dependence.  */
3792   if (i != datarefs.length ())
3793     {
3794       gcc_assert (bb_vinfo != NULL);
3795       for (unsigned j = i; j < datarefs.length (); ++j)
3796         {
3797           data_reference_p dr = datarefs[j];
3798           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3799           free_data_ref (dr);
3800         }
3801       datarefs.truncate (i);
3802     }
3803
3804   return true;
3805 }
3806
3807
3808 /* Function vect_get_new_vect_var.
3809
3810    Returns a name for a new variable.  The current naming scheme appends the
3811    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3812    the name of vectorizer generated variables, and appends that to NAME if
3813    provided.  */
3814
3815 tree
3816 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3817 {
3818   const char *prefix;
3819   tree new_vect_var;
3820
3821   switch (var_kind)
3822   {
3823   case vect_simple_var:
3824     prefix = "vect";
3825     break;
3826   case vect_scalar_var:
3827     prefix = "stmp";
3828     break;
3829   case vect_pointer_var:
3830     prefix = "vectp";
3831     break;
3832   default:
3833     gcc_unreachable ();
3834   }
3835
3836   if (name)
3837     {
3838       char* tmp = concat (prefix, "_", name, NULL);
3839       new_vect_var = create_tmp_reg (type, tmp);
3840       free (tmp);
3841     }
3842   else
3843     new_vect_var = create_tmp_reg (type, prefix);
3844
3845   return new_vect_var;
3846 }
3847
3848 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3849
3850 static void
3851 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3852                                   stmt_vec_info stmt_info)
3853 {
3854   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3855   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3856   int misalign = DR_MISALIGNMENT (dr);
3857   if (misalign == -1)
3858     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3859   else
3860     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3861 }
3862
3863 /* Function vect_create_addr_base_for_vector_ref.
3864
3865    Create an expression that computes the address of the first memory location
3866    that will be accessed for a data reference.
3867
3868    Input:
3869    STMT: The statement containing the data reference.
3870    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3871    OFFSET: Optional. If supplied, it is be added to the initial address.
3872    LOOP:    Specify relative to which loop-nest should the address be computed.
3873             For example, when the dataref is in an inner-loop nested in an
3874             outer-loop that is now being vectorized, LOOP can be either the
3875             outer-loop, or the inner-loop.  The first memory location accessed
3876             by the following dataref ('in' points to short):
3877
3878                 for (i=0; i<N; i++)
3879                    for (j=0; j<M; j++)
3880                      s += in[i+j]
3881
3882             is as follows:
3883             if LOOP=i_loop:     &in             (relative to i_loop)
3884             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3885    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3886             initial address.  Unlike OFFSET, which is number of elements to
3887             be added, BYTE_OFFSET is measured in bytes.
3888
3889    Output:
3890    1. Return an SSA_NAME whose value is the address of the memory location of
3891       the first vector of the data reference.
3892    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3893       these statement(s) which define the returned SSA_NAME.
3894
3895    FORNOW: We are only handling array accesses with step 1.  */
3896
3897 tree
3898 vect_create_addr_base_for_vector_ref (gimple stmt,
3899                                       gimple_seq *new_stmt_list,
3900                                       tree offset,
3901                                       struct loop *loop,
3902                                       tree byte_offset)
3903 {
3904   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3905   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3906   tree data_ref_base;
3907   const char *base_name;
3908   tree addr_base;
3909   tree dest;
3910   gimple_seq seq = NULL;
3911   tree base_offset;
3912   tree init;
3913   tree vect_ptr_type;
3914   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3915   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3916
3917   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3918     {
3919       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3920
3921       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3922
3923       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3924       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3925       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3926     }
3927   else
3928     {
3929       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3930       base_offset = unshare_expr (DR_OFFSET (dr));
3931       init = unshare_expr (DR_INIT (dr));
3932     }
3933
3934   if (loop_vinfo)
3935     base_name = get_name (data_ref_base);
3936   else
3937     {
3938       base_offset = ssize_int (0);
3939       init = ssize_int (0);
3940       base_name = get_name (DR_REF (dr));
3941     }
3942
3943   /* Create base_offset */
3944   base_offset = size_binop (PLUS_EXPR,
3945                             fold_convert (sizetype, base_offset),
3946                             fold_convert (sizetype, init));
3947
3948   if (offset)
3949     {
3950       offset = fold_build2 (MULT_EXPR, sizetype,
3951                             fold_convert (sizetype, offset), step);
3952       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3953                                  base_offset, offset);
3954     }
3955   if (byte_offset)
3956     {
3957       byte_offset = fold_convert (sizetype, byte_offset);
3958       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3959                                  base_offset, byte_offset);
3960     }
3961
3962   /* base + base_offset */
3963   if (loop_vinfo)
3964     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3965   else
3966     {
3967       addr_base = build1 (ADDR_EXPR,
3968                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3969                           unshare_expr (DR_REF (dr)));
3970     }
3971
3972   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3973   addr_base = fold_convert (vect_ptr_type, addr_base);
3974   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3975   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3976   gimple_seq_add_seq (new_stmt_list, seq);
3977
3978   if (DR_PTR_INFO (dr)
3979       && TREE_CODE (addr_base) == SSA_NAME)
3980     {
3981       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3982       if (offset || byte_offset)
3983         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3984     }
3985
3986   if (dump_enabled_p ())
3987     {
3988       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3989       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3990       dump_printf (MSG_NOTE, "\n");
3991     }
3992
3993   return addr_base;
3994 }
3995
3996
3997 /* Function vect_create_data_ref_ptr.
3998
3999    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4000    location accessed in the loop by STMT, along with the def-use update
4001    chain to appropriately advance the pointer through the loop iterations.
4002    Also set aliasing information for the pointer.  This pointer is used by
4003    the callers to this function to create a memory reference expression for
4004    vector load/store access.
4005
4006    Input:
4007    1. STMT: a stmt that references memory. Expected to be of the form
4008          GIMPLE_ASSIGN <name, data-ref> or
4009          GIMPLE_ASSIGN <data-ref, name>.
4010    2. AGGR_TYPE: the type of the reference, which should be either a vector
4011         or an array.
4012    3. AT_LOOP: the loop where the vector memref is to be created.
4013    4. OFFSET (optional): an offset to be added to the initial address accessed
4014         by the data-ref in STMT.
4015    5. BSI: location where the new stmts are to be placed if there is no loop
4016    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4017         pointing to the initial address.
4018    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4019         to the initial address accessed by the data-ref in STMT.  This is
4020         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4021         in bytes.
4022
4023    Output:
4024    1. Declare a new ptr to vector_type, and have it point to the base of the
4025       data reference (initial addressed accessed by the data reference).
4026       For example, for vector of type V8HI, the following code is generated:
4027
4028       v8hi *ap;
4029       ap = (v8hi *)initial_address;
4030
4031       if OFFSET is not supplied:
4032          initial_address = &a[init];
4033       if OFFSET is supplied:
4034          initial_address = &a[init + OFFSET];
4035       if BYTE_OFFSET is supplied:
4036          initial_address = &a[init] + BYTE_OFFSET;
4037
4038       Return the initial_address in INITIAL_ADDRESS.
4039
4040    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4041       update the pointer in each iteration of the loop.
4042
4043       Return the increment stmt that updates the pointer in PTR_INCR.
4044
4045    3. Set INV_P to true if the access pattern of the data reference in the
4046       vectorized loop is invariant.  Set it to false otherwise.
4047
4048    4. Return the pointer.  */
4049
4050 tree
4051 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4052                           tree offset, tree *initial_address,
4053                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
4054                           bool only_init, bool *inv_p, tree byte_offset)
4055 {
4056   const char *base_name;
4057   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4058   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4059   struct loop *loop = NULL;
4060   bool nested_in_vect_loop = false;
4061   struct loop *containing_loop = NULL;
4062   tree aggr_ptr_type;
4063   tree aggr_ptr;
4064   tree new_temp;
4065   gimple vec_stmt;
4066   gimple_seq new_stmt_list = NULL;
4067   edge pe = NULL;
4068   basic_block new_bb;
4069   tree aggr_ptr_init;
4070   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4071   tree aptr;
4072   gimple_stmt_iterator incr_gsi;
4073   bool insert_after;
4074   tree indx_before_incr, indx_after_incr;
4075   gimple incr;
4076   tree step;
4077   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4078
4079   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4080               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4081
4082   if (loop_vinfo)
4083     {
4084       loop = LOOP_VINFO_LOOP (loop_vinfo);
4085       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4086       containing_loop = (gimple_bb (stmt))->loop_father;
4087       pe = loop_preheader_edge (loop);
4088     }
4089   else
4090     {
4091       gcc_assert (bb_vinfo);
4092       only_init = true;
4093       *ptr_incr = NULL;
4094     }
4095
4096   /* Check the step (evolution) of the load in LOOP, and record
4097      whether it's invariant.  */
4098   if (nested_in_vect_loop)
4099     step = STMT_VINFO_DR_STEP (stmt_info);
4100   else
4101     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4102
4103   if (integer_zerop (step))
4104     *inv_p = true;
4105   else
4106     *inv_p = false;
4107
4108   /* Create an expression for the first address accessed by this load
4109      in LOOP.  */
4110   base_name = get_name (DR_BASE_ADDRESS (dr));
4111
4112   if (dump_enabled_p ())
4113     {
4114       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4115       dump_printf_loc (MSG_NOTE, vect_location,
4116                        "create %s-pointer variable to type: ",
4117                        get_tree_code_name (TREE_CODE (aggr_type)));
4118       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4119       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4120         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4121       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4122         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4123       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4124         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4125       else
4126         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4127       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4128       dump_printf (MSG_NOTE, "\n");
4129     }
4130
4131   /* (1) Create the new aggregate-pointer variable.
4132      Vector and array types inherit the alias set of their component
4133      type by default so we need to use a ref-all pointer if the data
4134      reference does not conflict with the created aggregated data
4135      reference because it is not addressable.  */
4136   bool need_ref_all = false;
4137   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4138                               get_alias_set (DR_REF (dr))))
4139     need_ref_all = true;
4140   /* Likewise for any of the data references in the stmt group.  */
4141   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4142     {
4143       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4144       do
4145         {
4146           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4147           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4148           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4149                                       get_alias_set (DR_REF (sdr))))
4150             {
4151               need_ref_all = true;
4152               break;
4153             }
4154           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4155         }
4156       while (orig_stmt);
4157     }
4158   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4159                                                need_ref_all);
4160   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4161
4162
4163   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4164      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4165      def-use update cycles for the pointer: one relative to the outer-loop
4166      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4167      to the inner-loop (which is the inner-most loop containing the dataref),
4168      and this is done be step (5) below.
4169
4170      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4171      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4172      redundant.  Steps (3),(4) create the following:
4173
4174         vp0 = &base_addr;
4175         LOOP:   vp1 = phi(vp0,vp2)
4176                 ...
4177                 ...
4178                 vp2 = vp1 + step
4179                 goto LOOP
4180
4181      If there is an inner-loop nested in loop, then step (5) will also be
4182      applied, and an additional update in the inner-loop will be created:
4183
4184         vp0 = &base_addr;
4185         LOOP:   vp1 = phi(vp0,vp2)
4186                 ...
4187         inner:     vp3 = phi(vp1,vp4)
4188                    vp4 = vp3 + inner_step
4189                    if () goto inner
4190                 ...
4191                 vp2 = vp1 + step
4192                 if () goto LOOP   */
4193
4194   /* (2) Calculate the initial address of the aggregate-pointer, and set
4195      the aggregate-pointer to point to it before the loop.  */
4196
4197   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4198
4199   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4200                                                    offset, loop, byte_offset);
4201   if (new_stmt_list)
4202     {
4203       if (pe)
4204         {
4205           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4206           gcc_assert (!new_bb);
4207         }
4208       else
4209         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4210     }
4211
4212   *initial_address = new_temp;
4213
4214   /* Create: p = (aggr_type *) initial_base  */
4215   if (TREE_CODE (new_temp) != SSA_NAME
4216       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4217     {
4218       vec_stmt = gimple_build_assign (aggr_ptr,
4219                                       fold_convert (aggr_ptr_type, new_temp));
4220       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4221       /* Copy the points-to information if it exists. */
4222       if (DR_PTR_INFO (dr))
4223         vect_duplicate_ssa_name_ptr_info (aggr_ptr_init, dr, stmt_info);
4224       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4225       if (pe)
4226         {
4227           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4228           gcc_assert (!new_bb);
4229         }
4230       else
4231         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4232     }
4233   else
4234     aggr_ptr_init = new_temp;
4235
4236   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4237      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4238      inner-loop nested in LOOP (during outer-loop vectorization).  */
4239
4240   /* No update in loop is required.  */
4241   if (only_init && (!loop_vinfo || at_loop == loop))
4242     aptr = aggr_ptr_init;
4243   else
4244     {
4245       /* The step of the aggregate pointer is the type size.  */
4246       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4247       /* One exception to the above is when the scalar step of the load in
4248          LOOP is zero. In this case the step here is also zero.  */
4249       if (*inv_p)
4250         iv_step = size_zero_node;
4251       else if (tree_int_cst_sgn (step) == -1)
4252         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4253
4254       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4255
4256       create_iv (aggr_ptr_init,
4257                  fold_convert (aggr_ptr_type, iv_step),
4258                  aggr_ptr, loop, &incr_gsi, insert_after,
4259                  &indx_before_incr, &indx_after_incr);
4260       incr = gsi_stmt (incr_gsi);
4261       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4262
4263       /* Copy the points-to information if it exists. */
4264       if (DR_PTR_INFO (dr))
4265         {
4266           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4267           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4268         }
4269       if (ptr_incr)
4270         *ptr_incr = incr;
4271
4272       aptr = indx_before_incr;
4273     }
4274
4275   if (!nested_in_vect_loop || only_init)
4276     return aptr;
4277
4278
4279   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4280      nested in LOOP, if exists.  */
4281
4282   gcc_assert (nested_in_vect_loop);
4283   if (!only_init)
4284     {
4285       standard_iv_increment_position (containing_loop, &incr_gsi,
4286                                       &insert_after);
4287       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4288                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4289                  &indx_after_incr);
4290       incr = gsi_stmt (incr_gsi);
4291       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4292
4293       /* Copy the points-to information if it exists. */
4294       if (DR_PTR_INFO (dr))
4295         {
4296           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4297           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4298         }
4299       if (ptr_incr)
4300         *ptr_incr = incr;
4301
4302       return indx_before_incr;
4303     }
4304   else
4305     gcc_unreachable ();
4306 }
4307
4308
4309 /* Function bump_vector_ptr
4310
4311    Increment a pointer (to a vector type) by vector-size. If requested,
4312    i.e. if PTR-INCR is given, then also connect the new increment stmt
4313    to the existing def-use update-chain of the pointer, by modifying
4314    the PTR_INCR as illustrated below:
4315
4316    The pointer def-use update-chain before this function:
4317                         DATAREF_PTR = phi (p_0, p_2)
4318                         ....
4319         PTR_INCR:       p_2 = DATAREF_PTR + step
4320
4321    The pointer def-use update-chain after this function:
4322                         DATAREF_PTR = phi (p_0, p_2)
4323                         ....
4324                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4325                         ....
4326         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4327
4328    Input:
4329    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4330                  in the loop.
4331    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4332               the loop.  The increment amount across iterations is expected
4333               to be vector_size.
4334    BSI - location where the new update stmt is to be placed.
4335    STMT - the original scalar memory-access stmt that is being vectorized.
4336    BUMP - optional. The offset by which to bump the pointer. If not given,
4337           the offset is assumed to be vector_size.
4338
4339    Output: Return NEW_DATAREF_PTR as illustrated above.
4340
4341 */
4342
4343 tree
4344 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4345                  gimple stmt, tree bump)
4346 {
4347   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4348   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4349   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4350   tree update = TYPE_SIZE_UNIT (vectype);
4351   gassign *incr_stmt;
4352   ssa_op_iter iter;
4353   use_operand_p use_p;
4354   tree new_dataref_ptr;
4355
4356   if (bump)
4357     update = bump;
4358
4359   new_dataref_ptr = copy_ssa_name (dataref_ptr);
4360   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4361                                    dataref_ptr, update);
4362   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4363
4364   /* Copy the points-to information if it exists. */
4365   if (DR_PTR_INFO (dr))
4366     {
4367       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4368       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4369     }
4370
4371   if (!ptr_incr)
4372     return new_dataref_ptr;
4373
4374   /* Update the vector-pointer's cross-iteration increment.  */
4375   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4376     {
4377       tree use = USE_FROM_PTR (use_p);
4378
4379       if (use == dataref_ptr)
4380         SET_USE (use_p, new_dataref_ptr);
4381       else
4382         gcc_assert (tree_int_cst_compare (use, update) == 0);
4383     }
4384
4385   return new_dataref_ptr;
4386 }
4387
4388
4389 /* Function vect_create_destination_var.
4390
4391    Create a new temporary of type VECTYPE.  */
4392
4393 tree
4394 vect_create_destination_var (tree scalar_dest, tree vectype)
4395 {
4396   tree vec_dest;
4397   const char *name;
4398   char *new_name;
4399   tree type;
4400   enum vect_var_kind kind;
4401
4402   kind = vectype ? vect_simple_var : vect_scalar_var;
4403   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4404
4405   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4406
4407   name = get_name (scalar_dest);
4408   if (name)
4409     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4410   else
4411     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4412   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4413   free (new_name);
4414
4415   return vec_dest;
4416 }
4417
4418 /* Function vect_grouped_store_supported.
4419
4420    Returns TRUE if interleave high and interleave low permutations
4421    are supported, and FALSE otherwise.  */
4422
4423 bool
4424 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4425 {
4426   machine_mode mode = TYPE_MODE (vectype);
4427
4428   /* vect_permute_store_chain requires the group size to be equal to 3 or
4429      be a power of two.  */
4430   if (count != 3 && exact_log2 (count) == -1)
4431     {
4432       if (dump_enabled_p ())
4433         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4434                          "the size of the group of accesses"
4435                          " is not a power of 2 or not eqaul to 3\n");
4436       return false;
4437     }
4438
4439   /* Check that the permutation is supported.  */
4440   if (VECTOR_MODE_P (mode))
4441     {
4442       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4443       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4444
4445       if (count == 3)
4446         {
4447           unsigned int j0 = 0, j1 = 0, j2 = 0;
4448           unsigned int i, j;
4449
4450           for (j = 0; j < 3; j++)
4451             {
4452               int nelt0 = ((3 - j) * nelt) % 3;
4453               int nelt1 = ((3 - j) * nelt + 1) % 3;
4454               int nelt2 = ((3 - j) * nelt + 2) % 3;
4455               for (i = 0; i < nelt; i++)
4456                 {
4457                   if (3 * i + nelt0 < nelt)
4458                     sel[3 * i + nelt0] = j0++;
4459                   if (3 * i + nelt1 < nelt)
4460                     sel[3 * i + nelt1] = nelt + j1++;
4461                   if (3 * i + nelt2 < nelt)
4462                     sel[3 * i + nelt2] = 0;
4463                 }
4464               if (!can_vec_perm_p (mode, false, sel))
4465                 {
4466                   if (dump_enabled_p ())
4467                     dump_printf (MSG_MISSED_OPTIMIZATION,
4468                                  "permutaion op not supported by target.\n");
4469                   return false;
4470                 }
4471
4472               for (i = 0; i < nelt; i++)
4473                 {
4474                   if (3 * i + nelt0 < nelt)
4475                     sel[3 * i + nelt0] = 3 * i + nelt0;
4476                   if (3 * i + nelt1 < nelt)
4477                     sel[3 * i + nelt1] = 3 * i + nelt1;
4478                   if (3 * i + nelt2 < nelt)
4479                     sel[3 * i + nelt2] = nelt + j2++;
4480                 }
4481               if (!can_vec_perm_p (mode, false, sel))
4482                 {
4483                   if (dump_enabled_p ())
4484                     dump_printf (MSG_MISSED_OPTIMIZATION,
4485                                  "permutaion op not supported by target.\n");
4486                   return false;
4487                 }
4488             }
4489           return true;
4490         }
4491       else
4492         {
4493           /* If length is not equal to 3 then only power of 2 is supported.  */
4494           gcc_assert (exact_log2 (count) != -1);
4495
4496           for (i = 0; i < nelt / 2; i++)
4497             {
4498               sel[i * 2] = i;
4499               sel[i * 2 + 1] = i + nelt;
4500             }
4501             if (can_vec_perm_p (mode, false, sel))
4502               {
4503                 for (i = 0; i < nelt; i++)
4504                   sel[i] += nelt / 2;
4505                 if (can_vec_perm_p (mode, false, sel))
4506                   return true;
4507               }
4508         }
4509     }
4510
4511   if (dump_enabled_p ())
4512     dump_printf (MSG_MISSED_OPTIMIZATION,
4513                  "permutaion op not supported by target.\n");
4514   return false;
4515 }
4516
4517
4518 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4519    type VECTYPE.  */
4520
4521 bool
4522 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4523 {
4524   return vect_lanes_optab_supported_p ("vec_store_lanes",
4525                                        vec_store_lanes_optab,
4526                                        vectype, count);
4527 }
4528
4529
4530 /* Function vect_permute_store_chain.
4531
4532    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4533    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4534    the data correctly for the stores.  Return the final references for stores
4535    in RESULT_CHAIN.
4536
4537    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4538    The input is 4 vectors each containing 8 elements.  We assign a number to
4539    each element, the input sequence is:
4540
4541    1st vec:   0  1  2  3  4  5  6  7
4542    2nd vec:   8  9 10 11 12 13 14 15
4543    3rd vec:  16 17 18 19 20 21 22 23
4544    4th vec:  24 25 26 27 28 29 30 31
4545
4546    The output sequence should be:
4547
4548    1st vec:  0  8 16 24  1  9 17 25
4549    2nd vec:  2 10 18 26  3 11 19 27
4550    3rd vec:  4 12 20 28  5 13 21 30
4551    4th vec:  6 14 22 30  7 15 23 31
4552
4553    i.e., we interleave the contents of the four vectors in their order.
4554
4555    We use interleave_high/low instructions to create such output.  The input of
4556    each interleave_high/low operation is two vectors:
4557    1st vec    2nd vec
4558    0 1 2 3    4 5 6 7
4559    the even elements of the result vector are obtained left-to-right from the
4560    high/low elements of the first vector.  The odd elements of the result are
4561    obtained left-to-right from the high/low elements of the second vector.
4562    The output of interleave_high will be:   0 4 1 5
4563    and of interleave_low:                   2 6 3 7
4564
4565
4566    The permutation is done in log LENGTH stages.  In each stage interleave_high
4567    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4568    where the first argument is taken from the first half of DR_CHAIN and the
4569    second argument from it's second half.
4570    In our example,
4571
4572    I1: interleave_high (1st vec, 3rd vec)
4573    I2: interleave_low (1st vec, 3rd vec)
4574    I3: interleave_high (2nd vec, 4th vec)
4575    I4: interleave_low (2nd vec, 4th vec)
4576
4577    The output for the first stage is:
4578
4579    I1:  0 16  1 17  2 18  3 19
4580    I2:  4 20  5 21  6 22  7 23
4581    I3:  8 24  9 25 10 26 11 27
4582    I4: 12 28 13 29 14 30 15 31
4583
4584    The output of the second stage, i.e. the final result is:
4585
4586    I1:  0  8 16 24  1  9 17 25
4587    I2:  2 10 18 26  3 11 19 27
4588    I3:  4 12 20 28  5 13 21 30
4589    I4:  6 14 22 30  7 15 23 31.  */
4590
4591 void
4592 vect_permute_store_chain (vec<tree> dr_chain,
4593                           unsigned int length,
4594                           gimple stmt,
4595                           gimple_stmt_iterator *gsi,
4596                           vec<tree> *result_chain)
4597 {
4598   tree vect1, vect2, high, low;
4599   gimple perm_stmt;
4600   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4601   tree perm_mask_low, perm_mask_high;
4602   tree data_ref;
4603   tree perm3_mask_low, perm3_mask_high;
4604   unsigned int i, n, log_length = exact_log2 (length);
4605   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4606   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4607
4608   result_chain->quick_grow (length);
4609   memcpy (result_chain->address (), dr_chain.address (),
4610           length * sizeof (tree));
4611
4612   if (length == 3)
4613     {
4614       unsigned int j0 = 0, j1 = 0, j2 = 0;
4615
4616       for (j = 0; j < 3; j++)
4617         {
4618           int nelt0 = ((3 - j) * nelt) % 3;
4619           int nelt1 = ((3 - j) * nelt + 1) % 3;
4620           int nelt2 = ((3 - j) * nelt + 2) % 3;
4621
4622           for (i = 0; i < nelt; i++)
4623             {
4624               if (3 * i + nelt0 < nelt)
4625                 sel[3 * i + nelt0] = j0++;
4626               if (3 * i + nelt1 < nelt)
4627                 sel[3 * i + nelt1] = nelt + j1++;
4628               if (3 * i + nelt2 < nelt)
4629                 sel[3 * i + nelt2] = 0;
4630             }
4631           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4632
4633           for (i = 0; i < nelt; i++)
4634             {
4635               if (3 * i + nelt0 < nelt)
4636                 sel[3 * i + nelt0] = 3 * i + nelt0;
4637               if (3 * i + nelt1 < nelt)
4638                 sel[3 * i + nelt1] = 3 * i + nelt1;
4639               if (3 * i + nelt2 < nelt)
4640                 sel[3 * i + nelt2] = nelt + j2++;
4641             }
4642           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4643
4644           vect1 = dr_chain[0];
4645           vect2 = dr_chain[1];
4646
4647           /* Create interleaving stmt:
4648              low = VEC_PERM_EXPR <vect1, vect2,
4649                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4650                                    j + 2, nelt + j + 2, *, ...}>  */
4651           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4652           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4653                                            vect2, perm3_mask_low);
4654           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4655
4656           vect1 = data_ref;
4657           vect2 = dr_chain[2];
4658           /* Create interleaving stmt:
4659              low = VEC_PERM_EXPR <vect1, vect2,
4660                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4661                                    6, 7, nelt + j + 2, ...}>  */
4662           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4663           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4664                                            vect2, perm3_mask_high);
4665           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4666           (*result_chain)[j] = data_ref;
4667         }
4668     }
4669   else
4670     {
4671       /* If length is not equal to 3 then only power of 2 is supported.  */
4672       gcc_assert (exact_log2 (length) != -1);
4673
4674       for (i = 0, n = nelt / 2; i < n; i++)
4675         {
4676           sel[i * 2] = i;
4677           sel[i * 2 + 1] = i + nelt;
4678         }
4679         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4680
4681         for (i = 0; i < nelt; i++)
4682           sel[i] += nelt / 2;
4683         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4684
4685         for (i = 0, n = log_length; i < n; i++)
4686           {
4687             for (j = 0; j < length/2; j++)
4688               {
4689                 vect1 = dr_chain[j];
4690                 vect2 = dr_chain[j+length/2];
4691
4692                 /* Create interleaving stmt:
4693                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4694                                                         ...}>  */
4695                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4696                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4697                                                  vect2, perm_mask_high);
4698                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4699                 (*result_chain)[2*j] = high;
4700
4701                 /* Create interleaving stmt:
4702                    low = VEC_PERM_EXPR <vect1, vect2,
4703                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4704                                          ...}>  */
4705                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4706                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4707                                                  vect2, perm_mask_low);
4708                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4709                 (*result_chain)[2*j+1] = low;
4710               }
4711             memcpy (dr_chain.address (), result_chain->address (),
4712                     length * sizeof (tree));
4713           }
4714     }
4715 }
4716
4717 /* Function vect_setup_realignment
4718
4719    This function is called when vectorizing an unaligned load using
4720    the dr_explicit_realign[_optimized] scheme.
4721    This function generates the following code at the loop prolog:
4722
4723       p = initial_addr;
4724    x  msq_init = *(floor(p));   # prolog load
4725       realignment_token = call target_builtin;
4726     loop:
4727    x  msq = phi (msq_init, ---)
4728
4729    The stmts marked with x are generated only for the case of
4730    dr_explicit_realign_optimized.
4731
4732    The code above sets up a new (vector) pointer, pointing to the first
4733    location accessed by STMT, and a "floor-aligned" load using that pointer.
4734    It also generates code to compute the "realignment-token" (if the relevant
4735    target hook was defined), and creates a phi-node at the loop-header bb
4736    whose arguments are the result of the prolog-load (created by this
4737    function) and the result of a load that takes place in the loop (to be
4738    created by the caller to this function).
4739
4740    For the case of dr_explicit_realign_optimized:
4741    The caller to this function uses the phi-result (msq) to create the
4742    realignment code inside the loop, and sets up the missing phi argument,
4743    as follows:
4744     loop:
4745       msq = phi (msq_init, lsq)
4746       lsq = *(floor(p'));        # load in loop
4747       result = realign_load (msq, lsq, realignment_token);
4748
4749    For the case of dr_explicit_realign:
4750     loop:
4751       msq = *(floor(p));        # load in loop
4752       p' = p + (VS-1);
4753       lsq = *(floor(p'));       # load in loop
4754       result = realign_load (msq, lsq, realignment_token);
4755
4756    Input:
4757    STMT - (scalar) load stmt to be vectorized. This load accesses
4758           a memory location that may be unaligned.
4759    BSI - place where new code is to be inserted.
4760    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4761                               is used.
4762
4763    Output:
4764    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4765                        target hook, if defined.
4766    Return value - the result of the loop-header phi node.  */
4767
4768 tree
4769 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4770                         tree *realignment_token,
4771                         enum dr_alignment_support alignment_support_scheme,
4772                         tree init_addr,
4773                         struct loop **at_loop)
4774 {
4775   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4776   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4777   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4778   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4779   struct loop *loop = NULL;
4780   edge pe = NULL;
4781   tree scalar_dest = gimple_assign_lhs (stmt);
4782   tree vec_dest;
4783   gimple inc;
4784   tree ptr;
4785   tree data_ref;
4786   basic_block new_bb;
4787   tree msq_init = NULL_TREE;
4788   tree new_temp;
4789   gphi *phi_stmt;
4790   tree msq = NULL_TREE;
4791   gimple_seq stmts = NULL;
4792   bool inv_p;
4793   bool compute_in_loop = false;
4794   bool nested_in_vect_loop = false;
4795   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4796   struct loop *loop_for_initial_load = NULL;
4797
4798   if (loop_vinfo)
4799     {
4800       loop = LOOP_VINFO_LOOP (loop_vinfo);
4801       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4802     }
4803
4804   gcc_assert (alignment_support_scheme == dr_explicit_realign
4805               || alignment_support_scheme == dr_explicit_realign_optimized);
4806
4807   /* We need to generate three things:
4808      1. the misalignment computation
4809      2. the extra vector load (for the optimized realignment scheme).
4810      3. the phi node for the two vectors from which the realignment is
4811       done (for the optimized realignment scheme).  */
4812
4813   /* 1. Determine where to generate the misalignment computation.
4814
4815      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4816      calculation will be generated by this function, outside the loop (in the
4817      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4818      caller, inside the loop.
4819
4820      Background: If the misalignment remains fixed throughout the iterations of
4821      the loop, then both realignment schemes are applicable, and also the
4822      misalignment computation can be done outside LOOP.  This is because we are
4823      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4824      are a multiple of VS (the Vector Size), and therefore the misalignment in
4825      different vectorized LOOP iterations is always the same.
4826      The problem arises only if the memory access is in an inner-loop nested
4827      inside LOOP, which is now being vectorized using outer-loop vectorization.
4828      This is the only case when the misalignment of the memory access may not
4829      remain fixed throughout the iterations of the inner-loop (as explained in
4830      detail in vect_supportable_dr_alignment).  In this case, not only is the
4831      optimized realignment scheme not applicable, but also the misalignment
4832      computation (and generation of the realignment token that is passed to
4833      REALIGN_LOAD) have to be done inside the loop.
4834
4835      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4836      or not, which in turn determines if the misalignment is computed inside
4837      the inner-loop, or outside LOOP.  */
4838
4839   if (init_addr != NULL_TREE || !loop_vinfo)
4840     {
4841       compute_in_loop = true;
4842       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4843     }
4844
4845
4846   /* 2. Determine where to generate the extra vector load.
4847
4848      For the optimized realignment scheme, instead of generating two vector
4849      loads in each iteration, we generate a single extra vector load in the
4850      preheader of the loop, and in each iteration reuse the result of the
4851      vector load from the previous iteration.  In case the memory access is in
4852      an inner-loop nested inside LOOP, which is now being vectorized using
4853      outer-loop vectorization, we need to determine whether this initial vector
4854      load should be generated at the preheader of the inner-loop, or can be
4855      generated at the preheader of LOOP.  If the memory access has no evolution
4856      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4857      to be generated inside LOOP (in the preheader of the inner-loop).  */
4858
4859   if (nested_in_vect_loop)
4860     {
4861       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4862       bool invariant_in_outerloop =
4863             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4864       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4865     }
4866   else
4867     loop_for_initial_load = loop;
4868   if (at_loop)
4869     *at_loop = loop_for_initial_load;
4870
4871   if (loop_for_initial_load)
4872     pe = loop_preheader_edge (loop_for_initial_load);
4873
4874   /* 3. For the case of the optimized realignment, create the first vector
4875       load at the loop preheader.  */
4876
4877   if (alignment_support_scheme == dr_explicit_realign_optimized)
4878     {
4879       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4880       gassign *new_stmt;
4881
4882       gcc_assert (!compute_in_loop);
4883       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4884       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4885                                       NULL_TREE, &init_addr, NULL, &inc,
4886                                       true, &inv_p);
4887       new_temp = copy_ssa_name (ptr);
4888       new_stmt = gimple_build_assign
4889                    (new_temp, BIT_AND_EXPR, ptr,
4890                     build_int_cst (TREE_TYPE (ptr),
4891                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4892       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4893       gcc_assert (!new_bb);
4894       data_ref
4895         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4896                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4897       new_stmt = gimple_build_assign (vec_dest, data_ref);
4898       new_temp = make_ssa_name (vec_dest, new_stmt);
4899       gimple_assign_set_lhs (new_stmt, new_temp);
4900       if (pe)
4901         {
4902           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4903           gcc_assert (!new_bb);
4904         }
4905       else
4906          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4907
4908       msq_init = gimple_assign_lhs (new_stmt);
4909     }
4910
4911   /* 4. Create realignment token using a target builtin, if available.
4912       It is done either inside the containing loop, or before LOOP (as
4913       determined above).  */
4914
4915   if (targetm.vectorize.builtin_mask_for_load)
4916     {
4917       gcall *new_stmt;
4918       tree builtin_decl;
4919
4920       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4921       if (!init_addr)
4922         {
4923           /* Generate the INIT_ADDR computation outside LOOP.  */
4924           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4925                                                         NULL_TREE, loop);
4926           if (loop)
4927             {
4928               pe = loop_preheader_edge (loop);
4929               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4930               gcc_assert (!new_bb);
4931             }
4932           else
4933              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4934         }
4935
4936       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4937       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4938       vec_dest =
4939         vect_create_destination_var (scalar_dest,
4940                                      gimple_call_return_type (new_stmt));
4941       new_temp = make_ssa_name (vec_dest, new_stmt);
4942       gimple_call_set_lhs (new_stmt, new_temp);
4943
4944       if (compute_in_loop)
4945         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4946       else
4947         {
4948           /* Generate the misalignment computation outside LOOP.  */
4949           pe = loop_preheader_edge (loop);
4950           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4951           gcc_assert (!new_bb);
4952         }
4953
4954       *realignment_token = gimple_call_lhs (new_stmt);
4955
4956       /* The result of the CALL_EXPR to this builtin is determined from
4957          the value of the parameter and no global variables are touched
4958          which makes the builtin a "const" function.  Requiring the
4959          builtin to have the "const" attribute makes it unnecessary
4960          to call mark_call_clobbered.  */
4961       gcc_assert (TREE_READONLY (builtin_decl));
4962     }
4963
4964   if (alignment_support_scheme == dr_explicit_realign)
4965     return msq;
4966
4967   gcc_assert (!compute_in_loop);
4968   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4969
4970
4971   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4972
4973   pe = loop_preheader_edge (containing_loop);
4974   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4975   msq = make_ssa_name (vec_dest);
4976   phi_stmt = create_phi_node (msq, containing_loop->header);
4977   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4978
4979   return msq;
4980 }
4981
4982
4983 /* Function vect_grouped_load_supported.
4984
4985    Returns TRUE if even and odd permutations are supported,
4986    and FALSE otherwise.  */
4987
4988 bool
4989 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4990 {
4991   machine_mode mode = TYPE_MODE (vectype);
4992
4993   /* vect_permute_load_chain requires the group size to be equal to 3 or
4994      be a power of two.  */
4995   if (count != 3 && exact_log2 (count) == -1)
4996     {
4997       if (dump_enabled_p ())
4998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4999                          "the size of the group of accesses"
5000                          " is not a power of 2 or not equal to 3\n");
5001       return false;
5002     }
5003
5004   /* Check that the permutation is supported.  */
5005   if (VECTOR_MODE_P (mode))
5006     {
5007       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5008       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5009
5010       if (count == 3)
5011         {
5012           unsigned int k;
5013           for (k = 0; k < 3; k++)
5014             {
5015               for (i = 0; i < nelt; i++)
5016                 if (3 * i + k < 2 * nelt)
5017                   sel[i] = 3 * i + k;
5018                 else
5019                   sel[i] = 0;
5020               if (!can_vec_perm_p (mode, false, sel))
5021                 {
5022                   if (dump_enabled_p ())
5023                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5024                                      "shuffle of 3 loads is not supported by"
5025                                      " target\n");
5026                     return false;
5027                 }
5028               for (i = 0, j = 0; i < nelt; i++)
5029                 if (3 * i + k < 2 * nelt)
5030                   sel[i] = i;
5031                 else
5032                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5033               if (!can_vec_perm_p (mode, false, sel))
5034                 {
5035                   if (dump_enabled_p ())
5036                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5037                                      "shuffle of 3 loads is not supported by"
5038                                      " target\n");
5039                   return false;
5040                 }
5041             }
5042           return true;
5043         }
5044       else
5045         {
5046           /* If length is not equal to 3 then only power of 2 is supported.  */
5047           gcc_assert (exact_log2 (count) != -1);
5048           for (i = 0; i < nelt; i++)
5049             sel[i] = i * 2;
5050           if (can_vec_perm_p (mode, false, sel))
5051             {
5052               for (i = 0; i < nelt; i++)
5053                 sel[i] = i * 2 + 1;
5054               if (can_vec_perm_p (mode, false, sel))
5055                 return true;
5056             }
5057         }
5058     }
5059
5060   if (dump_enabled_p ())
5061     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5062                      "extract even/odd not supported by target\n");
5063   return false;
5064 }
5065
5066 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5067    type VECTYPE.  */
5068
5069 bool
5070 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5071 {
5072   return vect_lanes_optab_supported_p ("vec_load_lanes",
5073                                        vec_load_lanes_optab,
5074                                        vectype, count);
5075 }
5076
5077 /* Function vect_permute_load_chain.
5078
5079    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5080    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5081    the input data correctly.  Return the final references for loads in
5082    RESULT_CHAIN.
5083
5084    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5085    The input is 4 vectors each containing 8 elements. We assign a number to each
5086    element, the input sequence is:
5087
5088    1st vec:   0  1  2  3  4  5  6  7
5089    2nd vec:   8  9 10 11 12 13 14 15
5090    3rd vec:  16 17 18 19 20 21 22 23
5091    4th vec:  24 25 26 27 28 29 30 31
5092
5093    The output sequence should be:
5094
5095    1st vec:  0 4  8 12 16 20 24 28
5096    2nd vec:  1 5  9 13 17 21 25 29
5097    3rd vec:  2 6 10 14 18 22 26 30
5098    4th vec:  3 7 11 15 19 23 27 31
5099
5100    i.e., the first output vector should contain the first elements of each
5101    interleaving group, etc.
5102
5103    We use extract_even/odd instructions to create such output.  The input of
5104    each extract_even/odd operation is two vectors
5105    1st vec    2nd vec
5106    0 1 2 3    4 5 6 7
5107
5108    and the output is the vector of extracted even/odd elements.  The output of
5109    extract_even will be:   0 2 4 6
5110    and of extract_odd:     1 3 5 7
5111
5112
5113    The permutation is done in log LENGTH stages.  In each stage extract_even
5114    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5115    their order.  In our example,
5116
5117    E1: extract_even (1st vec, 2nd vec)
5118    E2: extract_odd (1st vec, 2nd vec)
5119    E3: extract_even (3rd vec, 4th vec)
5120    E4: extract_odd (3rd vec, 4th vec)
5121
5122    The output for the first stage will be:
5123
5124    E1:  0  2  4  6  8 10 12 14
5125    E2:  1  3  5  7  9 11 13 15
5126    E3: 16 18 20 22 24 26 28 30
5127    E4: 17 19 21 23 25 27 29 31
5128
5129    In order to proceed and create the correct sequence for the next stage (or
5130    for the correct output, if the second stage is the last one, as in our
5131    example), we first put the output of extract_even operation and then the
5132    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5133    The input for the second stage is:
5134
5135    1st vec (E1):  0  2  4  6  8 10 12 14
5136    2nd vec (E3): 16 18 20 22 24 26 28 30
5137    3rd vec (E2):  1  3  5  7  9 11 13 15
5138    4th vec (E4): 17 19 21 23 25 27 29 31
5139
5140    The output of the second stage:
5141
5142    E1: 0 4  8 12 16 20 24 28
5143    E2: 2 6 10 14 18 22 26 30
5144    E3: 1 5  9 13 17 21 25 29
5145    E4: 3 7 11 15 19 23 27 31
5146
5147    And RESULT_CHAIN after reordering:
5148
5149    1st vec (E1):  0 4  8 12 16 20 24 28
5150    2nd vec (E3):  1 5  9 13 17 21 25 29
5151    3rd vec (E2):  2 6 10 14 18 22 26 30
5152    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5153
5154 static void
5155 vect_permute_load_chain (vec<tree> dr_chain,
5156                          unsigned int length,
5157                          gimple stmt,
5158                          gimple_stmt_iterator *gsi,
5159                          vec<tree> *result_chain)
5160 {
5161   tree data_ref, first_vect, second_vect;
5162   tree perm_mask_even, perm_mask_odd;
5163   tree perm3_mask_low, perm3_mask_high;
5164   gimple perm_stmt;
5165   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5166   unsigned int i, j, log_length = exact_log2 (length);
5167   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5168   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5169
5170   result_chain->quick_grow (length);
5171   memcpy (result_chain->address (), dr_chain.address (),
5172           length * sizeof (tree));
5173
5174   if (length == 3)
5175     {
5176       unsigned int k;
5177
5178       for (k = 0; k < 3; k++)
5179         {
5180           for (i = 0; i < nelt; i++)
5181             if (3 * i + k < 2 * nelt)
5182               sel[i] = 3 * i + k;
5183             else
5184               sel[i] = 0;
5185           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5186
5187           for (i = 0, j = 0; i < nelt; i++)
5188             if (3 * i + k < 2 * nelt)
5189               sel[i] = i;
5190             else
5191               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5192
5193           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5194
5195           first_vect = dr_chain[0];
5196           second_vect = dr_chain[1];
5197
5198           /* Create interleaving stmt (low part of):
5199              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5200                                                              ...}>  */
5201           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5202           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5203                                            second_vect, perm3_mask_low);
5204           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5205
5206           /* Create interleaving stmt (high part of):
5207              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5208                                                               ...}>  */
5209           first_vect = data_ref;
5210           second_vect = dr_chain[2];
5211           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5212           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5213                                            second_vect, perm3_mask_high);
5214           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5215           (*result_chain)[k] = data_ref;
5216         }
5217     }
5218   else
5219     {
5220       /* If length is not equal to 3 then only power of 2 is supported.  */
5221       gcc_assert (exact_log2 (length) != -1);
5222
5223       for (i = 0; i < nelt; ++i)
5224         sel[i] = i * 2;
5225       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5226
5227       for (i = 0; i < nelt; ++i)
5228         sel[i] = i * 2 + 1;
5229       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5230
5231       for (i = 0; i < log_length; i++)
5232         {
5233           for (j = 0; j < length; j += 2)
5234             {
5235               first_vect = dr_chain[j];
5236               second_vect = dr_chain[j+1];
5237
5238               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5239               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5240               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5241                                                first_vect, second_vect,
5242                                                perm_mask_even);
5243               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5244               (*result_chain)[j/2] = data_ref;
5245
5246               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5247               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5248               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5249                                                first_vect, second_vect,
5250                                                perm_mask_odd);
5251               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5252               (*result_chain)[j/2+length/2] = data_ref;
5253             }
5254           memcpy (dr_chain.address (), result_chain->address (),
5255                   length * sizeof (tree));
5256         }
5257     }
5258 }
5259
5260 /* Function vect_shift_permute_load_chain.
5261
5262    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5263    sequence of stmts to reorder the input data accordingly.
5264    Return the final references for loads in RESULT_CHAIN.
5265    Return true if successed, false otherwise.
5266
5267    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5268    The input is 3 vectors each containing 8 elements.  We assign a
5269    number to each element, the input sequence is:
5270
5271    1st vec:   0  1  2  3  4  5  6  7
5272    2nd vec:   8  9 10 11 12 13 14 15
5273    3rd vec:  16 17 18 19 20 21 22 23
5274
5275    The output sequence should be:
5276
5277    1st vec:  0 3 6  9 12 15 18 21
5278    2nd vec:  1 4 7 10 13 16 19 22
5279    3rd vec:  2 5 8 11 14 17 20 23
5280
5281    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5282
5283    First we shuffle all 3 vectors to get correct elements order:
5284
5285    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5286    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5287    3rd vec:  (16 19 22) (17 20 23) (18 21)
5288
5289    Next we unite and shift vector 3 times:
5290
5291    1st step:
5292      shift right by 6 the concatenation of:
5293      "1st vec" and  "2nd vec"
5294        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5295      "2nd vec" and  "3rd vec"
5296        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5297      "3rd vec" and  "1st vec"
5298        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5299                              | New vectors                   |
5300
5301      So that now new vectors are:
5302
5303      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5304      2nd vec:  (10 13) (16 19 22) (17 20 23)
5305      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5306
5307    2nd step:
5308      shift right by 5 the concatenation of:
5309      "1st vec" and  "3rd vec"
5310        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5311      "2nd vec" and  "1st vec"
5312        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5313      "3rd vec" and  "2nd vec"
5314        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5315                           | New vectors                   |
5316
5317      So that now new vectors are:
5318
5319      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5320      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5321      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5322
5323    3rd step:
5324      shift right by 5 the concatenation of:
5325      "1st vec" and  "1st vec"
5326        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5327      shift right by 3 the concatenation of:
5328      "2nd vec" and  "2nd vec"
5329                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5330                           | New vectors                   |
5331
5332      So that now all vectors are READY:
5333      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5334      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5335      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5336
5337    This algorithm is faster than one in vect_permute_load_chain if:
5338      1.  "shift of a concatination" is faster than general permutation.
5339          This is usually so.
5340      2.  The TARGET machine can't execute vector instructions in parallel.
5341          This is because each step of the algorithm depends on previous.
5342          The algorithm in vect_permute_load_chain is much more parallel.
5343
5344    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5345 */
5346
5347 static bool
5348 vect_shift_permute_load_chain (vec<tree> dr_chain,
5349                                unsigned int length,
5350                                gimple stmt,
5351                                gimple_stmt_iterator *gsi,
5352                                vec<tree> *result_chain)
5353 {
5354   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5355   tree perm2_mask1, perm2_mask2, perm3_mask;
5356   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5357   gimple perm_stmt;
5358
5359   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5360   unsigned int i;
5361   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5362   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5363   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5364   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5365
5366   result_chain->quick_grow (length);
5367   memcpy (result_chain->address (), dr_chain.address (),
5368           length * sizeof (tree));
5369
5370   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5371     {
5372       unsigned int j, log_length = exact_log2 (length);
5373       for (i = 0; i < nelt / 2; ++i)
5374         sel[i] = i * 2;
5375       for (i = 0; i < nelt / 2; ++i)
5376         sel[nelt / 2 + i] = i * 2 + 1;
5377       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5378         {
5379           if (dump_enabled_p ())
5380             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5381                              "shuffle of 2 fields structure is not \
5382                               supported by target\n");
5383           return false;
5384         }
5385       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5386
5387       for (i = 0; i < nelt / 2; ++i)
5388         sel[i] = i * 2 + 1;
5389       for (i = 0; i < nelt / 2; ++i)
5390         sel[nelt / 2 + i] = i * 2;
5391       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5392         {
5393           if (dump_enabled_p ())
5394             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5395                              "shuffle of 2 fields structure is not \
5396                               supported by target\n");
5397           return false;
5398         }
5399       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5400
5401       /* Generating permutation constant to shift all elements.
5402          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5403       for (i = 0; i < nelt; i++)
5404         sel[i] = nelt / 2 + i;
5405       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5406         {
5407           if (dump_enabled_p ())
5408             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5409                              "shift permutation is not supported by target\n");
5410           return false;
5411         }
5412       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5413
5414       /* Generating permutation constant to select vector from 2.
5415          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5416       for (i = 0; i < nelt / 2; i++)
5417         sel[i] = i;
5418       for (i = nelt / 2; i < nelt; i++)
5419         sel[i] = nelt + i;
5420       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5421         {
5422           if (dump_enabled_p ())
5423             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5424                              "select is not supported by target\n");
5425           return false;
5426         }
5427       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5428
5429       for (i = 0; i < log_length; i++)
5430         {
5431           for (j = 0; j < length; j += 2)
5432             {
5433               first_vect = dr_chain[j];
5434               second_vect = dr_chain[j + 1];
5435
5436               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5437               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5438                                                first_vect, first_vect,
5439                                                perm2_mask1);
5440               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5441               vect[0] = data_ref;
5442
5443               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5444               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5445                                                second_vect, second_vect,
5446                                                perm2_mask2);
5447               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5448               vect[1] = data_ref;
5449
5450               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5451               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5452                                                vect[0], vect[1], shift1_mask);
5453               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5454               (*result_chain)[j/2 + length/2] = data_ref;
5455
5456               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5457               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5458                                                vect[0], vect[1], select_mask);
5459               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5460               (*result_chain)[j/2] = data_ref;
5461             }
5462           memcpy (dr_chain.address (), result_chain->address (),
5463                   length * sizeof (tree));
5464         }
5465       return true;
5466     }
5467   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5468     {
5469       unsigned int k = 0, l = 0;
5470
5471       /* Generating permutation constant to get all elements in rigth order.
5472          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5473       for (i = 0; i < nelt; i++)
5474         {
5475           if (3 * k + (l % 3) >= nelt)
5476             {
5477               k = 0;
5478               l += (3 - (nelt % 3));
5479             }
5480           sel[i] = 3 * k + (l % 3);
5481           k++;
5482         }
5483       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5484         {
5485           if (dump_enabled_p ())
5486             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5487                              "shuffle of 3 fields structure is not \
5488                               supported by target\n");
5489           return false;
5490         }
5491       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5492
5493       /* Generating permutation constant to shift all elements.
5494          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5495       for (i = 0; i < nelt; i++)
5496         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5497       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5498         {
5499           if (dump_enabled_p ())
5500             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5501                              "shift permutation is not supported by target\n");
5502           return false;
5503         }
5504       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5505
5506       /* Generating permutation constant to shift all elements.
5507          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5508       for (i = 0; i < nelt; i++)
5509         sel[i] = 2 * (nelt / 3) + 1 + i;
5510       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5511         {
5512           if (dump_enabled_p ())
5513             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5514                              "shift permutation is not supported by target\n");
5515           return false;
5516         }
5517       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5518
5519       /* Generating permutation constant to shift all elements.
5520          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5521       for (i = 0; i < nelt; i++)
5522         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5523       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5524         {
5525           if (dump_enabled_p ())
5526             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5527                              "shift permutation is not supported by target\n");
5528           return false;
5529         }
5530       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5531
5532       /* Generating permutation constant to shift all elements.
5533          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5534       for (i = 0; i < nelt; i++)
5535         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5536       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5537         {
5538           if (dump_enabled_p ())
5539             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5540                              "shift permutation is not supported by target\n");
5541           return false;
5542         }
5543       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5544
5545       for (k = 0; k < 3; k++)
5546         {
5547           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5548           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5549                                            dr_chain[k], dr_chain[k],
5550                                            perm3_mask);
5551           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5552           vect[k] = data_ref;
5553         }
5554
5555       for (k = 0; k < 3; k++)
5556         {
5557           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5558           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5559                                            vect[k % 3], vect[(k + 1) % 3],
5560                                            shift1_mask);
5561           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5562           vect_shift[k] = data_ref;
5563         }
5564
5565       for (k = 0; k < 3; k++)
5566         {
5567           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5568           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5569                                            vect_shift[(4 - k) % 3],
5570                                            vect_shift[(3 - k) % 3],
5571                                            shift2_mask);
5572           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5573           vect[k] = data_ref;
5574         }
5575
5576       (*result_chain)[3 - (nelt % 3)] = vect[2];
5577
5578       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5579       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5580                                        vect[0], shift3_mask);
5581       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5582       (*result_chain)[nelt % 3] = data_ref;
5583
5584       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5585       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5586                                        vect[1], shift4_mask);
5587       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5588       (*result_chain)[0] = data_ref;
5589       return true;
5590     }
5591   return false;
5592 }
5593
5594 /* Function vect_transform_grouped_load.
5595
5596    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5597    to perform their permutation and ascribe the result vectorized statements to
5598    the scalar statements.
5599 */
5600
5601 void
5602 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5603                              gimple_stmt_iterator *gsi)
5604 {
5605   machine_mode mode;
5606   vec<tree> result_chain = vNULL;
5607
5608   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5609      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5610      vectors, that are ready for vector computation.  */
5611   result_chain.create (size);
5612
5613   /* If reassociation width for vector type is 2 or greater target machine can
5614      execute 2 or more vector instructions in parallel.  Otherwise try to
5615      get chain for loads group using vect_shift_permute_load_chain.  */
5616   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5617   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5618       || exact_log2 (size) != -1
5619       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5620                                          gsi, &result_chain))
5621     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5622   vect_record_grouped_load_vectors (stmt, result_chain);
5623   result_chain.release ();
5624 }
5625
5626 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5627    generated as part of the vectorization of STMT.  Assign the statement
5628    for each vector to the associated scalar statement.  */
5629
5630 void
5631 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5632 {
5633   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5634   gimple next_stmt, new_stmt;
5635   unsigned int i, gap_count;
5636   tree tmp_data_ref;
5637
5638   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5639      Since we scan the chain starting from it's first node, their order
5640      corresponds the order of data-refs in RESULT_CHAIN.  */
5641   next_stmt = first_stmt;
5642   gap_count = 1;
5643   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5644     {
5645       if (!next_stmt)
5646         break;
5647
5648       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5649        code elimination pass later.  No need to check for the first stmt in
5650        the group, since it always exists.
5651        GROUP_GAP is the number of steps in elements from the previous
5652        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5653        correspond to the gaps.  */
5654       if (next_stmt != first_stmt
5655           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5656       {
5657         gap_count++;
5658         continue;
5659       }
5660
5661       while (next_stmt)
5662         {
5663           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5664           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5665              copies, and we put the new vector statement in the first available
5666              RELATED_STMT.  */
5667           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5668             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5669           else
5670             {
5671               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5672                 {
5673                   gimple prev_stmt =
5674                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5675                   gimple rel_stmt =
5676                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5677                   while (rel_stmt)
5678                     {
5679                       prev_stmt = rel_stmt;
5680                       rel_stmt =
5681                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5682                     }
5683
5684                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5685                     new_stmt;
5686                 }
5687             }
5688
5689           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5690           gap_count = 1;
5691           /* If NEXT_STMT accesses the same DR as the previous statement,
5692              put the same TMP_DATA_REF as its vectorized statement; otherwise
5693              get the next data-ref from RESULT_CHAIN.  */
5694           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5695             break;
5696         }
5697     }
5698 }
5699
5700 /* Function vect_force_dr_alignment_p.
5701
5702    Returns whether the alignment of a DECL can be forced to be aligned
5703    on ALIGNMENT bit boundary.  */
5704
5705 bool
5706 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5707 {
5708   if (TREE_CODE (decl) != VAR_DECL)
5709     return false;
5710
5711   if (decl_in_symtab_p (decl)
5712       && !symtab_node::get (decl)->can_increase_alignment_p ())
5713     return false;
5714
5715   if (TREE_STATIC (decl))
5716     return (alignment <= MAX_OFILE_ALIGNMENT);
5717   else
5718     return (alignment <= MAX_STACK_ALIGNMENT);
5719 }
5720
5721
5722 /* Return whether the data reference DR is supported with respect to its
5723    alignment.
5724    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5725    it is aligned, i.e., check if it is possible to vectorize it with different
5726    alignment.  */
5727
5728 enum dr_alignment_support
5729 vect_supportable_dr_alignment (struct data_reference *dr,
5730                                bool check_aligned_accesses)
5731 {
5732   gimple stmt = DR_STMT (dr);
5733   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5734   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5735   machine_mode mode = TYPE_MODE (vectype);
5736   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5737   struct loop *vect_loop = NULL;
5738   bool nested_in_vect_loop = false;
5739
5740   if (aligned_access_p (dr) && !check_aligned_accesses)
5741     return dr_aligned;
5742
5743   /* For now assume all conditional loads/stores support unaligned
5744      access without any special code.  */
5745   if (is_gimple_call (stmt)
5746       && gimple_call_internal_p (stmt)
5747       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5748           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5749     return dr_unaligned_supported;
5750
5751   if (loop_vinfo)
5752     {
5753       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5754       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5755     }
5756
5757   /* Possibly unaligned access.  */
5758
5759   /* We can choose between using the implicit realignment scheme (generating
5760      a misaligned_move stmt) and the explicit realignment scheme (generating
5761      aligned loads with a REALIGN_LOAD).  There are two variants to the
5762      explicit realignment scheme: optimized, and unoptimized.
5763      We can optimize the realignment only if the step between consecutive
5764      vector loads is equal to the vector size.  Since the vector memory
5765      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5766      is guaranteed that the misalignment amount remains the same throughout the
5767      execution of the vectorized loop.  Therefore, we can create the
5768      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5769      at the loop preheader.
5770
5771      However, in the case of outer-loop vectorization, when vectorizing a
5772      memory access in the inner-loop nested within the LOOP that is now being
5773      vectorized, while it is guaranteed that the misalignment of the
5774      vectorized memory access will remain the same in different outer-loop
5775      iterations, it is *not* guaranteed that is will remain the same throughout
5776      the execution of the inner-loop.  This is because the inner-loop advances
5777      with the original scalar step (and not in steps of VS).  If the inner-loop
5778      step happens to be a multiple of VS, then the misalignment remains fixed
5779      and we can use the optimized realignment scheme.  For example:
5780
5781       for (i=0; i<N; i++)
5782         for (j=0; j<M; j++)
5783           s += a[i+j];
5784
5785      When vectorizing the i-loop in the above example, the step between
5786      consecutive vector loads is 1, and so the misalignment does not remain
5787      fixed across the execution of the inner-loop, and the realignment cannot
5788      be optimized (as illustrated in the following pseudo vectorized loop):
5789
5790       for (i=0; i<N; i+=4)
5791         for (j=0; j<M; j++){
5792           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5793                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5794                          // (assuming that we start from an aligned address).
5795           }
5796
5797      We therefore have to use the unoptimized realignment scheme:
5798
5799       for (i=0; i<N; i+=4)
5800           for (j=k; j<M; j+=4)
5801           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5802                            // that the misalignment of the initial address is
5803                            // 0).
5804
5805      The loop can then be vectorized as follows:
5806
5807       for (k=0; k<4; k++){
5808         rt = get_realignment_token (&vp[k]);
5809         for (i=0; i<N; i+=4){
5810           v1 = vp[i+k];
5811           for (j=k; j<M; j+=4){
5812             v2 = vp[i+j+VS-1];
5813             va = REALIGN_LOAD <v1,v2,rt>;
5814             vs += va;
5815             v1 = v2;
5816           }
5817         }
5818     } */
5819
5820   if (DR_IS_READ (dr))
5821     {
5822       bool is_packed = false;
5823       tree type = (TREE_TYPE (DR_REF (dr)));
5824
5825       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5826           && (!targetm.vectorize.builtin_mask_for_load
5827               || targetm.vectorize.builtin_mask_for_load ()))
5828         {
5829           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5830           if ((nested_in_vect_loop
5831                && (TREE_INT_CST_LOW (DR_STEP (dr))
5832                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5833               || !loop_vinfo)
5834             return dr_explicit_realign;
5835           else
5836             return dr_explicit_realign_optimized;
5837         }
5838       if (!known_alignment_for_access_p (dr))
5839         is_packed = not_size_aligned (DR_REF (dr));
5840
5841       if ((TYPE_USER_ALIGN (type) && !is_packed)
5842           || targetm.vectorize.
5843                support_vector_misalignment (mode, type,
5844                                             DR_MISALIGNMENT (dr), is_packed))
5845         /* Can't software pipeline the loads, but can at least do them.  */
5846         return dr_unaligned_supported;
5847     }
5848   else
5849     {
5850       bool is_packed = false;
5851       tree type = (TREE_TYPE (DR_REF (dr)));
5852
5853       if (!known_alignment_for_access_p (dr))
5854         is_packed = not_size_aligned (DR_REF (dr));
5855
5856      if ((TYPE_USER_ALIGN (type) && !is_packed)
5857          || targetm.vectorize.
5858               support_vector_misalignment (mode, type,
5859                                            DR_MISALIGNMENT (dr), is_packed))
5860        return dr_unaligned_supported;
5861     }
5862
5863   /* Unsupported.  */
5864   return dr_unaligned_unsupported;
5865 }