gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "tm_p.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "cgraph.h"
  35 #include "dumpfile.h"
  36 #include "alias.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "tree-eh.h"
  40 #include "gimplify.h"
  41 #include "gimple-iterator.h"
  42 #include "gimplify-me.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "tree-ssa-loop-manip.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "expr.h"
  50 #include "builtins.h"
  51 #include "params.h"
  52
  53 /* Return true if load- or store-lanes optab OPTAB is implemented for
  54    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  55
  56 static bool
  57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  58                               tree vectype, unsigned HOST_WIDE_INT count)
  59 {
  60   machine_mode mode, array_mode;
  61   bool limit_p;
  62
  63   mode = TYPE_MODE (vectype);
  64   limit_p = !targetm.array_mode_supported_p (mode, count);
  65   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  66                               MODE_INT, limit_p);
  67
  68   if (array_mode == BLKmode)
  69     {
  70       if (dump_enabled_p ())
  71         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  72                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  73                          GET_MODE_NAME (mode), count);
  74       return false;
  75     }
  76
  77   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  78     {
  79       if (dump_enabled_p ())
  80         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  81                          "cannot use %s<%s><%s>\n", name,
  82                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  83       return false;
  84     }
  85
  86   if (dump_enabled_p ())
  87     dump_printf_loc (MSG_NOTE, vect_location,
  88                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  89                      GET_MODE_NAME (mode));
  90
  91   return true;
  92 }
  93
  94
  95 /* Return the smallest scalar part of STMT.
  96    This is used to determine the vectype of the stmt.  We generally set the
  97    vectype according to the type of the result (lhs).  For stmts whose
  98    result-type is different than the type of the arguments (e.g., demotion,
  99    promotion), vectype will be reset appropriately (later).  Note that we have
 100    to visit the smallest datatype in this function, because that determines the
 101    VF.  If the smallest datatype in the loop is present only as the rhs of a
 102    promotion operation - we'd miss it.
 103    Such a case, where a variable of this datatype does not appear in the lhs
 104    anywhere in the loop, can only occur if it's an invariant: e.g.:
 105    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 106    invariant motion.  However, we cannot rely on invariant motion to always
 107    take invariants out of the loop, and so in the case of promotion we also
 108    have to check the rhs.
 109    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 110    types.  */
 111
 112 tree
 113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 114                                HOST_WIDE_INT *rhs_size_unit)
 115 {
 116   tree scalar_type = gimple_expr_type (stmt);
 117   HOST_WIDE_INT lhs, rhs;
 118
 119   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 120
 121   if (is_gimple_assign (stmt)
 122       && (gimple_assign_cast_p (stmt)
 123           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 124           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 125           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 126     {
 127       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 128
 129       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 130       if (rhs < lhs)
 131         scalar_type = rhs_type;
 132     }
 133
 134   *lhs_size_unit = lhs;
 135   *rhs_size_unit = rhs;
 136   return scalar_type;
 137 }
 138
 139
 140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 141    tested at run-time.  Return TRUE if DDR was successfully inserted.
 142    Return false if versioning is not supported.  */
 143
 144 static bool
 145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 146 {
 147   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 148
 149   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 150     return false;
 151
 152   if (dump_enabled_p ())
 153     {
 154       dump_printf_loc (MSG_NOTE, vect_location,
 155                        "mark for run-time aliasing test between ");
 156       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 157       dump_printf (MSG_NOTE,  " and ");
 158       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 159       dump_printf (MSG_NOTE, "\n");
 160     }
 161
 162   if (optimize_loop_nest_for_size_p (loop))
 163     {
 164       if (dump_enabled_p ())
 165         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 166                          "versioning not supported when optimizing"
 167                          " for size.\n");
 168       return false;
 169     }
 170
 171   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 172   if (loop->inner)
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 176                          "versioning not yet supported for outer-loops.\n");
 177       return false;
 178     }
 179
 180   /* FORNOW: We don't support creating runtime alias tests for non-constant
 181      step.  */
 182   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 183       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 184     {
 185       if (dump_enabled_p ())
 186         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 187                          "versioning not yet supported for non-constant "
 188                          "step\n");
 189       return false;
 190     }
 191
 192   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 193   return true;
 194 }
 195
 196
 197 /* Function vect_analyze_data_ref_dependence.
 198
 199    Return TRUE if there (might) exist a dependence between a memory-reference
 200    DRA and a memory-reference DRB.  When versioning for alias may check a
 201    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 202    the data dependence.  */
 203
 204 static bool
 205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 206                                   loop_vec_info loop_vinfo, int *max_vf)
 207 {
 208   unsigned int i;
 209   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 210   struct data_reference *dra = DDR_A (ddr);
 211   struct data_reference *drb = DDR_B (ddr);
 212   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 213   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 214   lambda_vector dist_v;
 215   unsigned int loop_depth;
 216
 217   /* In loop analysis all data references should be vectorizable.  */
 218   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 219       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 220     gcc_unreachable ();
 221
 222   /* Independent data accesses.  */
 223   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 224     return false;
 225
 226   if (dra == drb
 227       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 228     return false;
 229
 230   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 231      least two scalar iterations, there is always also a true dependence.
 232      As the vectorizer does not re-order loads and stores we can ignore
 233      the anti-dependence if TBAA can disambiguate both DRs similar to the
 234      case with known negative distance anti-dependences (positive
 235      distance anti-dependences would violate TBAA constraints).  */
 236   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 237        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 238       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 239                                  get_alias_set (DR_REF (drb))))
 240     return false;
 241
 242   /* Unknown data dependence.  */
 243   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 244     {
 245       /* If user asserted safelen consecutive iterations can be
 246          executed concurrently, assume independence.  */
 247       if (loop->safelen >= 2)
 248         {
 249           if (loop->safelen < *max_vf)
 250             *max_vf = loop->safelen;
 251           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 252           return false;
 253         }
 254
 255       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 256           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 257         {
 258           if (dump_enabled_p ())
 259             {
 260               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 261                                "versioning for alias not supported for: "
 262                                "can't determine dependence between ");
 263               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 264                                  DR_REF (dra));
 265               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 266               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 267                                  DR_REF (drb));
 268               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 269             }
 270           return true;
 271         }
 272
 273       if (dump_enabled_p ())
 274         {
 275           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 276                            "versioning for alias required: "
 277                            "can't determine dependence between ");
 278           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 279                              DR_REF (dra));
 280           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 281           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 282                              DR_REF (drb));
 283           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 284         }
 285
 286       /* Add to list of ddrs that need to be tested at run-time.  */
 287       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 288     }
 289
 290   /* Known data dependence.  */
 291   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 292     {
 293       /* If user asserted safelen consecutive iterations can be
 294          executed concurrently, assume independence.  */
 295       if (loop->safelen >= 2)
 296         {
 297           if (loop->safelen < *max_vf)
 298             *max_vf = loop->safelen;
 299           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 300           return false;
 301         }
 302
 303       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 304           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 305         {
 306           if (dump_enabled_p ())
 307             {
 308               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 309                                "versioning for alias not supported for: "
 310                                "bad dist vector for ");
 311               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 312                                  DR_REF (dra));
 313               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 314               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 315                                  DR_REF (drb));
 316               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 317             }
 318           return true;
 319         }
 320
 321       if (dump_enabled_p ())
 322         {
 323           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 324                            "versioning for alias required: "
 325                            "bad dist vector for ");
 326           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 327           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 328           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 329           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 330         }
 331       /* Add to list of ddrs that need to be tested at run-time.  */
 332       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 333     }
 334
 335   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 336   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 337     {
 338       int dist = dist_v[loop_depth];
 339
 340       if (dump_enabled_p ())
 341         dump_printf_loc (MSG_NOTE, vect_location,
 342                          "dependence distance  = %d.\n", dist);
 343
 344       if (dist == 0)
 345         {
 346           if (dump_enabled_p ())
 347             {
 348               dump_printf_loc (MSG_NOTE, vect_location,
 349                                "dependence distance == 0 between ");
 350               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 351               dump_printf (MSG_NOTE, " and ");
 352               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 353               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 354             }
 355
 356           /* When we perform grouped accesses and perform implicit CSE
 357              by detecting equal accesses and doing disambiguation with
 358              runtime alias tests like for
 359                 .. = a[i];
 360                 .. = a[i+1];
 361                 a[i] = ..;
 362                 a[i+1] = ..;
 363                 *p = ..;
 364                 .. = a[i];
 365                 .. = a[i+1];
 366              where we will end up loading { a[i], a[i+1] } once, make
 367              sure that inserting group loads before the first load and
 368              stores after the last store will do the right thing.
 369              Similar for groups like
 370                 a[i] = ...;
 371                 ... = a[i];
 372                 a[i+1] = ...;
 373              where loads from the group interleave with the store.  */
 374           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 375               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 376             {
 377               gimple *earlier_stmt;
 378               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 379               if (DR_IS_WRITE
 380                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 381                 {
 382                   if (dump_enabled_p ())
 383                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 384                                      "READ_WRITE dependence in interleaving."
 385                                      "\n");
 386                   return true;
 387                 }
 388             }
 389
 390           continue;
 391         }
 392
 393       if (dist > 0 && DDR_REVERSED_P (ddr))
 394         {
 395           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 396              reversed (to make distance vector positive), and the actual
 397              distance is negative.  */
 398           if (dump_enabled_p ())
 399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 400                              "dependence distance negative.\n");
 401           /* Record a negative dependence distance to later limit the
 402              amount of stmt copying / unrolling we can perform.
 403              Only need to handle read-after-write dependence.  */
 404           if (DR_IS_READ (drb)
 405               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 406                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 407             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 408           continue;
 409         }
 410
 411       if (abs (dist) >= 2
 412           && abs (dist) < *max_vf)
 413         {
 414           /* The dependence distance requires reduction of the maximal
 415              vectorization factor.  */
 416           *max_vf = abs (dist);
 417           if (dump_enabled_p ())
 418             dump_printf_loc (MSG_NOTE, vect_location,
 419                              "adjusting maximal vectorization factor to %i\n",
 420                              *max_vf);
 421         }
 422
 423       if (abs (dist) >= *max_vf)
 424         {
 425           /* Dependence distance does not create dependence, as far as
 426              vectorization is concerned, in this case.  */
 427           if (dump_enabled_p ())
 428             dump_printf_loc (MSG_NOTE, vect_location,
 429                              "dependence distance >= VF.\n");
 430           continue;
 431         }
 432
 433       if (dump_enabled_p ())
 434         {
 435           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 436                        "not vectorized, possible dependence "
 437                        "between data-refs ");
 438           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 439           dump_printf (MSG_NOTE,  " and ");
 440           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 441           dump_printf (MSG_NOTE,  "\n");
 442         }
 443
 444       return true;
 445     }
 446
 447   return false;
 448 }
 449
 450 /* Function vect_analyze_data_ref_dependences.
 451
 452    Examine all the data references in the loop, and make sure there do not
 453    exist any data dependences between them.  Set *MAX_VF according to
 454    the maximum vectorization factor the data dependences allow.  */
 455
 456 bool
 457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 458 {
 459   unsigned int i;
 460   struct data_dependence_relation *ddr;
 461
 462   if (dump_enabled_p ())
 463     dump_printf_loc (MSG_NOTE, vect_location,
 464                      "=== vect_analyze_data_ref_dependences ===\n");
 465
 466   LOOP_VINFO_DDRS (loop_vinfo)
 467     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 468              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 469   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 470   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 471                                 &LOOP_VINFO_DDRS (loop_vinfo),
 472                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 473     return false;
 474
 475   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 476     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 477       return false;
 478
 479   return true;
 480 }
 481
 482
 483 /* Function vect_slp_analyze_data_ref_dependence.
 484
 485    Return TRUE if there (might) exist a dependence between a memory-reference
 486    DRA and a memory-reference DRB.  When versioning for alias may check a
 487    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 488    the data dependence.  */
 489
 490 static bool
 491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 492 {
 493   struct data_reference *dra = DDR_A (ddr);
 494   struct data_reference *drb = DDR_B (ddr);
 495
 496   /* We need to check dependences of statements marked as unvectorizable
 497      as well, they still can prohibit vectorization.  */
 498
 499   /* Independent data accesses.  */
 500   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 501     return false;
 502
 503   if (dra == drb)
 504     return false;
 505
 506   /* Read-read is OK.  */
 507   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 508     return false;
 509
 510   /* If dra and drb are part of the same interleaving chain consider
 511      them independent.  */
 512   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 513       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 514           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 515     return false;
 516
 517   /* Unknown data dependence.  */
 518   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 519     {
 520       if  (dump_enabled_p ())
 521         {
 522           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 523                            "can't determine dependence between ");
 524           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 525           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 526           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 527           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 528         }
 529     }
 530   else if (dump_enabled_p ())
 531     {
 532       dump_printf_loc (MSG_NOTE, vect_location,
 533                        "determined dependence between ");
 534       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 535       dump_printf (MSG_NOTE, " and ");
 536       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 537       dump_printf (MSG_NOTE,  "\n");
 538     }
 539
 540   /* We do not vectorize basic blocks with write-write dependencies.  */
 541   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 542     return true;
 543
 544   /* If we have a read-write dependence check that the load is before the store.
 545      When we vectorize basic blocks, vector load can be only before
 546      corresponding scalar load, and vector store can be only after its
 547      corresponding scalar store.  So the order of the acceses is preserved in
 548      case the load is before the store.  */
 549   gimple *earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 550   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 551     {
 552       /* That only holds for load-store pairs taking part in vectorization.  */
 553       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 554           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 555         return false;
 556     }
 557
 558   return true;
 559 }
 560
 561
 562 /* Analyze dependences involved in the transform of SLP NODE.  */
 563
 564 static bool
 565 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node)
 566 {
 567   /* This walks over all stmts involved in the SLP load/store done
 568      in NODE verifying we can sink them up to the last stmt in the
 569      group.  */
 570   gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
 571   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 572     {
 573       gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
 574       if (access == last_access)
 575         continue;
 576       stmt_vec_info access_stmt_info = vinfo_for_stmt (access);
 577       gimple_stmt_iterator gsi = gsi_for_stmt (access);
 578       gsi_next (&gsi);
 579       for (; gsi_stmt (gsi) != last_access; gsi_next (&gsi))
 580         {
 581           gimple *stmt = gsi_stmt (gsi);
 582           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 583           if (!STMT_VINFO_DATA_REF (stmt_info)
 584               || (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
 585                   && DR_IS_READ (STMT_VINFO_DATA_REF (access_stmt_info))))
 586             continue;
 587
 588           ddr_p ddr = initialize_data_dependence_relation
 589               (STMT_VINFO_DATA_REF (access_stmt_info),
 590                STMT_VINFO_DATA_REF (stmt_info), vNULL);
 591           if (vect_slp_analyze_data_ref_dependence (ddr))
 592             {
 593               /* ???  If the dependence analysis failed we can resort to the
 594                  alias oracle which can handle more kinds of stmts.  */
 595               free_dependence_relation (ddr);
 596               return false;
 597             }
 598           free_dependence_relation (ddr);
 599         }
 600     }
 601   return true;
 602 }
 603
 604
 605 /* Function vect_analyze_data_ref_dependences.
 606
 607    Examine all the data references in the basic-block, and make sure there
 608    do not exist any data dependences between them.  Set *MAX_VF according to
 609    the maximum vectorization factor the data dependences allow.  */
 610
 611 bool
 612 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 613 {
 614   if (dump_enabled_p ())
 615     dump_printf_loc (MSG_NOTE, vect_location,
 616                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 617
 618   slp_instance instance;
 619   slp_tree load;
 620   unsigned int i, j;
 621   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
 622     {
 623       bool remove = false;
 624       /* Verify we can sink loads to the vectorized stmt insert location.  */
 625       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, load)
 626         if (! vect_slp_analyze_node_dependences (instance, load))
 627           {
 628             remove = true;
 629             break;
 630           }
 631       /* Verify we can sink stores to the vectorized stmt insert location.  */
 632       slp_tree store = SLP_INSTANCE_TREE (instance);
 633       if (!remove
 634           && STMT_VINFO_DATA_REF
 635                 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))
 636           && ! vect_slp_analyze_node_dependences (instance, store))
 637         remove = true;
 638       if (remove)
 639         {
 640           dump_printf_loc (MSG_NOTE, vect_location,
 641                            "removing SLP instance operations starting from: ");
 642           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 643                             SLP_TREE_SCALAR_STMTS
 644                               (SLP_INSTANCE_TREE (instance))[0], 0);
 645           vect_free_slp_instance (instance);
 646           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
 647         }
 648       i++;
 649     }
 650
 651   if (!BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
 652     return false;
 653
 654   return true;
 655 }
 656
 657
 658 /* Function vect_compute_data_ref_alignment
 659
 660    Compute the misalignment of the data reference DR.
 661
 662    Output:
 663    1. If during the misalignment computation it is found that the data reference
 664       cannot be vectorized then false is returned.
 665    2. DR_MISALIGNMENT (DR) is defined.
 666
 667    FOR NOW: No analysis is actually performed. Misalignment is calculated
 668    only for trivial cases. TODO.  */
 669
 670 static bool
 671 vect_compute_data_ref_alignment (struct data_reference *dr)
 672 {
 673   gimple *stmt = DR_STMT (dr);
 674   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 675   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 676   struct loop *loop = NULL;
 677   tree ref = DR_REF (dr);
 678   tree vectype;
 679   tree base, base_addr;
 680   tree misalign = NULL_TREE;
 681   tree aligned_to;
 682   unsigned HOST_WIDE_INT alignment;
 683
 684   if (dump_enabled_p ())
 685     dump_printf_loc (MSG_NOTE, vect_location,
 686                      "vect_compute_data_ref_alignment:\n");
 687
 688   if (loop_vinfo)
 689     loop = LOOP_VINFO_LOOP (loop_vinfo);
 690
 691   /* Initialize misalignment to unknown.  */
 692   SET_DR_MISALIGNMENT (dr, -1);
 693
 694   if (tree_fits_shwi_p (DR_STEP (dr)))
 695     misalign = DR_INIT (dr);
 696   aligned_to = DR_ALIGNED_TO (dr);
 697   base_addr = DR_BASE_ADDRESS (dr);
 698   vectype = STMT_VINFO_VECTYPE (stmt_info);
 699
 700   /* In case the dataref is in an inner-loop of the loop that is being
 701      vectorized (LOOP), we use the base and misalignment information
 702      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 703      stays the same throughout the execution of the inner-loop, which is why
 704      we have to check that the stride of the dataref in the inner-loop evenly
 705      divides by the vector size.  */
 706   if (loop && nested_in_vect_loop_p (loop, stmt))
 707     {
 708       tree step = DR_STEP (dr);
 709
 710       if (tree_fits_shwi_p (step)
 711           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 712         {
 713           if (dump_enabled_p ())
 714             dump_printf_loc (MSG_NOTE, vect_location,
 715                              "inner step divides the vector-size.\n");
 716           misalign = STMT_VINFO_DR_INIT (stmt_info);
 717           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 718           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 719         }
 720       else
 721         {
 722           if (dump_enabled_p ())
 723             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 724                              "inner step doesn't divide the vector-size.\n");
 725           misalign = NULL_TREE;
 726         }
 727     }
 728
 729   /* Similarly we can only use base and misalignment information relative to
 730      an innermost loop if the misalignment stays the same throughout the
 731      execution of the loop.  As above, this is the case if the stride of
 732      the dataref evenly divides by the vector size.  */
 733   else
 734     {
 735       tree step = DR_STEP (dr);
 736       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 737
 738       if (tree_fits_shwi_p (step)
 739           && ((tree_to_shwi (step) * vf)
 740               % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 741         {
 742           if (dump_enabled_p ())
 743             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 744                              "step doesn't divide the vector-size.\n");
 745           misalign = NULL_TREE;
 746         }
 747     }
 748
 749   /* To look at alignment of the base we have to preserve an inner MEM_REF
 750      as that carries alignment information of the actual access.  */
 751   base = ref;
 752   while (handled_component_p (base))
 753     base = TREE_OPERAND (base, 0);
 754   if (TREE_CODE (base) == MEM_REF)
 755     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 756                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 757   unsigned int base_alignment = get_object_alignment (base);
 758
 759   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
 760     DR_VECT_AUX (dr)->base_element_aligned = true;
 761
 762   alignment = TYPE_ALIGN_UNIT (vectype);
 763
 764   if ((compare_tree_int (aligned_to, alignment) < 0)
 765       || !misalign)
 766     {
 767       if (dump_enabled_p ())
 768         {
 769           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 770                            "Unknown alignment for access: ");
 771           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 772           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 773         }
 774       return true;
 775     }
 776
 777   if (base_alignment < TYPE_ALIGN (vectype))
 778     {
 779       /* Strip an inner MEM_REF to a bare decl if possible.  */
 780       if (TREE_CODE (base) == MEM_REF
 781           && integer_zerop (TREE_OPERAND (base, 1))
 782           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 783         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 784
 785       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 786         {
 787           if (dump_enabled_p ())
 788             {
 789               dump_printf_loc (MSG_NOTE, vect_location,
 790                                "can't force alignment of ref: ");
 791               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 792               dump_printf (MSG_NOTE, "\n");
 793             }
 794           return true;
 795         }
 796
 797       /* Force the alignment of the decl.
 798          NOTE: This is the only change to the code we make during
 799          the analysis phase, before deciding to vectorize the loop.  */
 800       if (dump_enabled_p ())
 801         {
 802           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 803           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 804           dump_printf (MSG_NOTE, "\n");
 805         }
 806
 807       DR_VECT_AUX (dr)->base_decl = base;
 808       DR_VECT_AUX (dr)->base_misaligned = true;
 809       DR_VECT_AUX (dr)->base_element_aligned = true;
 810     }
 811
 812   /* If this is a backward running DR then first access in the larger
 813      vectype actually is N-1 elements before the address in the DR.
 814      Adjust misalign accordingly.  */
 815   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 816     {
 817       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 818       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 819          otherwise we wouldn't be here.  */
 820       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 821       /* PLUS because DR_STEP was negative.  */
 822       misalign = size_binop (PLUS_EXPR, misalign, offset);
 823     }
 824
 825   SET_DR_MISALIGNMENT (dr,
 826                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 827
 828   if (dump_enabled_p ())
 829     {
 830       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 831                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 832       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 833       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 834     }
 835
 836   return true;
 837 }
 838
 839
 840 /* Function vect_compute_data_refs_alignment
 841
 842    Compute the misalignment of data references in the loop.
 843    Return FALSE if a data reference is found that cannot be vectorized.  */
 844
 845 static bool
 846 vect_compute_data_refs_alignment (vec_info *vinfo)
 847 {
 848   vec<data_reference_p> datarefs = vinfo->datarefs;
 849   struct data_reference *dr;
 850   unsigned int i;
 851
 852   FOR_EACH_VEC_ELT (datarefs, i, dr)
 853     {
 854       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 855       if (STMT_VINFO_VECTORIZABLE (stmt_info)
 856           && !vect_compute_data_ref_alignment (dr))
 857         {
 858           /* Strided accesses perform only component accesses, misalignment
 859              information is irrelevant for them.  */
 860           if (STMT_VINFO_STRIDED_P (stmt_info)
 861               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 862             continue;
 863
 864           if (is_a <bb_vec_info> (vinfo))
 865             {
 866               /* Mark unsupported statement as unvectorizable.  */
 867               STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 868               continue;
 869             }
 870           else
 871             return false;
 872         }
 873     }
 874
 875   return true;
 876 }
 877
 878
 879 /* Function vect_update_misalignment_for_peel
 880
 881    DR - the data reference whose misalignment is to be adjusted.
 882    DR_PEEL - the data reference whose misalignment is being made
 883              zero in the vector loop by the peel.
 884    NPEEL - the number of iterations in the peel loop if the misalignment
 885            of DR_PEEL is known at compile time.  */
 886
 887 static void
 888 vect_update_misalignment_for_peel (struct data_reference *dr,
 889                                    struct data_reference *dr_peel, int npeel)
 890 {
 891   unsigned int i;
 892   vec<dr_p> same_align_drs;
 893   struct data_reference *current_dr;
 894   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 895   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 896   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 897   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 898
 899  /* For interleaved data accesses the step in the loop must be multiplied by
 900      the size of the interleaving group.  */
 901   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 902     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 903   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 904     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 905
 906   /* It can be assumed that the data refs with the same alignment as dr_peel
 907      are aligned in the vector loop.  */
 908   same_align_drs
 909     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 910   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 911     {
 912       if (current_dr != dr)
 913         continue;
 914       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 915                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 916       SET_DR_MISALIGNMENT (dr, 0);
 917       return;
 918     }
 919
 920   if (known_alignment_for_access_p (dr)
 921       && known_alignment_for_access_p (dr_peel))
 922     {
 923       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 924       int misal = DR_MISALIGNMENT (dr);
 925       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 926       misal += negative ? -npeel * dr_size : npeel * dr_size;
 927       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 928       SET_DR_MISALIGNMENT (dr, misal);
 929       return;
 930     }
 931
 932   if (dump_enabled_p ())
 933     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 934   SET_DR_MISALIGNMENT (dr, -1);
 935 }
 936
 937
 938 /* Function vect_verify_datarefs_alignment
 939
 940    Return TRUE if all data references in the loop can be
 941    handled with respect to alignment.  */
 942
 943 bool
 944 vect_verify_datarefs_alignment (vec_info *vinfo)
 945 {
 946   vec<data_reference_p> datarefs = vinfo->datarefs;
 947   struct data_reference *dr;
 948   enum dr_alignment_support supportable_dr_alignment;
 949   unsigned int i;
 950
 951   FOR_EACH_VEC_ELT (datarefs, i, dr)
 952     {
 953       gimple *stmt = DR_STMT (dr);
 954       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 955
 956       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 957         continue;
 958
 959       /* For interleaving, only the alignment of the first access matters.
 960          Skip statements marked as not vectorizable.  */
 961       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 962            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 963           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 964         continue;
 965
 966       /* Strided accesses perform only component accesses, alignment is
 967          irrelevant for them.  */
 968       if (STMT_VINFO_STRIDED_P (stmt_info)
 969           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 970         continue;
 971
 972       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 973       if (!supportable_dr_alignment)
 974         {
 975           if (dump_enabled_p ())
 976             {
 977               if (DR_IS_READ (dr))
 978                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 979                                  "not vectorized: unsupported unaligned load.");
 980               else
 981                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 982                                  "not vectorized: unsupported unaligned "
 983                                  "store.");
 984
 985               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 986                                  DR_REF (dr));
 987               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 988             }
 989           return false;
 990         }
 991       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 992         dump_printf_loc (MSG_NOTE, vect_location,
 993                          "Vectorizing an unaligned access.\n");
 994     }
 995   return true;
 996 }
 997
 998 /* Given an memory reference EXP return whether its alignment is less
 999    than its size.  */
1000
1001 static bool
1002 not_size_aligned (tree exp)
1003 {
1004   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1005     return true;
1006
1007   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1008           > get_object_alignment (exp));
1009 }
1010
1011 /* Function vector_alignment_reachable_p
1012
1013    Return true if vector alignment for DR is reachable by peeling
1014    a few loop iterations.  Return false otherwise.  */
1015
1016 static bool
1017 vector_alignment_reachable_p (struct data_reference *dr)
1018 {
1019   gimple *stmt = DR_STMT (dr);
1020   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1021   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1022
1023   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1024     {
1025       /* For interleaved access we peel only if number of iterations in
1026          the prolog loop ({VF - misalignment}), is a multiple of the
1027          number of the interleaved accesses.  */
1028       int elem_size, mis_in_elements;
1029       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1030
1031       /* FORNOW: handle only known alignment.  */
1032       if (!known_alignment_for_access_p (dr))
1033         return false;
1034
1035       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1036       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1037
1038       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1039         return false;
1040     }
1041
1042   /* If misalignment is known at the compile time then allow peeling
1043      only if natural alignment is reachable through peeling.  */
1044   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1045     {
1046       HOST_WIDE_INT elmsize =
1047                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1048       if (dump_enabled_p ())
1049         {
1050           dump_printf_loc (MSG_NOTE, vect_location,
1051                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1052           dump_printf (MSG_NOTE,
1053                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1054         }
1055       if (DR_MISALIGNMENT (dr) % elmsize)
1056         {
1057           if (dump_enabled_p ())
1058             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1059                              "data size does not divide the misalignment.\n");
1060           return false;
1061         }
1062     }
1063
1064   if (!known_alignment_for_access_p (dr))
1065     {
1066       tree type = TREE_TYPE (DR_REF (dr));
1067       bool is_packed = not_size_aligned (DR_REF (dr));
1068       if (dump_enabled_p ())
1069         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1070                          "Unknown misalignment, is_packed = %d\n",is_packed);
1071       if ((TYPE_USER_ALIGN (type) && !is_packed)
1072           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1073         return true;
1074       else
1075         return false;
1076     }
1077
1078   return true;
1079 }
1080
1081
1082 /* Calculate the cost of the memory access represented by DR.  */
1083
1084 static void
1085 vect_get_data_access_cost (struct data_reference *dr,
1086                            unsigned int *inside_cost,
1087                            unsigned int *outside_cost,
1088                            stmt_vector_for_cost *body_cost_vec)
1089 {
1090   gimple *stmt = DR_STMT (dr);
1091   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1092   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1093   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1094   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1095   int ncopies = vf / nunits;
1096
1097   if (DR_IS_READ (dr))
1098     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1099                         NULL, body_cost_vec, false);
1100   else
1101     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1102
1103   if (dump_enabled_p ())
1104     dump_printf_loc (MSG_NOTE, vect_location,
1105                      "vect_get_data_access_cost: inside_cost = %d, "
1106                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1107 }
1108
1109
1110 typedef struct _vect_peel_info
1111 {
1112   int npeel;
1113   struct data_reference *dr;
1114   unsigned int count;
1115 } *vect_peel_info;
1116
1117 typedef struct _vect_peel_extended_info
1118 {
1119   struct _vect_peel_info peel_info;
1120   unsigned int inside_cost;
1121   unsigned int outside_cost;
1122   stmt_vector_for_cost body_cost_vec;
1123 } *vect_peel_extended_info;
1124
1125
1126 /* Peeling hashtable helpers.  */
1127
1128 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1129 {
1130   static inline hashval_t hash (const _vect_peel_info *);
1131   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1132 };
1133
1134 inline hashval_t
1135 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1136 {
1137   return (hashval_t) peel_info->npeel;
1138 }
1139
1140 inline bool
1141 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1142 {
1143   return (a->npeel == b->npeel);
1144 }
1145
1146
1147 /* Insert DR into peeling hash table with NPEEL as key.  */
1148
1149 static void
1150 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1151                           loop_vec_info loop_vinfo, struct data_reference *dr,
1152                           int npeel)
1153 {
1154   struct _vect_peel_info elem, *slot;
1155   _vect_peel_info **new_slot;
1156   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1157
1158   elem.npeel = npeel;
1159   slot = peeling_htab->find (&elem);
1160   if (slot)
1161     slot->count++;
1162   else
1163     {
1164       slot = XNEW (struct _vect_peel_info);
1165       slot->npeel = npeel;
1166       slot->dr = dr;
1167       slot->count = 1;
1168       new_slot = peeling_htab->find_slot (slot, INSERT);
1169       *new_slot = slot;
1170     }
1171
1172   if (!supportable_dr_alignment
1173       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1174     slot->count += VECT_MAX_COST;
1175 }
1176
1177
1178 /* Traverse peeling hash table to find peeling option that aligns maximum
1179    number of data accesses.  */
1180
1181 int
1182 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1183                                      _vect_peel_extended_info *max)
1184 {
1185   vect_peel_info elem = *slot;
1186
1187   if (elem->count > max->peel_info.count
1188       || (elem->count == max->peel_info.count
1189           && max->peel_info.npeel > elem->npeel))
1190     {
1191       max->peel_info.npeel = elem->npeel;
1192       max->peel_info.count = elem->count;
1193       max->peel_info.dr = elem->dr;
1194     }
1195
1196   return 1;
1197 }
1198
1199
1200 /* Traverse peeling hash table and calculate cost for each peeling option.
1201    Find the one with the lowest cost.  */
1202
1203 int
1204 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1205                                    _vect_peel_extended_info *min)
1206 {
1207   vect_peel_info elem = *slot;
1208   int save_misalignment, dummy;
1209   unsigned int inside_cost = 0, outside_cost = 0, i;
1210   gimple *stmt = DR_STMT (elem->dr);
1211   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1212   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1213   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1214   struct data_reference *dr;
1215   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1216
1217   prologue_cost_vec.create (2);
1218   body_cost_vec.create (2);
1219   epilogue_cost_vec.create (2);
1220
1221   FOR_EACH_VEC_ELT (datarefs, i, dr)
1222     {
1223       stmt = DR_STMT (dr);
1224       stmt_info = vinfo_for_stmt (stmt);
1225       /* For interleaving, only the alignment of the first access
1226          matters.  */
1227       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1228           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1229         continue;
1230
1231       save_misalignment = DR_MISALIGNMENT (dr);
1232       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1233       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1234                                  &body_cost_vec);
1235       SET_DR_MISALIGNMENT (dr, save_misalignment);
1236     }
1237
1238   outside_cost += vect_get_known_peeling_cost
1239     (loop_vinfo, elem->npeel, &dummy,
1240      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1241      &prologue_cost_vec, &epilogue_cost_vec);
1242
1243   /* Prologue and epilogue costs are added to the target model later.
1244      These costs depend only on the scalar iteration cost, the
1245      number of peeling iterations finally chosen, and the number of
1246      misaligned statements.  So discard the information found here.  */
1247   prologue_cost_vec.release ();
1248   epilogue_cost_vec.release ();
1249
1250   if (inside_cost < min->inside_cost
1251       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1252     {
1253       min->inside_cost = inside_cost;
1254       min->outside_cost = outside_cost;
1255       min->body_cost_vec.release ();
1256       min->body_cost_vec = body_cost_vec;
1257       min->peel_info.dr = elem->dr;
1258       min->peel_info.npeel = elem->npeel;
1259     }
1260   else
1261     body_cost_vec.release ();
1262
1263   return 1;
1264 }
1265
1266
1267 /* Choose best peeling option by traversing peeling hash table and either
1268    choosing an option with the lowest cost (if cost model is enabled) or the
1269    option that aligns as many accesses as possible.  */
1270
1271 static struct data_reference *
1272 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1273                                        loop_vec_info loop_vinfo,
1274                                        unsigned int *npeel,
1275                                        stmt_vector_for_cost *body_cost_vec)
1276 {
1277    struct _vect_peel_extended_info res;
1278
1279    res.peel_info.dr = NULL;
1280    res.body_cost_vec = stmt_vector_for_cost ();
1281
1282    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1283      {
1284        res.inside_cost = INT_MAX;
1285        res.outside_cost = INT_MAX;
1286        peeling_htab->traverse <_vect_peel_extended_info *,
1287                                vect_peeling_hash_get_lowest_cost> (&res);
1288      }
1289    else
1290      {
1291        res.peel_info.count = 0;
1292        peeling_htab->traverse <_vect_peel_extended_info *,
1293                                vect_peeling_hash_get_most_frequent> (&res);
1294      }
1295
1296    *npeel = res.peel_info.npeel;
1297    *body_cost_vec = res.body_cost_vec;
1298    return res.peel_info.dr;
1299 }
1300
1301
1302 /* Function vect_enhance_data_refs_alignment
1303
1304    This pass will use loop versioning and loop peeling in order to enhance
1305    the alignment of data references in the loop.
1306
1307    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1308    original loop is to be vectorized.  Any other loops that are created by
1309    the transformations performed in this pass - are not supposed to be
1310    vectorized.  This restriction will be relaxed.
1311
1312    This pass will require a cost model to guide it whether to apply peeling
1313    or versioning or a combination of the two.  For example, the scheme that
1314    intel uses when given a loop with several memory accesses, is as follows:
1315    choose one memory access ('p') which alignment you want to force by doing
1316    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1317    other accesses are not necessarily aligned, or (2) use loop versioning to
1318    generate one loop in which all accesses are aligned, and another loop in
1319    which only 'p' is necessarily aligned.
1320
1321    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1322    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1323    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1324
1325    Devising a cost model is the most critical aspect of this work.  It will
1326    guide us on which access to peel for, whether to use loop versioning, how
1327    many versions to create, etc.  The cost model will probably consist of
1328    generic considerations as well as target specific considerations (on
1329    powerpc for example, misaligned stores are more painful than misaligned
1330    loads).
1331
1332    Here are the general steps involved in alignment enhancements:
1333
1334      -- original loop, before alignment analysis:
1335         for (i=0; i<N; i++){
1336           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1337           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1338         }
1339
1340      -- After vect_compute_data_refs_alignment:
1341         for (i=0; i<N; i++){
1342           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1343           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1344         }
1345
1346      -- Possibility 1: we do loop versioning:
1347      if (p is aligned) {
1348         for (i=0; i<N; i++){    # loop 1A
1349           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1350           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1351         }
1352      }
1353      else {
1354         for (i=0; i<N; i++){    # loop 1B
1355           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1356           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1357         }
1358      }
1359
1360      -- Possibility 2: we do loop peeling:
1361      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1362         x = q[i];
1363         p[i] = y;
1364      }
1365      for (i = 3; i < N; i++){   # loop 2A
1366         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1367         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1368      }
1369
1370      -- Possibility 3: combination of loop peeling and versioning:
1371      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1372         x = q[i];
1373         p[i] = y;
1374      }
1375      if (p is aligned) {
1376         for (i = 3; i<N; i++){  # loop 3A
1377           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1378           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1379         }
1380      }
1381      else {
1382         for (i = 3; i<N; i++){  # loop 3B
1383           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1384           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1385         }
1386      }
1387
1388      These loops are later passed to loop_transform to be vectorized.  The
1389      vectorizer will use the alignment information to guide the transformation
1390      (whether to generate regular loads/stores, or with special handling for
1391      misalignment).  */
1392
1393 bool
1394 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1395 {
1396   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1397   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1398   enum dr_alignment_support supportable_dr_alignment;
1399   struct data_reference *dr0 = NULL, *first_store = NULL;
1400   struct data_reference *dr;
1401   unsigned int i, j;
1402   bool do_peeling = false;
1403   bool do_versioning = false;
1404   bool stat;
1405   gimple *stmt;
1406   stmt_vec_info stmt_info;
1407   unsigned int npeel = 0;
1408   bool all_misalignments_unknown = true;
1409   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1410   unsigned possible_npeel_number = 1;
1411   tree vectype;
1412   unsigned int nelements, mis, same_align_drs_max = 0;
1413   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1414   hash_table<peel_info_hasher> peeling_htab (1);
1415
1416   if (dump_enabled_p ())
1417     dump_printf_loc (MSG_NOTE, vect_location,
1418                      "=== vect_enhance_data_refs_alignment ===\n");
1419
1420   /* Reset data so we can safely be called multiple times.  */
1421   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1422   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1423
1424   /* While cost model enhancements are expected in the future, the high level
1425      view of the code at this time is as follows:
1426
1427      A) If there is a misaligned access then see if peeling to align
1428         this access can make all data references satisfy
1429         vect_supportable_dr_alignment.  If so, update data structures
1430         as needed and return true.
1431
1432      B) If peeling wasn't possible and there is a data reference with an
1433         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1434         then see if loop versioning checks can be used to make all data
1435         references satisfy vect_supportable_dr_alignment.  If so, update
1436         data structures as needed and return true.
1437
1438      C) If neither peeling nor versioning were successful then return false if
1439         any data reference does not satisfy vect_supportable_dr_alignment.
1440
1441      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1442
1443      Note, Possibility 3 above (which is peeling and versioning together) is not
1444      being done at this time.  */
1445
1446   /* (1) Peeling to force alignment.  */
1447
1448   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1449      Considerations:
1450      + How many accesses will become aligned due to the peeling
1451      - How many accesses will become unaligned due to the peeling,
1452        and the cost of misaligned accesses.
1453      - The cost of peeling (the extra runtime checks, the increase
1454        in code size).  */
1455
1456   FOR_EACH_VEC_ELT (datarefs, i, dr)
1457     {
1458       stmt = DR_STMT (dr);
1459       stmt_info = vinfo_for_stmt (stmt);
1460
1461       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1462         continue;
1463
1464       /* For interleaving, only the alignment of the first access
1465          matters.  */
1466       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1467           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1468         continue;
1469
1470       /* For invariant accesses there is nothing to enhance.  */
1471       if (integer_zerop (DR_STEP (dr)))
1472         continue;
1473
1474       /* Strided accesses perform only component accesses, alignment is
1475          irrelevant for them.  */
1476       if (STMT_VINFO_STRIDED_P (stmt_info)
1477           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1478         continue;
1479
1480       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1481       do_peeling = vector_alignment_reachable_p (dr);
1482       if (do_peeling)
1483         {
1484           if (known_alignment_for_access_p (dr))
1485             {
1486               unsigned int npeel_tmp;
1487               bool negative = tree_int_cst_compare (DR_STEP (dr),
1488                                                     size_zero_node) < 0;
1489
1490               /* Save info about DR in the hash table.  */
1491               vectype = STMT_VINFO_VECTYPE (stmt_info);
1492               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1493               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1494                                                 TREE_TYPE (DR_REF (dr))));
1495               npeel_tmp = (negative
1496                            ? (mis - nelements) : (nelements - mis))
1497                   & (nelements - 1);
1498
1499               /* For multiple types, it is possible that the bigger type access
1500                  will have more than one peeling option.  E.g., a loop with two
1501                  types: one of size (vector size / 4), and the other one of
1502                  size (vector size / 8).  Vectorization factor will 8.  If both
1503                  access are misaligned by 3, the first one needs one scalar
1504                  iteration to be aligned, and the second one needs 5.  But the
1505                  the first one will be aligned also by peeling 5 scalar
1506                  iterations, and in that case both accesses will be aligned.
1507                  Hence, except for the immediate peeling amount, we also want
1508                  to try to add full vector size, while we don't exceed
1509                  vectorization factor.
1510                  We do this automtically for cost model, since we calculate cost
1511                  for every peeling option.  */
1512               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1513                 {
1514                   if (STMT_SLP_TYPE (stmt_info))
1515                     possible_npeel_number
1516                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1517                   else
1518                     possible_npeel_number = vf / nelements;
1519                 }
1520
1521               /* Handle the aligned case. We may decide to align some other
1522                  access, making DR unaligned.  */
1523               if (DR_MISALIGNMENT (dr) == 0)
1524                 {
1525                   npeel_tmp = 0;
1526                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1527                     possible_npeel_number++;
1528                 }
1529
1530               for (j = 0; j < possible_npeel_number; j++)
1531                 {
1532                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1533                                             dr, npeel_tmp);
1534                   npeel_tmp += nelements;
1535                 }
1536
1537               all_misalignments_unknown = false;
1538               /* Data-ref that was chosen for the case that all the
1539                  misalignments are unknown is not relevant anymore, since we
1540                  have a data-ref with known alignment.  */
1541               dr0 = NULL;
1542             }
1543           else
1544             {
1545               /* If we don't know any misalignment values, we prefer
1546                  peeling for data-ref that has the maximum number of data-refs
1547                  with the same alignment, unless the target prefers to align
1548                  stores over load.  */
1549               if (all_misalignments_unknown)
1550                 {
1551                   unsigned same_align_drs
1552                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1553                   if (!dr0
1554                       || same_align_drs_max < same_align_drs)
1555                     {
1556                       same_align_drs_max = same_align_drs;
1557                       dr0 = dr;
1558                     }
1559                   /* For data-refs with the same number of related
1560                      accesses prefer the one where the misalign
1561                      computation will be invariant in the outermost loop.  */
1562                   else if (same_align_drs_max == same_align_drs)
1563                     {
1564                       struct loop *ivloop0, *ivloop;
1565                       ivloop0 = outermost_invariant_loop_for_expr
1566                           (loop, DR_BASE_ADDRESS (dr0));
1567                       ivloop = outermost_invariant_loop_for_expr
1568                           (loop, DR_BASE_ADDRESS (dr));
1569                       if ((ivloop && !ivloop0)
1570                           || (ivloop && ivloop0
1571                               && flow_loop_nested_p (ivloop, ivloop0)))
1572                         dr0 = dr;
1573                     }
1574
1575                   if (!first_store && DR_IS_WRITE (dr))
1576                     first_store = dr;
1577                 }
1578
1579               /* If there are both known and unknown misaligned accesses in the
1580                  loop, we choose peeling amount according to the known
1581                  accesses.  */
1582               if (!supportable_dr_alignment)
1583                 {
1584                   dr0 = dr;
1585                   if (!first_store && DR_IS_WRITE (dr))
1586                     first_store = dr;
1587                 }
1588             }
1589         }
1590       else
1591         {
1592           if (!aligned_access_p (dr))
1593             {
1594               if (dump_enabled_p ())
1595                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596                                  "vector alignment may not be reachable\n");
1597               break;
1598             }
1599         }
1600     }
1601
1602   /* Check if we can possibly peel the loop.  */
1603   if (!vect_can_advance_ivs_p (loop_vinfo)
1604       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1605       || loop->inner)
1606     do_peeling = false;
1607
1608   if (do_peeling
1609       && all_misalignments_unknown
1610       && vect_supportable_dr_alignment (dr0, false))
1611     {
1612       /* Check if the target requires to prefer stores over loads, i.e., if
1613          misaligned stores are more expensive than misaligned loads (taking
1614          drs with same alignment into account).  */
1615       if (first_store && DR_IS_READ (dr0))
1616         {
1617           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1618           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1619           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1620           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1621           stmt_vector_for_cost dummy;
1622           dummy.create (2);
1623
1624           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1625                                      &dummy);
1626           vect_get_data_access_cost (first_store, &store_inside_cost,
1627                                      &store_outside_cost, &dummy);
1628
1629           dummy.release ();
1630
1631           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1632              aligning the load DR0).  */
1633           load_inside_penalty = store_inside_cost;
1634           load_outside_penalty = store_outside_cost;
1635           for (i = 0;
1636                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1637                           DR_STMT (first_store))).iterate (i, &dr);
1638                i++)
1639             if (DR_IS_READ (dr))
1640               {
1641                 load_inside_penalty += load_inside_cost;
1642                 load_outside_penalty += load_outside_cost;
1643               }
1644             else
1645               {
1646                 load_inside_penalty += store_inside_cost;
1647                 load_outside_penalty += store_outside_cost;
1648               }
1649
1650           /* Calculate the penalty for leaving DR0 unaligned (by
1651              aligning the FIRST_STORE).  */
1652           store_inside_penalty = load_inside_cost;
1653           store_outside_penalty = load_outside_cost;
1654           for (i = 0;
1655                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1656                       DR_STMT (dr0))).iterate (i, &dr);
1657                i++)
1658             if (DR_IS_READ (dr))
1659               {
1660                 store_inside_penalty += load_inside_cost;
1661                 store_outside_penalty += load_outside_cost;
1662               }
1663             else
1664               {
1665                 store_inside_penalty += store_inside_cost;
1666                 store_outside_penalty += store_outside_cost;
1667               }
1668
1669           if (load_inside_penalty > store_inside_penalty
1670               || (load_inside_penalty == store_inside_penalty
1671                   && load_outside_penalty > store_outside_penalty))
1672             dr0 = first_store;
1673         }
1674
1675       /* In case there are only loads with different unknown misalignments, use
1676          peeling only if it may help to align other accesses in the loop or
1677          if it may help improving load bandwith when we'd end up using
1678          unaligned loads.  */
1679       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1680       if (!first_store
1681           && !STMT_VINFO_SAME_ALIGN_REFS (
1682                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1683           && (vect_supportable_dr_alignment (dr0, false)
1684               != dr_unaligned_supported
1685               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1686                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1687         do_peeling = false;
1688     }
1689
1690   if (do_peeling && !dr0)
1691     {
1692       /* Peeling is possible, but there is no data access that is not supported
1693          unless aligned. So we try to choose the best possible peeling.  */
1694
1695       /* We should get here only if there are drs with known misalignment.  */
1696       gcc_assert (!all_misalignments_unknown);
1697
1698       /* Choose the best peeling from the hash table.  */
1699       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1700                                                    loop_vinfo, &npeel,
1701                                                    &body_cost_vec);
1702       if (!dr0 || !npeel)
1703         do_peeling = false;
1704     }
1705
1706   if (do_peeling)
1707     {
1708       stmt = DR_STMT (dr0);
1709       stmt_info = vinfo_for_stmt (stmt);
1710       vectype = STMT_VINFO_VECTYPE (stmt_info);
1711       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1712
1713       if (known_alignment_for_access_p (dr0))
1714         {
1715           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1716                                                 size_zero_node) < 0;
1717           if (!npeel)
1718             {
1719               /* Since it's known at compile time, compute the number of
1720                  iterations in the peeled loop (the peeling factor) for use in
1721                  updating DR_MISALIGNMENT values.  The peeling factor is the
1722                  vectorization factor minus the misalignment as an element
1723                  count.  */
1724               mis = DR_MISALIGNMENT (dr0);
1725               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1726               npeel = ((negative ? mis - nelements : nelements - mis)
1727                        & (nelements - 1));
1728             }
1729
1730           /* For interleaved data access every iteration accesses all the
1731              members of the group, therefore we divide the number of iterations
1732              by the group size.  */
1733           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1734           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1735             npeel /= GROUP_SIZE (stmt_info);
1736
1737           if (dump_enabled_p ())
1738             dump_printf_loc (MSG_NOTE, vect_location,
1739                              "Try peeling by %d\n", npeel);
1740         }
1741
1742       /* Ensure that all data refs can be vectorized after the peel.  */
1743       FOR_EACH_VEC_ELT (datarefs, i, dr)
1744         {
1745           int save_misalignment;
1746
1747           if (dr == dr0)
1748             continue;
1749
1750           stmt = DR_STMT (dr);
1751           stmt_info = vinfo_for_stmt (stmt);
1752           /* For interleaving, only the alignment of the first access
1753             matters.  */
1754           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1755               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1756             continue;
1757
1758           /* Strided accesses perform only component accesses, alignment is
1759              irrelevant for them.  */
1760           if (STMT_VINFO_STRIDED_P (stmt_info)
1761               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1762             continue;
1763
1764           save_misalignment = DR_MISALIGNMENT (dr);
1765           vect_update_misalignment_for_peel (dr, dr0, npeel);
1766           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1767           SET_DR_MISALIGNMENT (dr, save_misalignment);
1768
1769           if (!supportable_dr_alignment)
1770             {
1771               do_peeling = false;
1772               break;
1773             }
1774         }
1775
1776       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1777         {
1778           stat = vect_verify_datarefs_alignment (loop_vinfo);
1779           if (!stat)
1780             do_peeling = false;
1781           else
1782             {
1783               body_cost_vec.release ();
1784               return stat;
1785             }
1786         }
1787
1788       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1789       if (do_peeling)
1790         {
1791           unsigned max_allowed_peel
1792             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1793           if (max_allowed_peel != (unsigned)-1)
1794             {
1795               unsigned max_peel = npeel;
1796               if (max_peel == 0)
1797                 {
1798                   gimple *dr_stmt = DR_STMT (dr0);
1799                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1800                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1801                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1802                 }
1803               if (max_peel > max_allowed_peel)
1804                 {
1805                   do_peeling = false;
1806                   if (dump_enabled_p ())
1807                     dump_printf_loc (MSG_NOTE, vect_location,
1808                         "Disable peeling, max peels reached: %d\n", max_peel);
1809                 }
1810             }
1811         }
1812
1813       /* Cost model #2 - if peeling may result in a remaining loop not
1814          iterating enough to be vectorized then do not peel.  */
1815       if (do_peeling
1816           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1817         {
1818           unsigned max_peel
1819             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1820           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1821               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1822             do_peeling = false;
1823         }
1824
1825       if (do_peeling)
1826         {
1827           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1828              If the misalignment of DR_i is identical to that of dr0 then set
1829              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1830              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1831              by the peeling factor times the element size of DR_i (MOD the
1832              vectorization factor times the size).  Otherwise, the
1833              misalignment of DR_i must be set to unknown.  */
1834           FOR_EACH_VEC_ELT (datarefs, i, dr)
1835             if (dr != dr0)
1836               vect_update_misalignment_for_peel (dr, dr0, npeel);
1837
1838           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1839           if (npeel)
1840             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1841           else
1842             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1843               = DR_MISALIGNMENT (dr0);
1844           SET_DR_MISALIGNMENT (dr0, 0);
1845           if (dump_enabled_p ())
1846             {
1847               dump_printf_loc (MSG_NOTE, vect_location,
1848                                "Alignment of access forced using peeling.\n");
1849               dump_printf_loc (MSG_NOTE, vect_location,
1850                                "Peeling for alignment will be applied.\n");
1851             }
1852           /* The inside-loop cost will be accounted for in vectorizable_load
1853              and vectorizable_store correctly with adjusted alignments.
1854              Drop the body_cst_vec on the floor here.  */
1855           body_cost_vec.release ();
1856
1857           stat = vect_verify_datarefs_alignment (loop_vinfo);
1858           gcc_assert (stat);
1859           return stat;
1860         }
1861     }
1862
1863   body_cost_vec.release ();
1864
1865   /* (2) Versioning to force alignment.  */
1866
1867   /* Try versioning if:
1868      1) optimize loop for speed
1869      2) there is at least one unsupported misaligned data ref with an unknown
1870         misalignment, and
1871      3) all misaligned data refs with a known misalignment are supported, and
1872      4) the number of runtime alignment checks is within reason.  */
1873
1874   do_versioning =
1875         optimize_loop_nest_for_speed_p (loop)
1876         && (!loop->inner); /* FORNOW */
1877
1878   if (do_versioning)
1879     {
1880       FOR_EACH_VEC_ELT (datarefs, i, dr)
1881         {
1882           stmt = DR_STMT (dr);
1883           stmt_info = vinfo_for_stmt (stmt);
1884
1885           /* For interleaving, only the alignment of the first access
1886              matters.  */
1887           if (aligned_access_p (dr)
1888               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1889                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1890             continue;
1891
1892           if (STMT_VINFO_STRIDED_P (stmt_info))
1893             {
1894               /* Strided loads perform only component accesses, alignment is
1895                  irrelevant for them.  */
1896               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1897                 continue;
1898               do_versioning = false;
1899               break;
1900             }
1901
1902           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1903
1904           if (!supportable_dr_alignment)
1905             {
1906               gimple *stmt;
1907               int mask;
1908               tree vectype;
1909
1910               if (known_alignment_for_access_p (dr)
1911                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1912                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1913                 {
1914                   do_versioning = false;
1915                   break;
1916                 }
1917
1918               stmt = DR_STMT (dr);
1919               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1920               gcc_assert (vectype);
1921
1922               /* The rightmost bits of an aligned address must be zeros.
1923                  Construct the mask needed for this test.  For example,
1924                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1925                  mask must be 15 = 0xf. */
1926               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1927
1928               /* FORNOW: use the same mask to test all potentially unaligned
1929                  references in the loop.  The vectorizer currently supports
1930                  a single vector size, see the reference to
1931                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1932                  vectorization factor is computed.  */
1933               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1934                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1935               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1936               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1937                       DR_STMT (dr));
1938             }
1939         }
1940
1941       /* Versioning requires at least one misaligned data reference.  */
1942       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1943         do_versioning = false;
1944       else if (!do_versioning)
1945         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1946     }
1947
1948   if (do_versioning)
1949     {
1950       vec<gimple *> may_misalign_stmts
1951         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1952       gimple *stmt;
1953
1954       /* It can now be assumed that the data references in the statements
1955          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1956          of the loop being vectorized.  */
1957       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1958         {
1959           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1960           dr = STMT_VINFO_DATA_REF (stmt_info);
1961           SET_DR_MISALIGNMENT (dr, 0);
1962           if (dump_enabled_p ())
1963             dump_printf_loc (MSG_NOTE, vect_location,
1964                              "Alignment of access forced using versioning.\n");
1965         }
1966
1967       if (dump_enabled_p ())
1968         dump_printf_loc (MSG_NOTE, vect_location,
1969                          "Versioning for alignment will be applied.\n");
1970
1971       /* Peeling and versioning can't be done together at this time.  */
1972       gcc_assert (! (do_peeling && do_versioning));
1973
1974       stat = vect_verify_datarefs_alignment (loop_vinfo);
1975       gcc_assert (stat);
1976       return stat;
1977     }
1978
1979   /* This point is reached if neither peeling nor versioning is being done.  */
1980   gcc_assert (! (do_peeling || do_versioning));
1981
1982   stat = vect_verify_datarefs_alignment (loop_vinfo);
1983   return stat;
1984 }
1985
1986
1987 /* Function vect_find_same_alignment_drs.
1988
1989    Update group and alignment relations according to the chosen
1990    vectorization factor.  */
1991
1992 static void
1993 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1994                               loop_vec_info loop_vinfo)
1995 {
1996   unsigned int i;
1997   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1998   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1999   struct data_reference *dra = DDR_A (ddr);
2000   struct data_reference *drb = DDR_B (ddr);
2001   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2002   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2003   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2004   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2005   lambda_vector dist_v;
2006   unsigned int loop_depth;
2007
2008   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2009     return;
2010
2011   if (dra == drb)
2012     return;
2013
2014   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2015     return;
2016
2017   /* Loop-based vectorization and known data dependence.  */
2018   if (DDR_NUM_DIST_VECTS (ddr) == 0)
2019     return;
2020
2021   /* Data-dependence analysis reports a distance vector of zero
2022      for data-references that overlap only in the first iteration
2023      but have different sign step (see PR45764).
2024      So as a sanity check require equal DR_STEP.  */
2025   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2026     return;
2027
2028   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2029   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2030     {
2031       int dist = dist_v[loop_depth];
2032
2033       if (dump_enabled_p ())
2034         dump_printf_loc (MSG_NOTE, vect_location,
2035                          "dependence distance  = %d.\n", dist);
2036
2037       /* Same loop iteration.  */
2038       if (dist == 0
2039           || (dist % vectorization_factor == 0 && dra_size == drb_size))
2040         {
2041           /* Two references with distance zero have the same alignment.  */
2042           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2043           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2044           if (dump_enabled_p ())
2045             {
2046               dump_printf_loc (MSG_NOTE, vect_location,
2047                                "accesses have the same alignment.\n");
2048               dump_printf (MSG_NOTE,
2049                            "dependence distance modulo vf == 0 between ");
2050               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2051               dump_printf (MSG_NOTE,  " and ");
2052               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2053               dump_printf (MSG_NOTE, "\n");
2054             }
2055         }
2056     }
2057 }
2058
2059
2060 /* Function vect_analyze_data_refs_alignment
2061
2062    Analyze the alignment of the data-references in the loop.
2063    Return FALSE if a data reference is found that cannot be vectorized.  */
2064
2065 bool
2066 vect_analyze_data_refs_alignment (vec_info *vinfo)
2067 {
2068   if (dump_enabled_p ())
2069     dump_printf_loc (MSG_NOTE, vect_location,
2070                      "=== vect_analyze_data_refs_alignment ===\n");
2071
2072   /* Mark groups of data references with same alignment using
2073      data dependence information.  */
2074   if (is_a <loop_vec_info> (vinfo))
2075     {
2076       vec<ddr_p> ddrs = vinfo->ddrs;
2077       struct data_dependence_relation *ddr;
2078       unsigned int i;
2079
2080       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2081         vect_find_same_alignment_drs (ddr, as_a <loop_vec_info> (vinfo));
2082     }
2083
2084   if (!vect_compute_data_refs_alignment (vinfo))
2085     {
2086       if (dump_enabled_p ())
2087         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2088                          "not vectorized: can't calculate alignment "
2089                          "for data ref.\n");
2090       return false;
2091     }
2092
2093   return true;
2094 }
2095
2096
2097 /* Analyze groups of accesses: check that DR belongs to a group of
2098    accesses of legal size, step, etc.  Detect gaps, single element
2099    interleaving, and other special cases. Set grouped access info.
2100    Collect groups of strided stores for further use in SLP analysis.
2101    Worker for vect_analyze_group_access.  */
2102
2103 static bool
2104 vect_analyze_group_access_1 (struct data_reference *dr)
2105 {
2106   tree step = DR_STEP (dr);
2107   tree scalar_type = TREE_TYPE (DR_REF (dr));
2108   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2109   gimple *stmt = DR_STMT (dr);
2110   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2111   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2112   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2113   HOST_WIDE_INT dr_step = -1;
2114   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2115   bool slp_impossible = false;
2116   struct loop *loop = NULL;
2117
2118   if (loop_vinfo)
2119     loop = LOOP_VINFO_LOOP (loop_vinfo);
2120
2121   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2122      size of the interleaving group (including gaps).  */
2123   if (tree_fits_shwi_p (step))
2124     {
2125       dr_step = tree_to_shwi (step);
2126       groupsize = absu_hwi (dr_step) / type_size;
2127     }
2128   else
2129     groupsize = 0;
2130
2131   /* Not consecutive access is possible only if it is a part of interleaving.  */
2132   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2133     {
2134       /* Check if it this DR is a part of interleaving, and is a single
2135          element of the group that is accessed in the loop.  */
2136
2137       /* Gaps are supported only for loads. STEP must be a multiple of the type
2138          size.  The size of the group must be a power of 2.  */
2139       if (DR_IS_READ (dr)
2140           && (dr_step % type_size) == 0
2141           && groupsize > 0
2142           && exact_log2 (groupsize) != -1)
2143         {
2144           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2145           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2146           if (dump_enabled_p ())
2147             {
2148               dump_printf_loc (MSG_NOTE, vect_location,
2149                                "Detected single element interleaving ");
2150               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2151               dump_printf (MSG_NOTE, " step ");
2152               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2153               dump_printf (MSG_NOTE, "\n");
2154             }
2155
2156           if (loop_vinfo)
2157             {
2158               if (dump_enabled_p ())
2159                 dump_printf_loc (MSG_NOTE, vect_location,
2160                                  "Data access with gaps requires scalar "
2161                                  "epilogue loop\n");
2162               if (loop->inner)
2163                 {
2164                   if (dump_enabled_p ())
2165                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2166                                      "Peeling for outer loop is not"
2167                                      " supported\n");
2168                   return false;
2169                 }
2170
2171               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2172             }
2173
2174           return true;
2175         }
2176
2177       if (dump_enabled_p ())
2178         {
2179           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180                            "not consecutive access ");
2181           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2182         }
2183
2184       if (bb_vinfo)
2185         {
2186           /* Mark the statement as unvectorizable.  */
2187           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2188           return true;
2189         }
2190
2191       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2192       STMT_VINFO_STRIDED_P (stmt_info) = true;
2193       return true;
2194     }
2195
2196   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2197     {
2198       /* First stmt in the interleaving chain. Check the chain.  */
2199       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2200       struct data_reference *data_ref = dr;
2201       unsigned int count = 1;
2202       tree prev_init = DR_INIT (data_ref);
2203       gimple *prev = stmt;
2204       HOST_WIDE_INT diff, gaps = 0;
2205
2206       while (next)
2207         {
2208           /* Skip same data-refs.  In case that two or more stmts share
2209              data-ref (supported only for loads), we vectorize only the first
2210              stmt, and the rest get their vectorized loads from the first
2211              one.  */
2212           if (!tree_int_cst_compare (DR_INIT (data_ref),
2213                                      DR_INIT (STMT_VINFO_DATA_REF (
2214                                                    vinfo_for_stmt (next)))))
2215             {
2216               if (DR_IS_WRITE (data_ref))
2217                 {
2218                   if (dump_enabled_p ())
2219                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220                                      "Two store stmts share the same dr.\n");
2221                   return false;
2222                 }
2223
2224               if (dump_enabled_p ())
2225                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2226                                  "Two or more load stmts share the same dr.\n");
2227
2228               /* For load use the same data-ref load.  */
2229               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2230
2231               prev = next;
2232               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2233               continue;
2234             }
2235
2236           prev = next;
2237           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2238
2239           /* All group members have the same STEP by construction.  */
2240           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2241
2242           /* Check that the distance between two accesses is equal to the type
2243              size. Otherwise, we have gaps.  */
2244           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2245                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2246           if (diff != 1)
2247             {
2248               /* FORNOW: SLP of accesses with gaps is not supported.  */
2249               slp_impossible = true;
2250               if (DR_IS_WRITE (data_ref))
2251                 {
2252                   if (dump_enabled_p ())
2253                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2254                                      "interleaved store with gaps\n");
2255                   return false;
2256                 }
2257
2258               gaps += diff - 1;
2259             }
2260
2261           last_accessed_element += diff;
2262
2263           /* Store the gap from the previous member of the group. If there is no
2264              gap in the access, GROUP_GAP is always 1.  */
2265           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2266
2267           prev_init = DR_INIT (data_ref);
2268           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2269           /* Count the number of data-refs in the chain.  */
2270           count++;
2271         }
2272
2273       if (groupsize == 0)
2274         groupsize = count + gaps;
2275
2276       if (groupsize > UINT_MAX)
2277         {
2278           if (dump_enabled_p ())
2279             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280                              "group is too large\n");
2281           return false;
2282         }
2283
2284       /* Check that the size of the interleaving is equal to count for stores,
2285          i.e., that there are no gaps.  */
2286       if (groupsize != count
2287           && !DR_IS_READ (dr))
2288         {
2289           if (dump_enabled_p ())
2290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291                              "interleaved store with gaps\n");
2292           return false;
2293         }
2294
2295       /* If there is a gap after the last load in the group it is the
2296          difference between the groupsize and the last accessed
2297          element.
2298          When there is no gap, this difference should be 0.  */
2299       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2300
2301       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2302       if (dump_enabled_p ())
2303         {
2304           dump_printf_loc (MSG_NOTE, vect_location,
2305                            "Detected interleaving ");
2306           if (DR_IS_READ (dr))
2307             dump_printf (MSG_NOTE, "load ");
2308           else
2309             dump_printf (MSG_NOTE, "store ");
2310           dump_printf (MSG_NOTE, "of size %u starting with ",
2311                        (unsigned)groupsize);
2312           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2313           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2314             dump_printf_loc (MSG_NOTE, vect_location,
2315                              "There is a gap of %u elements after the group\n",
2316                              GROUP_GAP (vinfo_for_stmt (stmt)));
2317         }
2318
2319       /* SLP: create an SLP data structure for every interleaving group of
2320          stores for further analysis in vect_analyse_slp.  */
2321       if (DR_IS_WRITE (dr) && !slp_impossible)
2322         {
2323           if (loop_vinfo)
2324             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2325           if (bb_vinfo)
2326             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2327         }
2328
2329       /* If there is a gap in the end of the group or the group size cannot
2330          be made a multiple of the vector element count then we access excess
2331          elements in the last iteration and thus need to peel that off.  */
2332       if (loop_vinfo
2333           && (groupsize - last_accessed_element > 0
2334               || exact_log2 (groupsize) == -1))
2335
2336         {
2337           if (dump_enabled_p ())
2338             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339                              "Data access with gaps requires scalar "
2340                              "epilogue loop\n");
2341           if (loop->inner)
2342             {
2343               if (dump_enabled_p ())
2344                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345                                  "Peeling for outer loop is not supported\n");
2346               return false;
2347             }
2348
2349           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2350         }
2351     }
2352
2353   return true;
2354 }
2355
2356 /* Analyze groups of accesses: check that DR belongs to a group of
2357    accesses of legal size, step, etc.  Detect gaps, single element
2358    interleaving, and other special cases. Set grouped access info.
2359    Collect groups of strided stores for further use in SLP analysis.  */
2360
2361 static bool
2362 vect_analyze_group_access (struct data_reference *dr)
2363 {
2364   if (!vect_analyze_group_access_1 (dr))
2365     {
2366       /* Dissolve the group if present.  */
2367       gimple *next;
2368       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2369       while (stmt)
2370         {
2371           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2372           next = GROUP_NEXT_ELEMENT (vinfo);
2373           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2374           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2375           stmt = next;
2376         }
2377       return false;
2378     }
2379   return true;
2380 }
2381
2382 /* Analyze the access pattern of the data-reference DR.
2383    In case of non-consecutive accesses call vect_analyze_group_access() to
2384    analyze groups of accesses.  */
2385
2386 static bool
2387 vect_analyze_data_ref_access (struct data_reference *dr)
2388 {
2389   tree step = DR_STEP (dr);
2390   tree scalar_type = TREE_TYPE (DR_REF (dr));
2391   gimple *stmt = DR_STMT (dr);
2392   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2393   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2394   struct loop *loop = NULL;
2395
2396   if (loop_vinfo)
2397     loop = LOOP_VINFO_LOOP (loop_vinfo);
2398
2399   if (loop_vinfo && !step)
2400     {
2401       if (dump_enabled_p ())
2402         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2403                          "bad data-ref access in loop\n");
2404       return false;
2405     }
2406
2407   /* Allow loads with zero step in inner-loop vectorization.  */
2408   if (loop_vinfo && integer_zerop (step))
2409     {
2410       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2411       if (!nested_in_vect_loop_p (loop, stmt))
2412         return DR_IS_READ (dr);
2413       /* Allow references with zero step for outer loops marked
2414          with pragma omp simd only - it guarantees absence of
2415          loop-carried dependencies between inner loop iterations.  */
2416       if (!loop->force_vectorize)
2417         {
2418           if (dump_enabled_p ())
2419             dump_printf_loc (MSG_NOTE, vect_location,
2420                              "zero step in inner loop of nest\n");
2421           return false;
2422         }
2423     }
2424
2425   if (loop && nested_in_vect_loop_p (loop, stmt))
2426     {
2427       /* Interleaved accesses are not yet supported within outer-loop
2428         vectorization for references in the inner-loop.  */
2429       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2430
2431       /* For the rest of the analysis we use the outer-loop step.  */
2432       step = STMT_VINFO_DR_STEP (stmt_info);
2433       if (integer_zerop (step))
2434         {
2435           if (dump_enabled_p ())
2436             dump_printf_loc (MSG_NOTE, vect_location,
2437                              "zero step in outer loop.\n");
2438           return DR_IS_READ (dr);
2439         }
2440     }
2441
2442   /* Consecutive?  */
2443   if (TREE_CODE (step) == INTEGER_CST)
2444     {
2445       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2446       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2447           || (dr_step < 0
2448               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2449         {
2450           /* Mark that it is not interleaving.  */
2451           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2452           return true;
2453         }
2454     }
2455
2456   if (loop && nested_in_vect_loop_p (loop, stmt))
2457     {
2458       if (dump_enabled_p ())
2459         dump_printf_loc (MSG_NOTE, vect_location,
2460                          "grouped access in outer loop.\n");
2461       return false;
2462     }
2463
2464
2465   /* Assume this is a DR handled by non-constant strided load case.  */
2466   if (TREE_CODE (step) != INTEGER_CST)
2467     return (STMT_VINFO_STRIDED_P (stmt_info)
2468             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2469                 || vect_analyze_group_access (dr)));
2470
2471   /* Not consecutive access - check if it's a part of interleaving group.  */
2472   return vect_analyze_group_access (dr);
2473 }
2474
2475
2476
2477 /*  A helper function used in the comparator function to sort data
2478     references.  T1 and T2 are two data references to be compared.
2479     The function returns -1, 0, or 1.  */
2480
2481 static int
2482 compare_tree (tree t1, tree t2)
2483 {
2484   int i, cmp;
2485   enum tree_code code;
2486   char tclass;
2487
2488   if (t1 == t2)
2489     return 0;
2490   if (t1 == NULL)
2491     return -1;
2492   if (t2 == NULL)
2493     return 1;
2494
2495
2496   if (TREE_CODE (t1) != TREE_CODE (t2))
2497     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2498
2499   code = TREE_CODE (t1);
2500   switch (code)
2501     {
2502     /* For const values, we can just use hash values for comparisons.  */
2503     case INTEGER_CST:
2504     case REAL_CST:
2505     case FIXED_CST:
2506     case STRING_CST:
2507     case COMPLEX_CST:
2508     case VECTOR_CST:
2509       {
2510         hashval_t h1 = iterative_hash_expr (t1, 0);
2511         hashval_t h2 = iterative_hash_expr (t2, 0);
2512         if (h1 != h2)
2513           return h1 < h2 ? -1 : 1;
2514         break;
2515       }
2516
2517     case SSA_NAME:
2518       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2519       if (cmp != 0)
2520         return cmp;
2521
2522       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2523         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2524       break;
2525
2526     default:
2527       tclass = TREE_CODE_CLASS (code);
2528
2529       /* For var-decl, we could compare their UIDs.  */
2530       if (tclass == tcc_declaration)
2531         {
2532           if (DECL_UID (t1) != DECL_UID (t2))
2533             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2534           break;
2535         }
2536
2537       /* For expressions with operands, compare their operands recursively.  */
2538       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2539         {
2540           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2541           if (cmp != 0)
2542             return cmp;
2543         }
2544     }
2545
2546   return 0;
2547 }
2548
2549
2550 /* Compare two data-references DRA and DRB to group them into chunks
2551    suitable for grouping.  */
2552
2553 static int
2554 dr_group_sort_cmp (const void *dra_, const void *drb_)
2555 {
2556   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2557   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2558   int cmp;
2559
2560   /* Stabilize sort.  */
2561   if (dra == drb)
2562     return 0;
2563
2564   /* Ordering of DRs according to base.  */
2565   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2566     {
2567       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2568       if (cmp != 0)
2569         return cmp;
2570     }
2571
2572   /* And according to DR_OFFSET.  */
2573   if (!dr_equal_offsets_p (dra, drb))
2574     {
2575       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2576       if (cmp != 0)
2577         return cmp;
2578     }
2579
2580   /* Put reads before writes.  */
2581   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2582     return DR_IS_READ (dra) ? -1 : 1;
2583
2584   /* Then sort after access size.  */
2585   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2586                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2587     {
2588       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2589                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2590       if (cmp != 0)
2591         return cmp;
2592     }
2593
2594   /* And after step.  */
2595   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2596     {
2597       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2598       if (cmp != 0)
2599         return cmp;
2600     }
2601
2602   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2603   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2604   if (cmp == 0)
2605     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2606   return cmp;
2607 }
2608
2609 /* Function vect_analyze_data_ref_accesses.
2610
2611    Analyze the access pattern of all the data references in the loop.
2612
2613    FORNOW: the only access pattern that is considered vectorizable is a
2614            simple step 1 (consecutive) access.
2615
2616    FORNOW: handle only arrays and pointer accesses.  */
2617
2618 bool
2619 vect_analyze_data_ref_accesses (vec_info *vinfo)
2620 {
2621   unsigned int i;
2622   vec<data_reference_p> datarefs = vinfo->datarefs;
2623   struct data_reference *dr;
2624
2625   if (dump_enabled_p ())
2626     dump_printf_loc (MSG_NOTE, vect_location,
2627                      "=== vect_analyze_data_ref_accesses ===\n");
2628
2629   if (datarefs.is_empty ())
2630     return true;
2631
2632   /* Sort the array of datarefs to make building the interleaving chains
2633      linear.  Don't modify the original vector's order, it is needed for
2634      determining what dependencies are reversed.  */
2635   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2636   datarefs_copy.qsort (dr_group_sort_cmp);
2637
2638   /* Build the interleaving chains.  */
2639   for (i = 0; i < datarefs_copy.length () - 1;)
2640     {
2641       data_reference_p dra = datarefs_copy[i];
2642       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2643       stmt_vec_info lastinfo = NULL;
2644       for (i = i + 1; i < datarefs_copy.length (); ++i)
2645         {
2646           data_reference_p drb = datarefs_copy[i];
2647           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2648
2649           /* ???  Imperfect sorting (non-compatible types, non-modulo
2650              accesses, same accesses) can lead to a group to be artificially
2651              split here as we don't just skip over those.  If it really
2652              matters we can push those to a worklist and re-iterate
2653              over them.  The we can just skip ahead to the next DR here.  */
2654
2655           /* Check that the data-refs have same first location (except init)
2656              and they are both either store or load (not load and store,
2657              not masked loads or stores).  */
2658           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2659               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2660                                    DR_BASE_ADDRESS (drb), 0)
2661               || !dr_equal_offsets_p (dra, drb)
2662               || !gimple_assign_single_p (DR_STMT (dra))
2663               || !gimple_assign_single_p (DR_STMT (drb)))
2664             break;
2665
2666           /* Check that the data-refs have the same constant size.  */
2667           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2668           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2669           if (!tree_fits_uhwi_p (sza)
2670               || !tree_fits_uhwi_p (szb)
2671               || !tree_int_cst_equal (sza, szb))
2672             break;
2673
2674           /* Check that the data-refs have the same step.  */
2675           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2676             break;
2677
2678           /* Do not place the same access in the interleaving chain twice.  */
2679           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2680             break;
2681
2682           /* Check the types are compatible.
2683              ???  We don't distinguish this during sorting.  */
2684           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2685                                    TREE_TYPE (DR_REF (drb))))
2686             break;
2687
2688           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2689           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2690           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2691           gcc_assert (init_a < init_b);
2692
2693           /* If init_b == init_a + the size of the type * k, we have an
2694              interleaving, and DRA is accessed before DRB.  */
2695           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2696           if ((init_b - init_a) % type_size_a != 0)
2697             break;
2698
2699           /* If we have a store, the accesses are adjacent.  This splits
2700              groups into chunks we support (we don't support vectorization
2701              of stores with gaps).  */
2702           if (!DR_IS_READ (dra)
2703               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2704                                              (DR_INIT (datarefs_copy[i-1]))
2705                   != type_size_a))
2706             break;
2707
2708           /* If the step (if not zero or non-constant) is greater than the
2709              difference between data-refs' inits this splits groups into
2710              suitable sizes.  */
2711           if (tree_fits_shwi_p (DR_STEP (dra)))
2712             {
2713               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2714               if (step != 0 && step <= (init_b - init_a))
2715                 break;
2716             }
2717
2718           if (dump_enabled_p ())
2719             {
2720               dump_printf_loc (MSG_NOTE, vect_location,
2721                                "Detected interleaving ");
2722               if (DR_IS_READ (dra))
2723                 dump_printf (MSG_NOTE, "load ");
2724               else
2725                 dump_printf (MSG_NOTE, "store ");
2726               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2727               dump_printf (MSG_NOTE,  " and ");
2728               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2729               dump_printf (MSG_NOTE, "\n");
2730             }
2731
2732           /* Link the found element into the group list.  */
2733           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2734             {
2735               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2736               lastinfo = stmtinfo_a;
2737             }
2738           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2739           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2740           lastinfo = stmtinfo_b;
2741         }
2742     }
2743
2744   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2745     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2746         && !vect_analyze_data_ref_access (dr))
2747       {
2748         if (dump_enabled_p ())
2749           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2750                            "not vectorized: complicated access pattern.\n");
2751
2752         if (is_a <bb_vec_info> (vinfo))
2753           {
2754             /* Mark the statement as not vectorizable.  */
2755             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2756             continue;
2757           }
2758         else
2759           {
2760             datarefs_copy.release ();
2761             return false;
2762           }
2763       }
2764
2765   datarefs_copy.release ();
2766   return true;
2767 }
2768
2769
2770 /* Operator == between two dr_with_seg_len objects.
2771
2772    This equality operator is used to make sure two data refs
2773    are the same one so that we will consider to combine the
2774    aliasing checks of those two pairs of data dependent data
2775    refs.  */
2776
2777 static bool
2778 operator == (const dr_with_seg_len& d1,
2779              const dr_with_seg_len& d2)
2780 {
2781   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2782                           DR_BASE_ADDRESS (d2.dr), 0)
2783            && compare_tree (d1.offset, d2.offset) == 0
2784            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2785 }
2786
2787 /* Function comp_dr_with_seg_len_pair.
2788
2789    Comparison function for sorting objects of dr_with_seg_len_pair_t
2790    so that we can combine aliasing checks in one scan.  */
2791
2792 static int
2793 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2794 {
2795   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2796   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2797
2798   const dr_with_seg_len &p11 = p1->first,
2799                         &p12 = p1->second,
2800                         &p21 = p2->first,
2801                         &p22 = p2->second;
2802
2803   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2804      if a and c have the same basic address snd step, and b and d have the same
2805      address and step.  Therefore, if any a&c or b&d don't have the same address
2806      and step, we don't care the order of those two pairs after sorting.  */
2807   int comp_res;
2808
2809   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2810                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2811     return comp_res;
2812   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2813                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2814     return comp_res;
2815   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2816     return comp_res;
2817   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2818     return comp_res;
2819   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2820     return comp_res;
2821   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2822     return comp_res;
2823
2824   return 0;
2825 }
2826
2827 /* Function vect_vfa_segment_size.
2828
2829    Create an expression that computes the size of segment
2830    that will be accessed for a data reference.  The functions takes into
2831    account that realignment loads may access one more vector.
2832
2833    Input:
2834      DR: The data reference.
2835      LENGTH_FACTOR: segment length to consider.
2836
2837    Return an expression whose value is the size of segment which will be
2838    accessed by DR.  */
2839
2840 static tree
2841 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2842 {
2843   tree segment_length;
2844
2845   if (integer_zerop (DR_STEP (dr)))
2846     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2847   else
2848     segment_length = size_binop (MULT_EXPR,
2849                                  fold_convert (sizetype, DR_STEP (dr)),
2850                                  fold_convert (sizetype, length_factor));
2851
2852   if (vect_supportable_dr_alignment (dr, false)
2853         == dr_explicit_realign_optimized)
2854     {
2855       tree vector_size = TYPE_SIZE_UNIT
2856                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2857
2858       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2859     }
2860   return segment_length;
2861 }
2862
2863 /* Function vect_prune_runtime_alias_test_list.
2864
2865    Prune a list of ddrs to be tested at run-time by versioning for alias.
2866    Merge several alias checks into one if possible.
2867    Return FALSE if resulting list of ddrs is longer then allowed by
2868    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2869
2870 bool
2871 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2872 {
2873   vec<ddr_p> may_alias_ddrs =
2874     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2875   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2876     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2877   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2878   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2879
2880   ddr_p ddr;
2881   unsigned int i;
2882   tree length_factor;
2883
2884   if (dump_enabled_p ())
2885     dump_printf_loc (MSG_NOTE, vect_location,
2886                      "=== vect_prune_runtime_alias_test_list ===\n");
2887
2888   if (may_alias_ddrs.is_empty ())
2889     return true;
2890
2891   /* Basically, for each pair of dependent data refs store_ptr_0
2892      and load_ptr_0, we create an expression:
2893
2894      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2895      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2896
2897      for aliasing checks.  However, in some cases we can decrease
2898      the number of checks by combining two checks into one.  For
2899      example, suppose we have another pair of data refs store_ptr_0
2900      and load_ptr_1, and if the following condition is satisfied:
2901
2902      load_ptr_0 < load_ptr_1  &&
2903      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2904
2905      (this condition means, in each iteration of vectorized loop,
2906      the accessed memory of store_ptr_0 cannot be between the memory
2907      of load_ptr_0 and load_ptr_1.)
2908
2909      we then can use only the following expression to finish the
2910      alising checks between store_ptr_0 & load_ptr_0 and
2911      store_ptr_0 & load_ptr_1:
2912
2913      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2914      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2915
2916      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2917      same basic address.  */
2918
2919   comp_alias_ddrs.create (may_alias_ddrs.length ());
2920
2921   /* First, we collect all data ref pairs for aliasing checks.  */
2922   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2923     {
2924       struct data_reference *dr_a, *dr_b;
2925       gimple *dr_group_first_a, *dr_group_first_b;
2926       tree segment_length_a, segment_length_b;
2927       gimple *stmt_a, *stmt_b;
2928
2929       dr_a = DDR_A (ddr);
2930       stmt_a = DR_STMT (DDR_A (ddr));
2931       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2932       if (dr_group_first_a)
2933         {
2934           stmt_a = dr_group_first_a;
2935           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2936         }
2937
2938       dr_b = DDR_B (ddr);
2939       stmt_b = DR_STMT (DDR_B (ddr));
2940       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2941       if (dr_group_first_b)
2942         {
2943           stmt_b = dr_group_first_b;
2944           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2945         }
2946
2947       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2948         length_factor = scalar_loop_iters;
2949       else
2950         length_factor = size_int (vect_factor);
2951       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2952       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2953
2954       dr_with_seg_len_pair_t dr_with_seg_len_pair
2955           (dr_with_seg_len (dr_a, segment_length_a),
2956            dr_with_seg_len (dr_b, segment_length_b));
2957
2958       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2959         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2960
2961       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2962     }
2963
2964   /* Second, we sort the collected data ref pairs so that we can scan
2965      them once to combine all possible aliasing checks.  */
2966   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2967
2968   /* Third, we scan the sorted dr pairs and check if we can combine
2969      alias checks of two neighbouring dr pairs.  */
2970   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2971     {
2972       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2973       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2974                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2975                       *dr_a2 = &comp_alias_ddrs[i].first,
2976                       *dr_b2 = &comp_alias_ddrs[i].second;
2977
2978       /* Remove duplicate data ref pairs.  */
2979       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2980         {
2981           if (dump_enabled_p ())
2982             {
2983               dump_printf_loc (MSG_NOTE, vect_location,
2984                                "found equal ranges ");
2985               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2986                                  DR_REF (dr_a1->dr));
2987               dump_printf (MSG_NOTE,  ", ");
2988               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2989                                  DR_REF (dr_b1->dr));
2990               dump_printf (MSG_NOTE,  " and ");
2991               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2992                                  DR_REF (dr_a2->dr));
2993               dump_printf (MSG_NOTE,  ", ");
2994               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2995                                  DR_REF (dr_b2->dr));
2996               dump_printf (MSG_NOTE, "\n");
2997             }
2998
2999           comp_alias_ddrs.ordered_remove (i--);
3000           continue;
3001         }
3002
3003       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3004         {
3005           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3006              and DR_A1 and DR_A2 are two consecutive memrefs.  */
3007           if (*dr_a1 == *dr_a2)
3008             {
3009               std::swap (dr_a1, dr_b1);
3010               std::swap (dr_a2, dr_b2);
3011             }
3012
3013           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3014                                 DR_BASE_ADDRESS (dr_a2->dr),
3015                                 0)
3016               || !tree_fits_shwi_p (dr_a1->offset)
3017               || !tree_fits_shwi_p (dr_a2->offset))
3018             continue;
3019
3020           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
3021                                 - tree_to_shwi (dr_a1->offset));
3022
3023
3024           /* Now we check if the following condition is satisfied:
3025
3026              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3027
3028              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
3029              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3030              have to make a best estimation.  We can get the minimum value
3031              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3032              then either of the following two conditions can guarantee the
3033              one above:
3034
3035              1: DIFF <= MIN_SEG_LEN_B
3036              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3037
3038              */
3039
3040           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
3041                                           ? tree_to_shwi (dr_b1->seg_len)
3042                                           : vect_factor);
3043
3044           if (diff <= min_seg_len_b
3045               || (tree_fits_shwi_p (dr_a1->seg_len)
3046                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
3047             {
3048               if (dump_enabled_p ())
3049                 {
3050                   dump_printf_loc (MSG_NOTE, vect_location,
3051                                    "merging ranges for ");
3052                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
3053                                      DR_REF (dr_a1->dr));
3054                   dump_printf (MSG_NOTE,  ", ");
3055                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
3056                                      DR_REF (dr_b1->dr));
3057                   dump_printf (MSG_NOTE,  " and ");
3058                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
3059                                      DR_REF (dr_a2->dr));
3060                   dump_printf (MSG_NOTE,  ", ");
3061                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
3062                                      DR_REF (dr_b2->dr));
3063                   dump_printf (MSG_NOTE, "\n");
3064                 }
3065
3066               dr_a1->seg_len = size_binop (PLUS_EXPR,
3067                                            dr_a2->seg_len, size_int (diff));
3068               comp_alias_ddrs.ordered_remove (i--);
3069             }
3070         }
3071     }
3072
3073   dump_printf_loc (MSG_NOTE, vect_location,
3074                    "improved number of alias checks from %d to %d\n",
3075                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
3076   if ((int) comp_alias_ddrs.length () >
3077       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3078     return false;
3079
3080   return true;
3081 }
3082
3083 /* Check whether a non-affine read or write in stmt is suitable for gather load
3084    or scatter store and if so, return a builtin decl for that operation.  */
3085
3086 tree
3087 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3088                            tree *offp, int *scalep)
3089 {
3090   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3091   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3092   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3093   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3094   tree offtype = NULL_TREE;
3095   tree decl, base, off;
3096   machine_mode pmode;
3097   int punsignedp, reversep, pvolatilep = 0;
3098
3099   base = DR_REF (dr);
3100   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3101      see if we can use the def stmt of the address.  */
3102   if (is_gimple_call (stmt)
3103       && gimple_call_internal_p (stmt)
3104       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3105           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3106       && TREE_CODE (base) == MEM_REF
3107       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3108       && integer_zerop (TREE_OPERAND (base, 1))
3109       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3110     {
3111       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3112       if (is_gimple_assign (def_stmt)
3113           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3114         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3115     }
3116
3117   /* The gather and scatter builtins need address of the form
3118      loop_invariant + vector * {1, 2, 4, 8}
3119      or
3120      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3121      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3122      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3123      multiplications and additions in it.  To get a vector, we need
3124      a single SSA_NAME that will be defined in the loop and will
3125      contain everything that is not loop invariant and that can be
3126      vectorized.  The following code attempts to find such a preexistng
3127      SSA_NAME OFF and put the loop invariants into a tree BASE
3128      that can be gimplified before the loop.  */
3129   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3130                               &punsignedp, &reversep, &pvolatilep, false);
3131   gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3132
3133   if (TREE_CODE (base) == MEM_REF)
3134     {
3135       if (!integer_zerop (TREE_OPERAND (base, 1)))
3136         {
3137           if (off == NULL_TREE)
3138             {
3139               offset_int moff = mem_ref_offset (base);
3140               off = wide_int_to_tree (sizetype, moff);
3141             }
3142           else
3143             off = size_binop (PLUS_EXPR, off,
3144                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3145         }
3146       base = TREE_OPERAND (base, 0);
3147     }
3148   else
3149     base = build_fold_addr_expr (base);
3150
3151   if (off == NULL_TREE)
3152     off = size_zero_node;
3153
3154   /* If base is not loop invariant, either off is 0, then we start with just
3155      the constant offset in the loop invariant BASE and continue with base
3156      as OFF, otherwise give up.
3157      We could handle that case by gimplifying the addition of base + off
3158      into some SSA_NAME and use that as off, but for now punt.  */
3159   if (!expr_invariant_in_loop_p (loop, base))
3160     {
3161       if (!integer_zerop (off))
3162         return NULL_TREE;
3163       off = base;
3164       base = size_int (pbitpos / BITS_PER_UNIT);
3165     }
3166   /* Otherwise put base + constant offset into the loop invariant BASE
3167      and continue with OFF.  */
3168   else
3169     {
3170       base = fold_convert (sizetype, base);
3171       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3172     }
3173
3174   /* OFF at this point may be either a SSA_NAME or some tree expression
3175      from get_inner_reference.  Try to peel off loop invariants from it
3176      into BASE as long as possible.  */
3177   STRIP_NOPS (off);
3178   while (offtype == NULL_TREE)
3179     {
3180       enum tree_code code;
3181       tree op0, op1, add = NULL_TREE;
3182
3183       if (TREE_CODE (off) == SSA_NAME)
3184         {
3185           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3186
3187           if (expr_invariant_in_loop_p (loop, off))
3188             return NULL_TREE;
3189
3190           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3191             break;
3192
3193           op0 = gimple_assign_rhs1 (def_stmt);
3194           code = gimple_assign_rhs_code (def_stmt);
3195           op1 = gimple_assign_rhs2 (def_stmt);
3196         }
3197       else
3198         {
3199           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3200             return NULL_TREE;
3201           code = TREE_CODE (off);
3202           extract_ops_from_tree (off, &code, &op0, &op1);
3203         }
3204       switch (code)
3205         {
3206         case POINTER_PLUS_EXPR:
3207         case PLUS_EXPR:
3208           if (expr_invariant_in_loop_p (loop, op0))
3209             {
3210               add = op0;
3211               off = op1;
3212             do_add:
3213               add = fold_convert (sizetype, add);
3214               if (scale != 1)
3215                 add = size_binop (MULT_EXPR, add, size_int (scale));
3216               base = size_binop (PLUS_EXPR, base, add);
3217               continue;
3218             }
3219           if (expr_invariant_in_loop_p (loop, op1))
3220             {
3221               add = op1;
3222               off = op0;
3223               goto do_add;
3224             }
3225           break;
3226         case MINUS_EXPR:
3227           if (expr_invariant_in_loop_p (loop, op1))
3228             {
3229               add = fold_convert (sizetype, op1);
3230               add = size_binop (MINUS_EXPR, size_zero_node, add);
3231               off = op0;
3232               goto do_add;
3233             }
3234           break;
3235         case MULT_EXPR:
3236           if (scale == 1 && tree_fits_shwi_p (op1))
3237             {
3238               scale = tree_to_shwi (op1);
3239               off = op0;
3240               continue;
3241             }
3242           break;
3243         case SSA_NAME:
3244           off = op0;
3245           continue;
3246         CASE_CONVERT:
3247           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3248               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3249             break;
3250           if (TYPE_PRECISION (TREE_TYPE (op0))
3251               == TYPE_PRECISION (TREE_TYPE (off)))
3252             {
3253               off = op0;
3254               continue;
3255             }
3256           if (TYPE_PRECISION (TREE_TYPE (op0))
3257               < TYPE_PRECISION (TREE_TYPE (off)))
3258             {
3259               off = op0;
3260               offtype = TREE_TYPE (off);
3261               STRIP_NOPS (off);
3262               continue;
3263             }
3264           break;
3265         default:
3266           break;
3267         }
3268       break;
3269     }
3270
3271   /* If at the end OFF still isn't a SSA_NAME or isn't
3272      defined in the loop, punt.  */
3273   if (TREE_CODE (off) != SSA_NAME
3274       || expr_invariant_in_loop_p (loop, off))
3275     return NULL_TREE;
3276
3277   if (offtype == NULL_TREE)
3278     offtype = TREE_TYPE (off);
3279
3280   if (DR_IS_READ (dr))
3281     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3282                                              offtype, scale);
3283   else
3284     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3285                                               offtype, scale);
3286
3287   if (decl == NULL_TREE)
3288     return NULL_TREE;
3289
3290   if (basep)
3291     *basep = base;
3292   if (offp)
3293     *offp = off;
3294   if (scalep)
3295     *scalep = scale;
3296   return decl;
3297 }
3298
3299 /* Function vect_analyze_data_refs.
3300
3301   Find all the data references in the loop or basic block.
3302
3303    The general structure of the analysis of data refs in the vectorizer is as
3304    follows:
3305    1- vect_analyze_data_refs(loop/bb): call
3306       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3307       in the loop/bb and their dependences.
3308    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3309    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3310    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3311
3312 */
3313
3314 bool
3315 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3316 {
3317   struct loop *loop = NULL;
3318   unsigned int i;
3319   struct data_reference *dr;
3320   tree scalar_type;
3321
3322   if (dump_enabled_p ())
3323     dump_printf_loc (MSG_NOTE, vect_location,
3324                      "=== vect_analyze_data_refs ===\n");
3325
3326   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3327     loop = LOOP_VINFO_LOOP (loop_vinfo);
3328
3329   /* Go through the data-refs, check that the analysis succeeded.  Update
3330      pointer from stmt_vec_info struct to DR and vectype.  */
3331
3332   vec<data_reference_p> datarefs = vinfo->datarefs;
3333   FOR_EACH_VEC_ELT (datarefs, i, dr)
3334     {
3335       gimple *stmt;
3336       stmt_vec_info stmt_info;
3337       tree base, offset, init;
3338       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3339       bool simd_lane_access = false;
3340       int vf;
3341
3342 again:
3343       if (!dr || !DR_REF (dr))
3344         {
3345           if (dump_enabled_p ())
3346             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3347                              "not vectorized: unhandled data-ref\n");
3348           return false;
3349         }
3350
3351       stmt = DR_STMT (dr);
3352       stmt_info = vinfo_for_stmt (stmt);
3353
3354       /* Discard clobbers from the dataref vector.  We will remove
3355          clobber stmts during vectorization.  */
3356       if (gimple_clobber_p (stmt))
3357         {
3358           free_data_ref (dr);
3359           if (i == datarefs.length () - 1)
3360             {
3361               datarefs.pop ();
3362               break;
3363             }
3364           datarefs.ordered_remove (i);
3365           dr = datarefs[i];
3366           goto again;
3367         }
3368
3369       /* Check that analysis of the data-ref succeeded.  */
3370       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3371           || !DR_STEP (dr))
3372         {
3373           bool maybe_gather
3374             = DR_IS_READ (dr)
3375               && !TREE_THIS_VOLATILE (DR_REF (dr))
3376               && targetm.vectorize.builtin_gather != NULL;
3377           bool maybe_scatter
3378             = DR_IS_WRITE (dr)
3379               && !TREE_THIS_VOLATILE (DR_REF (dr))
3380               && targetm.vectorize.builtin_scatter != NULL;
3381           bool maybe_simd_lane_access
3382             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3383
3384           /* If target supports vector gather loads or scatter stores, or if
3385              this might be a SIMD lane access, see if they can't be used.  */
3386           if (is_a <loop_vec_info> (vinfo)
3387               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3388               && !nested_in_vect_loop_p (loop, stmt))
3389             {
3390               struct data_reference *newdr
3391                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3392                                    DR_REF (dr), stmt, maybe_scatter ? false : true);
3393               gcc_assert (newdr != NULL && DR_REF (newdr));
3394               if (DR_BASE_ADDRESS (newdr)
3395                   && DR_OFFSET (newdr)
3396                   && DR_INIT (newdr)
3397                   && DR_STEP (newdr)
3398                   && integer_zerop (DR_STEP (newdr)))
3399                 {
3400                   if (maybe_simd_lane_access)
3401                     {
3402                       tree off = DR_OFFSET (newdr);
3403                       STRIP_NOPS (off);
3404                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3405                           && TREE_CODE (off) == MULT_EXPR
3406                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3407                         {
3408                           tree step = TREE_OPERAND (off, 1);
3409                           off = TREE_OPERAND (off, 0);
3410                           STRIP_NOPS (off);
3411                           if (CONVERT_EXPR_P (off)
3412                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3413                                                                           0)))
3414                                  < TYPE_PRECISION (TREE_TYPE (off)))
3415                             off = TREE_OPERAND (off, 0);
3416                           if (TREE_CODE (off) == SSA_NAME)
3417                             {
3418                               gimple *def = SSA_NAME_DEF_STMT (off);
3419                               tree reft = TREE_TYPE (DR_REF (newdr));
3420                               if (is_gimple_call (def)
3421                                   && gimple_call_internal_p (def)
3422                                   && (gimple_call_internal_fn (def)
3423                                       == IFN_GOMP_SIMD_LANE))
3424                                 {
3425                                   tree arg = gimple_call_arg (def, 0);
3426                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3427                                   arg = SSA_NAME_VAR (arg);
3428                                   if (arg == loop->simduid
3429                                       /* For now.  */
3430                                       && tree_int_cst_equal
3431                                            (TYPE_SIZE_UNIT (reft),
3432                                             step))
3433                                     {
3434                                       DR_OFFSET (newdr) = ssize_int (0);
3435                                       DR_STEP (newdr) = step;
3436                                       DR_ALIGNED_TO (newdr)
3437                                         = size_int (BIGGEST_ALIGNMENT);
3438                                       dr = newdr;
3439                                       simd_lane_access = true;
3440                                     }
3441                                 }
3442                             }
3443                         }
3444                     }
3445                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3446                     {
3447                       dr = newdr;
3448                       if (maybe_gather)
3449                         gatherscatter = GATHER;
3450                       else
3451                         gatherscatter = SCATTER;
3452                     }
3453                 }
3454               if (gatherscatter == SG_NONE && !simd_lane_access)
3455                 free_data_ref (newdr);
3456             }
3457
3458           if (gatherscatter == SG_NONE && !simd_lane_access)
3459             {
3460               if (dump_enabled_p ())
3461                 {
3462                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3463                                    "not vectorized: data ref analysis "
3464                                    "failed ");
3465                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3466                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3467                 }
3468
3469               if (is_a <bb_vec_info> (vinfo))
3470                 break;
3471
3472               return false;
3473             }
3474         }
3475
3476       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3477         {
3478           if (dump_enabled_p ())
3479             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3480                              "not vectorized: base addr of dr is a "
3481                              "constant\n");
3482
3483           if (is_a <bb_vec_info> (vinfo))
3484             break;
3485
3486           if (gatherscatter != SG_NONE || simd_lane_access)
3487             free_data_ref (dr);
3488           return false;
3489         }
3490
3491       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3492         {
3493           if (dump_enabled_p ())
3494             {
3495               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3496                                "not vectorized: volatile type ");
3497               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3498               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3499             }
3500
3501           if (is_a <bb_vec_info> (vinfo))
3502             break;
3503
3504           return false;
3505         }
3506
3507       if (stmt_can_throw_internal (stmt))
3508         {
3509           if (dump_enabled_p ())
3510             {
3511               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3512                                "not vectorized: statement can throw an "
3513                                "exception ");
3514               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3515               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3516             }
3517
3518           if (is_a <bb_vec_info> (vinfo))
3519             break;
3520
3521           if (gatherscatter != SG_NONE || simd_lane_access)
3522             free_data_ref (dr);
3523           return false;
3524         }
3525
3526       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3527           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3528         {
3529           if (dump_enabled_p ())
3530             {
3531               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3532                                "not vectorized: statement is bitfield "
3533                                "access ");
3534               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3535               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3536             }
3537
3538           if (is_a <bb_vec_info> (vinfo))
3539             break;
3540
3541           if (gatherscatter != SG_NONE || simd_lane_access)
3542             free_data_ref (dr);
3543           return false;
3544         }
3545
3546       base = unshare_expr (DR_BASE_ADDRESS (dr));
3547       offset = unshare_expr (DR_OFFSET (dr));
3548       init = unshare_expr (DR_INIT (dr));
3549
3550       if (is_gimple_call (stmt)
3551           && (!gimple_call_internal_p (stmt)
3552               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3553                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3554         {
3555           if (dump_enabled_p ())
3556             {
3557               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3558                                "not vectorized: dr in a call ");
3559               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3560               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3561             }
3562
3563           if (is_a <bb_vec_info> (vinfo))
3564             break;
3565
3566           if (gatherscatter != SG_NONE || simd_lane_access)
3567             free_data_ref (dr);
3568           return false;
3569         }
3570
3571       /* Update DR field in stmt_vec_info struct.  */
3572
3573       /* If the dataref is in an inner-loop of the loop that is considered for
3574          for vectorization, we also want to analyze the access relative to
3575          the outer-loop (DR contains information only relative to the
3576          inner-most enclosing loop).  We do that by building a reference to the
3577          first location accessed by the inner-loop, and analyze it relative to
3578          the outer-loop.  */
3579       if (loop && nested_in_vect_loop_p (loop, stmt))
3580         {
3581           tree outer_step, outer_base, outer_init;
3582           HOST_WIDE_INT pbitsize, pbitpos;
3583           tree poffset;
3584           machine_mode pmode;
3585           int punsignedp, preversep, pvolatilep;
3586           affine_iv base_iv, offset_iv;
3587           tree dinit;
3588
3589           /* Build a reference to the first location accessed by the
3590              inner-loop: *(BASE+INIT).  (The first location is actually
3591              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3592           tree inner_base = build_fold_indirect_ref
3593                                 (fold_build_pointer_plus (base, init));
3594
3595           if (dump_enabled_p ())
3596             {
3597               dump_printf_loc (MSG_NOTE, vect_location,
3598                                "analyze in outer-loop: ");
3599               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3600               dump_printf (MSG_NOTE, "\n");
3601             }
3602
3603           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3604                                             &poffset, &pmode, &punsignedp,
3605                                             &preversep, &pvolatilep, false);
3606           gcc_assert (outer_base != NULL_TREE);
3607
3608           if (pbitpos % BITS_PER_UNIT != 0)
3609             {
3610               if (dump_enabled_p ())
3611                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3612                                  "failed: bit offset alignment.\n");
3613               return false;
3614             }
3615
3616           if (preversep)
3617             {
3618               if (dump_enabled_p ())
3619                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3620                                  "failed: reverse storage order.\n");
3621               return false;
3622             }
3623
3624           outer_base = build_fold_addr_expr (outer_base);
3625           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3626                           &base_iv, false))
3627             {
3628               if (dump_enabled_p ())
3629                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3630                                  "failed: evolution of base is not affine.\n");
3631               return false;
3632             }
3633
3634           if (offset)
3635             {
3636               if (poffset)
3637                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3638                                        poffset);
3639               else
3640                 poffset = offset;
3641             }
3642
3643           if (!poffset)
3644             {
3645               offset_iv.base = ssize_int (0);
3646               offset_iv.step = ssize_int (0);
3647             }
3648           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3649                                &offset_iv, false))
3650             {
3651               if (dump_enabled_p ())
3652                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3653                                  "evolution of offset is not affine.\n");
3654               return false;
3655             }
3656
3657           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3658           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3659           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3660           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3661           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3662
3663           outer_step = size_binop (PLUS_EXPR,
3664                                 fold_convert (ssizetype, base_iv.step),
3665                                 fold_convert (ssizetype, offset_iv.step));
3666
3667           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3668           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3669           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3670           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3671           STMT_VINFO_DR_OFFSET (stmt_info) =
3672                                 fold_convert (ssizetype, offset_iv.base);
3673           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3674                                 size_int (highest_pow2_factor (offset_iv.base));
3675
3676           if (dump_enabled_p ())
3677             {
3678               dump_printf_loc (MSG_NOTE, vect_location,
3679                                "\touter base_address: ");
3680               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3681                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3682               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3683               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3684                                  STMT_VINFO_DR_OFFSET (stmt_info));
3685               dump_printf (MSG_NOTE,
3686                            "\n\touter constant offset from base address: ");
3687               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3688                                  STMT_VINFO_DR_INIT (stmt_info));
3689               dump_printf (MSG_NOTE, "\n\touter step: ");
3690               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3691                                  STMT_VINFO_DR_STEP (stmt_info));
3692               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3693               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3694                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3695               dump_printf (MSG_NOTE, "\n");
3696             }
3697         }
3698
3699       if (STMT_VINFO_DATA_REF (stmt_info))
3700         {
3701           if (dump_enabled_p ())
3702             {
3703               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3704                                "not vectorized: more than one data ref "
3705                                "in stmt: ");
3706               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3707               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3708             }
3709
3710           if (is_a <bb_vec_info> (vinfo))
3711             break;
3712
3713           if (gatherscatter != SG_NONE || simd_lane_access)
3714             free_data_ref (dr);
3715           return false;
3716         }
3717
3718       STMT_VINFO_DATA_REF (stmt_info) = dr;
3719       if (simd_lane_access)
3720         {
3721           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3722           free_data_ref (datarefs[i]);
3723           datarefs[i] = dr;
3724         }
3725
3726       /* Set vectype for STMT.  */
3727       scalar_type = TREE_TYPE (DR_REF (dr));
3728       STMT_VINFO_VECTYPE (stmt_info)
3729         = get_vectype_for_scalar_type (scalar_type);
3730       if (!STMT_VINFO_VECTYPE (stmt_info))
3731         {
3732           if (dump_enabled_p ())
3733             {
3734               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3735                                "not vectorized: no vectype for stmt: ");
3736               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3737               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3738               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3739                                  scalar_type);
3740               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3741             }
3742
3743           if (is_a <bb_vec_info> (vinfo))
3744             {
3745               /* No vector type is fine, the ref can still participate
3746                  in dependence analysis, we just can't vectorize it.  */
3747               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3748               continue;
3749             }
3750
3751           if (gatherscatter != SG_NONE || simd_lane_access)
3752             {
3753               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3754               if (gatherscatter != SG_NONE)
3755                 free_data_ref (dr);
3756             }
3757           return false;
3758         }
3759       else
3760         {
3761           if (dump_enabled_p ())
3762             {
3763               dump_printf_loc (MSG_NOTE, vect_location,
3764                                "got vectype for stmt: ");
3765               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3766               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3767                                  STMT_VINFO_VECTYPE (stmt_info));
3768               dump_printf (MSG_NOTE, "\n");
3769             }
3770         }
3771
3772       /* Adjust the minimal vectorization factor according to the
3773          vector type.  */
3774       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3775       if (vf > *min_vf)
3776         *min_vf = vf;
3777
3778       if (gatherscatter != SG_NONE)
3779         {
3780           tree off;
3781           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3782                                           NULL, &off, NULL)
3783               || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3784             {
3785               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3786               free_data_ref (dr);
3787               if (dump_enabled_p ())
3788                 {
3789                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3790                                    (gatherscatter == GATHER) ?
3791                                    "not vectorized: not suitable for gather "
3792                                    "load " :
3793                                    "not vectorized: not suitable for scatter "
3794                                    "store ");
3795                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3796                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3797                 }
3798               return false;
3799             }
3800
3801           datarefs[i] = dr;
3802           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3803         }
3804
3805       else if (is_a <loop_vec_info> (vinfo)
3806                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3807         {
3808           if (nested_in_vect_loop_p (loop, stmt))
3809             {
3810               if (dump_enabled_p ())
3811                 {
3812                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813                                    "not vectorized: not suitable for strided "
3814                                    "load ");
3815                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3816                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3817                 }
3818               return false;
3819             }
3820           STMT_VINFO_STRIDED_P (stmt_info) = true;
3821         }
3822     }
3823
3824   /* If we stopped analysis at the first dataref we could not analyze
3825      when trying to vectorize a basic-block mark the rest of the datarefs
3826      as not vectorizable and truncate the vector of datarefs.  That
3827      avoids spending useless time in analyzing their dependence.  */
3828   if (i != datarefs.length ())
3829     {
3830       gcc_assert (is_a <bb_vec_info> (vinfo));
3831       for (unsigned j = i; j < datarefs.length (); ++j)
3832         {
3833           data_reference_p dr = datarefs[j];
3834           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3835           free_data_ref (dr);
3836         }
3837       datarefs.truncate (i);
3838     }
3839
3840   return true;
3841 }
3842
3843
3844 /* Function vect_get_new_vect_var.
3845
3846    Returns a name for a new variable.  The current naming scheme appends the
3847    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3848    the name of vectorizer generated variables, and appends that to NAME if
3849    provided.  */
3850
3851 tree
3852 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3853 {
3854   const char *prefix;
3855   tree new_vect_var;
3856
3857   switch (var_kind)
3858   {
3859   case vect_simple_var:
3860     prefix = "vect";
3861     break;
3862   case vect_scalar_var:
3863     prefix = "stmp";
3864     break;
3865   case vect_pointer_var:
3866     prefix = "vectp";
3867     break;
3868   default:
3869     gcc_unreachable ();
3870   }
3871
3872   if (name)
3873     {
3874       char* tmp = concat (prefix, "_", name, NULL);
3875       new_vect_var = create_tmp_reg (type, tmp);
3876       free (tmp);
3877     }
3878   else
3879     new_vect_var = create_tmp_reg (type, prefix);
3880
3881   return new_vect_var;
3882 }
3883
3884 /* Like vect_get_new_vect_var but return an SSA name.  */
3885
3886 tree
3887 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3888 {
3889   const char *prefix;
3890   tree new_vect_var;
3891
3892   switch (var_kind)
3893   {
3894   case vect_simple_var:
3895     prefix = "vect";
3896     break;
3897   case vect_scalar_var:
3898     prefix = "stmp";
3899     break;
3900   case vect_pointer_var:
3901     prefix = "vectp";
3902     break;
3903   default:
3904     gcc_unreachable ();
3905   }
3906
3907   if (name)
3908     {
3909       char* tmp = concat (prefix, "_", name, NULL);
3910       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3911       free (tmp);
3912     }
3913   else
3914     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3915
3916   return new_vect_var;
3917 }
3918
3919 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3920
3921 static void
3922 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3923                                   stmt_vec_info stmt_info)
3924 {
3925   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3926   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3927   int misalign = DR_MISALIGNMENT (dr);
3928   if (misalign == -1)
3929     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3930   else
3931     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3932 }
3933
3934 /* Function vect_create_addr_base_for_vector_ref.
3935
3936    Create an expression that computes the address of the first memory location
3937    that will be accessed for a data reference.
3938
3939    Input:
3940    STMT: The statement containing the data reference.
3941    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3942    OFFSET: Optional. If supplied, it is be added to the initial address.
3943    LOOP:    Specify relative to which loop-nest should the address be computed.
3944             For example, when the dataref is in an inner-loop nested in an
3945             outer-loop that is now being vectorized, LOOP can be either the
3946             outer-loop, or the inner-loop.  The first memory location accessed
3947             by the following dataref ('in' points to short):
3948
3949                 for (i=0; i<N; i++)
3950                    for (j=0; j<M; j++)
3951                      s += in[i+j]
3952
3953             is as follows:
3954             if LOOP=i_loop:     &in             (relative to i_loop)
3955             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3956    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3957             initial address.  Unlike OFFSET, which is number of elements to
3958             be added, BYTE_OFFSET is measured in bytes.
3959
3960    Output:
3961    1. Return an SSA_NAME whose value is the address of the memory location of
3962       the first vector of the data reference.
3963    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3964       these statement(s) which define the returned SSA_NAME.
3965
3966    FORNOW: We are only handling array accesses with step 1.  */
3967
3968 tree
3969 vect_create_addr_base_for_vector_ref (gimple *stmt,
3970                                       gimple_seq *new_stmt_list,
3971                                       tree offset,
3972                                       struct loop *loop,
3973                                       tree byte_offset)
3974 {
3975   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3976   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3977   tree data_ref_base;
3978   const char *base_name;
3979   tree addr_base;
3980   tree dest;
3981   gimple_seq seq = NULL;
3982   tree base_offset;
3983   tree init;
3984   tree vect_ptr_type;
3985   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3986   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3987
3988   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3989     {
3990       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3991
3992       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3993
3994       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3995       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3996       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3997     }
3998   else
3999     {
4000       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4001       base_offset = unshare_expr (DR_OFFSET (dr));
4002       init = unshare_expr (DR_INIT (dr));
4003     }
4004
4005   if (loop_vinfo)
4006     base_name = get_name (data_ref_base);
4007   else
4008     {
4009       base_offset = ssize_int (0);
4010       init = ssize_int (0);
4011       base_name = get_name (DR_REF (dr));
4012     }
4013
4014   /* Create base_offset */
4015   base_offset = size_binop (PLUS_EXPR,
4016                             fold_convert (sizetype, base_offset),
4017                             fold_convert (sizetype, init));
4018
4019   if (offset)
4020     {
4021       offset = fold_build2 (MULT_EXPR, sizetype,
4022                             fold_convert (sizetype, offset), step);
4023       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4024                                  base_offset, offset);
4025     }
4026   if (byte_offset)
4027     {
4028       byte_offset = fold_convert (sizetype, byte_offset);
4029       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4030                                  base_offset, byte_offset);
4031     }
4032
4033   /* base + base_offset */
4034   if (loop_vinfo)
4035     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4036   else
4037     {
4038       addr_base = build1 (ADDR_EXPR,
4039                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4040                           unshare_expr (DR_REF (dr)));
4041     }
4042
4043   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4044   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4045   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4046   gimple_seq_add_seq (new_stmt_list, seq);
4047
4048   if (DR_PTR_INFO (dr)
4049       && TREE_CODE (addr_base) == SSA_NAME
4050       && !SSA_NAME_PTR_INFO (addr_base))
4051     {
4052       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4053       if (offset || byte_offset)
4054         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4055     }
4056
4057   if (dump_enabled_p ())
4058     {
4059       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4060       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4061       dump_printf (MSG_NOTE, "\n");
4062     }
4063
4064   return addr_base;
4065 }
4066
4067
4068 /* Function vect_create_data_ref_ptr.
4069
4070    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4071    location accessed in the loop by STMT, along with the def-use update
4072    chain to appropriately advance the pointer through the loop iterations.
4073    Also set aliasing information for the pointer.  This pointer is used by
4074    the callers to this function to create a memory reference expression for
4075    vector load/store access.
4076
4077    Input:
4078    1. STMT: a stmt that references memory. Expected to be of the form
4079          GIMPLE_ASSIGN <name, data-ref> or
4080          GIMPLE_ASSIGN <data-ref, name>.
4081    2. AGGR_TYPE: the type of the reference, which should be either a vector
4082         or an array.
4083    3. AT_LOOP: the loop where the vector memref is to be created.
4084    4. OFFSET (optional): an offset to be added to the initial address accessed
4085         by the data-ref in STMT.
4086    5. BSI: location where the new stmts are to be placed if there is no loop
4087    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4088         pointing to the initial address.
4089    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4090         to the initial address accessed by the data-ref in STMT.  This is
4091         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4092         in bytes.
4093
4094    Output:
4095    1. Declare a new ptr to vector_type, and have it point to the base of the
4096       data reference (initial addressed accessed by the data reference).
4097       For example, for vector of type V8HI, the following code is generated:
4098
4099       v8hi *ap;
4100       ap = (v8hi *)initial_address;
4101
4102       if OFFSET is not supplied:
4103          initial_address = &a[init];
4104       if OFFSET is supplied:
4105          initial_address = &a[init + OFFSET];
4106       if BYTE_OFFSET is supplied:
4107          initial_address = &a[init] + BYTE_OFFSET;
4108
4109       Return the initial_address in INITIAL_ADDRESS.
4110
4111    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4112       update the pointer in each iteration of the loop.
4113
4114       Return the increment stmt that updates the pointer in PTR_INCR.
4115
4116    3. Set INV_P to true if the access pattern of the data reference in the
4117       vectorized loop is invariant.  Set it to false otherwise.
4118
4119    4. Return the pointer.  */
4120
4121 tree
4122 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4123                           tree offset, tree *initial_address,
4124                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4125                           bool only_init, bool *inv_p, tree byte_offset)
4126 {
4127   const char *base_name;
4128   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4129   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4130   struct loop *loop = NULL;
4131   bool nested_in_vect_loop = false;
4132   struct loop *containing_loop = NULL;
4133   tree aggr_ptr_type;
4134   tree aggr_ptr;
4135   tree new_temp;
4136   gimple_seq new_stmt_list = NULL;
4137   edge pe = NULL;
4138   basic_block new_bb;
4139   tree aggr_ptr_init;
4140   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4141   tree aptr;
4142   gimple_stmt_iterator incr_gsi;
4143   bool insert_after;
4144   tree indx_before_incr, indx_after_incr;
4145   gimple *incr;
4146   tree step;
4147   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4148
4149   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4150               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4151
4152   if (loop_vinfo)
4153     {
4154       loop = LOOP_VINFO_LOOP (loop_vinfo);
4155       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4156       containing_loop = (gimple_bb (stmt))->loop_father;
4157       pe = loop_preheader_edge (loop);
4158     }
4159   else
4160     {
4161       gcc_assert (bb_vinfo);
4162       only_init = true;
4163       *ptr_incr = NULL;
4164     }
4165
4166   /* Check the step (evolution) of the load in LOOP, and record
4167      whether it's invariant.  */
4168   if (nested_in_vect_loop)
4169     step = STMT_VINFO_DR_STEP (stmt_info);
4170   else
4171     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4172
4173   if (integer_zerop (step))
4174     *inv_p = true;
4175   else
4176     *inv_p = false;
4177
4178   /* Create an expression for the first address accessed by this load
4179      in LOOP.  */
4180   base_name = get_name (DR_BASE_ADDRESS (dr));
4181
4182   if (dump_enabled_p ())
4183     {
4184       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4185       dump_printf_loc (MSG_NOTE, vect_location,
4186                        "create %s-pointer variable to type: ",
4187                        get_tree_code_name (TREE_CODE (aggr_type)));
4188       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4189       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4190         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4191       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4192         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4193       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4194         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4195       else
4196         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4197       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4198       dump_printf (MSG_NOTE, "\n");
4199     }
4200
4201   /* (1) Create the new aggregate-pointer variable.
4202      Vector and array types inherit the alias set of their component
4203      type by default so we need to use a ref-all pointer if the data
4204      reference does not conflict with the created aggregated data
4205      reference because it is not addressable.  */
4206   bool need_ref_all = false;
4207   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4208                               get_alias_set (DR_REF (dr))))
4209     need_ref_all = true;
4210   /* Likewise for any of the data references in the stmt group.  */
4211   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4212     {
4213       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4214       do
4215         {
4216           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4217           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4218           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4219                                       get_alias_set (DR_REF (sdr))))
4220             {
4221               need_ref_all = true;
4222               break;
4223             }
4224           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4225         }
4226       while (orig_stmt);
4227     }
4228   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4229                                                need_ref_all);
4230   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4231
4232
4233   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4234      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4235      def-use update cycles for the pointer: one relative to the outer-loop
4236      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4237      to the inner-loop (which is the inner-most loop containing the dataref),
4238      and this is done be step (5) below.
4239
4240      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4241      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4242      redundant.  Steps (3),(4) create the following:
4243
4244         vp0 = &base_addr;
4245         LOOP:   vp1 = phi(vp0,vp2)
4246                 ...
4247                 ...
4248                 vp2 = vp1 + step
4249                 goto LOOP
4250
4251      If there is an inner-loop nested in loop, then step (5) will also be
4252      applied, and an additional update in the inner-loop will be created:
4253
4254         vp0 = &base_addr;
4255         LOOP:   vp1 = phi(vp0,vp2)
4256                 ...
4257         inner:     vp3 = phi(vp1,vp4)
4258                    vp4 = vp3 + inner_step
4259                    if () goto inner
4260                 ...
4261                 vp2 = vp1 + step
4262                 if () goto LOOP   */
4263
4264   /* (2) Calculate the initial address of the aggregate-pointer, and set
4265      the aggregate-pointer to point to it before the loop.  */
4266
4267   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4268
4269   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4270                                                    offset, loop, byte_offset);
4271   if (new_stmt_list)
4272     {
4273       if (pe)
4274         {
4275           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4276           gcc_assert (!new_bb);
4277         }
4278       else
4279         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4280     }
4281
4282   *initial_address = new_temp;
4283   aggr_ptr_init = new_temp;
4284
4285   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4286      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4287      inner-loop nested in LOOP (during outer-loop vectorization).  */
4288
4289   /* No update in loop is required.  */
4290   if (only_init && (!loop_vinfo || at_loop == loop))
4291     aptr = aggr_ptr_init;
4292   else
4293     {
4294       /* The step of the aggregate pointer is the type size.  */
4295       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4296       /* One exception to the above is when the scalar step of the load in
4297          LOOP is zero. In this case the step here is also zero.  */
4298       if (*inv_p)
4299         iv_step = size_zero_node;
4300       else if (tree_int_cst_sgn (step) == -1)
4301         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4302
4303       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4304
4305       create_iv (aggr_ptr_init,
4306                  fold_convert (aggr_ptr_type, iv_step),
4307                  aggr_ptr, loop, &incr_gsi, insert_after,
4308                  &indx_before_incr, &indx_after_incr);
4309       incr = gsi_stmt (incr_gsi);
4310       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4311
4312       /* Copy the points-to information if it exists. */
4313       if (DR_PTR_INFO (dr))
4314         {
4315           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4316           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4317         }
4318       if (ptr_incr)
4319         *ptr_incr = incr;
4320
4321       aptr = indx_before_incr;
4322     }
4323
4324   if (!nested_in_vect_loop || only_init)
4325     return aptr;
4326
4327
4328   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4329      nested in LOOP, if exists.  */
4330
4331   gcc_assert (nested_in_vect_loop);
4332   if (!only_init)
4333     {
4334       standard_iv_increment_position (containing_loop, &incr_gsi,
4335                                       &insert_after);
4336       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4337                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4338                  &indx_after_incr);
4339       incr = gsi_stmt (incr_gsi);
4340       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4341
4342       /* Copy the points-to information if it exists. */
4343       if (DR_PTR_INFO (dr))
4344         {
4345           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4346           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4347         }
4348       if (ptr_incr)
4349         *ptr_incr = incr;
4350
4351       return indx_before_incr;
4352     }
4353   else
4354     gcc_unreachable ();
4355 }
4356
4357
4358 /* Function bump_vector_ptr
4359
4360    Increment a pointer (to a vector type) by vector-size. If requested,
4361    i.e. if PTR-INCR is given, then also connect the new increment stmt
4362    to the existing def-use update-chain of the pointer, by modifying
4363    the PTR_INCR as illustrated below:
4364
4365    The pointer def-use update-chain before this function:
4366                         DATAREF_PTR = phi (p_0, p_2)
4367                         ....
4368         PTR_INCR:       p_2 = DATAREF_PTR + step
4369
4370    The pointer def-use update-chain after this function:
4371                         DATAREF_PTR = phi (p_0, p_2)
4372                         ....
4373                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4374                         ....
4375         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4376
4377    Input:
4378    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4379                  in the loop.
4380    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4381               the loop.  The increment amount across iterations is expected
4382               to be vector_size.
4383    BSI - location where the new update stmt is to be placed.
4384    STMT - the original scalar memory-access stmt that is being vectorized.
4385    BUMP - optional. The offset by which to bump the pointer. If not given,
4386           the offset is assumed to be vector_size.
4387
4388    Output: Return NEW_DATAREF_PTR as illustrated above.
4389
4390 */
4391
4392 tree
4393 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4394                  gimple *stmt, tree bump)
4395 {
4396   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4397   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4398   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4399   tree update = TYPE_SIZE_UNIT (vectype);
4400   gassign *incr_stmt;
4401   ssa_op_iter iter;
4402   use_operand_p use_p;
4403   tree new_dataref_ptr;
4404
4405   if (bump)
4406     update = bump;
4407
4408   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4409     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4410   else
4411     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4412   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4413                                    dataref_ptr, update);
4414   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4415
4416   /* Copy the points-to information if it exists. */
4417   if (DR_PTR_INFO (dr))
4418     {
4419       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4420       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4421     }
4422
4423   if (!ptr_incr)
4424     return new_dataref_ptr;
4425
4426   /* Update the vector-pointer's cross-iteration increment.  */
4427   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4428     {
4429       tree use = USE_FROM_PTR (use_p);
4430
4431       if (use == dataref_ptr)
4432         SET_USE (use_p, new_dataref_ptr);
4433       else
4434         gcc_assert (tree_int_cst_compare (use, update) == 0);
4435     }
4436
4437   return new_dataref_ptr;
4438 }
4439
4440
4441 /* Function vect_create_destination_var.
4442
4443    Create a new temporary of type VECTYPE.  */
4444
4445 tree
4446 vect_create_destination_var (tree scalar_dest, tree vectype)
4447 {
4448   tree vec_dest;
4449   const char *name;
4450   char *new_name;
4451   tree type;
4452   enum vect_var_kind kind;
4453
4454   kind = vectype ? vect_simple_var : vect_scalar_var;
4455   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4456
4457   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4458
4459   name = get_name (scalar_dest);
4460   if (name)
4461     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4462   else
4463     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4464   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4465   free (new_name);
4466
4467   return vec_dest;
4468 }
4469
4470 /* Function vect_grouped_store_supported.
4471
4472    Returns TRUE if interleave high and interleave low permutations
4473    are supported, and FALSE otherwise.  */
4474
4475 bool
4476 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4477 {
4478   machine_mode mode = TYPE_MODE (vectype);
4479
4480   /* vect_permute_store_chain requires the group size to be equal to 3 or
4481      be a power of two.  */
4482   if (count != 3 && exact_log2 (count) == -1)
4483     {
4484       if (dump_enabled_p ())
4485         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4486                          "the size of the group of accesses"
4487                          " is not a power of 2 or not eqaul to 3\n");
4488       return false;
4489     }
4490
4491   /* Check that the permutation is supported.  */
4492   if (VECTOR_MODE_P (mode))
4493     {
4494       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4495       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4496
4497       if (count == 3)
4498         {
4499           unsigned int j0 = 0, j1 = 0, j2 = 0;
4500           unsigned int i, j;
4501
4502           for (j = 0; j < 3; j++)
4503             {
4504               int nelt0 = ((3 - j) * nelt) % 3;
4505               int nelt1 = ((3 - j) * nelt + 1) % 3;
4506               int nelt2 = ((3 - j) * nelt + 2) % 3;
4507               for (i = 0; i < nelt; i++)
4508                 {
4509                   if (3 * i + nelt0 < nelt)
4510                     sel[3 * i + nelt0] = j0++;
4511                   if (3 * i + nelt1 < nelt)
4512                     sel[3 * i + nelt1] = nelt + j1++;
4513                   if (3 * i + nelt2 < nelt)
4514                     sel[3 * i + nelt2] = 0;
4515                 }
4516               if (!can_vec_perm_p (mode, false, sel))
4517                 {
4518                   if (dump_enabled_p ())
4519                     dump_printf (MSG_MISSED_OPTIMIZATION,
4520                                  "permutaion op not supported by target.\n");
4521                   return false;
4522                 }
4523
4524               for (i = 0; i < nelt; i++)
4525                 {
4526                   if (3 * i + nelt0 < nelt)
4527                     sel[3 * i + nelt0] = 3 * i + nelt0;
4528                   if (3 * i + nelt1 < nelt)
4529                     sel[3 * i + nelt1] = 3 * i + nelt1;
4530                   if (3 * i + nelt2 < nelt)
4531                     sel[3 * i + nelt2] = nelt + j2++;
4532                 }
4533               if (!can_vec_perm_p (mode, false, sel))
4534                 {
4535                   if (dump_enabled_p ())
4536                     dump_printf (MSG_MISSED_OPTIMIZATION,
4537                                  "permutaion op not supported by target.\n");
4538                   return false;
4539                 }
4540             }
4541           return true;
4542         }
4543       else
4544         {
4545           /* If length is not equal to 3 then only power of 2 is supported.  */
4546           gcc_assert (exact_log2 (count) != -1);
4547
4548           for (i = 0; i < nelt / 2; i++)
4549             {
4550               sel[i * 2] = i;
4551               sel[i * 2 + 1] = i + nelt;
4552             }
4553             if (can_vec_perm_p (mode, false, sel))
4554               {
4555                 for (i = 0; i < nelt; i++)
4556                   sel[i] += nelt / 2;
4557                 if (can_vec_perm_p (mode, false, sel))
4558                   return true;
4559               }
4560         }
4561     }
4562
4563   if (dump_enabled_p ())
4564     dump_printf (MSG_MISSED_OPTIMIZATION,
4565                  "permutaion op not supported by target.\n");
4566   return false;
4567 }
4568
4569
4570 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4571    type VECTYPE.  */
4572
4573 bool
4574 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4575 {
4576   return vect_lanes_optab_supported_p ("vec_store_lanes",
4577                                        vec_store_lanes_optab,
4578                                        vectype, count);
4579 }
4580
4581
4582 /* Function vect_permute_store_chain.
4583
4584    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4585    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4586    the data correctly for the stores.  Return the final references for stores
4587    in RESULT_CHAIN.
4588
4589    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4590    The input is 4 vectors each containing 8 elements.  We assign a number to
4591    each element, the input sequence is:
4592
4593    1st vec:   0  1  2  3  4  5  6  7
4594    2nd vec:   8  9 10 11 12 13 14 15
4595    3rd vec:  16 17 18 19 20 21 22 23
4596    4th vec:  24 25 26 27 28 29 30 31
4597
4598    The output sequence should be:
4599
4600    1st vec:  0  8 16 24  1  9 17 25
4601    2nd vec:  2 10 18 26  3 11 19 27
4602    3rd vec:  4 12 20 28  5 13 21 30
4603    4th vec:  6 14 22 30  7 15 23 31
4604
4605    i.e., we interleave the contents of the four vectors in their order.
4606
4607    We use interleave_high/low instructions to create such output.  The input of
4608    each interleave_high/low operation is two vectors:
4609    1st vec    2nd vec
4610    0 1 2 3    4 5 6 7
4611    the even elements of the result vector are obtained left-to-right from the
4612    high/low elements of the first vector.  The odd elements of the result are
4613    obtained left-to-right from the high/low elements of the second vector.
4614    The output of interleave_high will be:   0 4 1 5
4615    and of interleave_low:                   2 6 3 7
4616
4617
4618    The permutation is done in log LENGTH stages.  In each stage interleave_high
4619    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4620    where the first argument is taken from the first half of DR_CHAIN and the
4621    second argument from it's second half.
4622    In our example,
4623
4624    I1: interleave_high (1st vec, 3rd vec)
4625    I2: interleave_low (1st vec, 3rd vec)
4626    I3: interleave_high (2nd vec, 4th vec)
4627    I4: interleave_low (2nd vec, 4th vec)
4628
4629    The output for the first stage is:
4630
4631    I1:  0 16  1 17  2 18  3 19
4632    I2:  4 20  5 21  6 22  7 23
4633    I3:  8 24  9 25 10 26 11 27
4634    I4: 12 28 13 29 14 30 15 31
4635
4636    The output of the second stage, i.e. the final result is:
4637
4638    I1:  0  8 16 24  1  9 17 25
4639    I2:  2 10 18 26  3 11 19 27
4640    I3:  4 12 20 28  5 13 21 30
4641    I4:  6 14 22 30  7 15 23 31.  */
4642
4643 void
4644 vect_permute_store_chain (vec<tree> dr_chain,
4645                           unsigned int length,
4646                           gimple *stmt,
4647                           gimple_stmt_iterator *gsi,
4648                           vec<tree> *result_chain)
4649 {
4650   tree vect1, vect2, high, low;
4651   gimple *perm_stmt;
4652   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4653   tree perm_mask_low, perm_mask_high;
4654   tree data_ref;
4655   tree perm3_mask_low, perm3_mask_high;
4656   unsigned int i, n, log_length = exact_log2 (length);
4657   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4658   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4659
4660   result_chain->quick_grow (length);
4661   memcpy (result_chain->address (), dr_chain.address (),
4662           length * sizeof (tree));
4663
4664   if (length == 3)
4665     {
4666       unsigned int j0 = 0, j1 = 0, j2 = 0;
4667
4668       for (j = 0; j < 3; j++)
4669         {
4670           int nelt0 = ((3 - j) * nelt) % 3;
4671           int nelt1 = ((3 - j) * nelt + 1) % 3;
4672           int nelt2 = ((3 - j) * nelt + 2) % 3;
4673
4674           for (i = 0; i < nelt; i++)
4675             {
4676               if (3 * i + nelt0 < nelt)
4677                 sel[3 * i + nelt0] = j0++;
4678               if (3 * i + nelt1 < nelt)
4679                 sel[3 * i + nelt1] = nelt + j1++;
4680               if (3 * i + nelt2 < nelt)
4681                 sel[3 * i + nelt2] = 0;
4682             }
4683           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4684
4685           for (i = 0; i < nelt; i++)
4686             {
4687               if (3 * i + nelt0 < nelt)
4688                 sel[3 * i + nelt0] = 3 * i + nelt0;
4689               if (3 * i + nelt1 < nelt)
4690                 sel[3 * i + nelt1] = 3 * i + nelt1;
4691               if (3 * i + nelt2 < nelt)
4692                 sel[3 * i + nelt2] = nelt + j2++;
4693             }
4694           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4695
4696           vect1 = dr_chain[0];
4697           vect2 = dr_chain[1];
4698
4699           /* Create interleaving stmt:
4700              low = VEC_PERM_EXPR <vect1, vect2,
4701                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4702                                    j + 2, nelt + j + 2, *, ...}>  */
4703           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4704           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4705                                            vect2, perm3_mask_low);
4706           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4707
4708           vect1 = data_ref;
4709           vect2 = dr_chain[2];
4710           /* Create interleaving stmt:
4711              low = VEC_PERM_EXPR <vect1, vect2,
4712                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4713                                    6, 7, nelt + j + 2, ...}>  */
4714           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4715           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4716                                            vect2, perm3_mask_high);
4717           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4718           (*result_chain)[j] = data_ref;
4719         }
4720     }
4721   else
4722     {
4723       /* If length is not equal to 3 then only power of 2 is supported.  */
4724       gcc_assert (exact_log2 (length) != -1);
4725
4726       for (i = 0, n = nelt / 2; i < n; i++)
4727         {
4728           sel[i * 2] = i;
4729           sel[i * 2 + 1] = i + nelt;
4730         }
4731         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4732
4733         for (i = 0; i < nelt; i++)
4734           sel[i] += nelt / 2;
4735         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4736
4737         for (i = 0, n = log_length; i < n; i++)
4738           {
4739             for (j = 0; j < length/2; j++)
4740               {
4741                 vect1 = dr_chain[j];
4742                 vect2 = dr_chain[j+length/2];
4743
4744                 /* Create interleaving stmt:
4745                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4746                                                         ...}>  */
4747                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4748                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4749                                                  vect2, perm_mask_high);
4750                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4751                 (*result_chain)[2*j] = high;
4752
4753                 /* Create interleaving stmt:
4754                    low = VEC_PERM_EXPR <vect1, vect2,
4755                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4756                                          ...}>  */
4757                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4758                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4759                                                  vect2, perm_mask_low);
4760                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4761                 (*result_chain)[2*j+1] = low;
4762               }
4763             memcpy (dr_chain.address (), result_chain->address (),
4764                     length * sizeof (tree));
4765           }
4766     }
4767 }
4768
4769 /* Function vect_setup_realignment
4770
4771    This function is called when vectorizing an unaligned load using
4772    the dr_explicit_realign[_optimized] scheme.
4773    This function generates the following code at the loop prolog:
4774
4775       p = initial_addr;
4776    x  msq_init = *(floor(p));   # prolog load
4777       realignment_token = call target_builtin;
4778     loop:
4779    x  msq = phi (msq_init, ---)
4780
4781    The stmts marked with x are generated only for the case of
4782    dr_explicit_realign_optimized.
4783
4784    The code above sets up a new (vector) pointer, pointing to the first
4785    location accessed by STMT, and a "floor-aligned" load using that pointer.
4786    It also generates code to compute the "realignment-token" (if the relevant
4787    target hook was defined), and creates a phi-node at the loop-header bb
4788    whose arguments are the result of the prolog-load (created by this
4789    function) and the result of a load that takes place in the loop (to be
4790    created by the caller to this function).
4791
4792    For the case of dr_explicit_realign_optimized:
4793    The caller to this function uses the phi-result (msq) to create the
4794    realignment code inside the loop, and sets up the missing phi argument,
4795    as follows:
4796     loop:
4797       msq = phi (msq_init, lsq)
4798       lsq = *(floor(p'));        # load in loop
4799       result = realign_load (msq, lsq, realignment_token);
4800
4801    For the case of dr_explicit_realign:
4802     loop:
4803       msq = *(floor(p));        # load in loop
4804       p' = p + (VS-1);
4805       lsq = *(floor(p'));       # load in loop
4806       result = realign_load (msq, lsq, realignment_token);
4807
4808    Input:
4809    STMT - (scalar) load stmt to be vectorized. This load accesses
4810           a memory location that may be unaligned.
4811    BSI - place where new code is to be inserted.
4812    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4813                               is used.
4814
4815    Output:
4816    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4817                        target hook, if defined.
4818    Return value - the result of the loop-header phi node.  */
4819
4820 tree
4821 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4822                         tree *realignment_token,
4823                         enum dr_alignment_support alignment_support_scheme,
4824                         tree init_addr,
4825                         struct loop **at_loop)
4826 {
4827   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4828   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4829   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4830   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4831   struct loop *loop = NULL;
4832   edge pe = NULL;
4833   tree scalar_dest = gimple_assign_lhs (stmt);
4834   tree vec_dest;
4835   gimple *inc;
4836   tree ptr;
4837   tree data_ref;
4838   basic_block new_bb;
4839   tree msq_init = NULL_TREE;
4840   tree new_temp;
4841   gphi *phi_stmt;
4842   tree msq = NULL_TREE;
4843   gimple_seq stmts = NULL;
4844   bool inv_p;
4845   bool compute_in_loop = false;
4846   bool nested_in_vect_loop = false;
4847   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4848   struct loop *loop_for_initial_load = NULL;
4849
4850   if (loop_vinfo)
4851     {
4852       loop = LOOP_VINFO_LOOP (loop_vinfo);
4853       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4854     }
4855
4856   gcc_assert (alignment_support_scheme == dr_explicit_realign
4857               || alignment_support_scheme == dr_explicit_realign_optimized);
4858
4859   /* We need to generate three things:
4860      1. the misalignment computation
4861      2. the extra vector load (for the optimized realignment scheme).
4862      3. the phi node for the two vectors from which the realignment is
4863       done (for the optimized realignment scheme).  */
4864
4865   /* 1. Determine where to generate the misalignment computation.
4866
4867      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4868      calculation will be generated by this function, outside the loop (in the
4869      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4870      caller, inside the loop.
4871
4872      Background: If the misalignment remains fixed throughout the iterations of
4873      the loop, then both realignment schemes are applicable, and also the
4874      misalignment computation can be done outside LOOP.  This is because we are
4875      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4876      are a multiple of VS (the Vector Size), and therefore the misalignment in
4877      different vectorized LOOP iterations is always the same.
4878      The problem arises only if the memory access is in an inner-loop nested
4879      inside LOOP, which is now being vectorized using outer-loop vectorization.
4880      This is the only case when the misalignment of the memory access may not
4881      remain fixed throughout the iterations of the inner-loop (as explained in
4882      detail in vect_supportable_dr_alignment).  In this case, not only is the
4883      optimized realignment scheme not applicable, but also the misalignment
4884      computation (and generation of the realignment token that is passed to
4885      REALIGN_LOAD) have to be done inside the loop.
4886
4887      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4888      or not, which in turn determines if the misalignment is computed inside
4889      the inner-loop, or outside LOOP.  */
4890
4891   if (init_addr != NULL_TREE || !loop_vinfo)
4892     {
4893       compute_in_loop = true;
4894       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4895     }
4896
4897
4898   /* 2. Determine where to generate the extra vector load.
4899
4900      For the optimized realignment scheme, instead of generating two vector
4901      loads in each iteration, we generate a single extra vector load in the
4902      preheader of the loop, and in each iteration reuse the result of the
4903      vector load from the previous iteration.  In case the memory access is in
4904      an inner-loop nested inside LOOP, which is now being vectorized using
4905      outer-loop vectorization, we need to determine whether this initial vector
4906      load should be generated at the preheader of the inner-loop, or can be
4907      generated at the preheader of LOOP.  If the memory access has no evolution
4908      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4909      to be generated inside LOOP (in the preheader of the inner-loop).  */
4910
4911   if (nested_in_vect_loop)
4912     {
4913       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4914       bool invariant_in_outerloop =
4915             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4916       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4917     }
4918   else
4919     loop_for_initial_load = loop;
4920   if (at_loop)
4921     *at_loop = loop_for_initial_load;
4922
4923   if (loop_for_initial_load)
4924     pe = loop_preheader_edge (loop_for_initial_load);
4925
4926   /* 3. For the case of the optimized realignment, create the first vector
4927       load at the loop preheader.  */
4928
4929   if (alignment_support_scheme == dr_explicit_realign_optimized)
4930     {
4931       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4932       gassign *new_stmt;
4933
4934       gcc_assert (!compute_in_loop);
4935       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4936       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4937                                       NULL_TREE, &init_addr, NULL, &inc,
4938                                       true, &inv_p);
4939       if (TREE_CODE (ptr) == SSA_NAME)
4940         new_temp = copy_ssa_name (ptr);
4941       else
4942         new_temp = make_ssa_name (TREE_TYPE (ptr));
4943       new_stmt = gimple_build_assign
4944                    (new_temp, BIT_AND_EXPR, ptr,
4945                     build_int_cst (TREE_TYPE (ptr),
4946                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4947       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4948       gcc_assert (!new_bb);
4949       data_ref
4950         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4951                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4952       new_stmt = gimple_build_assign (vec_dest, data_ref);
4953       new_temp = make_ssa_name (vec_dest, new_stmt);
4954       gimple_assign_set_lhs (new_stmt, new_temp);
4955       if (pe)
4956         {
4957           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4958           gcc_assert (!new_bb);
4959         }
4960       else
4961          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4962
4963       msq_init = gimple_assign_lhs (new_stmt);
4964     }
4965
4966   /* 4. Create realignment token using a target builtin, if available.
4967       It is done either inside the containing loop, or before LOOP (as
4968       determined above).  */
4969
4970   if (targetm.vectorize.builtin_mask_for_load)
4971     {
4972       gcall *new_stmt;
4973       tree builtin_decl;
4974
4975       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4976       if (!init_addr)
4977         {
4978           /* Generate the INIT_ADDR computation outside LOOP.  */
4979           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4980                                                         NULL_TREE, loop);
4981           if (loop)
4982             {
4983               pe = loop_preheader_edge (loop);
4984               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4985               gcc_assert (!new_bb);
4986             }
4987           else
4988              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4989         }
4990
4991       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4992       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4993       vec_dest =
4994         vect_create_destination_var (scalar_dest,
4995                                      gimple_call_return_type (new_stmt));
4996       new_temp = make_ssa_name (vec_dest, new_stmt);
4997       gimple_call_set_lhs (new_stmt, new_temp);
4998
4999       if (compute_in_loop)
5000         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5001       else
5002         {
5003           /* Generate the misalignment computation outside LOOP.  */
5004           pe = loop_preheader_edge (loop);
5005           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5006           gcc_assert (!new_bb);
5007         }
5008
5009       *realignment_token = gimple_call_lhs (new_stmt);
5010
5011       /* The result of the CALL_EXPR to this builtin is determined from
5012          the value of the parameter and no global variables are touched
5013          which makes the builtin a "const" function.  Requiring the
5014          builtin to have the "const" attribute makes it unnecessary
5015          to call mark_call_clobbered.  */
5016       gcc_assert (TREE_READONLY (builtin_decl));
5017     }
5018
5019   if (alignment_support_scheme == dr_explicit_realign)
5020     return msq;
5021
5022   gcc_assert (!compute_in_loop);
5023   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5024
5025
5026   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5027
5028   pe = loop_preheader_edge (containing_loop);
5029   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5030   msq = make_ssa_name (vec_dest);
5031   phi_stmt = create_phi_node (msq, containing_loop->header);
5032   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5033
5034   return msq;
5035 }
5036
5037
5038 /* Function vect_grouped_load_supported.
5039
5040    Returns TRUE if even and odd permutations are supported,
5041    and FALSE otherwise.  */
5042
5043 bool
5044 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5045 {
5046   machine_mode mode = TYPE_MODE (vectype);
5047
5048   /* vect_permute_load_chain requires the group size to be equal to 3 or
5049      be a power of two.  */
5050   if (count != 3 && exact_log2 (count) == -1)
5051     {
5052       if (dump_enabled_p ())
5053         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5054                          "the size of the group of accesses"
5055                          " is not a power of 2 or not equal to 3\n");
5056       return false;
5057     }
5058
5059   /* Check that the permutation is supported.  */
5060   if (VECTOR_MODE_P (mode))
5061     {
5062       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5063       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5064
5065       if (count == 3)
5066         {
5067           unsigned int k;
5068           for (k = 0; k < 3; k++)
5069             {
5070               for (i = 0; i < nelt; i++)
5071                 if (3 * i + k < 2 * nelt)
5072                   sel[i] = 3 * i + k;
5073                 else
5074                   sel[i] = 0;
5075               if (!can_vec_perm_p (mode, false, sel))
5076                 {
5077                   if (dump_enabled_p ())
5078                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5079                                      "shuffle of 3 loads is not supported by"
5080                                      " target\n");
5081                   return false;
5082                 }
5083               for (i = 0, j = 0; i < nelt; i++)
5084                 if (3 * i + k < 2 * nelt)
5085                   sel[i] = i;
5086                 else
5087                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5088               if (!can_vec_perm_p (mode, false, sel))
5089                 {
5090                   if (dump_enabled_p ())
5091                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5092                                      "shuffle of 3 loads is not supported by"
5093                                      " target\n");
5094                   return false;
5095                 }
5096             }
5097           return true;
5098         }
5099       else
5100         {
5101           /* If length is not equal to 3 then only power of 2 is supported.  */
5102           gcc_assert (exact_log2 (count) != -1);
5103           for (i = 0; i < nelt; i++)
5104             sel[i] = i * 2;
5105           if (can_vec_perm_p (mode, false, sel))
5106             {
5107               for (i = 0; i < nelt; i++)
5108                 sel[i] = i * 2 + 1;
5109               if (can_vec_perm_p (mode, false, sel))
5110                 return true;
5111             }
5112         }
5113     }
5114
5115   if (dump_enabled_p ())
5116     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5117                      "extract even/odd not supported by target\n");
5118   return false;
5119 }
5120
5121 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5122    type VECTYPE.  */
5123
5124 bool
5125 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5126 {
5127   return vect_lanes_optab_supported_p ("vec_load_lanes",
5128                                        vec_load_lanes_optab,
5129                                        vectype, count);
5130 }
5131
5132 /* Function vect_permute_load_chain.
5133
5134    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5135    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5136    the input data correctly.  Return the final references for loads in
5137    RESULT_CHAIN.
5138
5139    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5140    The input is 4 vectors each containing 8 elements. We assign a number to each
5141    element, the input sequence is:
5142
5143    1st vec:   0  1  2  3  4  5  6  7
5144    2nd vec:   8  9 10 11 12 13 14 15
5145    3rd vec:  16 17 18 19 20 21 22 23
5146    4th vec:  24 25 26 27 28 29 30 31
5147
5148    The output sequence should be:
5149
5150    1st vec:  0 4  8 12 16 20 24 28
5151    2nd vec:  1 5  9 13 17 21 25 29
5152    3rd vec:  2 6 10 14 18 22 26 30
5153    4th vec:  3 7 11 15 19 23 27 31
5154
5155    i.e., the first output vector should contain the first elements of each
5156    interleaving group, etc.
5157
5158    We use extract_even/odd instructions to create such output.  The input of
5159    each extract_even/odd operation is two vectors
5160    1st vec    2nd vec
5161    0 1 2 3    4 5 6 7
5162
5163    and the output is the vector of extracted even/odd elements.  The output of
5164    extract_even will be:   0 2 4 6
5165    and of extract_odd:     1 3 5 7
5166
5167
5168    The permutation is done in log LENGTH stages.  In each stage extract_even
5169    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5170    their order.  In our example,
5171
5172    E1: extract_even (1st vec, 2nd vec)
5173    E2: extract_odd (1st vec, 2nd vec)
5174    E3: extract_even (3rd vec, 4th vec)
5175    E4: extract_odd (3rd vec, 4th vec)
5176
5177    The output for the first stage will be:
5178
5179    E1:  0  2  4  6  8 10 12 14
5180    E2:  1  3  5  7  9 11 13 15
5181    E3: 16 18 20 22 24 26 28 30
5182    E4: 17 19 21 23 25 27 29 31
5183
5184    In order to proceed and create the correct sequence for the next stage (or
5185    for the correct output, if the second stage is the last one, as in our
5186    example), we first put the output of extract_even operation and then the
5187    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5188    The input for the second stage is:
5189
5190    1st vec (E1):  0  2  4  6  8 10 12 14
5191    2nd vec (E3): 16 18 20 22 24 26 28 30
5192    3rd vec (E2):  1  3  5  7  9 11 13 15
5193    4th vec (E4): 17 19 21 23 25 27 29 31
5194
5195    The output of the second stage:
5196
5197    E1: 0 4  8 12 16 20 24 28
5198    E2: 2 6 10 14 18 22 26 30
5199    E3: 1 5  9 13 17 21 25 29
5200    E4: 3 7 11 15 19 23 27 31
5201
5202    And RESULT_CHAIN after reordering:
5203
5204    1st vec (E1):  0 4  8 12 16 20 24 28
5205    2nd vec (E3):  1 5  9 13 17 21 25 29
5206    3rd vec (E2):  2 6 10 14 18 22 26 30
5207    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5208
5209 static void
5210 vect_permute_load_chain (vec<tree> dr_chain,
5211                          unsigned int length,
5212                          gimple *stmt,
5213                          gimple_stmt_iterator *gsi,
5214                          vec<tree> *result_chain)
5215 {
5216   tree data_ref, first_vect, second_vect;
5217   tree perm_mask_even, perm_mask_odd;
5218   tree perm3_mask_low, perm3_mask_high;
5219   gimple *perm_stmt;
5220   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5221   unsigned int i, j, log_length = exact_log2 (length);
5222   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5223   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5224
5225   result_chain->quick_grow (length);
5226   memcpy (result_chain->address (), dr_chain.address (),
5227           length * sizeof (tree));
5228
5229   if (length == 3)
5230     {
5231       unsigned int k;
5232
5233       for (k = 0; k < 3; k++)
5234         {
5235           for (i = 0; i < nelt; i++)
5236             if (3 * i + k < 2 * nelt)
5237               sel[i] = 3 * i + k;
5238             else
5239               sel[i] = 0;
5240           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5241
5242           for (i = 0, j = 0; i < nelt; i++)
5243             if (3 * i + k < 2 * nelt)
5244               sel[i] = i;
5245             else
5246               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5247
5248           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5249
5250           first_vect = dr_chain[0];
5251           second_vect = dr_chain[1];
5252
5253           /* Create interleaving stmt (low part of):
5254              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5255                                                              ...}>  */
5256           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5257           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5258                                            second_vect, perm3_mask_low);
5259           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5260
5261           /* Create interleaving stmt (high part of):
5262              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5263                                                               ...}>  */
5264           first_vect = data_ref;
5265           second_vect = dr_chain[2];
5266           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5267           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5268                                            second_vect, perm3_mask_high);
5269           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5270           (*result_chain)[k] = data_ref;
5271         }
5272     }
5273   else
5274     {
5275       /* If length is not equal to 3 then only power of 2 is supported.  */
5276       gcc_assert (exact_log2 (length) != -1);
5277
5278       for (i = 0; i < nelt; ++i)
5279         sel[i] = i * 2;
5280       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5281
5282       for (i = 0; i < nelt; ++i)
5283         sel[i] = i * 2 + 1;
5284       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5285
5286       for (i = 0; i < log_length; i++)
5287         {
5288           for (j = 0; j < length; j += 2)
5289             {
5290               first_vect = dr_chain[j];
5291               second_vect = dr_chain[j+1];
5292
5293               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5294               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5295               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5296                                                first_vect, second_vect,
5297                                                perm_mask_even);
5298               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5299               (*result_chain)[j/2] = data_ref;
5300
5301               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5302               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5303               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5304                                                first_vect, second_vect,
5305                                                perm_mask_odd);
5306               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5307               (*result_chain)[j/2+length/2] = data_ref;
5308             }
5309           memcpy (dr_chain.address (), result_chain->address (),
5310                   length * sizeof (tree));
5311         }
5312     }
5313 }
5314
5315 /* Function vect_shift_permute_load_chain.
5316
5317    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5318    sequence of stmts to reorder the input data accordingly.
5319    Return the final references for loads in RESULT_CHAIN.
5320    Return true if successed, false otherwise.
5321
5322    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5323    The input is 3 vectors each containing 8 elements.  We assign a
5324    number to each element, the input sequence is:
5325
5326    1st vec:   0  1  2  3  4  5  6  7
5327    2nd vec:   8  9 10 11 12 13 14 15
5328    3rd vec:  16 17 18 19 20 21 22 23
5329
5330    The output sequence should be:
5331
5332    1st vec:  0 3 6  9 12 15 18 21
5333    2nd vec:  1 4 7 10 13 16 19 22
5334    3rd vec:  2 5 8 11 14 17 20 23
5335
5336    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5337
5338    First we shuffle all 3 vectors to get correct elements order:
5339
5340    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5341    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5342    3rd vec:  (16 19 22) (17 20 23) (18 21)
5343
5344    Next we unite and shift vector 3 times:
5345
5346    1st step:
5347      shift right by 6 the concatenation of:
5348      "1st vec" and  "2nd vec"
5349        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5350      "2nd vec" and  "3rd vec"
5351        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5352      "3rd vec" and  "1st vec"
5353        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5354                              | New vectors                   |
5355
5356      So that now new vectors are:
5357
5358      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5359      2nd vec:  (10 13) (16 19 22) (17 20 23)
5360      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5361
5362    2nd step:
5363      shift right by 5 the concatenation of:
5364      "1st vec" and  "3rd vec"
5365        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5366      "2nd vec" and  "1st vec"
5367        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5368      "3rd vec" and  "2nd vec"
5369        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5370                           | New vectors                   |
5371
5372      So that now new vectors are:
5373
5374      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5375      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5376      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5377
5378    3rd step:
5379      shift right by 5 the concatenation of:
5380      "1st vec" and  "1st vec"
5381        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5382      shift right by 3 the concatenation of:
5383      "2nd vec" and  "2nd vec"
5384                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5385                           | New vectors                   |
5386
5387      So that now all vectors are READY:
5388      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5389      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5390      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5391
5392    This algorithm is faster than one in vect_permute_load_chain if:
5393      1.  "shift of a concatination" is faster than general permutation.
5394          This is usually so.
5395      2.  The TARGET machine can't execute vector instructions in parallel.
5396          This is because each step of the algorithm depends on previous.
5397          The algorithm in vect_permute_load_chain is much more parallel.
5398
5399    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5400 */
5401
5402 static bool
5403 vect_shift_permute_load_chain (vec<tree> dr_chain,
5404                                unsigned int length,
5405                                gimple *stmt,
5406                                gimple_stmt_iterator *gsi,
5407                                vec<tree> *result_chain)
5408 {
5409   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5410   tree perm2_mask1, perm2_mask2, perm3_mask;
5411   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5412   gimple *perm_stmt;
5413
5414   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5415   unsigned int i;
5416   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5417   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5418   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5419   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5420
5421   result_chain->quick_grow (length);
5422   memcpy (result_chain->address (), dr_chain.address (),
5423           length * sizeof (tree));
5424
5425   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5426     {
5427       unsigned int j, log_length = exact_log2 (length);
5428       for (i = 0; i < nelt / 2; ++i)
5429         sel[i] = i * 2;
5430       for (i = 0; i < nelt / 2; ++i)
5431         sel[nelt / 2 + i] = i * 2 + 1;
5432       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5433         {
5434           if (dump_enabled_p ())
5435             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5436                              "shuffle of 2 fields structure is not \
5437                               supported by target\n");
5438           return false;
5439         }
5440       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5441
5442       for (i = 0; i < nelt / 2; ++i)
5443         sel[i] = i * 2 + 1;
5444       for (i = 0; i < nelt / 2; ++i)
5445         sel[nelt / 2 + i] = i * 2;
5446       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5447         {
5448           if (dump_enabled_p ())
5449             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5450                              "shuffle of 2 fields structure is not \
5451                               supported by target\n");
5452           return false;
5453         }
5454       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5455
5456       /* Generating permutation constant to shift all elements.
5457          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5458       for (i = 0; i < nelt; i++)
5459         sel[i] = nelt / 2 + i;
5460       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5461         {
5462           if (dump_enabled_p ())
5463             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5464                              "shift permutation is not supported by target\n");
5465           return false;
5466         }
5467       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5468
5469       /* Generating permutation constant to select vector from 2.
5470          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5471       for (i = 0; i < nelt / 2; i++)
5472         sel[i] = i;
5473       for (i = nelt / 2; i < nelt; i++)
5474         sel[i] = nelt + i;
5475       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5476         {
5477           if (dump_enabled_p ())
5478             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5479                              "select is not supported by target\n");
5480           return false;
5481         }
5482       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5483
5484       for (i = 0; i < log_length; i++)
5485         {
5486           for (j = 0; j < length; j += 2)
5487             {
5488               first_vect = dr_chain[j];
5489               second_vect = dr_chain[j + 1];
5490
5491               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5492               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5493                                                first_vect, first_vect,
5494                                                perm2_mask1);
5495               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5496               vect[0] = data_ref;
5497
5498               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5499               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5500                                                second_vect, second_vect,
5501                                                perm2_mask2);
5502               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5503               vect[1] = data_ref;
5504
5505               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5506               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5507                                                vect[0], vect[1], shift1_mask);
5508               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5509               (*result_chain)[j/2 + length/2] = data_ref;
5510
5511               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5512               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5513                                                vect[0], vect[1], select_mask);
5514               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5515               (*result_chain)[j/2] = data_ref;
5516             }
5517           memcpy (dr_chain.address (), result_chain->address (),
5518                   length * sizeof (tree));
5519         }
5520       return true;
5521     }
5522   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5523     {
5524       unsigned int k = 0, l = 0;
5525
5526       /* Generating permutation constant to get all elements in rigth order.
5527          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5528       for (i = 0; i < nelt; i++)
5529         {
5530           if (3 * k + (l % 3) >= nelt)
5531             {
5532               k = 0;
5533               l += (3 - (nelt % 3));
5534             }
5535           sel[i] = 3 * k + (l % 3);
5536           k++;
5537         }
5538       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5539         {
5540           if (dump_enabled_p ())
5541             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5542                              "shuffle of 3 fields structure is not \
5543                               supported by target\n");
5544           return false;
5545         }
5546       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5547
5548       /* Generating permutation constant to shift all elements.
5549          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5550       for (i = 0; i < nelt; i++)
5551         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5552       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5553         {
5554           if (dump_enabled_p ())
5555             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5556                              "shift permutation is not supported by target\n");
5557           return false;
5558         }
5559       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5560
5561       /* Generating permutation constant to shift all elements.
5562          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5563       for (i = 0; i < nelt; i++)
5564         sel[i] = 2 * (nelt / 3) + 1 + i;
5565       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5566         {
5567           if (dump_enabled_p ())
5568             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5569                              "shift permutation is not supported by target\n");
5570           return false;
5571         }
5572       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5573
5574       /* Generating permutation constant to shift all elements.
5575          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5576       for (i = 0; i < nelt; i++)
5577         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5578       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5579         {
5580           if (dump_enabled_p ())
5581             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5582                              "shift permutation is not supported by target\n");
5583           return false;
5584         }
5585       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5586
5587       /* Generating permutation constant to shift all elements.
5588          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5589       for (i = 0; i < nelt; i++)
5590         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5591       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5592         {
5593           if (dump_enabled_p ())
5594             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5595                              "shift permutation is not supported by target\n");
5596           return false;
5597         }
5598       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5599
5600       for (k = 0; k < 3; k++)
5601         {
5602           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5603           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5604                                            dr_chain[k], dr_chain[k],
5605                                            perm3_mask);
5606           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5607           vect[k] = data_ref;
5608         }
5609
5610       for (k = 0; k < 3; k++)
5611         {
5612           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5613           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5614                                            vect[k % 3], vect[(k + 1) % 3],
5615                                            shift1_mask);
5616           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5617           vect_shift[k] = data_ref;
5618         }
5619
5620       for (k = 0; k < 3; k++)
5621         {
5622           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5623           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5624                                            vect_shift[(4 - k) % 3],
5625                                            vect_shift[(3 - k) % 3],
5626                                            shift2_mask);
5627           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5628           vect[k] = data_ref;
5629         }
5630
5631       (*result_chain)[3 - (nelt % 3)] = vect[2];
5632
5633       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5634       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5635                                        vect[0], shift3_mask);
5636       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5637       (*result_chain)[nelt % 3] = data_ref;
5638
5639       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5640       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5641                                        vect[1], shift4_mask);
5642       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5643       (*result_chain)[0] = data_ref;
5644       return true;
5645     }
5646   return false;
5647 }
5648
5649 /* Function vect_transform_grouped_load.
5650
5651    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5652    to perform their permutation and ascribe the result vectorized statements to
5653    the scalar statements.
5654 */
5655
5656 void
5657 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5658                              gimple_stmt_iterator *gsi)
5659 {
5660   machine_mode mode;
5661   vec<tree> result_chain = vNULL;
5662
5663   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5664      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5665      vectors, that are ready for vector computation.  */
5666   result_chain.create (size);
5667
5668   /* If reassociation width for vector type is 2 or greater target machine can
5669      execute 2 or more vector instructions in parallel.  Otherwise try to
5670      get chain for loads group using vect_shift_permute_load_chain.  */
5671   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5672   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5673       || exact_log2 (size) != -1
5674       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5675                                          gsi, &result_chain))
5676     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5677   vect_record_grouped_load_vectors (stmt, result_chain);
5678   result_chain.release ();
5679 }
5680
5681 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5682    generated as part of the vectorization of STMT.  Assign the statement
5683    for each vector to the associated scalar statement.  */
5684
5685 void
5686 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5687 {
5688   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5689   gimple *next_stmt, *new_stmt;
5690   unsigned int i, gap_count;
5691   tree tmp_data_ref;
5692
5693   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5694      Since we scan the chain starting from it's first node, their order
5695      corresponds the order of data-refs in RESULT_CHAIN.  */
5696   next_stmt = first_stmt;
5697   gap_count = 1;
5698   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5699     {
5700       if (!next_stmt)
5701         break;
5702
5703       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5704        code elimination pass later.  No need to check for the first stmt in
5705        the group, since it always exists.
5706        GROUP_GAP is the number of steps in elements from the previous
5707        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5708        correspond to the gaps.  */
5709       if (next_stmt != first_stmt
5710           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5711       {
5712         gap_count++;
5713         continue;
5714       }
5715
5716       while (next_stmt)
5717         {
5718           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5719           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5720              copies, and we put the new vector statement in the first available
5721              RELATED_STMT.  */
5722           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5723             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5724           else
5725             {
5726               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5727                 {
5728                   gimple *prev_stmt =
5729                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5730                   gimple *rel_stmt =
5731                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5732                   while (rel_stmt)
5733                     {
5734                       prev_stmt = rel_stmt;
5735                       rel_stmt =
5736                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5737                     }
5738
5739                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5740                     new_stmt;
5741                 }
5742             }
5743
5744           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5745           gap_count = 1;
5746           /* If NEXT_STMT accesses the same DR as the previous statement,
5747              put the same TMP_DATA_REF as its vectorized statement; otherwise
5748              get the next data-ref from RESULT_CHAIN.  */
5749           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5750             break;
5751         }
5752     }
5753 }
5754
5755 /* Function vect_force_dr_alignment_p.
5756
5757    Returns whether the alignment of a DECL can be forced to be aligned
5758    on ALIGNMENT bit boundary.  */
5759
5760 bool
5761 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5762 {
5763   if (TREE_CODE (decl) != VAR_DECL)
5764     return false;
5765
5766   if (decl_in_symtab_p (decl)
5767       && !symtab_node::get (decl)->can_increase_alignment_p ())
5768     return false;
5769
5770   if (TREE_STATIC (decl))
5771     return (alignment <= MAX_OFILE_ALIGNMENT);
5772   else
5773     return (alignment <= MAX_STACK_ALIGNMENT);
5774 }
5775
5776
5777 /* Return whether the data reference DR is supported with respect to its
5778    alignment.
5779    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5780    it is aligned, i.e., check if it is possible to vectorize it with different
5781    alignment.  */
5782
5783 enum dr_alignment_support
5784 vect_supportable_dr_alignment (struct data_reference *dr,
5785                                bool check_aligned_accesses)
5786 {
5787   gimple *stmt = DR_STMT (dr);
5788   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5789   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5790   machine_mode mode = TYPE_MODE (vectype);
5791   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5792   struct loop *vect_loop = NULL;
5793   bool nested_in_vect_loop = false;
5794
5795   if (aligned_access_p (dr) && !check_aligned_accesses)
5796     return dr_aligned;
5797
5798   /* For now assume all conditional loads/stores support unaligned
5799      access without any special code.  */
5800   if (is_gimple_call (stmt)
5801       && gimple_call_internal_p (stmt)
5802       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5803           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5804     return dr_unaligned_supported;
5805
5806   if (loop_vinfo)
5807     {
5808       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5809       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5810     }
5811
5812   /* Possibly unaligned access.  */
5813
5814   /* We can choose between using the implicit realignment scheme (generating
5815      a misaligned_move stmt) and the explicit realignment scheme (generating
5816      aligned loads with a REALIGN_LOAD).  There are two variants to the
5817      explicit realignment scheme: optimized, and unoptimized.
5818      We can optimize the realignment only if the step between consecutive
5819      vector loads is equal to the vector size.  Since the vector memory
5820      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5821      is guaranteed that the misalignment amount remains the same throughout the
5822      execution of the vectorized loop.  Therefore, we can create the
5823      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5824      at the loop preheader.
5825
5826      However, in the case of outer-loop vectorization, when vectorizing a
5827      memory access in the inner-loop nested within the LOOP that is now being
5828      vectorized, while it is guaranteed that the misalignment of the
5829      vectorized memory access will remain the same in different outer-loop
5830      iterations, it is *not* guaranteed that is will remain the same throughout
5831      the execution of the inner-loop.  This is because the inner-loop advances
5832      with the original scalar step (and not in steps of VS).  If the inner-loop
5833      step happens to be a multiple of VS, then the misalignment remains fixed
5834      and we can use the optimized realignment scheme.  For example:
5835
5836       for (i=0; i<N; i++)
5837         for (j=0; j<M; j++)
5838           s += a[i+j];
5839
5840      When vectorizing the i-loop in the above example, the step between
5841      consecutive vector loads is 1, and so the misalignment does not remain
5842      fixed across the execution of the inner-loop, and the realignment cannot
5843      be optimized (as illustrated in the following pseudo vectorized loop):
5844
5845       for (i=0; i<N; i+=4)
5846         for (j=0; j<M; j++){
5847           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5848                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5849                          // (assuming that we start from an aligned address).
5850           }
5851
5852      We therefore have to use the unoptimized realignment scheme:
5853
5854       for (i=0; i<N; i+=4)
5855           for (j=k; j<M; j+=4)
5856           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5857                            // that the misalignment of the initial address is
5858                            // 0).
5859
5860      The loop can then be vectorized as follows:
5861
5862       for (k=0; k<4; k++){
5863         rt = get_realignment_token (&vp[k]);
5864         for (i=0; i<N; i+=4){
5865           v1 = vp[i+k];
5866           for (j=k; j<M; j+=4){
5867             v2 = vp[i+j+VS-1];
5868             va = REALIGN_LOAD <v1,v2,rt>;
5869             vs += va;
5870             v1 = v2;
5871           }
5872         }
5873     } */
5874
5875   if (DR_IS_READ (dr))
5876     {
5877       bool is_packed = false;
5878       tree type = (TREE_TYPE (DR_REF (dr)));
5879
5880       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5881           && (!targetm.vectorize.builtin_mask_for_load
5882               || targetm.vectorize.builtin_mask_for_load ()))
5883         {
5884           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5885           if ((nested_in_vect_loop
5886                && (TREE_INT_CST_LOW (DR_STEP (dr))
5887                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5888               || !loop_vinfo)
5889             return dr_explicit_realign;
5890           else
5891             return dr_explicit_realign_optimized;
5892         }
5893       if (!known_alignment_for_access_p (dr))
5894         is_packed = not_size_aligned (DR_REF (dr));
5895
5896       if ((TYPE_USER_ALIGN (type) && !is_packed)
5897           || targetm.vectorize.
5898                support_vector_misalignment (mode, type,
5899                                             DR_MISALIGNMENT (dr), is_packed))
5900         /* Can't software pipeline the loads, but can at least do them.  */
5901         return dr_unaligned_supported;
5902     }
5903   else
5904     {
5905       bool is_packed = false;
5906       tree type = (TREE_TYPE (DR_REF (dr)));
5907
5908       if (!known_alignment_for_access_p (dr))
5909         is_packed = not_size_aligned (DR_REF (dr));
5910
5911      if ((TYPE_USER_ALIGN (type) && !is_packed)
5912          || targetm.vectorize.
5913               support_vector_misalignment (mode, type,
5914                                            DR_MISALIGNMENT (dr), is_packed))
5915        return dr_unaligned_supported;
5916     }
5917
5918   /* Unsupported.  */
5919   return dr_unaligned_unsupported;
5920 }