gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "tm_p.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "cgraph.h"
  35 #include "dumpfile.h"
  36 #include "alias.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "tree-eh.h"
  40 #include "gimplify.h"
  41 #include "gimple-iterator.h"
  42 #include "gimplify-me.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "tree-ssa-loop-manip.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "expr.h"
  50 #include "builtins.h"
  51 #include "params.h"
  52
  53 /* Return true if load- or store-lanes optab OPTAB is implemented for
  54    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  55
  56 static bool
  57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  58                               tree vectype, unsigned HOST_WIDE_INT count)
  59 {
  60   machine_mode mode, array_mode;
  61   bool limit_p;
  62
  63   mode = TYPE_MODE (vectype);
  64   limit_p = !targetm.array_mode_supported_p (mode, count);
  65   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  66                               MODE_INT, limit_p);
  67
  68   if (array_mode == BLKmode)
  69     {
  70       if (dump_enabled_p ())
  71         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  72                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  73                          GET_MODE_NAME (mode), count);
  74       return false;
  75     }
  76
  77   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  78     {
  79       if (dump_enabled_p ())
  80         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  81                          "cannot use %s<%s><%s>\n", name,
  82                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  83       return false;
  84     }
  85
  86   if (dump_enabled_p ())
  87     dump_printf_loc (MSG_NOTE, vect_location,
  88                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  89                      GET_MODE_NAME (mode));
  90
  91   return true;
  92 }
  93
  94
  95 /* Return the smallest scalar part of STMT.
  96    This is used to determine the vectype of the stmt.  We generally set the
  97    vectype according to the type of the result (lhs).  For stmts whose
  98    result-type is different than the type of the arguments (e.g., demotion,
  99    promotion), vectype will be reset appropriately (later).  Note that we have
 100    to visit the smallest datatype in this function, because that determines the
 101    VF.  If the smallest datatype in the loop is present only as the rhs of a
 102    promotion operation - we'd miss it.
 103    Such a case, where a variable of this datatype does not appear in the lhs
 104    anywhere in the loop, can only occur if it's an invariant: e.g.:
 105    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 106    invariant motion.  However, we cannot rely on invariant motion to always
 107    take invariants out of the loop, and so in the case of promotion we also
 108    have to check the rhs.
 109    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 110    types.  */
 111
 112 tree
 113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 114                                HOST_WIDE_INT *rhs_size_unit)
 115 {
 116   tree scalar_type = gimple_expr_type (stmt);
 117   HOST_WIDE_INT lhs, rhs;
 118
 119   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 120
 121   if (is_gimple_assign (stmt)
 122       && (gimple_assign_cast_p (stmt)
 123           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 124           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 125           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 126     {
 127       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 128
 129       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 130       if (rhs < lhs)
 131         scalar_type = rhs_type;
 132     }
 133
 134   *lhs_size_unit = lhs;
 135   *rhs_size_unit = rhs;
 136   return scalar_type;
 137 }
 138
 139
 140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 141    tested at run-time.  Return TRUE if DDR was successfully inserted.
 142    Return false if versioning is not supported.  */
 143
 144 static bool
 145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 146 {
 147   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 148
 149   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 150     return false;
 151
 152   if (dump_enabled_p ())
 153     {
 154       dump_printf_loc (MSG_NOTE, vect_location,
 155                        "mark for run-time aliasing test between ");
 156       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 157       dump_printf (MSG_NOTE,  " and ");
 158       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 159       dump_printf (MSG_NOTE, "\n");
 160     }
 161
 162   if (optimize_loop_nest_for_size_p (loop))
 163     {
 164       if (dump_enabled_p ())
 165         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 166                          "versioning not supported when optimizing"
 167                          " for size.\n");
 168       return false;
 169     }
 170
 171   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 172   if (loop->inner)
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 176                          "versioning not yet supported for outer-loops.\n");
 177       return false;
 178     }
 179
 180   /* FORNOW: We don't support creating runtime alias tests for non-constant
 181      step.  */
 182   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 183       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 184     {
 185       if (dump_enabled_p ())
 186         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 187                          "versioning not yet supported for non-constant "
 188                          "step\n");
 189       return false;
 190     }
 191
 192   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 193   return true;
 194 }
 195
 196
 197 /* Function vect_analyze_data_ref_dependence.
 198
 199    Return TRUE if there (might) exist a dependence between a memory-reference
 200    DRA and a memory-reference DRB.  When versioning for alias may check a
 201    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 202    the data dependence.  */
 203
 204 static bool
 205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 206                                   loop_vec_info loop_vinfo, int *max_vf)
 207 {
 208   unsigned int i;
 209   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 210   struct data_reference *dra = DDR_A (ddr);
 211   struct data_reference *drb = DDR_B (ddr);
 212   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 213   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 214   lambda_vector dist_v;
 215   unsigned int loop_depth;
 216
 217   /* In loop analysis all data references should be vectorizable.  */
 218   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 219       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 220     gcc_unreachable ();
 221
 222   /* Independent data accesses.  */
 223   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 224     return false;
 225
 226   if (dra == drb
 227       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 228     return false;
 229
 230   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 231      least two scalar iterations, there is always also a true dependence.
 232      As the vectorizer does not re-order loads and stores we can ignore
 233      the anti-dependence if TBAA can disambiguate both DRs similar to the
 234      case with known negative distance anti-dependences (positive
 235      distance anti-dependences would violate TBAA constraints).  */
 236   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 237        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 238       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 239                                  get_alias_set (DR_REF (drb))))
 240     return false;
 241
 242   /* Unknown data dependence.  */
 243   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 244     {
 245       /* If user asserted safelen consecutive iterations can be
 246          executed concurrently, assume independence.  */
 247       if (loop->safelen >= 2)
 248         {
 249           if (loop->safelen < *max_vf)
 250             *max_vf = loop->safelen;
 251           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 252           return false;
 253         }
 254
 255       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 256           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 257         {
 258           if (dump_enabled_p ())
 259             {
 260               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 261                                "versioning for alias not supported for: "
 262                                "can't determine dependence between ");
 263               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 264                                  DR_REF (dra));
 265               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 266               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 267                                  DR_REF (drb));
 268               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 269             }
 270           return true;
 271         }
 272
 273       if (dump_enabled_p ())
 274         {
 275           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 276                            "versioning for alias required: "
 277                            "can't determine dependence between ");
 278           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 279                              DR_REF (dra));
 280           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 281           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 282                              DR_REF (drb));
 283           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 284         }
 285
 286       /* Add to list of ddrs that need to be tested at run-time.  */
 287       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 288     }
 289
 290   /* Known data dependence.  */
 291   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 292     {
 293       /* If user asserted safelen consecutive iterations can be
 294          executed concurrently, assume independence.  */
 295       if (loop->safelen >= 2)
 296         {
 297           if (loop->safelen < *max_vf)
 298             *max_vf = loop->safelen;
 299           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 300           return false;
 301         }
 302
 303       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 304           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 305         {
 306           if (dump_enabled_p ())
 307             {
 308               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 309                                "versioning for alias not supported for: "
 310                                "bad dist vector for ");
 311               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 312                                  DR_REF (dra));
 313               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 314               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 315                                  DR_REF (drb));
 316               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 317             }
 318           return true;
 319         }
 320
 321       if (dump_enabled_p ())
 322         {
 323           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 324                            "versioning for alias required: "
 325                            "bad dist vector for ");
 326           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 327           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 328           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 329           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 330         }
 331       /* Add to list of ddrs that need to be tested at run-time.  */
 332       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 333     }
 334
 335   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 336   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 337     {
 338       int dist = dist_v[loop_depth];
 339
 340       if (dump_enabled_p ())
 341         dump_printf_loc (MSG_NOTE, vect_location,
 342                          "dependence distance  = %d.\n", dist);
 343
 344       if (dist == 0)
 345         {
 346           if (dump_enabled_p ())
 347             {
 348               dump_printf_loc (MSG_NOTE, vect_location,
 349                                "dependence distance == 0 between ");
 350               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 351               dump_printf (MSG_NOTE, " and ");
 352               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 353               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 354             }
 355
 356           /* When we perform grouped accesses and perform implicit CSE
 357              by detecting equal accesses and doing disambiguation with
 358              runtime alias tests like for
 359                 .. = a[i];
 360                 .. = a[i+1];
 361                 a[i] = ..;
 362                 a[i+1] = ..;
 363                 *p = ..;
 364                 .. = a[i];
 365                 .. = a[i+1];
 366              where we will end up loading { a[i], a[i+1] } once, make
 367              sure that inserting group loads before the first load and
 368              stores after the last store will do the right thing.
 369              Similar for groups like
 370                 a[i] = ...;
 371                 ... = a[i];
 372                 a[i+1] = ...;
 373              where loads from the group interleave with the store.  */
 374           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 375               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 376             {
 377               gimple *earlier_stmt;
 378               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 379               if (DR_IS_WRITE
 380                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 381                 {
 382                   if (dump_enabled_p ())
 383                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 384                                      "READ_WRITE dependence in interleaving."
 385                                      "\n");
 386                   return true;
 387                 }
 388             }
 389
 390           continue;
 391         }
 392
 393       if (dist > 0 && DDR_REVERSED_P (ddr))
 394         {
 395           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 396              reversed (to make distance vector positive), and the actual
 397              distance is negative.  */
 398           if (dump_enabled_p ())
 399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 400                              "dependence distance negative.\n");
 401           /* Record a negative dependence distance to later limit the
 402              amount of stmt copying / unrolling we can perform.
 403              Only need to handle read-after-write dependence.  */
 404           if (DR_IS_READ (drb)
 405               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 406                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 407             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 408           continue;
 409         }
 410
 411       if (abs (dist) >= 2
 412           && abs (dist) < *max_vf)
 413         {
 414           /* The dependence distance requires reduction of the maximal
 415              vectorization factor.  */
 416           *max_vf = abs (dist);
 417           if (dump_enabled_p ())
 418             dump_printf_loc (MSG_NOTE, vect_location,
 419                              "adjusting maximal vectorization factor to %i\n",
 420                              *max_vf);
 421         }
 422
 423       if (abs (dist) >= *max_vf)
 424         {
 425           /* Dependence distance does not create dependence, as far as
 426              vectorization is concerned, in this case.  */
 427           if (dump_enabled_p ())
 428             dump_printf_loc (MSG_NOTE, vect_location,
 429                              "dependence distance >= VF.\n");
 430           continue;
 431         }
 432
 433       if (dump_enabled_p ())
 434         {
 435           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 436                        "not vectorized, possible dependence "
 437                        "between data-refs ");
 438           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 439           dump_printf (MSG_NOTE,  " and ");
 440           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 441           dump_printf (MSG_NOTE,  "\n");
 442         }
 443
 444       return true;
 445     }
 446
 447   return false;
 448 }
 449
 450 /* Function vect_analyze_data_ref_dependences.
 451
 452    Examine all the data references in the loop, and make sure there do not
 453    exist any data dependences between them.  Set *MAX_VF according to
 454    the maximum vectorization factor the data dependences allow.  */
 455
 456 bool
 457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 458 {
 459   unsigned int i;
 460   struct data_dependence_relation *ddr;
 461
 462   if (dump_enabled_p ())
 463     dump_printf_loc (MSG_NOTE, vect_location,
 464                      "=== vect_analyze_data_ref_dependences ===\n");
 465
 466   LOOP_VINFO_DDRS (loop_vinfo)
 467     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 468              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 469   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 470   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 471                                 &LOOP_VINFO_DDRS (loop_vinfo),
 472                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 473     return false;
 474
 475   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 476     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 477       return false;
 478
 479   return true;
 480 }
 481
 482
 483 /* Function vect_slp_analyze_data_ref_dependence.
 484
 485    Return TRUE if there (might) exist a dependence between a memory-reference
 486    DRA and a memory-reference DRB.  When versioning for alias may check a
 487    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 488    the data dependence.  */
 489
 490 static bool
 491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 492 {
 493   struct data_reference *dra = DDR_A (ddr);
 494   struct data_reference *drb = DDR_B (ddr);
 495
 496   /* We need to check dependences of statements marked as unvectorizable
 497      as well, they still can prohibit vectorization.  */
 498
 499   /* Independent data accesses.  */
 500   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 501     return false;
 502
 503   if (dra == drb)
 504     return false;
 505
 506   /* Read-read is OK.  */
 507   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 508     return false;
 509
 510   /* If dra and drb are part of the same interleaving chain consider
 511      them independent.  */
 512   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 513       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 514           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 515     return false;
 516
 517   /* Unknown data dependence.  */
 518   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 519     {
 520       if  (dump_enabled_p ())
 521         {
 522           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 523                            "can't determine dependence between ");
 524           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 525           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 526           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 527           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 528         }
 529     }
 530   else if (dump_enabled_p ())
 531     {
 532       dump_printf_loc (MSG_NOTE, vect_location,
 533                        "determined dependence between ");
 534       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 535       dump_printf (MSG_NOTE, " and ");
 536       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 537       dump_printf (MSG_NOTE,  "\n");
 538     }
 539
 540   /* We do not vectorize basic blocks with write-write dependencies.  */
 541   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 542     return true;
 543
 544   /* If we have a read-write dependence check that the load is before the store.
 545      When we vectorize basic blocks, vector load can be only before
 546      corresponding scalar load, and vector store can be only after its
 547      corresponding scalar store.  So the order of the acceses is preserved in
 548      case the load is before the store.  */
 549   gimple *earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 550   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 551     {
 552       /* That only holds for load-store pairs taking part in vectorization.  */
 553       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 554           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 555         return false;
 556     }
 557
 558   return true;
 559 }
 560
 561
 562 /* Function vect_analyze_data_ref_dependences.
 563
 564    Examine all the data references in the basic-block, and make sure there
 565    do not exist any data dependences between them.  Set *MAX_VF according to
 566    the maximum vectorization factor the data dependences allow.  */
 567
 568 bool
 569 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 570 {
 571   struct data_dependence_relation *ddr;
 572   unsigned int i;
 573
 574   if (dump_enabled_p ())
 575     dump_printf_loc (MSG_NOTE, vect_location,
 576                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 577
 578   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 579                                 &BB_VINFO_DDRS (bb_vinfo),
 580                                 vNULL, true))
 581     return false;
 582
 583   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 584     if (vect_slp_analyze_data_ref_dependence (ddr))
 585       return false;
 586
 587   return true;
 588 }
 589
 590
 591 /* Function vect_compute_data_ref_alignment
 592
 593    Compute the misalignment of the data reference DR.
 594
 595    Output:
 596    1. If during the misalignment computation it is found that the data reference
 597       cannot be vectorized then false is returned.
 598    2. DR_MISALIGNMENT (DR) is defined.
 599
 600    FOR NOW: No analysis is actually performed. Misalignment is calculated
 601    only for trivial cases. TODO.  */
 602
 603 static bool
 604 vect_compute_data_ref_alignment (struct data_reference *dr)
 605 {
 606   gimple *stmt = DR_STMT (dr);
 607   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 608   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 609   struct loop *loop = NULL;
 610   tree ref = DR_REF (dr);
 611   tree vectype;
 612   tree base, base_addr;
 613   tree misalign = NULL_TREE;
 614   tree aligned_to;
 615   unsigned HOST_WIDE_INT alignment;
 616
 617   if (dump_enabled_p ())
 618     dump_printf_loc (MSG_NOTE, vect_location,
 619                      "vect_compute_data_ref_alignment:\n");
 620
 621   if (loop_vinfo)
 622     loop = LOOP_VINFO_LOOP (loop_vinfo);
 623
 624   /* Initialize misalignment to unknown.  */
 625   SET_DR_MISALIGNMENT (dr, -1);
 626
 627   if (tree_fits_shwi_p (DR_STEP (dr)))
 628     misalign = DR_INIT (dr);
 629   aligned_to = DR_ALIGNED_TO (dr);
 630   base_addr = DR_BASE_ADDRESS (dr);
 631   vectype = STMT_VINFO_VECTYPE (stmt_info);
 632
 633   /* In case the dataref is in an inner-loop of the loop that is being
 634      vectorized (LOOP), we use the base and misalignment information
 635      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 636      stays the same throughout the execution of the inner-loop, which is why
 637      we have to check that the stride of the dataref in the inner-loop evenly
 638      divides by the vector size.  */
 639   if (loop && nested_in_vect_loop_p (loop, stmt))
 640     {
 641       tree step = DR_STEP (dr);
 642
 643       if (tree_fits_shwi_p (step)
 644           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 645         {
 646           if (dump_enabled_p ())
 647             dump_printf_loc (MSG_NOTE, vect_location,
 648                              "inner step divides the vector-size.\n");
 649           misalign = STMT_VINFO_DR_INIT (stmt_info);
 650           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 651           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 652         }
 653       else
 654         {
 655           if (dump_enabled_p ())
 656             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 657                              "inner step doesn't divide the vector-size.\n");
 658           misalign = NULL_TREE;
 659         }
 660     }
 661
 662   /* Similarly we can only use base and misalignment information relative to
 663      an innermost loop if the misalignment stays the same throughout the
 664      execution of the loop.  As above, this is the case if the stride of
 665      the dataref evenly divides by the vector size.  */
 666   else
 667     {
 668       tree step = DR_STEP (dr);
 669       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 670
 671       if (tree_fits_shwi_p (step)
 672           && ((tree_to_shwi (step) * vf)
 673               % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 674         {
 675           if (dump_enabled_p ())
 676             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 677                              "step doesn't divide the vector-size.\n");
 678           misalign = NULL_TREE;
 679         }
 680     }
 681
 682   /* To look at alignment of the base we have to preserve an inner MEM_REF
 683      as that carries alignment information of the actual access.  */
 684   base = ref;
 685   while (handled_component_p (base))
 686     base = TREE_OPERAND (base, 0);
 687   if (TREE_CODE (base) == MEM_REF)
 688     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 689                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 690   unsigned int base_alignment = get_object_alignment (base);
 691
 692   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
 693     DR_VECT_AUX (dr)->base_element_aligned = true;
 694
 695   alignment = TYPE_ALIGN_UNIT (vectype);
 696
 697   if ((compare_tree_int (aligned_to, alignment) < 0)
 698       || !misalign)
 699     {
 700       if (dump_enabled_p ())
 701         {
 702           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 703                            "Unknown alignment for access: ");
 704           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 705           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 706         }
 707       return true;
 708     }
 709
 710   if (base_alignment < TYPE_ALIGN (vectype))
 711     {
 712       /* Strip an inner MEM_REF to a bare decl if possible.  */
 713       if (TREE_CODE (base) == MEM_REF
 714           && integer_zerop (TREE_OPERAND (base, 1))
 715           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 716         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 717
 718       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 719         {
 720           if (dump_enabled_p ())
 721             {
 722               dump_printf_loc (MSG_NOTE, vect_location,
 723                                "can't force alignment of ref: ");
 724               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 725               dump_printf (MSG_NOTE, "\n");
 726             }
 727           return true;
 728         }
 729
 730       /* Force the alignment of the decl.
 731          NOTE: This is the only change to the code we make during
 732          the analysis phase, before deciding to vectorize the loop.  */
 733       if (dump_enabled_p ())
 734         {
 735           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 736           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 737           dump_printf (MSG_NOTE, "\n");
 738         }
 739
 740       DR_VECT_AUX (dr)->base_decl = base;
 741       DR_VECT_AUX (dr)->base_misaligned = true;
 742       DR_VECT_AUX (dr)->base_element_aligned = true;
 743     }
 744
 745   /* If this is a backward running DR then first access in the larger
 746      vectype actually is N-1 elements before the address in the DR.
 747      Adjust misalign accordingly.  */
 748   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 749     {
 750       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 751       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 752          otherwise we wouldn't be here.  */
 753       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 754       /* PLUS because DR_STEP was negative.  */
 755       misalign = size_binop (PLUS_EXPR, misalign, offset);
 756     }
 757
 758   SET_DR_MISALIGNMENT (dr,
 759                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 760
 761   if (dump_enabled_p ())
 762     {
 763       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 764                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 765       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 766       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 767     }
 768
 769   return true;
 770 }
 771
 772
 773 /* Function vect_compute_data_refs_alignment
 774
 775    Compute the misalignment of data references in the loop.
 776    Return FALSE if a data reference is found that cannot be vectorized.  */
 777
 778 static bool
 779 vect_compute_data_refs_alignment (vec_info *vinfo)
 780 {
 781   vec<data_reference_p> datarefs = vinfo->datarefs;
 782   struct data_reference *dr;
 783   unsigned int i;
 784
 785   FOR_EACH_VEC_ELT (datarefs, i, dr)
 786     {
 787       stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 788       if (STMT_VINFO_VECTORIZABLE (stmt_info)
 789           && !vect_compute_data_ref_alignment (dr))
 790         {
 791           /* Strided accesses perform only component accesses, misalignment
 792              information is irrelevant for them.  */
 793           if (STMT_VINFO_STRIDED_P (stmt_info)
 794               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 795             continue;
 796
 797           if (is_a <bb_vec_info> (vinfo))
 798             {
 799               /* Mark unsupported statement as unvectorizable.  */
 800               STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 801               continue;
 802             }
 803           else
 804             return false;
 805         }
 806     }
 807
 808   return true;
 809 }
 810
 811
 812 /* Function vect_update_misalignment_for_peel
 813
 814    DR - the data reference whose misalignment is to be adjusted.
 815    DR_PEEL - the data reference whose misalignment is being made
 816              zero in the vector loop by the peel.
 817    NPEEL - the number of iterations in the peel loop if the misalignment
 818            of DR_PEEL is known at compile time.  */
 819
 820 static void
 821 vect_update_misalignment_for_peel (struct data_reference *dr,
 822                                    struct data_reference *dr_peel, int npeel)
 823 {
 824   unsigned int i;
 825   vec<dr_p> same_align_drs;
 826   struct data_reference *current_dr;
 827   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 828   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 829   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 830   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 831
 832  /* For interleaved data accesses the step in the loop must be multiplied by
 833      the size of the interleaving group.  */
 834   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 835     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 836   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 837     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 838
 839   /* It can be assumed that the data refs with the same alignment as dr_peel
 840      are aligned in the vector loop.  */
 841   same_align_drs
 842     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 843   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 844     {
 845       if (current_dr != dr)
 846         continue;
 847       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 848                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 849       SET_DR_MISALIGNMENT (dr, 0);
 850       return;
 851     }
 852
 853   if (known_alignment_for_access_p (dr)
 854       && known_alignment_for_access_p (dr_peel))
 855     {
 856       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 857       int misal = DR_MISALIGNMENT (dr);
 858       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 859       misal += negative ? -npeel * dr_size : npeel * dr_size;
 860       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 861       SET_DR_MISALIGNMENT (dr, misal);
 862       return;
 863     }
 864
 865   if (dump_enabled_p ())
 866     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 867   SET_DR_MISALIGNMENT (dr, -1);
 868 }
 869
 870
 871 /* Function vect_verify_datarefs_alignment
 872
 873    Return TRUE if all data references in the loop can be
 874    handled with respect to alignment.  */
 875
 876 bool
 877 vect_verify_datarefs_alignment (vec_info *vinfo)
 878 {
 879   vec<data_reference_p> datarefs = vinfo->datarefs;
 880   struct data_reference *dr;
 881   enum dr_alignment_support supportable_dr_alignment;
 882   unsigned int i;
 883
 884   FOR_EACH_VEC_ELT (datarefs, i, dr)
 885     {
 886       gimple *stmt = DR_STMT (dr);
 887       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 888
 889       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 890         continue;
 891
 892       /* For interleaving, only the alignment of the first access matters.
 893          Skip statements marked as not vectorizable.  */
 894       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 895            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 896           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 897         continue;
 898
 899       /* Strided accesses perform only component accesses, alignment is
 900          irrelevant for them.  */
 901       if (STMT_VINFO_STRIDED_P (stmt_info)
 902           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 903         continue;
 904
 905       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 906       if (!supportable_dr_alignment)
 907         {
 908           if (dump_enabled_p ())
 909             {
 910               if (DR_IS_READ (dr))
 911                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                                  "not vectorized: unsupported unaligned load.");
 913               else
 914                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 915                                  "not vectorized: unsupported unaligned "
 916                                  "store.");
 917
 918               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 919                                  DR_REF (dr));
 920               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 921             }
 922           return false;
 923         }
 924       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 925         dump_printf_loc (MSG_NOTE, vect_location,
 926                          "Vectorizing an unaligned access.\n");
 927     }
 928   return true;
 929 }
 930
 931 /* Given an memory reference EXP return whether its alignment is less
 932    than its size.  */
 933
 934 static bool
 935 not_size_aligned (tree exp)
 936 {
 937   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 938     return true;
 939
 940   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 941           > get_object_alignment (exp));
 942 }
 943
 944 /* Function vector_alignment_reachable_p
 945
 946    Return true if vector alignment for DR is reachable by peeling
 947    a few loop iterations.  Return false otherwise.  */
 948
 949 static bool
 950 vector_alignment_reachable_p (struct data_reference *dr)
 951 {
 952   gimple *stmt = DR_STMT (dr);
 953   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 954   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 955
 956   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 957     {
 958       /* For interleaved access we peel only if number of iterations in
 959          the prolog loop ({VF - misalignment}), is a multiple of the
 960          number of the interleaved accesses.  */
 961       int elem_size, mis_in_elements;
 962       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 963
 964       /* FORNOW: handle only known alignment.  */
 965       if (!known_alignment_for_access_p (dr))
 966         return false;
 967
 968       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 969       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 970
 971       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
 972         return false;
 973     }
 974
 975   /* If misalignment is known at the compile time then allow peeling
 976      only if natural alignment is reachable through peeling.  */
 977   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
 978     {
 979       HOST_WIDE_INT elmsize =
 980                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
 981       if (dump_enabled_p ())
 982         {
 983           dump_printf_loc (MSG_NOTE, vect_location,
 984                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
 985           dump_printf (MSG_NOTE,
 986                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
 987         }
 988       if (DR_MISALIGNMENT (dr) % elmsize)
 989         {
 990           if (dump_enabled_p ())
 991             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 992                              "data size does not divide the misalignment.\n");
 993           return false;
 994         }
 995     }
 996
 997   if (!known_alignment_for_access_p (dr))
 998     {
 999       tree type = TREE_TYPE (DR_REF (dr));
1000       bool is_packed = not_size_aligned (DR_REF (dr));
1001       if (dump_enabled_p ())
1002         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1003                          "Unknown misalignment, is_packed = %d\n",is_packed);
1004       if ((TYPE_USER_ALIGN (type) && !is_packed)
1005           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1006         return true;
1007       else
1008         return false;
1009     }
1010
1011   return true;
1012 }
1013
1014
1015 /* Calculate the cost of the memory access represented by DR.  */
1016
1017 static void
1018 vect_get_data_access_cost (struct data_reference *dr,
1019                            unsigned int *inside_cost,
1020                            unsigned int *outside_cost,
1021                            stmt_vector_for_cost *body_cost_vec)
1022 {
1023   gimple *stmt = DR_STMT (dr);
1024   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1025   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1026   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1027   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1028   int ncopies = vf / nunits;
1029
1030   if (DR_IS_READ (dr))
1031     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1032                         NULL, body_cost_vec, false);
1033   else
1034     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1035
1036   if (dump_enabled_p ())
1037     dump_printf_loc (MSG_NOTE, vect_location,
1038                      "vect_get_data_access_cost: inside_cost = %d, "
1039                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1040 }
1041
1042
1043 typedef struct _vect_peel_info
1044 {
1045   int npeel;
1046   struct data_reference *dr;
1047   unsigned int count;
1048 } *vect_peel_info;
1049
1050 typedef struct _vect_peel_extended_info
1051 {
1052   struct _vect_peel_info peel_info;
1053   unsigned int inside_cost;
1054   unsigned int outside_cost;
1055   stmt_vector_for_cost body_cost_vec;
1056 } *vect_peel_extended_info;
1057
1058
1059 /* Peeling hashtable helpers.  */
1060
1061 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1062 {
1063   static inline hashval_t hash (const _vect_peel_info *);
1064   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1065 };
1066
1067 inline hashval_t
1068 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1069 {
1070   return (hashval_t) peel_info->npeel;
1071 }
1072
1073 inline bool
1074 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1075 {
1076   return (a->npeel == b->npeel);
1077 }
1078
1079
1080 /* Insert DR into peeling hash table with NPEEL as key.  */
1081
1082 static void
1083 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1084                           loop_vec_info loop_vinfo, struct data_reference *dr,
1085                           int npeel)
1086 {
1087   struct _vect_peel_info elem, *slot;
1088   _vect_peel_info **new_slot;
1089   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1090
1091   elem.npeel = npeel;
1092   slot = peeling_htab->find (&elem);
1093   if (slot)
1094     slot->count++;
1095   else
1096     {
1097       slot = XNEW (struct _vect_peel_info);
1098       slot->npeel = npeel;
1099       slot->dr = dr;
1100       slot->count = 1;
1101       new_slot = peeling_htab->find_slot (slot, INSERT);
1102       *new_slot = slot;
1103     }
1104
1105   if (!supportable_dr_alignment
1106       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1107     slot->count += VECT_MAX_COST;
1108 }
1109
1110
1111 /* Traverse peeling hash table to find peeling option that aligns maximum
1112    number of data accesses.  */
1113
1114 int
1115 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1116                                      _vect_peel_extended_info *max)
1117 {
1118   vect_peel_info elem = *slot;
1119
1120   if (elem->count > max->peel_info.count
1121       || (elem->count == max->peel_info.count
1122           && max->peel_info.npeel > elem->npeel))
1123     {
1124       max->peel_info.npeel = elem->npeel;
1125       max->peel_info.count = elem->count;
1126       max->peel_info.dr = elem->dr;
1127     }
1128
1129   return 1;
1130 }
1131
1132
1133 /* Traverse peeling hash table and calculate cost for each peeling option.
1134    Find the one with the lowest cost.  */
1135
1136 int
1137 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1138                                    _vect_peel_extended_info *min)
1139 {
1140   vect_peel_info elem = *slot;
1141   int save_misalignment, dummy;
1142   unsigned int inside_cost = 0, outside_cost = 0, i;
1143   gimple *stmt = DR_STMT (elem->dr);
1144   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1145   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1146   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1147   struct data_reference *dr;
1148   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1149
1150   prologue_cost_vec.create (2);
1151   body_cost_vec.create (2);
1152   epilogue_cost_vec.create (2);
1153
1154   FOR_EACH_VEC_ELT (datarefs, i, dr)
1155     {
1156       stmt = DR_STMT (dr);
1157       stmt_info = vinfo_for_stmt (stmt);
1158       /* For interleaving, only the alignment of the first access
1159          matters.  */
1160       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1161           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1162         continue;
1163
1164       save_misalignment = DR_MISALIGNMENT (dr);
1165       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1166       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1167                                  &body_cost_vec);
1168       SET_DR_MISALIGNMENT (dr, save_misalignment);
1169     }
1170
1171   outside_cost += vect_get_known_peeling_cost
1172     (loop_vinfo, elem->npeel, &dummy,
1173      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1174      &prologue_cost_vec, &epilogue_cost_vec);
1175
1176   /* Prologue and epilogue costs are added to the target model later.
1177      These costs depend only on the scalar iteration cost, the
1178      number of peeling iterations finally chosen, and the number of
1179      misaligned statements.  So discard the information found here.  */
1180   prologue_cost_vec.release ();
1181   epilogue_cost_vec.release ();
1182
1183   if (inside_cost < min->inside_cost
1184       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1185     {
1186       min->inside_cost = inside_cost;
1187       min->outside_cost = outside_cost;
1188       min->body_cost_vec.release ();
1189       min->body_cost_vec = body_cost_vec;
1190       min->peel_info.dr = elem->dr;
1191       min->peel_info.npeel = elem->npeel;
1192     }
1193   else
1194     body_cost_vec.release ();
1195
1196   return 1;
1197 }
1198
1199
1200 /* Choose best peeling option by traversing peeling hash table and either
1201    choosing an option with the lowest cost (if cost model is enabled) or the
1202    option that aligns as many accesses as possible.  */
1203
1204 static struct data_reference *
1205 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1206                                        loop_vec_info loop_vinfo,
1207                                        unsigned int *npeel,
1208                                        stmt_vector_for_cost *body_cost_vec)
1209 {
1210    struct _vect_peel_extended_info res;
1211
1212    res.peel_info.dr = NULL;
1213    res.body_cost_vec = stmt_vector_for_cost ();
1214
1215    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1216      {
1217        res.inside_cost = INT_MAX;
1218        res.outside_cost = INT_MAX;
1219        peeling_htab->traverse <_vect_peel_extended_info *,
1220                                vect_peeling_hash_get_lowest_cost> (&res);
1221      }
1222    else
1223      {
1224        res.peel_info.count = 0;
1225        peeling_htab->traverse <_vect_peel_extended_info *,
1226                                vect_peeling_hash_get_most_frequent> (&res);
1227      }
1228
1229    *npeel = res.peel_info.npeel;
1230    *body_cost_vec = res.body_cost_vec;
1231    return res.peel_info.dr;
1232 }
1233
1234
1235 /* Function vect_enhance_data_refs_alignment
1236
1237    This pass will use loop versioning and loop peeling in order to enhance
1238    the alignment of data references in the loop.
1239
1240    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1241    original loop is to be vectorized.  Any other loops that are created by
1242    the transformations performed in this pass - are not supposed to be
1243    vectorized.  This restriction will be relaxed.
1244
1245    This pass will require a cost model to guide it whether to apply peeling
1246    or versioning or a combination of the two.  For example, the scheme that
1247    intel uses when given a loop with several memory accesses, is as follows:
1248    choose one memory access ('p') which alignment you want to force by doing
1249    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1250    other accesses are not necessarily aligned, or (2) use loop versioning to
1251    generate one loop in which all accesses are aligned, and another loop in
1252    which only 'p' is necessarily aligned.
1253
1254    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1255    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1256    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1257
1258    Devising a cost model is the most critical aspect of this work.  It will
1259    guide us on which access to peel for, whether to use loop versioning, how
1260    many versions to create, etc.  The cost model will probably consist of
1261    generic considerations as well as target specific considerations (on
1262    powerpc for example, misaligned stores are more painful than misaligned
1263    loads).
1264
1265    Here are the general steps involved in alignment enhancements:
1266
1267      -- original loop, before alignment analysis:
1268         for (i=0; i<N; i++){
1269           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1270           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1271         }
1272
1273      -- After vect_compute_data_refs_alignment:
1274         for (i=0; i<N; i++){
1275           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1276           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1277         }
1278
1279      -- Possibility 1: we do loop versioning:
1280      if (p is aligned) {
1281         for (i=0; i<N; i++){    # loop 1A
1282           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1283           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1284         }
1285      }
1286      else {
1287         for (i=0; i<N; i++){    # loop 1B
1288           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1289           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1290         }
1291      }
1292
1293      -- Possibility 2: we do loop peeling:
1294      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1295         x = q[i];
1296         p[i] = y;
1297      }
1298      for (i = 3; i < N; i++){   # loop 2A
1299         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1300         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1301      }
1302
1303      -- Possibility 3: combination of loop peeling and versioning:
1304      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1305         x = q[i];
1306         p[i] = y;
1307      }
1308      if (p is aligned) {
1309         for (i = 3; i<N; i++){  # loop 3A
1310           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1311           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1312         }
1313      }
1314      else {
1315         for (i = 3; i<N; i++){  # loop 3B
1316           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1317           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1318         }
1319      }
1320
1321      These loops are later passed to loop_transform to be vectorized.  The
1322      vectorizer will use the alignment information to guide the transformation
1323      (whether to generate regular loads/stores, or with special handling for
1324      misalignment).  */
1325
1326 bool
1327 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1328 {
1329   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1330   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1331   enum dr_alignment_support supportable_dr_alignment;
1332   struct data_reference *dr0 = NULL, *first_store = NULL;
1333   struct data_reference *dr;
1334   unsigned int i, j;
1335   bool do_peeling = false;
1336   bool do_versioning = false;
1337   bool stat;
1338   gimple *stmt;
1339   stmt_vec_info stmt_info;
1340   unsigned int npeel = 0;
1341   bool all_misalignments_unknown = true;
1342   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1343   unsigned possible_npeel_number = 1;
1344   tree vectype;
1345   unsigned int nelements, mis, same_align_drs_max = 0;
1346   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1347   hash_table<peel_info_hasher> peeling_htab (1);
1348
1349   if (dump_enabled_p ())
1350     dump_printf_loc (MSG_NOTE, vect_location,
1351                      "=== vect_enhance_data_refs_alignment ===\n");
1352
1353   /* Reset data so we can safely be called multiple times.  */
1354   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1355   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1356
1357   /* While cost model enhancements are expected in the future, the high level
1358      view of the code at this time is as follows:
1359
1360      A) If there is a misaligned access then see if peeling to align
1361         this access can make all data references satisfy
1362         vect_supportable_dr_alignment.  If so, update data structures
1363         as needed and return true.
1364
1365      B) If peeling wasn't possible and there is a data reference with an
1366         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1367         then see if loop versioning checks can be used to make all data
1368         references satisfy vect_supportable_dr_alignment.  If so, update
1369         data structures as needed and return true.
1370
1371      C) If neither peeling nor versioning were successful then return false if
1372         any data reference does not satisfy vect_supportable_dr_alignment.
1373
1374      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1375
1376      Note, Possibility 3 above (which is peeling and versioning together) is not
1377      being done at this time.  */
1378
1379   /* (1) Peeling to force alignment.  */
1380
1381   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1382      Considerations:
1383      + How many accesses will become aligned due to the peeling
1384      - How many accesses will become unaligned due to the peeling,
1385        and the cost of misaligned accesses.
1386      - The cost of peeling (the extra runtime checks, the increase
1387        in code size).  */
1388
1389   FOR_EACH_VEC_ELT (datarefs, i, dr)
1390     {
1391       stmt = DR_STMT (dr);
1392       stmt_info = vinfo_for_stmt (stmt);
1393
1394       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1395         continue;
1396
1397       /* For interleaving, only the alignment of the first access
1398          matters.  */
1399       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1400           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1401         continue;
1402
1403       /* For invariant accesses there is nothing to enhance.  */
1404       if (integer_zerop (DR_STEP (dr)))
1405         continue;
1406
1407       /* Strided accesses perform only component accesses, alignment is
1408          irrelevant for them.  */
1409       if (STMT_VINFO_STRIDED_P (stmt_info)
1410           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1411         continue;
1412
1413       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1414       do_peeling = vector_alignment_reachable_p (dr);
1415       if (do_peeling)
1416         {
1417           if (known_alignment_for_access_p (dr))
1418             {
1419               unsigned int npeel_tmp;
1420               bool negative = tree_int_cst_compare (DR_STEP (dr),
1421                                                     size_zero_node) < 0;
1422
1423               /* Save info about DR in the hash table.  */
1424               vectype = STMT_VINFO_VECTYPE (stmt_info);
1425               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1426               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1427                                                 TREE_TYPE (DR_REF (dr))));
1428               npeel_tmp = (negative
1429                            ? (mis - nelements) : (nelements - mis))
1430                   & (nelements - 1);
1431
1432               /* For multiple types, it is possible that the bigger type access
1433                  will have more than one peeling option.  E.g., a loop with two
1434                  types: one of size (vector size / 4), and the other one of
1435                  size (vector size / 8).  Vectorization factor will 8.  If both
1436                  access are misaligned by 3, the first one needs one scalar
1437                  iteration to be aligned, and the second one needs 5.  But the
1438                  the first one will be aligned also by peeling 5 scalar
1439                  iterations, and in that case both accesses will be aligned.
1440                  Hence, except for the immediate peeling amount, we also want
1441                  to try to add full vector size, while we don't exceed
1442                  vectorization factor.
1443                  We do this automtically for cost model, since we calculate cost
1444                  for every peeling option.  */
1445               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1446                 {
1447                   if (STMT_SLP_TYPE (stmt_info))
1448                     possible_npeel_number
1449                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1450                   else
1451                     possible_npeel_number = vf / nelements;
1452                 }
1453
1454               /* Handle the aligned case. We may decide to align some other
1455                  access, making DR unaligned.  */
1456               if (DR_MISALIGNMENT (dr) == 0)
1457                 {
1458                   npeel_tmp = 0;
1459                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1460                     possible_npeel_number++;
1461                 }
1462
1463               for (j = 0; j < possible_npeel_number; j++)
1464                 {
1465                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1466                                             dr, npeel_tmp);
1467                   npeel_tmp += nelements;
1468                 }
1469
1470               all_misalignments_unknown = false;
1471               /* Data-ref that was chosen for the case that all the
1472                  misalignments are unknown is not relevant anymore, since we
1473                  have a data-ref with known alignment.  */
1474               dr0 = NULL;
1475             }
1476           else
1477             {
1478               /* If we don't know any misalignment values, we prefer
1479                  peeling for data-ref that has the maximum number of data-refs
1480                  with the same alignment, unless the target prefers to align
1481                  stores over load.  */
1482               if (all_misalignments_unknown)
1483                 {
1484                   unsigned same_align_drs
1485                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1486                   if (!dr0
1487                       || same_align_drs_max < same_align_drs)
1488                     {
1489                       same_align_drs_max = same_align_drs;
1490                       dr0 = dr;
1491                     }
1492                   /* For data-refs with the same number of related
1493                      accesses prefer the one where the misalign
1494                      computation will be invariant in the outermost loop.  */
1495                   else if (same_align_drs_max == same_align_drs)
1496                     {
1497                       struct loop *ivloop0, *ivloop;
1498                       ivloop0 = outermost_invariant_loop_for_expr
1499                           (loop, DR_BASE_ADDRESS (dr0));
1500                       ivloop = outermost_invariant_loop_for_expr
1501                           (loop, DR_BASE_ADDRESS (dr));
1502                       if ((ivloop && !ivloop0)
1503                           || (ivloop && ivloop0
1504                               && flow_loop_nested_p (ivloop, ivloop0)))
1505                         dr0 = dr;
1506                     }
1507
1508                   if (!first_store && DR_IS_WRITE (dr))
1509                     first_store = dr;
1510                 }
1511
1512               /* If there are both known and unknown misaligned accesses in the
1513                  loop, we choose peeling amount according to the known
1514                  accesses.  */
1515               if (!supportable_dr_alignment)
1516                 {
1517                   dr0 = dr;
1518                   if (!first_store && DR_IS_WRITE (dr))
1519                     first_store = dr;
1520                 }
1521             }
1522         }
1523       else
1524         {
1525           if (!aligned_access_p (dr))
1526             {
1527               if (dump_enabled_p ())
1528                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1529                                  "vector alignment may not be reachable\n");
1530               break;
1531             }
1532         }
1533     }
1534
1535   /* Check if we can possibly peel the loop.  */
1536   if (!vect_can_advance_ivs_p (loop_vinfo)
1537       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1538       || loop->inner)
1539     do_peeling = false;
1540
1541   if (do_peeling
1542       && all_misalignments_unknown
1543       && vect_supportable_dr_alignment (dr0, false))
1544     {
1545       /* Check if the target requires to prefer stores over loads, i.e., if
1546          misaligned stores are more expensive than misaligned loads (taking
1547          drs with same alignment into account).  */
1548       if (first_store && DR_IS_READ (dr0))
1549         {
1550           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1551           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1552           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1553           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1554           stmt_vector_for_cost dummy;
1555           dummy.create (2);
1556
1557           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1558                                      &dummy);
1559           vect_get_data_access_cost (first_store, &store_inside_cost,
1560                                      &store_outside_cost, &dummy);
1561
1562           dummy.release ();
1563
1564           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1565              aligning the load DR0).  */
1566           load_inside_penalty = store_inside_cost;
1567           load_outside_penalty = store_outside_cost;
1568           for (i = 0;
1569                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1570                           DR_STMT (first_store))).iterate (i, &dr);
1571                i++)
1572             if (DR_IS_READ (dr))
1573               {
1574                 load_inside_penalty += load_inside_cost;
1575                 load_outside_penalty += load_outside_cost;
1576               }
1577             else
1578               {
1579                 load_inside_penalty += store_inside_cost;
1580                 load_outside_penalty += store_outside_cost;
1581               }
1582
1583           /* Calculate the penalty for leaving DR0 unaligned (by
1584              aligning the FIRST_STORE).  */
1585           store_inside_penalty = load_inside_cost;
1586           store_outside_penalty = load_outside_cost;
1587           for (i = 0;
1588                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1589                       DR_STMT (dr0))).iterate (i, &dr);
1590                i++)
1591             if (DR_IS_READ (dr))
1592               {
1593                 store_inside_penalty += load_inside_cost;
1594                 store_outside_penalty += load_outside_cost;
1595               }
1596             else
1597               {
1598                 store_inside_penalty += store_inside_cost;
1599                 store_outside_penalty += store_outside_cost;
1600               }
1601
1602           if (load_inside_penalty > store_inside_penalty
1603               || (load_inside_penalty == store_inside_penalty
1604                   && load_outside_penalty > store_outside_penalty))
1605             dr0 = first_store;
1606         }
1607
1608       /* In case there are only loads with different unknown misalignments, use
1609          peeling only if it may help to align other accesses in the loop or
1610          if it may help improving load bandwith when we'd end up using
1611          unaligned loads.  */
1612       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1613       if (!first_store
1614           && !STMT_VINFO_SAME_ALIGN_REFS (
1615                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1616           && (vect_supportable_dr_alignment (dr0, false)
1617               != dr_unaligned_supported
1618               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1619                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1620         do_peeling = false;
1621     }
1622
1623   if (do_peeling && !dr0)
1624     {
1625       /* Peeling is possible, but there is no data access that is not supported
1626          unless aligned. So we try to choose the best possible peeling.  */
1627
1628       /* We should get here only if there are drs with known misalignment.  */
1629       gcc_assert (!all_misalignments_unknown);
1630
1631       /* Choose the best peeling from the hash table.  */
1632       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1633                                                    loop_vinfo, &npeel,
1634                                                    &body_cost_vec);
1635       if (!dr0 || !npeel)
1636         do_peeling = false;
1637     }
1638
1639   if (do_peeling)
1640     {
1641       stmt = DR_STMT (dr0);
1642       stmt_info = vinfo_for_stmt (stmt);
1643       vectype = STMT_VINFO_VECTYPE (stmt_info);
1644       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1645
1646       if (known_alignment_for_access_p (dr0))
1647         {
1648           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1649                                                 size_zero_node) < 0;
1650           if (!npeel)
1651             {
1652               /* Since it's known at compile time, compute the number of
1653                  iterations in the peeled loop (the peeling factor) for use in
1654                  updating DR_MISALIGNMENT values.  The peeling factor is the
1655                  vectorization factor minus the misalignment as an element
1656                  count.  */
1657               mis = DR_MISALIGNMENT (dr0);
1658               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1659               npeel = ((negative ? mis - nelements : nelements - mis)
1660                        & (nelements - 1));
1661             }
1662
1663           /* For interleaved data access every iteration accesses all the
1664              members of the group, therefore we divide the number of iterations
1665              by the group size.  */
1666           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1667           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1668             npeel /= GROUP_SIZE (stmt_info);
1669
1670           if (dump_enabled_p ())
1671             dump_printf_loc (MSG_NOTE, vect_location,
1672                              "Try peeling by %d\n", npeel);
1673         }
1674
1675       /* Ensure that all data refs can be vectorized after the peel.  */
1676       FOR_EACH_VEC_ELT (datarefs, i, dr)
1677         {
1678           int save_misalignment;
1679
1680           if (dr == dr0)
1681             continue;
1682
1683           stmt = DR_STMT (dr);
1684           stmt_info = vinfo_for_stmt (stmt);
1685           /* For interleaving, only the alignment of the first access
1686             matters.  */
1687           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1688               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1689             continue;
1690
1691           /* Strided accesses perform only component accesses, alignment is
1692              irrelevant for them.  */
1693           if (STMT_VINFO_STRIDED_P (stmt_info)
1694               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1695             continue;
1696
1697           save_misalignment = DR_MISALIGNMENT (dr);
1698           vect_update_misalignment_for_peel (dr, dr0, npeel);
1699           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1700           SET_DR_MISALIGNMENT (dr, save_misalignment);
1701
1702           if (!supportable_dr_alignment)
1703             {
1704               do_peeling = false;
1705               break;
1706             }
1707         }
1708
1709       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1710         {
1711           stat = vect_verify_datarefs_alignment (loop_vinfo);
1712           if (!stat)
1713             do_peeling = false;
1714           else
1715             {
1716               body_cost_vec.release ();
1717               return stat;
1718             }
1719         }
1720
1721       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1722       if (do_peeling)
1723         {
1724           unsigned max_allowed_peel
1725             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1726           if (max_allowed_peel != (unsigned)-1)
1727             {
1728               unsigned max_peel = npeel;
1729               if (max_peel == 0)
1730                 {
1731                   gimple *dr_stmt = DR_STMT (dr0);
1732                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1733                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1734                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1735                 }
1736               if (max_peel > max_allowed_peel)
1737                 {
1738                   do_peeling = false;
1739                   if (dump_enabled_p ())
1740                     dump_printf_loc (MSG_NOTE, vect_location,
1741                         "Disable peeling, max peels reached: %d\n", max_peel);
1742                 }
1743             }
1744         }
1745
1746       /* Cost model #2 - if peeling may result in a remaining loop not
1747          iterating enough to be vectorized then do not peel.  */
1748       if (do_peeling
1749           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1750         {
1751           unsigned max_peel
1752             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1753           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1754               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1755             do_peeling = false;
1756         }
1757
1758       if (do_peeling)
1759         {
1760           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1761              If the misalignment of DR_i is identical to that of dr0 then set
1762              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1763              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1764              by the peeling factor times the element size of DR_i (MOD the
1765              vectorization factor times the size).  Otherwise, the
1766              misalignment of DR_i must be set to unknown.  */
1767           FOR_EACH_VEC_ELT (datarefs, i, dr)
1768             if (dr != dr0)
1769               vect_update_misalignment_for_peel (dr, dr0, npeel);
1770
1771           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1772           if (npeel)
1773             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1774           else
1775             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1776               = DR_MISALIGNMENT (dr0);
1777           SET_DR_MISALIGNMENT (dr0, 0);
1778           if (dump_enabled_p ())
1779             {
1780               dump_printf_loc (MSG_NOTE, vect_location,
1781                                "Alignment of access forced using peeling.\n");
1782               dump_printf_loc (MSG_NOTE, vect_location,
1783                                "Peeling for alignment will be applied.\n");
1784             }
1785           /* The inside-loop cost will be accounted for in vectorizable_load
1786              and vectorizable_store correctly with adjusted alignments.
1787              Drop the body_cst_vec on the floor here.  */
1788           body_cost_vec.release ();
1789
1790           stat = vect_verify_datarefs_alignment (loop_vinfo);
1791           gcc_assert (stat);
1792           return stat;
1793         }
1794     }
1795
1796   body_cost_vec.release ();
1797
1798   /* (2) Versioning to force alignment.  */
1799
1800   /* Try versioning if:
1801      1) optimize loop for speed
1802      2) there is at least one unsupported misaligned data ref with an unknown
1803         misalignment, and
1804      3) all misaligned data refs with a known misalignment are supported, and
1805      4) the number of runtime alignment checks is within reason.  */
1806
1807   do_versioning =
1808         optimize_loop_nest_for_speed_p (loop)
1809         && (!loop->inner); /* FORNOW */
1810
1811   if (do_versioning)
1812     {
1813       FOR_EACH_VEC_ELT (datarefs, i, dr)
1814         {
1815           stmt = DR_STMT (dr);
1816           stmt_info = vinfo_for_stmt (stmt);
1817
1818           /* For interleaving, only the alignment of the first access
1819              matters.  */
1820           if (aligned_access_p (dr)
1821               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1822                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1823             continue;
1824
1825           if (STMT_VINFO_STRIDED_P (stmt_info))
1826             {
1827               /* Strided loads perform only component accesses, alignment is
1828                  irrelevant for them.  */
1829               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1830                 continue;
1831               do_versioning = false;
1832               break;
1833             }
1834
1835           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1836
1837           if (!supportable_dr_alignment)
1838             {
1839               gimple *stmt;
1840               int mask;
1841               tree vectype;
1842
1843               if (known_alignment_for_access_p (dr)
1844                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1845                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1846                 {
1847                   do_versioning = false;
1848                   break;
1849                 }
1850
1851               stmt = DR_STMT (dr);
1852               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1853               gcc_assert (vectype);
1854
1855               /* The rightmost bits of an aligned address must be zeros.
1856                  Construct the mask needed for this test.  For example,
1857                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1858                  mask must be 15 = 0xf. */
1859               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1860
1861               /* FORNOW: use the same mask to test all potentially unaligned
1862                  references in the loop.  The vectorizer currently supports
1863                  a single vector size, see the reference to
1864                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1865                  vectorization factor is computed.  */
1866               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1867                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1868               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1869               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1870                       DR_STMT (dr));
1871             }
1872         }
1873
1874       /* Versioning requires at least one misaligned data reference.  */
1875       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1876         do_versioning = false;
1877       else if (!do_versioning)
1878         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1879     }
1880
1881   if (do_versioning)
1882     {
1883       vec<gimple *> may_misalign_stmts
1884         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1885       gimple *stmt;
1886
1887       /* It can now be assumed that the data references in the statements
1888          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1889          of the loop being vectorized.  */
1890       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1891         {
1892           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1893           dr = STMT_VINFO_DATA_REF (stmt_info);
1894           SET_DR_MISALIGNMENT (dr, 0);
1895           if (dump_enabled_p ())
1896             dump_printf_loc (MSG_NOTE, vect_location,
1897                              "Alignment of access forced using versioning.\n");
1898         }
1899
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_NOTE, vect_location,
1902                          "Versioning for alignment will be applied.\n");
1903
1904       /* Peeling and versioning can't be done together at this time.  */
1905       gcc_assert (! (do_peeling && do_versioning));
1906
1907       stat = vect_verify_datarefs_alignment (loop_vinfo);
1908       gcc_assert (stat);
1909       return stat;
1910     }
1911
1912   /* This point is reached if neither peeling nor versioning is being done.  */
1913   gcc_assert (! (do_peeling || do_versioning));
1914
1915   stat = vect_verify_datarefs_alignment (loop_vinfo);
1916   return stat;
1917 }
1918
1919
1920 /* Function vect_find_same_alignment_drs.
1921
1922    Update group and alignment relations according to the chosen
1923    vectorization factor.  */
1924
1925 static void
1926 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1927                               loop_vec_info loop_vinfo)
1928 {
1929   unsigned int i;
1930   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1931   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1932   struct data_reference *dra = DDR_A (ddr);
1933   struct data_reference *drb = DDR_B (ddr);
1934   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1935   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1936   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1937   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1938   lambda_vector dist_v;
1939   unsigned int loop_depth;
1940
1941   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1942     return;
1943
1944   if (dra == drb)
1945     return;
1946
1947   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1948     return;
1949
1950   /* Loop-based vectorization and known data dependence.  */
1951   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1952     return;
1953
1954   /* Data-dependence analysis reports a distance vector of zero
1955      for data-references that overlap only in the first iteration
1956      but have different sign step (see PR45764).
1957      So as a sanity check require equal DR_STEP.  */
1958   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1959     return;
1960
1961   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1962   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1963     {
1964       int dist = dist_v[loop_depth];
1965
1966       if (dump_enabled_p ())
1967         dump_printf_loc (MSG_NOTE, vect_location,
1968                          "dependence distance  = %d.\n", dist);
1969
1970       /* Same loop iteration.  */
1971       if (dist == 0
1972           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1973         {
1974           /* Two references with distance zero have the same alignment.  */
1975           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1976           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1977           if (dump_enabled_p ())
1978             {
1979               dump_printf_loc (MSG_NOTE, vect_location,
1980                                "accesses have the same alignment.\n");
1981               dump_printf (MSG_NOTE,
1982                            "dependence distance modulo vf == 0 between ");
1983               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1984               dump_printf (MSG_NOTE,  " and ");
1985               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1986               dump_printf (MSG_NOTE, "\n");
1987             }
1988         }
1989     }
1990 }
1991
1992
1993 /* Function vect_analyze_data_refs_alignment
1994
1995    Analyze the alignment of the data-references in the loop.
1996    Return FALSE if a data reference is found that cannot be vectorized.  */
1997
1998 bool
1999 vect_analyze_data_refs_alignment (vec_info *vinfo)
2000 {
2001   if (dump_enabled_p ())
2002     dump_printf_loc (MSG_NOTE, vect_location,
2003                      "=== vect_analyze_data_refs_alignment ===\n");
2004
2005   /* Mark groups of data references with same alignment using
2006      data dependence information.  */
2007   if (is_a <loop_vec_info> (vinfo))
2008     {
2009       vec<ddr_p> ddrs = vinfo->ddrs;
2010       struct data_dependence_relation *ddr;
2011       unsigned int i;
2012
2013       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2014         vect_find_same_alignment_drs (ddr, as_a <loop_vec_info> (vinfo));
2015     }
2016
2017   if (!vect_compute_data_refs_alignment (vinfo))
2018     {
2019       if (dump_enabled_p ())
2020         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2021                          "not vectorized: can't calculate alignment "
2022                          "for data ref.\n");
2023       return false;
2024     }
2025
2026   return true;
2027 }
2028
2029
2030 /* Analyze groups of accesses: check that DR belongs to a group of
2031    accesses of legal size, step, etc.  Detect gaps, single element
2032    interleaving, and other special cases. Set grouped access info.
2033    Collect groups of strided stores for further use in SLP analysis.
2034    Worker for vect_analyze_group_access.  */
2035
2036 static bool
2037 vect_analyze_group_access_1 (struct data_reference *dr)
2038 {
2039   tree step = DR_STEP (dr);
2040   tree scalar_type = TREE_TYPE (DR_REF (dr));
2041   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2042   gimple *stmt = DR_STMT (dr);
2043   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2044   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2045   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2046   HOST_WIDE_INT dr_step = -1;
2047   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2048   bool slp_impossible = false;
2049   struct loop *loop = NULL;
2050
2051   if (loop_vinfo)
2052     loop = LOOP_VINFO_LOOP (loop_vinfo);
2053
2054   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2055      size of the interleaving group (including gaps).  */
2056   if (tree_fits_shwi_p (step))
2057     {
2058       dr_step = tree_to_shwi (step);
2059       groupsize = absu_hwi (dr_step) / type_size;
2060     }
2061   else
2062     groupsize = 0;
2063
2064   /* Not consecutive access is possible only if it is a part of interleaving.  */
2065   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2066     {
2067       /* Check if it this DR is a part of interleaving, and is a single
2068          element of the group that is accessed in the loop.  */
2069
2070       /* Gaps are supported only for loads. STEP must be a multiple of the type
2071          size.  The size of the group must be a power of 2.  */
2072       if (DR_IS_READ (dr)
2073           && (dr_step % type_size) == 0
2074           && groupsize > 0
2075           && exact_log2 (groupsize) != -1)
2076         {
2077           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2078           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2079           if (dump_enabled_p ())
2080             {
2081               dump_printf_loc (MSG_NOTE, vect_location,
2082                                "Detected single element interleaving ");
2083               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2084               dump_printf (MSG_NOTE, " step ");
2085               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2086               dump_printf (MSG_NOTE, "\n");
2087             }
2088
2089           if (loop_vinfo)
2090             {
2091               if (dump_enabled_p ())
2092                 dump_printf_loc (MSG_NOTE, vect_location,
2093                                  "Data access with gaps requires scalar "
2094                                  "epilogue loop\n");
2095               if (loop->inner)
2096                 {
2097                   if (dump_enabled_p ())
2098                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2099                                      "Peeling for outer loop is not"
2100                                      " supported\n");
2101                   return false;
2102                 }
2103
2104               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2105             }
2106
2107           return true;
2108         }
2109
2110       if (dump_enabled_p ())
2111         {
2112           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2113                            "not consecutive access ");
2114           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2115         }
2116
2117       if (bb_vinfo)
2118         {
2119           /* Mark the statement as unvectorizable.  */
2120           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2121           return true;
2122         }
2123
2124       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2125       STMT_VINFO_STRIDED_P (stmt_info) = true;
2126       return true;
2127     }
2128
2129   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2130     {
2131       /* First stmt in the interleaving chain. Check the chain.  */
2132       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2133       struct data_reference *data_ref = dr;
2134       unsigned int count = 1;
2135       tree prev_init = DR_INIT (data_ref);
2136       gimple *prev = stmt;
2137       HOST_WIDE_INT diff, gaps = 0;
2138
2139       while (next)
2140         {
2141           /* Skip same data-refs.  In case that two or more stmts share
2142              data-ref (supported only for loads), we vectorize only the first
2143              stmt, and the rest get their vectorized loads from the first
2144              one.  */
2145           if (!tree_int_cst_compare (DR_INIT (data_ref),
2146                                      DR_INIT (STMT_VINFO_DATA_REF (
2147                                                    vinfo_for_stmt (next)))))
2148             {
2149               if (DR_IS_WRITE (data_ref))
2150                 {
2151                   if (dump_enabled_p ())
2152                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153                                      "Two store stmts share the same dr.\n");
2154                   return false;
2155                 }
2156
2157               if (dump_enabled_p ())
2158                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2159                                  "Two or more load stmts share the same dr.\n");
2160
2161               /* For load use the same data-ref load.  */
2162               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2163
2164               prev = next;
2165               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2166               continue;
2167             }
2168
2169           prev = next;
2170           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2171
2172           /* All group members have the same STEP by construction.  */
2173           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2174
2175           /* Check that the distance between two accesses is equal to the type
2176              size. Otherwise, we have gaps.  */
2177           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2178                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2179           if (diff != 1)
2180             {
2181               /* FORNOW: SLP of accesses with gaps is not supported.  */
2182               slp_impossible = true;
2183               if (DR_IS_WRITE (data_ref))
2184                 {
2185                   if (dump_enabled_p ())
2186                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2187                                      "interleaved store with gaps\n");
2188                   return false;
2189                 }
2190
2191               gaps += diff - 1;
2192             }
2193
2194           last_accessed_element += diff;
2195
2196           /* Store the gap from the previous member of the group. If there is no
2197              gap in the access, GROUP_GAP is always 1.  */
2198           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2199
2200           prev_init = DR_INIT (data_ref);
2201           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2202           /* Count the number of data-refs in the chain.  */
2203           count++;
2204         }
2205
2206       if (groupsize == 0)
2207         groupsize = count + gaps;
2208
2209       if (groupsize > UINT_MAX)
2210         {
2211           if (dump_enabled_p ())
2212             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2213                              "group is too large\n");
2214           return false;
2215         }
2216
2217       /* Check that the size of the interleaving is equal to count for stores,
2218          i.e., that there are no gaps.  */
2219       if (groupsize != count
2220           && !DR_IS_READ (dr))
2221         {
2222           if (dump_enabled_p ())
2223             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2224                              "interleaved store with gaps\n");
2225           return false;
2226         }
2227
2228       /* If there is a gap after the last load in the group it is the
2229          difference between the groupsize and the last accessed
2230          element.
2231          When there is no gap, this difference should be 0.  */
2232       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2233
2234       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2235       if (dump_enabled_p ())
2236         {
2237           dump_printf_loc (MSG_NOTE, vect_location,
2238                            "Detected interleaving ");
2239           if (DR_IS_READ (dr))
2240             dump_printf (MSG_NOTE, "load ");
2241           else
2242             dump_printf (MSG_NOTE, "store ");
2243           dump_printf (MSG_NOTE, "of size %u starting with ",
2244                        (unsigned)groupsize);
2245           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2246           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2247             dump_printf_loc (MSG_NOTE, vect_location,
2248                              "There is a gap of %u elements after the group\n",
2249                              GROUP_GAP (vinfo_for_stmt (stmt)));
2250         }
2251
2252       /* SLP: create an SLP data structure for every interleaving group of
2253          stores for further analysis in vect_analyse_slp.  */
2254       if (DR_IS_WRITE (dr) && !slp_impossible)
2255         {
2256           if (loop_vinfo)
2257             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2258           if (bb_vinfo)
2259             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2260         }
2261
2262       /* If there is a gap in the end of the group or the group size cannot
2263          be made a multiple of the vector element count then we access excess
2264          elements in the last iteration and thus need to peel that off.  */
2265       if (loop_vinfo
2266           && (groupsize - last_accessed_element > 0
2267               || exact_log2 (groupsize) == -1))
2268
2269         {
2270           if (dump_enabled_p ())
2271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272                              "Data access with gaps requires scalar "
2273                              "epilogue loop\n");
2274           if (loop->inner)
2275             {
2276               if (dump_enabled_p ())
2277                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278                                  "Peeling for outer loop is not supported\n");
2279               return false;
2280             }
2281
2282           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2283         }
2284     }
2285
2286   return true;
2287 }
2288
2289 /* Analyze groups of accesses: check that DR belongs to a group of
2290    accesses of legal size, step, etc.  Detect gaps, single element
2291    interleaving, and other special cases. Set grouped access info.
2292    Collect groups of strided stores for further use in SLP analysis.  */
2293
2294 static bool
2295 vect_analyze_group_access (struct data_reference *dr)
2296 {
2297   if (!vect_analyze_group_access_1 (dr))
2298     {
2299       /* Dissolve the group if present.  */
2300       gimple *next;
2301       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2302       while (stmt)
2303         {
2304           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2305           next = GROUP_NEXT_ELEMENT (vinfo);
2306           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2307           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2308           stmt = next;
2309         }
2310       return false;
2311     }
2312   return true;
2313 }
2314
2315 /* Analyze the access pattern of the data-reference DR.
2316    In case of non-consecutive accesses call vect_analyze_group_access() to
2317    analyze groups of accesses.  */
2318
2319 static bool
2320 vect_analyze_data_ref_access (struct data_reference *dr)
2321 {
2322   tree step = DR_STEP (dr);
2323   tree scalar_type = TREE_TYPE (DR_REF (dr));
2324   gimple *stmt = DR_STMT (dr);
2325   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2326   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2327   struct loop *loop = NULL;
2328
2329   if (loop_vinfo)
2330     loop = LOOP_VINFO_LOOP (loop_vinfo);
2331
2332   if (loop_vinfo && !step)
2333     {
2334       if (dump_enabled_p ())
2335         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336                          "bad data-ref access in loop\n");
2337       return false;
2338     }
2339
2340   /* Allow loads with zero step in inner-loop vectorization.  */
2341   if (loop_vinfo && integer_zerop (step))
2342     {
2343       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2344       if (!nested_in_vect_loop_p (loop, stmt))
2345         return DR_IS_READ (dr);
2346       /* Allow references with zero step for outer loops marked
2347          with pragma omp simd only - it guarantees absence of
2348          loop-carried dependencies between inner loop iterations.  */
2349       if (!loop->force_vectorize)
2350         {
2351           if (dump_enabled_p ())
2352             dump_printf_loc (MSG_NOTE, vect_location,
2353                              "zero step in inner loop of nest\n");
2354           return false;
2355         }
2356     }
2357
2358   if (loop && nested_in_vect_loop_p (loop, stmt))
2359     {
2360       /* Interleaved accesses are not yet supported within outer-loop
2361         vectorization for references in the inner-loop.  */
2362       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2363
2364       /* For the rest of the analysis we use the outer-loop step.  */
2365       step = STMT_VINFO_DR_STEP (stmt_info);
2366       if (integer_zerop (step))
2367         {
2368           if (dump_enabled_p ())
2369             dump_printf_loc (MSG_NOTE, vect_location,
2370                              "zero step in outer loop.\n");
2371           return DR_IS_READ (dr);
2372         }
2373     }
2374
2375   /* Consecutive?  */
2376   if (TREE_CODE (step) == INTEGER_CST)
2377     {
2378       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2379       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2380           || (dr_step < 0
2381               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2382         {
2383           /* Mark that it is not interleaving.  */
2384           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2385           return true;
2386         }
2387     }
2388
2389   if (loop && nested_in_vect_loop_p (loop, stmt))
2390     {
2391       if (dump_enabled_p ())
2392         dump_printf_loc (MSG_NOTE, vect_location,
2393                          "grouped access in outer loop.\n");
2394       return false;
2395     }
2396
2397
2398   /* Assume this is a DR handled by non-constant strided load case.  */
2399   if (TREE_CODE (step) != INTEGER_CST)
2400     return (STMT_VINFO_STRIDED_P (stmt_info)
2401             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2402                 || vect_analyze_group_access (dr)));
2403
2404   /* Not consecutive access - check if it's a part of interleaving group.  */
2405   return vect_analyze_group_access (dr);
2406 }
2407
2408
2409
2410 /*  A helper function used in the comparator function to sort data
2411     references.  T1 and T2 are two data references to be compared.
2412     The function returns -1, 0, or 1.  */
2413
2414 static int
2415 compare_tree (tree t1, tree t2)
2416 {
2417   int i, cmp;
2418   enum tree_code code;
2419   char tclass;
2420
2421   if (t1 == t2)
2422     return 0;
2423   if (t1 == NULL)
2424     return -1;
2425   if (t2 == NULL)
2426     return 1;
2427
2428
2429   if (TREE_CODE (t1) != TREE_CODE (t2))
2430     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2431
2432   code = TREE_CODE (t1);
2433   switch (code)
2434     {
2435     /* For const values, we can just use hash values for comparisons.  */
2436     case INTEGER_CST:
2437     case REAL_CST:
2438     case FIXED_CST:
2439     case STRING_CST:
2440     case COMPLEX_CST:
2441     case VECTOR_CST:
2442       {
2443         hashval_t h1 = iterative_hash_expr (t1, 0);
2444         hashval_t h2 = iterative_hash_expr (t2, 0);
2445         if (h1 != h2)
2446           return h1 < h2 ? -1 : 1;
2447         break;
2448       }
2449
2450     case SSA_NAME:
2451       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2452       if (cmp != 0)
2453         return cmp;
2454
2455       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2456         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2457       break;
2458
2459     default:
2460       tclass = TREE_CODE_CLASS (code);
2461
2462       /* For var-decl, we could compare their UIDs.  */
2463       if (tclass == tcc_declaration)
2464         {
2465           if (DECL_UID (t1) != DECL_UID (t2))
2466             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2467           break;
2468         }
2469
2470       /* For expressions with operands, compare their operands recursively.  */
2471       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2472         {
2473           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2474           if (cmp != 0)
2475             return cmp;
2476         }
2477     }
2478
2479   return 0;
2480 }
2481
2482
2483 /* Compare two data-references DRA and DRB to group them into chunks
2484    suitable for grouping.  */
2485
2486 static int
2487 dr_group_sort_cmp (const void *dra_, const void *drb_)
2488 {
2489   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2490   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2491   int cmp;
2492
2493   /* Stabilize sort.  */
2494   if (dra == drb)
2495     return 0;
2496
2497   /* Ordering of DRs according to base.  */
2498   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2499     {
2500       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2501       if (cmp != 0)
2502         return cmp;
2503     }
2504
2505   /* And according to DR_OFFSET.  */
2506   if (!dr_equal_offsets_p (dra, drb))
2507     {
2508       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2509       if (cmp != 0)
2510         return cmp;
2511     }
2512
2513   /* Put reads before writes.  */
2514   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2515     return DR_IS_READ (dra) ? -1 : 1;
2516
2517   /* Then sort after access size.  */
2518   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2519                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2520     {
2521       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2522                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2523       if (cmp != 0)
2524         return cmp;
2525     }
2526
2527   /* And after step.  */
2528   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2529     {
2530       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2531       if (cmp != 0)
2532         return cmp;
2533     }
2534
2535   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2536   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2537   if (cmp == 0)
2538     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2539   return cmp;
2540 }
2541
2542 /* Function vect_analyze_data_ref_accesses.
2543
2544    Analyze the access pattern of all the data references in the loop.
2545
2546    FORNOW: the only access pattern that is considered vectorizable is a
2547            simple step 1 (consecutive) access.
2548
2549    FORNOW: handle only arrays and pointer accesses.  */
2550
2551 bool
2552 vect_analyze_data_ref_accesses (vec_info *vinfo)
2553 {
2554   unsigned int i;
2555   vec<data_reference_p> datarefs = vinfo->datarefs;
2556   struct data_reference *dr;
2557
2558   if (dump_enabled_p ())
2559     dump_printf_loc (MSG_NOTE, vect_location,
2560                      "=== vect_analyze_data_ref_accesses ===\n");
2561
2562   if (datarefs.is_empty ())
2563     return true;
2564
2565   /* Sort the array of datarefs to make building the interleaving chains
2566      linear.  Don't modify the original vector's order, it is needed for
2567      determining what dependencies are reversed.  */
2568   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2569   datarefs_copy.qsort (dr_group_sort_cmp);
2570
2571   /* Build the interleaving chains.  */
2572   for (i = 0; i < datarefs_copy.length () - 1;)
2573     {
2574       data_reference_p dra = datarefs_copy[i];
2575       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2576       stmt_vec_info lastinfo = NULL;
2577       for (i = i + 1; i < datarefs_copy.length (); ++i)
2578         {
2579           data_reference_p drb = datarefs_copy[i];
2580           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2581
2582           /* ???  Imperfect sorting (non-compatible types, non-modulo
2583              accesses, same accesses) can lead to a group to be artificially
2584              split here as we don't just skip over those.  If it really
2585              matters we can push those to a worklist and re-iterate
2586              over them.  The we can just skip ahead to the next DR here.  */
2587
2588           /* Check that the data-refs have same first location (except init)
2589              and they are both either store or load (not load and store,
2590              not masked loads or stores).  */
2591           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2592               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2593                                    DR_BASE_ADDRESS (drb), 0)
2594               || !dr_equal_offsets_p (dra, drb)
2595               || !gimple_assign_single_p (DR_STMT (dra))
2596               || !gimple_assign_single_p (DR_STMT (drb)))
2597             break;
2598
2599           /* Check that the data-refs have the same constant size.  */
2600           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2601           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2602           if (!tree_fits_uhwi_p (sza)
2603               || !tree_fits_uhwi_p (szb)
2604               || !tree_int_cst_equal (sza, szb))
2605             break;
2606
2607           /* Check that the data-refs have the same step.  */
2608           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2609             break;
2610
2611           /* Do not place the same access in the interleaving chain twice.  */
2612           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2613             break;
2614
2615           /* Check the types are compatible.
2616              ???  We don't distinguish this during sorting.  */
2617           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2618                                    TREE_TYPE (DR_REF (drb))))
2619             break;
2620
2621           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2622           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2623           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2624           gcc_assert (init_a < init_b);
2625
2626           /* If init_b == init_a + the size of the type * k, we have an
2627              interleaving, and DRA is accessed before DRB.  */
2628           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2629           if ((init_b - init_a) % type_size_a != 0)
2630             break;
2631
2632           /* If we have a store, the accesses are adjacent.  This splits
2633              groups into chunks we support (we don't support vectorization
2634              of stores with gaps).  */
2635           if (!DR_IS_READ (dra)
2636               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2637                                              (DR_INIT (datarefs_copy[i-1]))
2638                   != type_size_a))
2639             break;
2640
2641           /* If the step (if not zero or non-constant) is greater than the
2642              difference between data-refs' inits this splits groups into
2643              suitable sizes.  */
2644           if (tree_fits_shwi_p (DR_STEP (dra)))
2645             {
2646               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2647               if (step != 0 && step <= (init_b - init_a))
2648                 break;
2649             }
2650
2651           if (dump_enabled_p ())
2652             {
2653               dump_printf_loc (MSG_NOTE, vect_location,
2654                                "Detected interleaving ");
2655               if (DR_IS_READ (dra))
2656                 dump_printf (MSG_NOTE, "load ");
2657               else
2658                 dump_printf (MSG_NOTE, "store ");
2659               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2660               dump_printf (MSG_NOTE,  " and ");
2661               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2662               dump_printf (MSG_NOTE, "\n");
2663             }
2664
2665           /* Link the found element into the group list.  */
2666           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2667             {
2668               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2669               lastinfo = stmtinfo_a;
2670             }
2671           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2672           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2673           lastinfo = stmtinfo_b;
2674         }
2675     }
2676
2677   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2678     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2679         && !vect_analyze_data_ref_access (dr))
2680       {
2681         if (dump_enabled_p ())
2682           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2683                            "not vectorized: complicated access pattern.\n");
2684
2685         if (is_a <bb_vec_info> (vinfo))
2686           {
2687             /* Mark the statement as not vectorizable.  */
2688             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2689             continue;
2690           }
2691         else
2692           {
2693             datarefs_copy.release ();
2694             return false;
2695           }
2696       }
2697
2698   datarefs_copy.release ();
2699   return true;
2700 }
2701
2702
2703 /* Operator == between two dr_with_seg_len objects.
2704
2705    This equality operator is used to make sure two data refs
2706    are the same one so that we will consider to combine the
2707    aliasing checks of those two pairs of data dependent data
2708    refs.  */
2709
2710 static bool
2711 operator == (const dr_with_seg_len& d1,
2712              const dr_with_seg_len& d2)
2713 {
2714   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2715                           DR_BASE_ADDRESS (d2.dr), 0)
2716            && compare_tree (d1.offset, d2.offset) == 0
2717            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2718 }
2719
2720 /* Function comp_dr_with_seg_len_pair.
2721
2722    Comparison function for sorting objects of dr_with_seg_len_pair_t
2723    so that we can combine aliasing checks in one scan.  */
2724
2725 static int
2726 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2727 {
2728   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2729   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2730
2731   const dr_with_seg_len &p11 = p1->first,
2732                         &p12 = p1->second,
2733                         &p21 = p2->first,
2734                         &p22 = p2->second;
2735
2736   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2737      if a and c have the same basic address snd step, and b and d have the same
2738      address and step.  Therefore, if any a&c or b&d don't have the same address
2739      and step, we don't care the order of those two pairs after sorting.  */
2740   int comp_res;
2741
2742   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2743                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2744     return comp_res;
2745   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2746                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2747     return comp_res;
2748   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2749     return comp_res;
2750   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2751     return comp_res;
2752   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2753     return comp_res;
2754   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2755     return comp_res;
2756
2757   return 0;
2758 }
2759
2760 /* Function vect_vfa_segment_size.
2761
2762    Create an expression that computes the size of segment
2763    that will be accessed for a data reference.  The functions takes into
2764    account that realignment loads may access one more vector.
2765
2766    Input:
2767      DR: The data reference.
2768      LENGTH_FACTOR: segment length to consider.
2769
2770    Return an expression whose value is the size of segment which will be
2771    accessed by DR.  */
2772
2773 static tree
2774 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2775 {
2776   tree segment_length;
2777
2778   if (integer_zerop (DR_STEP (dr)))
2779     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2780   else
2781     segment_length = size_binop (MULT_EXPR,
2782                                  fold_convert (sizetype, DR_STEP (dr)),
2783                                  fold_convert (sizetype, length_factor));
2784
2785   if (vect_supportable_dr_alignment (dr, false)
2786         == dr_explicit_realign_optimized)
2787     {
2788       tree vector_size = TYPE_SIZE_UNIT
2789                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2790
2791       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2792     }
2793   return segment_length;
2794 }
2795
2796 /* Function vect_prune_runtime_alias_test_list.
2797
2798    Prune a list of ddrs to be tested at run-time by versioning for alias.
2799    Merge several alias checks into one if possible.
2800    Return FALSE if resulting list of ddrs is longer then allowed by
2801    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2802
2803 bool
2804 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2805 {
2806   vec<ddr_p> may_alias_ddrs =
2807     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2808   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2809     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2810   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2811   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2812
2813   ddr_p ddr;
2814   unsigned int i;
2815   tree length_factor;
2816
2817   if (dump_enabled_p ())
2818     dump_printf_loc (MSG_NOTE, vect_location,
2819                      "=== vect_prune_runtime_alias_test_list ===\n");
2820
2821   if (may_alias_ddrs.is_empty ())
2822     return true;
2823
2824   /* Basically, for each pair of dependent data refs store_ptr_0
2825      and load_ptr_0, we create an expression:
2826
2827      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2828      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2829
2830      for aliasing checks.  However, in some cases we can decrease
2831      the number of checks by combining two checks into one.  For
2832      example, suppose we have another pair of data refs store_ptr_0
2833      and load_ptr_1, and if the following condition is satisfied:
2834
2835      load_ptr_0 < load_ptr_1  &&
2836      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2837
2838      (this condition means, in each iteration of vectorized loop,
2839      the accessed memory of store_ptr_0 cannot be between the memory
2840      of load_ptr_0 and load_ptr_1.)
2841
2842      we then can use only the following expression to finish the
2843      alising checks between store_ptr_0 & load_ptr_0 and
2844      store_ptr_0 & load_ptr_1:
2845
2846      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2847      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2848
2849      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2850      same basic address.  */
2851
2852   comp_alias_ddrs.create (may_alias_ddrs.length ());
2853
2854   /* First, we collect all data ref pairs for aliasing checks.  */
2855   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2856     {
2857       struct data_reference *dr_a, *dr_b;
2858       gimple *dr_group_first_a, *dr_group_first_b;
2859       tree segment_length_a, segment_length_b;
2860       gimple *stmt_a, *stmt_b;
2861
2862       dr_a = DDR_A (ddr);
2863       stmt_a = DR_STMT (DDR_A (ddr));
2864       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2865       if (dr_group_first_a)
2866         {
2867           stmt_a = dr_group_first_a;
2868           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2869         }
2870
2871       dr_b = DDR_B (ddr);
2872       stmt_b = DR_STMT (DDR_B (ddr));
2873       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2874       if (dr_group_first_b)
2875         {
2876           stmt_b = dr_group_first_b;
2877           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2878         }
2879
2880       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2881         length_factor = scalar_loop_iters;
2882       else
2883         length_factor = size_int (vect_factor);
2884       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2885       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2886
2887       dr_with_seg_len_pair_t dr_with_seg_len_pair
2888           (dr_with_seg_len (dr_a, segment_length_a),
2889            dr_with_seg_len (dr_b, segment_length_b));
2890
2891       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2892         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2893
2894       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2895     }
2896
2897   /* Second, we sort the collected data ref pairs so that we can scan
2898      them once to combine all possible aliasing checks.  */
2899   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2900
2901   /* Third, we scan the sorted dr pairs and check if we can combine
2902      alias checks of two neighbouring dr pairs.  */
2903   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2904     {
2905       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2906       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2907                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2908                       *dr_a2 = &comp_alias_ddrs[i].first,
2909                       *dr_b2 = &comp_alias_ddrs[i].second;
2910
2911       /* Remove duplicate data ref pairs.  */
2912       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2913         {
2914           if (dump_enabled_p ())
2915             {
2916               dump_printf_loc (MSG_NOTE, vect_location,
2917                                "found equal ranges ");
2918               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2919                                  DR_REF (dr_a1->dr));
2920               dump_printf (MSG_NOTE,  ", ");
2921               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2922                                  DR_REF (dr_b1->dr));
2923               dump_printf (MSG_NOTE,  " and ");
2924               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2925                                  DR_REF (dr_a2->dr));
2926               dump_printf (MSG_NOTE,  ", ");
2927               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2928                                  DR_REF (dr_b2->dr));
2929               dump_printf (MSG_NOTE, "\n");
2930             }
2931
2932           comp_alias_ddrs.ordered_remove (i--);
2933           continue;
2934         }
2935
2936       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2937         {
2938           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2939              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2940           if (*dr_a1 == *dr_a2)
2941             {
2942               std::swap (dr_a1, dr_b1);
2943               std::swap (dr_a2, dr_b2);
2944             }
2945
2946           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2947                                 DR_BASE_ADDRESS (dr_a2->dr),
2948                                 0)
2949               || !tree_fits_shwi_p (dr_a1->offset)
2950               || !tree_fits_shwi_p (dr_a2->offset))
2951             continue;
2952
2953           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2954                                 - tree_to_shwi (dr_a1->offset));
2955
2956
2957           /* Now we check if the following condition is satisfied:
2958
2959              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2960
2961              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2962              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2963              have to make a best estimation.  We can get the minimum value
2964              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2965              then either of the following two conditions can guarantee the
2966              one above:
2967
2968              1: DIFF <= MIN_SEG_LEN_B
2969              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2970
2971              */
2972
2973           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2974                                           ? tree_to_shwi (dr_b1->seg_len)
2975                                           : vect_factor);
2976
2977           if (diff <= min_seg_len_b
2978               || (tree_fits_shwi_p (dr_a1->seg_len)
2979                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2980             {
2981               if (dump_enabled_p ())
2982                 {
2983                   dump_printf_loc (MSG_NOTE, vect_location,
2984                                    "merging ranges for ");
2985                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2986                                      DR_REF (dr_a1->dr));
2987                   dump_printf (MSG_NOTE,  ", ");
2988                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2989                                      DR_REF (dr_b1->dr));
2990                   dump_printf (MSG_NOTE,  " and ");
2991                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2992                                      DR_REF (dr_a2->dr));
2993                   dump_printf (MSG_NOTE,  ", ");
2994                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2995                                      DR_REF (dr_b2->dr));
2996                   dump_printf (MSG_NOTE, "\n");
2997                 }
2998
2999               dr_a1->seg_len = size_binop (PLUS_EXPR,
3000                                            dr_a2->seg_len, size_int (diff));
3001               comp_alias_ddrs.ordered_remove (i--);
3002             }
3003         }
3004     }
3005
3006   dump_printf_loc (MSG_NOTE, vect_location,
3007                    "improved number of alias checks from %d to %d\n",
3008                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
3009   if ((int) comp_alias_ddrs.length () >
3010       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3011     return false;
3012
3013   return true;
3014 }
3015
3016 /* Check whether a non-affine read or write in stmt is suitable for gather load
3017    or scatter store and if so, return a builtin decl for that operation.  */
3018
3019 tree
3020 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3021                            tree *offp, int *scalep)
3022 {
3023   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3024   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3025   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3026   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3027   tree offtype = NULL_TREE;
3028   tree decl, base, off;
3029   machine_mode pmode;
3030   int punsignedp, pvolatilep;
3031
3032   base = DR_REF (dr);
3033   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3034      see if we can use the def stmt of the address.  */
3035   if (is_gimple_call (stmt)
3036       && gimple_call_internal_p (stmt)
3037       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3038           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3039       && TREE_CODE (base) == MEM_REF
3040       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3041       && integer_zerop (TREE_OPERAND (base, 1))
3042       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3043     {
3044       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3045       if (is_gimple_assign (def_stmt)
3046           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3047         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3048     }
3049
3050   /* The gather and scatter builtins need address of the form
3051      loop_invariant + vector * {1, 2, 4, 8}
3052      or
3053      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3054      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3055      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3056      multiplications and additions in it.  To get a vector, we need
3057      a single SSA_NAME that will be defined in the loop and will
3058      contain everything that is not loop invariant and that can be
3059      vectorized.  The following code attempts to find such a preexistng
3060      SSA_NAME OFF and put the loop invariants into a tree BASE
3061      that can be gimplified before the loop.  */
3062   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3063                               &pmode, &punsignedp, &pvolatilep, false);
3064   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3065
3066   if (TREE_CODE (base) == MEM_REF)
3067     {
3068       if (!integer_zerop (TREE_OPERAND (base, 1)))
3069         {
3070           if (off == NULL_TREE)
3071             {
3072               offset_int moff = mem_ref_offset (base);
3073               off = wide_int_to_tree (sizetype, moff);
3074             }
3075           else
3076             off = size_binop (PLUS_EXPR, off,
3077                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3078         }
3079       base = TREE_OPERAND (base, 0);
3080     }
3081   else
3082     base = build_fold_addr_expr (base);
3083
3084   if (off == NULL_TREE)
3085     off = size_zero_node;
3086
3087   /* If base is not loop invariant, either off is 0, then we start with just
3088      the constant offset in the loop invariant BASE and continue with base
3089      as OFF, otherwise give up.
3090      We could handle that case by gimplifying the addition of base + off
3091      into some SSA_NAME and use that as off, but for now punt.  */
3092   if (!expr_invariant_in_loop_p (loop, base))
3093     {
3094       if (!integer_zerop (off))
3095         return NULL_TREE;
3096       off = base;
3097       base = size_int (pbitpos / BITS_PER_UNIT);
3098     }
3099   /* Otherwise put base + constant offset into the loop invariant BASE
3100      and continue with OFF.  */
3101   else
3102     {
3103       base = fold_convert (sizetype, base);
3104       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3105     }
3106
3107   /* OFF at this point may be either a SSA_NAME or some tree expression
3108      from get_inner_reference.  Try to peel off loop invariants from it
3109      into BASE as long as possible.  */
3110   STRIP_NOPS (off);
3111   while (offtype == NULL_TREE)
3112     {
3113       enum tree_code code;
3114       tree op0, op1, add = NULL_TREE;
3115
3116       if (TREE_CODE (off) == SSA_NAME)
3117         {
3118           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3119
3120           if (expr_invariant_in_loop_p (loop, off))
3121             return NULL_TREE;
3122
3123           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3124             break;
3125
3126           op0 = gimple_assign_rhs1 (def_stmt);
3127           code = gimple_assign_rhs_code (def_stmt);
3128           op1 = gimple_assign_rhs2 (def_stmt);
3129         }
3130       else
3131         {
3132           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3133             return NULL_TREE;
3134           code = TREE_CODE (off);
3135           extract_ops_from_tree (off, &code, &op0, &op1);
3136         }
3137       switch (code)
3138         {
3139         case POINTER_PLUS_EXPR:
3140         case PLUS_EXPR:
3141           if (expr_invariant_in_loop_p (loop, op0))
3142             {
3143               add = op0;
3144               off = op1;
3145             do_add:
3146               add = fold_convert (sizetype, add);
3147               if (scale != 1)
3148                 add = size_binop (MULT_EXPR, add, size_int (scale));
3149               base = size_binop (PLUS_EXPR, base, add);
3150               continue;
3151             }
3152           if (expr_invariant_in_loop_p (loop, op1))
3153             {
3154               add = op1;
3155               off = op0;
3156               goto do_add;
3157             }
3158           break;
3159         case MINUS_EXPR:
3160           if (expr_invariant_in_loop_p (loop, op1))
3161             {
3162               add = fold_convert (sizetype, op1);
3163               add = size_binop (MINUS_EXPR, size_zero_node, add);
3164               off = op0;
3165               goto do_add;
3166             }
3167           break;
3168         case MULT_EXPR:
3169           if (scale == 1 && tree_fits_shwi_p (op1))
3170             {
3171               scale = tree_to_shwi (op1);
3172               off = op0;
3173               continue;
3174             }
3175           break;
3176         case SSA_NAME:
3177           off = op0;
3178           continue;
3179         CASE_CONVERT:
3180           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3181               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3182             break;
3183           if (TYPE_PRECISION (TREE_TYPE (op0))
3184               == TYPE_PRECISION (TREE_TYPE (off)))
3185             {
3186               off = op0;
3187               continue;
3188             }
3189           if (TYPE_PRECISION (TREE_TYPE (op0))
3190               < TYPE_PRECISION (TREE_TYPE (off)))
3191             {
3192               off = op0;
3193               offtype = TREE_TYPE (off);
3194               STRIP_NOPS (off);
3195               continue;
3196             }
3197           break;
3198         default:
3199           break;
3200         }
3201       break;
3202     }
3203
3204   /* If at the end OFF still isn't a SSA_NAME or isn't
3205      defined in the loop, punt.  */
3206   if (TREE_CODE (off) != SSA_NAME
3207       || expr_invariant_in_loop_p (loop, off))
3208     return NULL_TREE;
3209
3210   if (offtype == NULL_TREE)
3211     offtype = TREE_TYPE (off);
3212
3213   if (DR_IS_READ (dr))
3214     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3215                                              offtype, scale);
3216   else
3217     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3218                                               offtype, scale);
3219
3220   if (decl == NULL_TREE)
3221     return NULL_TREE;
3222
3223   if (basep)
3224     *basep = base;
3225   if (offp)
3226     *offp = off;
3227   if (scalep)
3228     *scalep = scale;
3229   return decl;
3230 }
3231
3232 /* Function vect_analyze_data_refs.
3233
3234   Find all the data references in the loop or basic block.
3235
3236    The general structure of the analysis of data refs in the vectorizer is as
3237    follows:
3238    1- vect_analyze_data_refs(loop/bb): call
3239       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3240       in the loop/bb and their dependences.
3241    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3242    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3243    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3244
3245 */
3246
3247 bool
3248 vect_analyze_data_refs (vec_info *vinfo, int *min_vf, unsigned *n_stmts)
3249 {
3250   struct loop *loop = NULL;
3251   basic_block bb = NULL;
3252   unsigned int i;
3253   vec<data_reference_p> datarefs;
3254   struct data_reference *dr;
3255   tree scalar_type;
3256
3257   if (dump_enabled_p ())
3258     dump_printf_loc (MSG_NOTE, vect_location,
3259                      "=== vect_analyze_data_refs ===\n");
3260
3261   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3262     {
3263       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3264
3265       loop = LOOP_VINFO_LOOP (loop_vinfo);
3266       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3267       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3268         {
3269           if (dump_enabled_p ())
3270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3271                              "not vectorized: loop contains function calls"
3272                              " or data references that cannot be analyzed\n");
3273           return false;
3274         }
3275
3276       for (i = 0; i < loop->num_nodes; i++)
3277         {
3278           gimple_stmt_iterator gsi;
3279
3280           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3281             {
3282               gimple *stmt = gsi_stmt (gsi);
3283               if (is_gimple_debug (stmt))
3284                 continue;
3285               ++*n_stmts;
3286               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3287                 {
3288                   if (is_gimple_call (stmt) && loop->safelen)
3289                     {
3290                       tree fndecl = gimple_call_fndecl (stmt), op;
3291                       if (fndecl != NULL_TREE)
3292                         {
3293                           struct cgraph_node *node = cgraph_node::get (fndecl);
3294                           if (node != NULL && node->simd_clones != NULL)
3295                             {
3296                               unsigned int j, n = gimple_call_num_args (stmt);
3297                               for (j = 0; j < n; j++)
3298                                 {
3299                                   op = gimple_call_arg (stmt, j);
3300                                   if (DECL_P (op)
3301                                       || (REFERENCE_CLASS_P (op)
3302                                           && get_base_address (op)))
3303                                     break;
3304                                 }
3305                               op = gimple_call_lhs (stmt);
3306                               /* Ignore #pragma omp declare simd functions
3307                                  if they don't have data references in the
3308                                  call stmt itself.  */
3309                               if (j == n
3310                                   && !(op
3311                                        && (DECL_P (op)
3312                                            || (REFERENCE_CLASS_P (op)
3313                                                && get_base_address (op)))))
3314                                 continue;
3315                             }
3316                         }
3317                     }
3318                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3319                   if (dump_enabled_p ())
3320                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3321                                      "not vectorized: loop contains function "
3322                                      "calls or data references that cannot "
3323                                      "be analyzed\n");
3324                   return false;
3325                 }
3326             }
3327         }
3328
3329       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3330     }
3331   else
3332     {
3333       bb_vec_info bb_vinfo = as_a <bb_vec_info> (vinfo);
3334       gimple_stmt_iterator gsi;
3335
3336       bb = BB_VINFO_BB (bb_vinfo);
3337       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3338         {
3339           gimple *stmt = gsi_stmt (gsi);
3340           if (is_gimple_debug (stmt))
3341             continue;
3342           ++*n_stmts;
3343           if (!find_data_references_in_stmt (NULL, stmt,
3344                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3345             {
3346               /* Mark the rest of the basic-block as unvectorizable.  */
3347               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3348                 {
3349                   stmt = gsi_stmt (gsi);
3350                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3351                 }
3352               break;
3353             }
3354         }
3355
3356       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3357     }
3358
3359   /* Go through the data-refs, check that the analysis succeeded.  Update
3360      pointer from stmt_vec_info struct to DR and vectype.  */
3361
3362   FOR_EACH_VEC_ELT (datarefs, i, dr)
3363     {
3364       gimple *stmt;
3365       stmt_vec_info stmt_info;
3366       tree base, offset, init;
3367       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3368       bool simd_lane_access = false;
3369       int vf;
3370
3371 again:
3372       if (!dr || !DR_REF (dr))
3373         {
3374           if (dump_enabled_p ())
3375             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3376                              "not vectorized: unhandled data-ref\n");
3377           return false;
3378         }
3379
3380       stmt = DR_STMT (dr);
3381       stmt_info = vinfo_for_stmt (stmt);
3382
3383       /* Discard clobbers from the dataref vector.  We will remove
3384          clobber stmts during vectorization.  */
3385       if (gimple_clobber_p (stmt))
3386         {
3387           free_data_ref (dr);
3388           if (i == datarefs.length () - 1)
3389             {
3390               datarefs.pop ();
3391               break;
3392             }
3393           datarefs.ordered_remove (i);
3394           dr = datarefs[i];
3395           goto again;
3396         }
3397
3398       /* Check that analysis of the data-ref succeeded.  */
3399       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3400           || !DR_STEP (dr))
3401         {
3402           bool maybe_gather
3403             = DR_IS_READ (dr)
3404               && !TREE_THIS_VOLATILE (DR_REF (dr))
3405               && targetm.vectorize.builtin_gather != NULL;
3406           bool maybe_scatter
3407             = DR_IS_WRITE (dr)
3408               && !TREE_THIS_VOLATILE (DR_REF (dr))
3409               && targetm.vectorize.builtin_scatter != NULL;
3410           bool maybe_simd_lane_access
3411             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3412
3413           /* If target supports vector gather loads or scatter stores, or if
3414              this might be a SIMD lane access, see if they can't be used.  */
3415           if (is_a <loop_vec_info> (vinfo)
3416               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3417               && !nested_in_vect_loop_p (loop, stmt))
3418             {
3419               struct data_reference *newdr
3420                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3421                                    DR_REF (dr), stmt, maybe_scatter ? false : true);
3422               gcc_assert (newdr != NULL && DR_REF (newdr));
3423               if (DR_BASE_ADDRESS (newdr)
3424                   && DR_OFFSET (newdr)
3425                   && DR_INIT (newdr)
3426                   && DR_STEP (newdr)
3427                   && integer_zerop (DR_STEP (newdr)))
3428                 {
3429                   if (maybe_simd_lane_access)
3430                     {
3431                       tree off = DR_OFFSET (newdr);
3432                       STRIP_NOPS (off);
3433                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3434                           && TREE_CODE (off) == MULT_EXPR
3435                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3436                         {
3437                           tree step = TREE_OPERAND (off, 1);
3438                           off = TREE_OPERAND (off, 0);
3439                           STRIP_NOPS (off);
3440                           if (CONVERT_EXPR_P (off)
3441                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3442                                                                           0)))
3443                                  < TYPE_PRECISION (TREE_TYPE (off)))
3444                             off = TREE_OPERAND (off, 0);
3445                           if (TREE_CODE (off) == SSA_NAME)
3446                             {
3447                               gimple *def = SSA_NAME_DEF_STMT (off);
3448                               tree reft = TREE_TYPE (DR_REF (newdr));
3449                               if (is_gimple_call (def)
3450                                   && gimple_call_internal_p (def)
3451                                   && (gimple_call_internal_fn (def)
3452                                       == IFN_GOMP_SIMD_LANE))
3453                                 {
3454                                   tree arg = gimple_call_arg (def, 0);
3455                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3456                                   arg = SSA_NAME_VAR (arg);
3457                                   if (arg == loop->simduid
3458                                       /* For now.  */
3459                                       && tree_int_cst_equal
3460                                            (TYPE_SIZE_UNIT (reft),
3461                                             step))
3462                                     {
3463                                       DR_OFFSET (newdr) = ssize_int (0);
3464                                       DR_STEP (newdr) = step;
3465                                       DR_ALIGNED_TO (newdr)
3466                                         = size_int (BIGGEST_ALIGNMENT);
3467                                       dr = newdr;
3468                                       simd_lane_access = true;
3469                                     }
3470                                 }
3471                             }
3472                         }
3473                     }
3474                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3475                     {
3476                       dr = newdr;
3477                       if (maybe_gather)
3478                         gatherscatter = GATHER;
3479                       else
3480                         gatherscatter = SCATTER;
3481                     }
3482                 }
3483               if (gatherscatter == SG_NONE && !simd_lane_access)
3484                 free_data_ref (newdr);
3485             }
3486
3487           if (gatherscatter == SG_NONE && !simd_lane_access)
3488             {
3489               if (dump_enabled_p ())
3490                 {
3491                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3492                                    "not vectorized: data ref analysis "
3493                                    "failed ");
3494                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3495                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3496                 }
3497
3498               if (is_a <bb_vec_info> (vinfo))
3499                 break;
3500
3501               return false;
3502             }
3503         }
3504
3505       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3506         {
3507           if (dump_enabled_p ())
3508             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3509                              "not vectorized: base addr of dr is a "
3510                              "constant\n");
3511
3512           if (is_a <bb_vec_info> (vinfo))
3513             break;
3514
3515           if (gatherscatter != SG_NONE || simd_lane_access)
3516             free_data_ref (dr);
3517           return false;
3518         }
3519
3520       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3521         {
3522           if (dump_enabled_p ())
3523             {
3524               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3525                                "not vectorized: volatile type ");
3526               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3527               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3528             }
3529
3530           if (is_a <bb_vec_info> (vinfo))
3531             break;
3532
3533           return false;
3534         }
3535
3536       if (stmt_can_throw_internal (stmt))
3537         {
3538           if (dump_enabled_p ())
3539             {
3540               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3541                                "not vectorized: statement can throw an "
3542                                "exception ");
3543               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3544               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3545             }
3546
3547           if (is_a <bb_vec_info> (vinfo))
3548             break;
3549
3550           if (gatherscatter != SG_NONE || simd_lane_access)
3551             free_data_ref (dr);
3552           return false;
3553         }
3554
3555       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3556           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3557         {
3558           if (dump_enabled_p ())
3559             {
3560               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3561                                "not vectorized: statement is bitfield "
3562                                "access ");
3563               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3564               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3565             }
3566
3567           if (is_a <bb_vec_info> (vinfo))
3568             break;
3569
3570           if (gatherscatter != SG_NONE || simd_lane_access)
3571             free_data_ref (dr);
3572           return false;
3573         }
3574
3575       base = unshare_expr (DR_BASE_ADDRESS (dr));
3576       offset = unshare_expr (DR_OFFSET (dr));
3577       init = unshare_expr (DR_INIT (dr));
3578
3579       if (is_gimple_call (stmt)
3580           && (!gimple_call_internal_p (stmt)
3581               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3582                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3583         {
3584           if (dump_enabled_p ())
3585             {
3586               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3587                                "not vectorized: dr in a call ");
3588               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3589               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3590             }
3591
3592           if (is_a <bb_vec_info> (vinfo))
3593             break;
3594
3595           if (gatherscatter != SG_NONE || simd_lane_access)
3596             free_data_ref (dr);
3597           return false;
3598         }
3599
3600       /* Update DR field in stmt_vec_info struct.  */
3601
3602       /* If the dataref is in an inner-loop of the loop that is considered for
3603          for vectorization, we also want to analyze the access relative to
3604          the outer-loop (DR contains information only relative to the
3605          inner-most enclosing loop).  We do that by building a reference to the
3606          first location accessed by the inner-loop, and analyze it relative to
3607          the outer-loop.  */
3608       if (loop && nested_in_vect_loop_p (loop, stmt))
3609         {
3610           tree outer_step, outer_base, outer_init;
3611           HOST_WIDE_INT pbitsize, pbitpos;
3612           tree poffset;
3613           machine_mode pmode;
3614           int punsignedp, pvolatilep;
3615           affine_iv base_iv, offset_iv;
3616           tree dinit;
3617
3618           /* Build a reference to the first location accessed by the
3619              inner-loop: *(BASE+INIT).  (The first location is actually
3620              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3621           tree inner_base = build_fold_indirect_ref
3622                                 (fold_build_pointer_plus (base, init));
3623
3624           if (dump_enabled_p ())
3625             {
3626               dump_printf_loc (MSG_NOTE, vect_location,
3627                                "analyze in outer-loop: ");
3628               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3629               dump_printf (MSG_NOTE, "\n");
3630             }
3631
3632           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3633                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3634           gcc_assert (outer_base != NULL_TREE);
3635
3636           if (pbitpos % BITS_PER_UNIT != 0)
3637             {
3638               if (dump_enabled_p ())
3639                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3640                                  "failed: bit offset alignment.\n");
3641               return false;
3642             }
3643
3644           outer_base = build_fold_addr_expr (outer_base);
3645           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3646                           &base_iv, false))
3647             {
3648               if (dump_enabled_p ())
3649                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650                                  "failed: evolution of base is not affine.\n");
3651               return false;
3652             }
3653
3654           if (offset)
3655             {
3656               if (poffset)
3657                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3658                                        poffset);
3659               else
3660                 poffset = offset;
3661             }
3662
3663           if (!poffset)
3664             {
3665               offset_iv.base = ssize_int (0);
3666               offset_iv.step = ssize_int (0);
3667             }
3668           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3669                                &offset_iv, false))
3670             {
3671               if (dump_enabled_p ())
3672                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3673                                  "evolution of offset is not affine.\n");
3674               return false;
3675             }
3676
3677           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3678           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3679           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3680           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3681           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3682
3683           outer_step = size_binop (PLUS_EXPR,
3684                                 fold_convert (ssizetype, base_iv.step),
3685                                 fold_convert (ssizetype, offset_iv.step));
3686
3687           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3688           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3689           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3690           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3691           STMT_VINFO_DR_OFFSET (stmt_info) =
3692                                 fold_convert (ssizetype, offset_iv.base);
3693           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3694                                 size_int (highest_pow2_factor (offset_iv.base));
3695
3696           if (dump_enabled_p ())
3697             {
3698               dump_printf_loc (MSG_NOTE, vect_location,
3699                                "\touter base_address: ");
3700               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3701                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3702               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3703               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3704                                  STMT_VINFO_DR_OFFSET (stmt_info));
3705               dump_printf (MSG_NOTE,
3706                            "\n\touter constant offset from base address: ");
3707               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3708                                  STMT_VINFO_DR_INIT (stmt_info));
3709               dump_printf (MSG_NOTE, "\n\touter step: ");
3710               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3711                                  STMT_VINFO_DR_STEP (stmt_info));
3712               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3713               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3714                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3715               dump_printf (MSG_NOTE, "\n");
3716             }
3717         }
3718
3719       if (STMT_VINFO_DATA_REF (stmt_info))
3720         {
3721           if (dump_enabled_p ())
3722             {
3723               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3724                                "not vectorized: more than one data ref "
3725                                "in stmt: ");
3726               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3727               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3728             }
3729
3730           if (is_a <bb_vec_info> (vinfo))
3731             break;
3732
3733           if (gatherscatter != SG_NONE || simd_lane_access)
3734             free_data_ref (dr);
3735           return false;
3736         }
3737
3738       STMT_VINFO_DATA_REF (stmt_info) = dr;
3739       if (simd_lane_access)
3740         {
3741           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3742           free_data_ref (datarefs[i]);
3743           datarefs[i] = dr;
3744         }
3745
3746       /* Set vectype for STMT.  */
3747       scalar_type = TREE_TYPE (DR_REF (dr));
3748       STMT_VINFO_VECTYPE (stmt_info)
3749         = get_vectype_for_scalar_type (scalar_type);
3750       if (!STMT_VINFO_VECTYPE (stmt_info))
3751         {
3752           if (dump_enabled_p ())
3753             {
3754               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3755                                "not vectorized: no vectype for stmt: ");
3756               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3757               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3758               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3759                                  scalar_type);
3760               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3761             }
3762
3763           if (is_a <bb_vec_info> (vinfo))
3764             break;
3765
3766           if (gatherscatter != SG_NONE || simd_lane_access)
3767             {
3768               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3769               if (gatherscatter != SG_NONE)
3770                 free_data_ref (dr);
3771             }
3772           return false;
3773         }
3774       else
3775         {
3776           if (dump_enabled_p ())
3777             {
3778               dump_printf_loc (MSG_NOTE, vect_location,
3779                                "got vectype for stmt: ");
3780               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3781               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3782                                  STMT_VINFO_VECTYPE (stmt_info));
3783               dump_printf (MSG_NOTE, "\n");
3784             }
3785         }
3786
3787       /* Adjust the minimal vectorization factor according to the
3788          vector type.  */
3789       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3790       if (vf > *min_vf)
3791         *min_vf = vf;
3792
3793       if (gatherscatter != SG_NONE)
3794         {
3795           tree off;
3796           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3797                                           NULL, &off, NULL)
3798               || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3799             {
3800               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3801               free_data_ref (dr);
3802               if (dump_enabled_p ())
3803                 {
3804                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3805                                    (gatherscatter == GATHER) ?
3806                                    "not vectorized: not suitable for gather "
3807                                    "load " :
3808                                    "not vectorized: not suitable for scatter "
3809                                    "store ");
3810                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3811                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3812                 }
3813               return false;
3814             }
3815
3816           datarefs[i] = dr;
3817           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3818         }
3819
3820       else if (is_a <loop_vec_info> (vinfo)
3821                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3822         {
3823           if (nested_in_vect_loop_p (loop, stmt))
3824             {
3825               if (dump_enabled_p ())
3826                 {
3827                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3828                                    "not vectorized: not suitable for strided "
3829                                    "load ");
3830                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3831                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3832                 }
3833               return false;
3834             }
3835           STMT_VINFO_STRIDED_P (stmt_info) = true;
3836         }
3837     }
3838
3839   /* If we stopped analysis at the first dataref we could not analyze
3840      when trying to vectorize a basic-block mark the rest of the datarefs
3841      as not vectorizable and truncate the vector of datarefs.  That
3842      avoids spending useless time in analyzing their dependence.  */
3843   if (i != datarefs.length ())
3844     {
3845       gcc_assert (is_a <bb_vec_info> (vinfo));
3846       for (unsigned j = i; j < datarefs.length (); ++j)
3847         {
3848           data_reference_p dr = datarefs[j];
3849           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3850           free_data_ref (dr);
3851         }
3852       datarefs.truncate (i);
3853     }
3854
3855   return true;
3856 }
3857
3858
3859 /* Function vect_get_new_vect_var.
3860
3861    Returns a name for a new variable.  The current naming scheme appends the
3862    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3863    the name of vectorizer generated variables, and appends that to NAME if
3864    provided.  */
3865
3866 tree
3867 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3868 {
3869   const char *prefix;
3870   tree new_vect_var;
3871
3872   switch (var_kind)
3873   {
3874   case vect_simple_var:
3875     prefix = "vect";
3876     break;
3877   case vect_scalar_var:
3878     prefix = "stmp";
3879     break;
3880   case vect_pointer_var:
3881     prefix = "vectp";
3882     break;
3883   default:
3884     gcc_unreachable ();
3885   }
3886
3887   if (name)
3888     {
3889       char* tmp = concat (prefix, "_", name, NULL);
3890       new_vect_var = create_tmp_reg (type, tmp);
3891       free (tmp);
3892     }
3893   else
3894     new_vect_var = create_tmp_reg (type, prefix);
3895
3896   return new_vect_var;
3897 }
3898
3899 /* Like vect_get_new_vect_var but return an SSA name.  */
3900
3901 tree
3902 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3903 {
3904   const char *prefix;
3905   tree new_vect_var;
3906
3907   switch (var_kind)
3908   {
3909   case vect_simple_var:
3910     prefix = "vect";
3911     break;
3912   case vect_scalar_var:
3913     prefix = "stmp";
3914     break;
3915   case vect_pointer_var:
3916     prefix = "vectp";
3917     break;
3918   default:
3919     gcc_unreachable ();
3920   }
3921
3922   if (name)
3923     {
3924       char* tmp = concat (prefix, "_", name, NULL);
3925       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3926       free (tmp);
3927     }
3928   else
3929     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3930
3931   return new_vect_var;
3932 }
3933
3934 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3935
3936 static void
3937 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3938                                   stmt_vec_info stmt_info)
3939 {
3940   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3941   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3942   int misalign = DR_MISALIGNMENT (dr);
3943   if (misalign == -1)
3944     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3945   else
3946     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3947 }
3948
3949 /* Function vect_create_addr_base_for_vector_ref.
3950
3951    Create an expression that computes the address of the first memory location
3952    that will be accessed for a data reference.
3953
3954    Input:
3955    STMT: The statement containing the data reference.
3956    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3957    OFFSET: Optional. If supplied, it is be added to the initial address.
3958    LOOP:    Specify relative to which loop-nest should the address be computed.
3959             For example, when the dataref is in an inner-loop nested in an
3960             outer-loop that is now being vectorized, LOOP can be either the
3961             outer-loop, or the inner-loop.  The first memory location accessed
3962             by the following dataref ('in' points to short):
3963
3964                 for (i=0; i<N; i++)
3965                    for (j=0; j<M; j++)
3966                      s += in[i+j]
3967
3968             is as follows:
3969             if LOOP=i_loop:     &in             (relative to i_loop)
3970             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3971    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3972             initial address.  Unlike OFFSET, which is number of elements to
3973             be added, BYTE_OFFSET is measured in bytes.
3974
3975    Output:
3976    1. Return an SSA_NAME whose value is the address of the memory location of
3977       the first vector of the data reference.
3978    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3979       these statement(s) which define the returned SSA_NAME.
3980
3981    FORNOW: We are only handling array accesses with step 1.  */
3982
3983 tree
3984 vect_create_addr_base_for_vector_ref (gimple *stmt,
3985                                       gimple_seq *new_stmt_list,
3986                                       tree offset,
3987                                       struct loop *loop,
3988                                       tree byte_offset)
3989 {
3990   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3991   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3992   tree data_ref_base;
3993   const char *base_name;
3994   tree addr_base;
3995   tree dest;
3996   gimple_seq seq = NULL;
3997   tree base_offset;
3998   tree init;
3999   tree vect_ptr_type;
4000   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4001   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4002
4003   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4004     {
4005       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4006
4007       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4008
4009       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4010       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4011       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4012     }
4013   else
4014     {
4015       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4016       base_offset = unshare_expr (DR_OFFSET (dr));
4017       init = unshare_expr (DR_INIT (dr));
4018     }
4019
4020   if (loop_vinfo)
4021     base_name = get_name (data_ref_base);
4022   else
4023     {
4024       base_offset = ssize_int (0);
4025       init = ssize_int (0);
4026       base_name = get_name (DR_REF (dr));
4027     }
4028
4029   /* Create base_offset */
4030   base_offset = size_binop (PLUS_EXPR,
4031                             fold_convert (sizetype, base_offset),
4032                             fold_convert (sizetype, init));
4033
4034   if (offset)
4035     {
4036       offset = fold_build2 (MULT_EXPR, sizetype,
4037                             fold_convert (sizetype, offset), step);
4038       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4039                                  base_offset, offset);
4040     }
4041   if (byte_offset)
4042     {
4043       byte_offset = fold_convert (sizetype, byte_offset);
4044       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4045                                  base_offset, byte_offset);
4046     }
4047
4048   /* base + base_offset */
4049   if (loop_vinfo)
4050     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4051   else
4052     {
4053       addr_base = build1 (ADDR_EXPR,
4054                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4055                           unshare_expr (DR_REF (dr)));
4056     }
4057
4058   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4059   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4060   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4061   gimple_seq_add_seq (new_stmt_list, seq);
4062
4063   if (DR_PTR_INFO (dr)
4064       && TREE_CODE (addr_base) == SSA_NAME
4065       && !SSA_NAME_PTR_INFO (addr_base))
4066     {
4067       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4068       if (offset || byte_offset)
4069         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4070     }
4071
4072   if (dump_enabled_p ())
4073     {
4074       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4075       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4076       dump_printf (MSG_NOTE, "\n");
4077     }
4078
4079   return addr_base;
4080 }
4081
4082
4083 /* Function vect_create_data_ref_ptr.
4084
4085    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4086    location accessed in the loop by STMT, along with the def-use update
4087    chain to appropriately advance the pointer through the loop iterations.
4088    Also set aliasing information for the pointer.  This pointer is used by
4089    the callers to this function to create a memory reference expression for
4090    vector load/store access.
4091
4092    Input:
4093    1. STMT: a stmt that references memory. Expected to be of the form
4094          GIMPLE_ASSIGN <name, data-ref> or
4095          GIMPLE_ASSIGN <data-ref, name>.
4096    2. AGGR_TYPE: the type of the reference, which should be either a vector
4097         or an array.
4098    3. AT_LOOP: the loop where the vector memref is to be created.
4099    4. OFFSET (optional): an offset to be added to the initial address accessed
4100         by the data-ref in STMT.
4101    5. BSI: location where the new stmts are to be placed if there is no loop
4102    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4103         pointing to the initial address.
4104    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4105         to the initial address accessed by the data-ref in STMT.  This is
4106         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4107         in bytes.
4108
4109    Output:
4110    1. Declare a new ptr to vector_type, and have it point to the base of the
4111       data reference (initial addressed accessed by the data reference).
4112       For example, for vector of type V8HI, the following code is generated:
4113
4114       v8hi *ap;
4115       ap = (v8hi *)initial_address;
4116
4117       if OFFSET is not supplied:
4118          initial_address = &a[init];
4119       if OFFSET is supplied:
4120          initial_address = &a[init + OFFSET];
4121       if BYTE_OFFSET is supplied:
4122          initial_address = &a[init] + BYTE_OFFSET;
4123
4124       Return the initial_address in INITIAL_ADDRESS.
4125
4126    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4127       update the pointer in each iteration of the loop.
4128
4129       Return the increment stmt that updates the pointer in PTR_INCR.
4130
4131    3. Set INV_P to true if the access pattern of the data reference in the
4132       vectorized loop is invariant.  Set it to false otherwise.
4133
4134    4. Return the pointer.  */
4135
4136 tree
4137 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4138                           tree offset, tree *initial_address,
4139                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4140                           bool only_init, bool *inv_p, tree byte_offset)
4141 {
4142   const char *base_name;
4143   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4144   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4145   struct loop *loop = NULL;
4146   bool nested_in_vect_loop = false;
4147   struct loop *containing_loop = NULL;
4148   tree aggr_ptr_type;
4149   tree aggr_ptr;
4150   tree new_temp;
4151   gimple_seq new_stmt_list = NULL;
4152   edge pe = NULL;
4153   basic_block new_bb;
4154   tree aggr_ptr_init;
4155   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4156   tree aptr;
4157   gimple_stmt_iterator incr_gsi;
4158   bool insert_after;
4159   tree indx_before_incr, indx_after_incr;
4160   gimple *incr;
4161   tree step;
4162   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4163
4164   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4165               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4166
4167   if (loop_vinfo)
4168     {
4169       loop = LOOP_VINFO_LOOP (loop_vinfo);
4170       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4171       containing_loop = (gimple_bb (stmt))->loop_father;
4172       pe = loop_preheader_edge (loop);
4173     }
4174   else
4175     {
4176       gcc_assert (bb_vinfo);
4177       only_init = true;
4178       *ptr_incr = NULL;
4179     }
4180
4181   /* Check the step (evolution) of the load in LOOP, and record
4182      whether it's invariant.  */
4183   if (nested_in_vect_loop)
4184     step = STMT_VINFO_DR_STEP (stmt_info);
4185   else
4186     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4187
4188   if (integer_zerop (step))
4189     *inv_p = true;
4190   else
4191     *inv_p = false;
4192
4193   /* Create an expression for the first address accessed by this load
4194      in LOOP.  */
4195   base_name = get_name (DR_BASE_ADDRESS (dr));
4196
4197   if (dump_enabled_p ())
4198     {
4199       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4200       dump_printf_loc (MSG_NOTE, vect_location,
4201                        "create %s-pointer variable to type: ",
4202                        get_tree_code_name (TREE_CODE (aggr_type)));
4203       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4204       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4205         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4206       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4207         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4208       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4209         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4210       else
4211         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4212       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4213       dump_printf (MSG_NOTE, "\n");
4214     }
4215
4216   /* (1) Create the new aggregate-pointer variable.
4217      Vector and array types inherit the alias set of their component
4218      type by default so we need to use a ref-all pointer if the data
4219      reference does not conflict with the created aggregated data
4220      reference because it is not addressable.  */
4221   bool need_ref_all = false;
4222   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4223                               get_alias_set (DR_REF (dr))))
4224     need_ref_all = true;
4225   /* Likewise for any of the data references in the stmt group.  */
4226   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4227     {
4228       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4229       do
4230         {
4231           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4232           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4233           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4234                                       get_alias_set (DR_REF (sdr))))
4235             {
4236               need_ref_all = true;
4237               break;
4238             }
4239           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4240         }
4241       while (orig_stmt);
4242     }
4243   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4244                                                need_ref_all);
4245   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4246
4247
4248   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4249      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4250      def-use update cycles for the pointer: one relative to the outer-loop
4251      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4252      to the inner-loop (which is the inner-most loop containing the dataref),
4253      and this is done be step (5) below.
4254
4255      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4256      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4257      redundant.  Steps (3),(4) create the following:
4258
4259         vp0 = &base_addr;
4260         LOOP:   vp1 = phi(vp0,vp2)
4261                 ...
4262                 ...
4263                 vp2 = vp1 + step
4264                 goto LOOP
4265
4266      If there is an inner-loop nested in loop, then step (5) will also be
4267      applied, and an additional update in the inner-loop will be created:
4268
4269         vp0 = &base_addr;
4270         LOOP:   vp1 = phi(vp0,vp2)
4271                 ...
4272         inner:     vp3 = phi(vp1,vp4)
4273                    vp4 = vp3 + inner_step
4274                    if () goto inner
4275                 ...
4276                 vp2 = vp1 + step
4277                 if () goto LOOP   */
4278
4279   /* (2) Calculate the initial address of the aggregate-pointer, and set
4280      the aggregate-pointer to point to it before the loop.  */
4281
4282   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4283
4284   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4285                                                    offset, loop, byte_offset);
4286   if (new_stmt_list)
4287     {
4288       if (pe)
4289         {
4290           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4291           gcc_assert (!new_bb);
4292         }
4293       else
4294         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4295     }
4296
4297   *initial_address = new_temp;
4298   aggr_ptr_init = new_temp;
4299
4300   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4301      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4302      inner-loop nested in LOOP (during outer-loop vectorization).  */
4303
4304   /* No update in loop is required.  */
4305   if (only_init && (!loop_vinfo || at_loop == loop))
4306     aptr = aggr_ptr_init;
4307   else
4308     {
4309       /* The step of the aggregate pointer is the type size.  */
4310       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4311       /* One exception to the above is when the scalar step of the load in
4312          LOOP is zero. In this case the step here is also zero.  */
4313       if (*inv_p)
4314         iv_step = size_zero_node;
4315       else if (tree_int_cst_sgn (step) == -1)
4316         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4317
4318       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4319
4320       create_iv (aggr_ptr_init,
4321                  fold_convert (aggr_ptr_type, iv_step),
4322                  aggr_ptr, loop, &incr_gsi, insert_after,
4323                  &indx_before_incr, &indx_after_incr);
4324       incr = gsi_stmt (incr_gsi);
4325       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4326
4327       /* Copy the points-to information if it exists. */
4328       if (DR_PTR_INFO (dr))
4329         {
4330           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4331           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4332         }
4333       if (ptr_incr)
4334         *ptr_incr = incr;
4335
4336       aptr = indx_before_incr;
4337     }
4338
4339   if (!nested_in_vect_loop || only_init)
4340     return aptr;
4341
4342
4343   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4344      nested in LOOP, if exists.  */
4345
4346   gcc_assert (nested_in_vect_loop);
4347   if (!only_init)
4348     {
4349       standard_iv_increment_position (containing_loop, &incr_gsi,
4350                                       &insert_after);
4351       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4352                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4353                  &indx_after_incr);
4354       incr = gsi_stmt (incr_gsi);
4355       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4356
4357       /* Copy the points-to information if it exists. */
4358       if (DR_PTR_INFO (dr))
4359         {
4360           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4361           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4362         }
4363       if (ptr_incr)
4364         *ptr_incr = incr;
4365
4366       return indx_before_incr;
4367     }
4368   else
4369     gcc_unreachable ();
4370 }
4371
4372
4373 /* Function bump_vector_ptr
4374
4375    Increment a pointer (to a vector type) by vector-size. If requested,
4376    i.e. if PTR-INCR is given, then also connect the new increment stmt
4377    to the existing def-use update-chain of the pointer, by modifying
4378    the PTR_INCR as illustrated below:
4379
4380    The pointer def-use update-chain before this function:
4381                         DATAREF_PTR = phi (p_0, p_2)
4382                         ....
4383         PTR_INCR:       p_2 = DATAREF_PTR + step
4384
4385    The pointer def-use update-chain after this function:
4386                         DATAREF_PTR = phi (p_0, p_2)
4387                         ....
4388                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4389                         ....
4390         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4391
4392    Input:
4393    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4394                  in the loop.
4395    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4396               the loop.  The increment amount across iterations is expected
4397               to be vector_size.
4398    BSI - location where the new update stmt is to be placed.
4399    STMT - the original scalar memory-access stmt that is being vectorized.
4400    BUMP - optional. The offset by which to bump the pointer. If not given,
4401           the offset is assumed to be vector_size.
4402
4403    Output: Return NEW_DATAREF_PTR as illustrated above.
4404
4405 */
4406
4407 tree
4408 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4409                  gimple *stmt, tree bump)
4410 {
4411   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4412   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4413   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4414   tree update = TYPE_SIZE_UNIT (vectype);
4415   gassign *incr_stmt;
4416   ssa_op_iter iter;
4417   use_operand_p use_p;
4418   tree new_dataref_ptr;
4419
4420   if (bump)
4421     update = bump;
4422
4423   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4424     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4425   else
4426     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4427   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4428                                    dataref_ptr, update);
4429   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4430
4431   /* Copy the points-to information if it exists. */
4432   if (DR_PTR_INFO (dr))
4433     {
4434       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4435       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4436     }
4437
4438   if (!ptr_incr)
4439     return new_dataref_ptr;
4440
4441   /* Update the vector-pointer's cross-iteration increment.  */
4442   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4443     {
4444       tree use = USE_FROM_PTR (use_p);
4445
4446       if (use == dataref_ptr)
4447         SET_USE (use_p, new_dataref_ptr);
4448       else
4449         gcc_assert (tree_int_cst_compare (use, update) == 0);
4450     }
4451
4452   return new_dataref_ptr;
4453 }
4454
4455
4456 /* Function vect_create_destination_var.
4457
4458    Create a new temporary of type VECTYPE.  */
4459
4460 tree
4461 vect_create_destination_var (tree scalar_dest, tree vectype)
4462 {
4463   tree vec_dest;
4464   const char *name;
4465   char *new_name;
4466   tree type;
4467   enum vect_var_kind kind;
4468
4469   kind = vectype ? vect_simple_var : vect_scalar_var;
4470   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4471
4472   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4473
4474   name = get_name (scalar_dest);
4475   if (name)
4476     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4477   else
4478     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4479   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4480   free (new_name);
4481
4482   return vec_dest;
4483 }
4484
4485 /* Function vect_grouped_store_supported.
4486
4487    Returns TRUE if interleave high and interleave low permutations
4488    are supported, and FALSE otherwise.  */
4489
4490 bool
4491 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4492 {
4493   machine_mode mode = TYPE_MODE (vectype);
4494
4495   /* vect_permute_store_chain requires the group size to be equal to 3 or
4496      be a power of two.  */
4497   if (count != 3 && exact_log2 (count) == -1)
4498     {
4499       if (dump_enabled_p ())
4500         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4501                          "the size of the group of accesses"
4502                          " is not a power of 2 or not eqaul to 3\n");
4503       return false;
4504     }
4505
4506   /* Check that the permutation is supported.  */
4507   if (VECTOR_MODE_P (mode))
4508     {
4509       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4510       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4511
4512       if (count == 3)
4513         {
4514           unsigned int j0 = 0, j1 = 0, j2 = 0;
4515           unsigned int i, j;
4516
4517           for (j = 0; j < 3; j++)
4518             {
4519               int nelt0 = ((3 - j) * nelt) % 3;
4520               int nelt1 = ((3 - j) * nelt + 1) % 3;
4521               int nelt2 = ((3 - j) * nelt + 2) % 3;
4522               for (i = 0; i < nelt; i++)
4523                 {
4524                   if (3 * i + nelt0 < nelt)
4525                     sel[3 * i + nelt0] = j0++;
4526                   if (3 * i + nelt1 < nelt)
4527                     sel[3 * i + nelt1] = nelt + j1++;
4528                   if (3 * i + nelt2 < nelt)
4529                     sel[3 * i + nelt2] = 0;
4530                 }
4531               if (!can_vec_perm_p (mode, false, sel))
4532                 {
4533                   if (dump_enabled_p ())
4534                     dump_printf (MSG_MISSED_OPTIMIZATION,
4535                                  "permutaion op not supported by target.\n");
4536                   return false;
4537                 }
4538
4539               for (i = 0; i < nelt; i++)
4540                 {
4541                   if (3 * i + nelt0 < nelt)
4542                     sel[3 * i + nelt0] = 3 * i + nelt0;
4543                   if (3 * i + nelt1 < nelt)
4544                     sel[3 * i + nelt1] = 3 * i + nelt1;
4545                   if (3 * i + nelt2 < nelt)
4546                     sel[3 * i + nelt2] = nelt + j2++;
4547                 }
4548               if (!can_vec_perm_p (mode, false, sel))
4549                 {
4550                   if (dump_enabled_p ())
4551                     dump_printf (MSG_MISSED_OPTIMIZATION,
4552                                  "permutaion op not supported by target.\n");
4553                   return false;
4554                 }
4555             }
4556           return true;
4557         }
4558       else
4559         {
4560           /* If length is not equal to 3 then only power of 2 is supported.  */
4561           gcc_assert (exact_log2 (count) != -1);
4562
4563           for (i = 0; i < nelt / 2; i++)
4564             {
4565               sel[i * 2] = i;
4566               sel[i * 2 + 1] = i + nelt;
4567             }
4568             if (can_vec_perm_p (mode, false, sel))
4569               {
4570                 for (i = 0; i < nelt; i++)
4571                   sel[i] += nelt / 2;
4572                 if (can_vec_perm_p (mode, false, sel))
4573                   return true;
4574               }
4575         }
4576     }
4577
4578   if (dump_enabled_p ())
4579     dump_printf (MSG_MISSED_OPTIMIZATION,
4580                  "permutaion op not supported by target.\n");
4581   return false;
4582 }
4583
4584
4585 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4586    type VECTYPE.  */
4587
4588 bool
4589 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4590 {
4591   return vect_lanes_optab_supported_p ("vec_store_lanes",
4592                                        vec_store_lanes_optab,
4593                                        vectype, count);
4594 }
4595
4596
4597 /* Function vect_permute_store_chain.
4598
4599    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4600    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4601    the data correctly for the stores.  Return the final references for stores
4602    in RESULT_CHAIN.
4603
4604    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4605    The input is 4 vectors each containing 8 elements.  We assign a number to
4606    each element, the input sequence is:
4607
4608    1st vec:   0  1  2  3  4  5  6  7
4609    2nd vec:   8  9 10 11 12 13 14 15
4610    3rd vec:  16 17 18 19 20 21 22 23
4611    4th vec:  24 25 26 27 28 29 30 31
4612
4613    The output sequence should be:
4614
4615    1st vec:  0  8 16 24  1  9 17 25
4616    2nd vec:  2 10 18 26  3 11 19 27
4617    3rd vec:  4 12 20 28  5 13 21 30
4618    4th vec:  6 14 22 30  7 15 23 31
4619
4620    i.e., we interleave the contents of the four vectors in their order.
4621
4622    We use interleave_high/low instructions to create such output.  The input of
4623    each interleave_high/low operation is two vectors:
4624    1st vec    2nd vec
4625    0 1 2 3    4 5 6 7
4626    the even elements of the result vector are obtained left-to-right from the
4627    high/low elements of the first vector.  The odd elements of the result are
4628    obtained left-to-right from the high/low elements of the second vector.
4629    The output of interleave_high will be:   0 4 1 5
4630    and of interleave_low:                   2 6 3 7
4631
4632
4633    The permutation is done in log LENGTH stages.  In each stage interleave_high
4634    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4635    where the first argument is taken from the first half of DR_CHAIN and the
4636    second argument from it's second half.
4637    In our example,
4638
4639    I1: interleave_high (1st vec, 3rd vec)
4640    I2: interleave_low (1st vec, 3rd vec)
4641    I3: interleave_high (2nd vec, 4th vec)
4642    I4: interleave_low (2nd vec, 4th vec)
4643
4644    The output for the first stage is:
4645
4646    I1:  0 16  1 17  2 18  3 19
4647    I2:  4 20  5 21  6 22  7 23
4648    I3:  8 24  9 25 10 26 11 27
4649    I4: 12 28 13 29 14 30 15 31
4650
4651    The output of the second stage, i.e. the final result is:
4652
4653    I1:  0  8 16 24  1  9 17 25
4654    I2:  2 10 18 26  3 11 19 27
4655    I3:  4 12 20 28  5 13 21 30
4656    I4:  6 14 22 30  7 15 23 31.  */
4657
4658 void
4659 vect_permute_store_chain (vec<tree> dr_chain,
4660                           unsigned int length,
4661                           gimple *stmt,
4662                           gimple_stmt_iterator *gsi,
4663                           vec<tree> *result_chain)
4664 {
4665   tree vect1, vect2, high, low;
4666   gimple *perm_stmt;
4667   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4668   tree perm_mask_low, perm_mask_high;
4669   tree data_ref;
4670   tree perm3_mask_low, perm3_mask_high;
4671   unsigned int i, n, log_length = exact_log2 (length);
4672   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4673   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4674
4675   result_chain->quick_grow (length);
4676   memcpy (result_chain->address (), dr_chain.address (),
4677           length * sizeof (tree));
4678
4679   if (length == 3)
4680     {
4681       unsigned int j0 = 0, j1 = 0, j2 = 0;
4682
4683       for (j = 0; j < 3; j++)
4684         {
4685           int nelt0 = ((3 - j) * nelt) % 3;
4686           int nelt1 = ((3 - j) * nelt + 1) % 3;
4687           int nelt2 = ((3 - j) * nelt + 2) % 3;
4688
4689           for (i = 0; i < nelt; i++)
4690             {
4691               if (3 * i + nelt0 < nelt)
4692                 sel[3 * i + nelt0] = j0++;
4693               if (3 * i + nelt1 < nelt)
4694                 sel[3 * i + nelt1] = nelt + j1++;
4695               if (3 * i + nelt2 < nelt)
4696                 sel[3 * i + nelt2] = 0;
4697             }
4698           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4699
4700           for (i = 0; i < nelt; i++)
4701             {
4702               if (3 * i + nelt0 < nelt)
4703                 sel[3 * i + nelt0] = 3 * i + nelt0;
4704               if (3 * i + nelt1 < nelt)
4705                 sel[3 * i + nelt1] = 3 * i + nelt1;
4706               if (3 * i + nelt2 < nelt)
4707                 sel[3 * i + nelt2] = nelt + j2++;
4708             }
4709           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4710
4711           vect1 = dr_chain[0];
4712           vect2 = dr_chain[1];
4713
4714           /* Create interleaving stmt:
4715              low = VEC_PERM_EXPR <vect1, vect2,
4716                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4717                                    j + 2, nelt + j + 2, *, ...}>  */
4718           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4719           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4720                                            vect2, perm3_mask_low);
4721           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4722
4723           vect1 = data_ref;
4724           vect2 = dr_chain[2];
4725           /* Create interleaving stmt:
4726              low = VEC_PERM_EXPR <vect1, vect2,
4727                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4728                                    6, 7, nelt + j + 2, ...}>  */
4729           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4730           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4731                                            vect2, perm3_mask_high);
4732           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4733           (*result_chain)[j] = data_ref;
4734         }
4735     }
4736   else
4737     {
4738       /* If length is not equal to 3 then only power of 2 is supported.  */
4739       gcc_assert (exact_log2 (length) != -1);
4740
4741       for (i = 0, n = nelt / 2; i < n; i++)
4742         {
4743           sel[i * 2] = i;
4744           sel[i * 2 + 1] = i + nelt;
4745         }
4746         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4747
4748         for (i = 0; i < nelt; i++)
4749           sel[i] += nelt / 2;
4750         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4751
4752         for (i = 0, n = log_length; i < n; i++)
4753           {
4754             for (j = 0; j < length/2; j++)
4755               {
4756                 vect1 = dr_chain[j];
4757                 vect2 = dr_chain[j+length/2];
4758
4759                 /* Create interleaving stmt:
4760                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4761                                                         ...}>  */
4762                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4763                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4764                                                  vect2, perm_mask_high);
4765                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4766                 (*result_chain)[2*j] = high;
4767
4768                 /* Create interleaving stmt:
4769                    low = VEC_PERM_EXPR <vect1, vect2,
4770                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4771                                          ...}>  */
4772                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4773                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4774                                                  vect2, perm_mask_low);
4775                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4776                 (*result_chain)[2*j+1] = low;
4777               }
4778             memcpy (dr_chain.address (), result_chain->address (),
4779                     length * sizeof (tree));
4780           }
4781     }
4782 }
4783
4784 /* Function vect_setup_realignment
4785
4786    This function is called when vectorizing an unaligned load using
4787    the dr_explicit_realign[_optimized] scheme.
4788    This function generates the following code at the loop prolog:
4789
4790       p = initial_addr;
4791    x  msq_init = *(floor(p));   # prolog load
4792       realignment_token = call target_builtin;
4793     loop:
4794    x  msq = phi (msq_init, ---)
4795
4796    The stmts marked with x are generated only for the case of
4797    dr_explicit_realign_optimized.
4798
4799    The code above sets up a new (vector) pointer, pointing to the first
4800    location accessed by STMT, and a "floor-aligned" load using that pointer.
4801    It also generates code to compute the "realignment-token" (if the relevant
4802    target hook was defined), and creates a phi-node at the loop-header bb
4803    whose arguments are the result of the prolog-load (created by this
4804    function) and the result of a load that takes place in the loop (to be
4805    created by the caller to this function).
4806
4807    For the case of dr_explicit_realign_optimized:
4808    The caller to this function uses the phi-result (msq) to create the
4809    realignment code inside the loop, and sets up the missing phi argument,
4810    as follows:
4811     loop:
4812       msq = phi (msq_init, lsq)
4813       lsq = *(floor(p'));        # load in loop
4814       result = realign_load (msq, lsq, realignment_token);
4815
4816    For the case of dr_explicit_realign:
4817     loop:
4818       msq = *(floor(p));        # load in loop
4819       p' = p + (VS-1);
4820       lsq = *(floor(p'));       # load in loop
4821       result = realign_load (msq, lsq, realignment_token);
4822
4823    Input:
4824    STMT - (scalar) load stmt to be vectorized. This load accesses
4825           a memory location that may be unaligned.
4826    BSI - place where new code is to be inserted.
4827    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4828                               is used.
4829
4830    Output:
4831    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4832                        target hook, if defined.
4833    Return value - the result of the loop-header phi node.  */
4834
4835 tree
4836 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4837                         tree *realignment_token,
4838                         enum dr_alignment_support alignment_support_scheme,
4839                         tree init_addr,
4840                         struct loop **at_loop)
4841 {
4842   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4843   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4844   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4845   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4846   struct loop *loop = NULL;
4847   edge pe = NULL;
4848   tree scalar_dest = gimple_assign_lhs (stmt);
4849   tree vec_dest;
4850   gimple *inc;
4851   tree ptr;
4852   tree data_ref;
4853   basic_block new_bb;
4854   tree msq_init = NULL_TREE;
4855   tree new_temp;
4856   gphi *phi_stmt;
4857   tree msq = NULL_TREE;
4858   gimple_seq stmts = NULL;
4859   bool inv_p;
4860   bool compute_in_loop = false;
4861   bool nested_in_vect_loop = false;
4862   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4863   struct loop *loop_for_initial_load = NULL;
4864
4865   if (loop_vinfo)
4866     {
4867       loop = LOOP_VINFO_LOOP (loop_vinfo);
4868       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4869     }
4870
4871   gcc_assert (alignment_support_scheme == dr_explicit_realign
4872               || alignment_support_scheme == dr_explicit_realign_optimized);
4873
4874   /* We need to generate three things:
4875      1. the misalignment computation
4876      2. the extra vector load (for the optimized realignment scheme).
4877      3. the phi node for the two vectors from which the realignment is
4878       done (for the optimized realignment scheme).  */
4879
4880   /* 1. Determine where to generate the misalignment computation.
4881
4882      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4883      calculation will be generated by this function, outside the loop (in the
4884      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4885      caller, inside the loop.
4886
4887      Background: If the misalignment remains fixed throughout the iterations of
4888      the loop, then both realignment schemes are applicable, and also the
4889      misalignment computation can be done outside LOOP.  This is because we are
4890      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4891      are a multiple of VS (the Vector Size), and therefore the misalignment in
4892      different vectorized LOOP iterations is always the same.
4893      The problem arises only if the memory access is in an inner-loop nested
4894      inside LOOP, which is now being vectorized using outer-loop vectorization.
4895      This is the only case when the misalignment of the memory access may not
4896      remain fixed throughout the iterations of the inner-loop (as explained in
4897      detail in vect_supportable_dr_alignment).  In this case, not only is the
4898      optimized realignment scheme not applicable, but also the misalignment
4899      computation (and generation of the realignment token that is passed to
4900      REALIGN_LOAD) have to be done inside the loop.
4901
4902      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4903      or not, which in turn determines if the misalignment is computed inside
4904      the inner-loop, or outside LOOP.  */
4905
4906   if (init_addr != NULL_TREE || !loop_vinfo)
4907     {
4908       compute_in_loop = true;
4909       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4910     }
4911
4912
4913   /* 2. Determine where to generate the extra vector load.
4914
4915      For the optimized realignment scheme, instead of generating two vector
4916      loads in each iteration, we generate a single extra vector load in the
4917      preheader of the loop, and in each iteration reuse the result of the
4918      vector load from the previous iteration.  In case the memory access is in
4919      an inner-loop nested inside LOOP, which is now being vectorized using
4920      outer-loop vectorization, we need to determine whether this initial vector
4921      load should be generated at the preheader of the inner-loop, or can be
4922      generated at the preheader of LOOP.  If the memory access has no evolution
4923      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4924      to be generated inside LOOP (in the preheader of the inner-loop).  */
4925
4926   if (nested_in_vect_loop)
4927     {
4928       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4929       bool invariant_in_outerloop =
4930             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4931       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4932     }
4933   else
4934     loop_for_initial_load = loop;
4935   if (at_loop)
4936     *at_loop = loop_for_initial_load;
4937
4938   if (loop_for_initial_load)
4939     pe = loop_preheader_edge (loop_for_initial_load);
4940
4941   /* 3. For the case of the optimized realignment, create the first vector
4942       load at the loop preheader.  */
4943
4944   if (alignment_support_scheme == dr_explicit_realign_optimized)
4945     {
4946       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4947       gassign *new_stmt;
4948
4949       gcc_assert (!compute_in_loop);
4950       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4951       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4952                                       NULL_TREE, &init_addr, NULL, &inc,
4953                                       true, &inv_p);
4954       if (TREE_CODE (ptr) == SSA_NAME)
4955         new_temp = copy_ssa_name (ptr);
4956       else
4957         new_temp = make_ssa_name (TREE_TYPE (ptr));
4958       new_stmt = gimple_build_assign
4959                    (new_temp, BIT_AND_EXPR, ptr,
4960                     build_int_cst (TREE_TYPE (ptr),
4961                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4962       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4963       gcc_assert (!new_bb);
4964       data_ref
4965         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4966                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4967       new_stmt = gimple_build_assign (vec_dest, data_ref);
4968       new_temp = make_ssa_name (vec_dest, new_stmt);
4969       gimple_assign_set_lhs (new_stmt, new_temp);
4970       if (pe)
4971         {
4972           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4973           gcc_assert (!new_bb);
4974         }
4975       else
4976          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4977
4978       msq_init = gimple_assign_lhs (new_stmt);
4979     }
4980
4981   /* 4. Create realignment token using a target builtin, if available.
4982       It is done either inside the containing loop, or before LOOP (as
4983       determined above).  */
4984
4985   if (targetm.vectorize.builtin_mask_for_load)
4986     {
4987       gcall *new_stmt;
4988       tree builtin_decl;
4989
4990       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4991       if (!init_addr)
4992         {
4993           /* Generate the INIT_ADDR computation outside LOOP.  */
4994           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4995                                                         NULL_TREE, loop);
4996           if (loop)
4997             {
4998               pe = loop_preheader_edge (loop);
4999               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5000               gcc_assert (!new_bb);
5001             }
5002           else
5003              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5004         }
5005
5006       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5007       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5008       vec_dest =
5009         vect_create_destination_var (scalar_dest,
5010                                      gimple_call_return_type (new_stmt));
5011       new_temp = make_ssa_name (vec_dest, new_stmt);
5012       gimple_call_set_lhs (new_stmt, new_temp);
5013
5014       if (compute_in_loop)
5015         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5016       else
5017         {
5018           /* Generate the misalignment computation outside LOOP.  */
5019           pe = loop_preheader_edge (loop);
5020           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5021           gcc_assert (!new_bb);
5022         }
5023
5024       *realignment_token = gimple_call_lhs (new_stmt);
5025
5026       /* The result of the CALL_EXPR to this builtin is determined from
5027          the value of the parameter and no global variables are touched
5028          which makes the builtin a "const" function.  Requiring the
5029          builtin to have the "const" attribute makes it unnecessary
5030          to call mark_call_clobbered.  */
5031       gcc_assert (TREE_READONLY (builtin_decl));
5032     }
5033
5034   if (alignment_support_scheme == dr_explicit_realign)
5035     return msq;
5036
5037   gcc_assert (!compute_in_loop);
5038   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5039
5040
5041   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5042
5043   pe = loop_preheader_edge (containing_loop);
5044   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5045   msq = make_ssa_name (vec_dest);
5046   phi_stmt = create_phi_node (msq, containing_loop->header);
5047   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5048
5049   return msq;
5050 }
5051
5052
5053 /* Function vect_grouped_load_supported.
5054
5055    Returns TRUE if even and odd permutations are supported,
5056    and FALSE otherwise.  */
5057
5058 bool
5059 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5060 {
5061   machine_mode mode = TYPE_MODE (vectype);
5062
5063   /* vect_permute_load_chain requires the group size to be equal to 3 or
5064      be a power of two.  */
5065   if (count != 3 && exact_log2 (count) == -1)
5066     {
5067       if (dump_enabled_p ())
5068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5069                          "the size of the group of accesses"
5070                          " is not a power of 2 or not equal to 3\n");
5071       return false;
5072     }
5073
5074   /* Check that the permutation is supported.  */
5075   if (VECTOR_MODE_P (mode))
5076     {
5077       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5078       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5079
5080       if (count == 3)
5081         {
5082           unsigned int k;
5083           for (k = 0; k < 3; k++)
5084             {
5085               for (i = 0; i < nelt; i++)
5086                 if (3 * i + k < 2 * nelt)
5087                   sel[i] = 3 * i + k;
5088                 else
5089                   sel[i] = 0;
5090               if (!can_vec_perm_p (mode, false, sel))
5091                 {
5092                   if (dump_enabled_p ())
5093                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5094                                      "shuffle of 3 loads is not supported by"
5095                                      " target\n");
5096                   return false;
5097                 }
5098               for (i = 0, j = 0; i < nelt; i++)
5099                 if (3 * i + k < 2 * nelt)
5100                   sel[i] = i;
5101                 else
5102                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5103               if (!can_vec_perm_p (mode, false, sel))
5104                 {
5105                   if (dump_enabled_p ())
5106                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5107                                      "shuffle of 3 loads is not supported by"
5108                                      " target\n");
5109                   return false;
5110                 }
5111             }
5112           return true;
5113         }
5114       else
5115         {
5116           /* If length is not equal to 3 then only power of 2 is supported.  */
5117           gcc_assert (exact_log2 (count) != -1);
5118           for (i = 0; i < nelt; i++)
5119             sel[i] = i * 2;
5120           if (can_vec_perm_p (mode, false, sel))
5121             {
5122               for (i = 0; i < nelt; i++)
5123                 sel[i] = i * 2 + 1;
5124               if (can_vec_perm_p (mode, false, sel))
5125                 return true;
5126             }
5127         }
5128     }
5129
5130   if (dump_enabled_p ())
5131     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5132                      "extract even/odd not supported by target\n");
5133   return false;
5134 }
5135
5136 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5137    type VECTYPE.  */
5138
5139 bool
5140 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5141 {
5142   return vect_lanes_optab_supported_p ("vec_load_lanes",
5143                                        vec_load_lanes_optab,
5144                                        vectype, count);
5145 }
5146
5147 /* Function vect_permute_load_chain.
5148
5149    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5150    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5151    the input data correctly.  Return the final references for loads in
5152    RESULT_CHAIN.
5153
5154    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5155    The input is 4 vectors each containing 8 elements. We assign a number to each
5156    element, the input sequence is:
5157
5158    1st vec:   0  1  2  3  4  5  6  7
5159    2nd vec:   8  9 10 11 12 13 14 15
5160    3rd vec:  16 17 18 19 20 21 22 23
5161    4th vec:  24 25 26 27 28 29 30 31
5162
5163    The output sequence should be:
5164
5165    1st vec:  0 4  8 12 16 20 24 28
5166    2nd vec:  1 5  9 13 17 21 25 29
5167    3rd vec:  2 6 10 14 18 22 26 30
5168    4th vec:  3 7 11 15 19 23 27 31
5169
5170    i.e., the first output vector should contain the first elements of each
5171    interleaving group, etc.
5172
5173    We use extract_even/odd instructions to create such output.  The input of
5174    each extract_even/odd operation is two vectors
5175    1st vec    2nd vec
5176    0 1 2 3    4 5 6 7
5177
5178    and the output is the vector of extracted even/odd elements.  The output of
5179    extract_even will be:   0 2 4 6
5180    and of extract_odd:     1 3 5 7
5181
5182
5183    The permutation is done in log LENGTH stages.  In each stage extract_even
5184    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5185    their order.  In our example,
5186
5187    E1: extract_even (1st vec, 2nd vec)
5188    E2: extract_odd (1st vec, 2nd vec)
5189    E3: extract_even (3rd vec, 4th vec)
5190    E4: extract_odd (3rd vec, 4th vec)
5191
5192    The output for the first stage will be:
5193
5194    E1:  0  2  4  6  8 10 12 14
5195    E2:  1  3  5  7  9 11 13 15
5196    E3: 16 18 20 22 24 26 28 30
5197    E4: 17 19 21 23 25 27 29 31
5198
5199    In order to proceed and create the correct sequence for the next stage (or
5200    for the correct output, if the second stage is the last one, as in our
5201    example), we first put the output of extract_even operation and then the
5202    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5203    The input for the second stage is:
5204
5205    1st vec (E1):  0  2  4  6  8 10 12 14
5206    2nd vec (E3): 16 18 20 22 24 26 28 30
5207    3rd vec (E2):  1  3  5  7  9 11 13 15
5208    4th vec (E4): 17 19 21 23 25 27 29 31
5209
5210    The output of the second stage:
5211
5212    E1: 0 4  8 12 16 20 24 28
5213    E2: 2 6 10 14 18 22 26 30
5214    E3: 1 5  9 13 17 21 25 29
5215    E4: 3 7 11 15 19 23 27 31
5216
5217    And RESULT_CHAIN after reordering:
5218
5219    1st vec (E1):  0 4  8 12 16 20 24 28
5220    2nd vec (E3):  1 5  9 13 17 21 25 29
5221    3rd vec (E2):  2 6 10 14 18 22 26 30
5222    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5223
5224 static void
5225 vect_permute_load_chain (vec<tree> dr_chain,
5226                          unsigned int length,
5227                          gimple *stmt,
5228                          gimple_stmt_iterator *gsi,
5229                          vec<tree> *result_chain)
5230 {
5231   tree data_ref, first_vect, second_vect;
5232   tree perm_mask_even, perm_mask_odd;
5233   tree perm3_mask_low, perm3_mask_high;
5234   gimple *perm_stmt;
5235   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5236   unsigned int i, j, log_length = exact_log2 (length);
5237   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5238   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5239
5240   result_chain->quick_grow (length);
5241   memcpy (result_chain->address (), dr_chain.address (),
5242           length * sizeof (tree));
5243
5244   if (length == 3)
5245     {
5246       unsigned int k;
5247
5248       for (k = 0; k < 3; k++)
5249         {
5250           for (i = 0; i < nelt; i++)
5251             if (3 * i + k < 2 * nelt)
5252               sel[i] = 3 * i + k;
5253             else
5254               sel[i] = 0;
5255           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5256
5257           for (i = 0, j = 0; i < nelt; i++)
5258             if (3 * i + k < 2 * nelt)
5259               sel[i] = i;
5260             else
5261               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5262
5263           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5264
5265           first_vect = dr_chain[0];
5266           second_vect = dr_chain[1];
5267
5268           /* Create interleaving stmt (low part of):
5269              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5270                                                              ...}>  */
5271           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5272           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5273                                            second_vect, perm3_mask_low);
5274           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5275
5276           /* Create interleaving stmt (high part of):
5277              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5278                                                               ...}>  */
5279           first_vect = data_ref;
5280           second_vect = dr_chain[2];
5281           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5282           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5283                                            second_vect, perm3_mask_high);
5284           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5285           (*result_chain)[k] = data_ref;
5286         }
5287     }
5288   else
5289     {
5290       /* If length is not equal to 3 then only power of 2 is supported.  */
5291       gcc_assert (exact_log2 (length) != -1);
5292
5293       for (i = 0; i < nelt; ++i)
5294         sel[i] = i * 2;
5295       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5296
5297       for (i = 0; i < nelt; ++i)
5298         sel[i] = i * 2 + 1;
5299       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5300
5301       for (i = 0; i < log_length; i++)
5302         {
5303           for (j = 0; j < length; j += 2)
5304             {
5305               first_vect = dr_chain[j];
5306               second_vect = dr_chain[j+1];
5307
5308               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5309               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5310               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5311                                                first_vect, second_vect,
5312                                                perm_mask_even);
5313               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5314               (*result_chain)[j/2] = data_ref;
5315
5316               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5317               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5318               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5319                                                first_vect, second_vect,
5320                                                perm_mask_odd);
5321               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5322               (*result_chain)[j/2+length/2] = data_ref;
5323             }
5324           memcpy (dr_chain.address (), result_chain->address (),
5325                   length * sizeof (tree));
5326         }
5327     }
5328 }
5329
5330 /* Function vect_shift_permute_load_chain.
5331
5332    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5333    sequence of stmts to reorder the input data accordingly.
5334    Return the final references for loads in RESULT_CHAIN.
5335    Return true if successed, false otherwise.
5336
5337    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5338    The input is 3 vectors each containing 8 elements.  We assign a
5339    number to each element, the input sequence is:
5340
5341    1st vec:   0  1  2  3  4  5  6  7
5342    2nd vec:   8  9 10 11 12 13 14 15
5343    3rd vec:  16 17 18 19 20 21 22 23
5344
5345    The output sequence should be:
5346
5347    1st vec:  0 3 6  9 12 15 18 21
5348    2nd vec:  1 4 7 10 13 16 19 22
5349    3rd vec:  2 5 8 11 14 17 20 23
5350
5351    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5352
5353    First we shuffle all 3 vectors to get correct elements order:
5354
5355    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5356    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5357    3rd vec:  (16 19 22) (17 20 23) (18 21)
5358
5359    Next we unite and shift vector 3 times:
5360
5361    1st step:
5362      shift right by 6 the concatenation of:
5363      "1st vec" and  "2nd vec"
5364        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5365      "2nd vec" and  "3rd vec"
5366        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5367      "3rd vec" and  "1st vec"
5368        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5369                              | New vectors                   |
5370
5371      So that now new vectors are:
5372
5373      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5374      2nd vec:  (10 13) (16 19 22) (17 20 23)
5375      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5376
5377    2nd step:
5378      shift right by 5 the concatenation of:
5379      "1st vec" and  "3rd vec"
5380        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5381      "2nd vec" and  "1st vec"
5382        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5383      "3rd vec" and  "2nd vec"
5384        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5385                           | New vectors                   |
5386
5387      So that now new vectors are:
5388
5389      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5390      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5391      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5392
5393    3rd step:
5394      shift right by 5 the concatenation of:
5395      "1st vec" and  "1st vec"
5396        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5397      shift right by 3 the concatenation of:
5398      "2nd vec" and  "2nd vec"
5399                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5400                           | New vectors                   |
5401
5402      So that now all vectors are READY:
5403      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5404      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5405      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5406
5407    This algorithm is faster than one in vect_permute_load_chain if:
5408      1.  "shift of a concatination" is faster than general permutation.
5409          This is usually so.
5410      2.  The TARGET machine can't execute vector instructions in parallel.
5411          This is because each step of the algorithm depends on previous.
5412          The algorithm in vect_permute_load_chain is much more parallel.
5413
5414    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5415 */
5416
5417 static bool
5418 vect_shift_permute_load_chain (vec<tree> dr_chain,
5419                                unsigned int length,
5420                                gimple *stmt,
5421                                gimple_stmt_iterator *gsi,
5422                                vec<tree> *result_chain)
5423 {
5424   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5425   tree perm2_mask1, perm2_mask2, perm3_mask;
5426   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5427   gimple *perm_stmt;
5428
5429   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5430   unsigned int i;
5431   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5432   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5433   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5434   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5435
5436   result_chain->quick_grow (length);
5437   memcpy (result_chain->address (), dr_chain.address (),
5438           length * sizeof (tree));
5439
5440   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5441     {
5442       unsigned int j, log_length = exact_log2 (length);
5443       for (i = 0; i < nelt / 2; ++i)
5444         sel[i] = i * 2;
5445       for (i = 0; i < nelt / 2; ++i)
5446         sel[nelt / 2 + i] = i * 2 + 1;
5447       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5448         {
5449           if (dump_enabled_p ())
5450             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5451                              "shuffle of 2 fields structure is not \
5452                               supported by target\n");
5453           return false;
5454         }
5455       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5456
5457       for (i = 0; i < nelt / 2; ++i)
5458         sel[i] = i * 2 + 1;
5459       for (i = 0; i < nelt / 2; ++i)
5460         sel[nelt / 2 + i] = i * 2;
5461       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5462         {
5463           if (dump_enabled_p ())
5464             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5465                              "shuffle of 2 fields structure is not \
5466                               supported by target\n");
5467           return false;
5468         }
5469       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5470
5471       /* Generating permutation constant to shift all elements.
5472          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5473       for (i = 0; i < nelt; i++)
5474         sel[i] = nelt / 2 + i;
5475       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5476         {
5477           if (dump_enabled_p ())
5478             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5479                              "shift permutation is not supported by target\n");
5480           return false;
5481         }
5482       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5483
5484       /* Generating permutation constant to select vector from 2.
5485          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5486       for (i = 0; i < nelt / 2; i++)
5487         sel[i] = i;
5488       for (i = nelt / 2; i < nelt; i++)
5489         sel[i] = nelt + i;
5490       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5491         {
5492           if (dump_enabled_p ())
5493             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5494                              "select is not supported by target\n");
5495           return false;
5496         }
5497       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5498
5499       for (i = 0; i < log_length; i++)
5500         {
5501           for (j = 0; j < length; j += 2)
5502             {
5503               first_vect = dr_chain[j];
5504               second_vect = dr_chain[j + 1];
5505
5506               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5507               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5508                                                first_vect, first_vect,
5509                                                perm2_mask1);
5510               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5511               vect[0] = data_ref;
5512
5513               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5514               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5515                                                second_vect, second_vect,
5516                                                perm2_mask2);
5517               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5518               vect[1] = data_ref;
5519
5520               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5521               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5522                                                vect[0], vect[1], shift1_mask);
5523               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5524               (*result_chain)[j/2 + length/2] = data_ref;
5525
5526               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5527               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5528                                                vect[0], vect[1], select_mask);
5529               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5530               (*result_chain)[j/2] = data_ref;
5531             }
5532           memcpy (dr_chain.address (), result_chain->address (),
5533                   length * sizeof (tree));
5534         }
5535       return true;
5536     }
5537   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5538     {
5539       unsigned int k = 0, l = 0;
5540
5541       /* Generating permutation constant to get all elements in rigth order.
5542          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5543       for (i = 0; i < nelt; i++)
5544         {
5545           if (3 * k + (l % 3) >= nelt)
5546             {
5547               k = 0;
5548               l += (3 - (nelt % 3));
5549             }
5550           sel[i] = 3 * k + (l % 3);
5551           k++;
5552         }
5553       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5554         {
5555           if (dump_enabled_p ())
5556             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5557                              "shuffle of 3 fields structure is not \
5558                               supported by target\n");
5559           return false;
5560         }
5561       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5562
5563       /* Generating permutation constant to shift all elements.
5564          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5565       for (i = 0; i < nelt; i++)
5566         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5567       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5568         {
5569           if (dump_enabled_p ())
5570             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5571                              "shift permutation is not supported by target\n");
5572           return false;
5573         }
5574       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5575
5576       /* Generating permutation constant to shift all elements.
5577          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5578       for (i = 0; i < nelt; i++)
5579         sel[i] = 2 * (nelt / 3) + 1 + i;
5580       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5581         {
5582           if (dump_enabled_p ())
5583             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5584                              "shift permutation is not supported by target\n");
5585           return false;
5586         }
5587       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5588
5589       /* Generating permutation constant to shift all elements.
5590          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5591       for (i = 0; i < nelt; i++)
5592         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5593       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5594         {
5595           if (dump_enabled_p ())
5596             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5597                              "shift permutation is not supported by target\n");
5598           return false;
5599         }
5600       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5601
5602       /* Generating permutation constant to shift all elements.
5603          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5604       for (i = 0; i < nelt; i++)
5605         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5606       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5607         {
5608           if (dump_enabled_p ())
5609             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5610                              "shift permutation is not supported by target\n");
5611           return false;
5612         }
5613       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5614
5615       for (k = 0; k < 3; k++)
5616         {
5617           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5618           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5619                                            dr_chain[k], dr_chain[k],
5620                                            perm3_mask);
5621           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5622           vect[k] = data_ref;
5623         }
5624
5625       for (k = 0; k < 3; k++)
5626         {
5627           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5628           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5629                                            vect[k % 3], vect[(k + 1) % 3],
5630                                            shift1_mask);
5631           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5632           vect_shift[k] = data_ref;
5633         }
5634
5635       for (k = 0; k < 3; k++)
5636         {
5637           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5638           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5639                                            vect_shift[(4 - k) % 3],
5640                                            vect_shift[(3 - k) % 3],
5641                                            shift2_mask);
5642           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5643           vect[k] = data_ref;
5644         }
5645
5646       (*result_chain)[3 - (nelt % 3)] = vect[2];
5647
5648       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5649       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5650                                        vect[0], shift3_mask);
5651       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5652       (*result_chain)[nelt % 3] = data_ref;
5653
5654       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5655       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5656                                        vect[1], shift4_mask);
5657       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5658       (*result_chain)[0] = data_ref;
5659       return true;
5660     }
5661   return false;
5662 }
5663
5664 /* Function vect_transform_grouped_load.
5665
5666    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5667    to perform their permutation and ascribe the result vectorized statements to
5668    the scalar statements.
5669 */
5670
5671 void
5672 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5673                              gimple_stmt_iterator *gsi)
5674 {
5675   machine_mode mode;
5676   vec<tree> result_chain = vNULL;
5677
5678   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5679      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5680      vectors, that are ready for vector computation.  */
5681   result_chain.create (size);
5682
5683   /* If reassociation width for vector type is 2 or greater target machine can
5684      execute 2 or more vector instructions in parallel.  Otherwise try to
5685      get chain for loads group using vect_shift_permute_load_chain.  */
5686   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5687   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5688       || exact_log2 (size) != -1
5689       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5690                                          gsi, &result_chain))
5691     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5692   vect_record_grouped_load_vectors (stmt, result_chain);
5693   result_chain.release ();
5694 }
5695
5696 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5697    generated as part of the vectorization of STMT.  Assign the statement
5698    for each vector to the associated scalar statement.  */
5699
5700 void
5701 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5702 {
5703   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5704   gimple *next_stmt, *new_stmt;
5705   unsigned int i, gap_count;
5706   tree tmp_data_ref;
5707
5708   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5709      Since we scan the chain starting from it's first node, their order
5710      corresponds the order of data-refs in RESULT_CHAIN.  */
5711   next_stmt = first_stmt;
5712   gap_count = 1;
5713   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5714     {
5715       if (!next_stmt)
5716         break;
5717
5718       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5719        code elimination pass later.  No need to check for the first stmt in
5720        the group, since it always exists.
5721        GROUP_GAP is the number of steps in elements from the previous
5722        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5723        correspond to the gaps.  */
5724       if (next_stmt != first_stmt
5725           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5726       {
5727         gap_count++;
5728         continue;
5729       }
5730
5731       while (next_stmt)
5732         {
5733           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5734           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5735              copies, and we put the new vector statement in the first available
5736              RELATED_STMT.  */
5737           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5738             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5739           else
5740             {
5741               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5742                 {
5743                   gimple *prev_stmt =
5744                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5745                   gimple *rel_stmt =
5746                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5747                   while (rel_stmt)
5748                     {
5749                       prev_stmt = rel_stmt;
5750                       rel_stmt =
5751                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5752                     }
5753
5754                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5755                     new_stmt;
5756                 }
5757             }
5758
5759           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5760           gap_count = 1;
5761           /* If NEXT_STMT accesses the same DR as the previous statement,
5762              put the same TMP_DATA_REF as its vectorized statement; otherwise
5763              get the next data-ref from RESULT_CHAIN.  */
5764           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5765             break;
5766         }
5767     }
5768 }
5769
5770 /* Function vect_force_dr_alignment_p.
5771
5772    Returns whether the alignment of a DECL can be forced to be aligned
5773    on ALIGNMENT bit boundary.  */
5774
5775 bool
5776 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5777 {
5778   if (TREE_CODE (decl) != VAR_DECL)
5779     return false;
5780
5781   if (decl_in_symtab_p (decl)
5782       && !symtab_node::get (decl)->can_increase_alignment_p ())
5783     return false;
5784
5785   if (TREE_STATIC (decl))
5786     return (alignment <= MAX_OFILE_ALIGNMENT);
5787   else
5788     return (alignment <= MAX_STACK_ALIGNMENT);
5789 }
5790
5791
5792 /* Return whether the data reference DR is supported with respect to its
5793    alignment.
5794    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5795    it is aligned, i.e., check if it is possible to vectorize it with different
5796    alignment.  */
5797
5798 enum dr_alignment_support
5799 vect_supportable_dr_alignment (struct data_reference *dr,
5800                                bool check_aligned_accesses)
5801 {
5802   gimple *stmt = DR_STMT (dr);
5803   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5804   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5805   machine_mode mode = TYPE_MODE (vectype);
5806   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5807   struct loop *vect_loop = NULL;
5808   bool nested_in_vect_loop = false;
5809
5810   if (aligned_access_p (dr) && !check_aligned_accesses)
5811     return dr_aligned;
5812
5813   /* For now assume all conditional loads/stores support unaligned
5814      access without any special code.  */
5815   if (is_gimple_call (stmt)
5816       && gimple_call_internal_p (stmt)
5817       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5818           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5819     return dr_unaligned_supported;
5820
5821   if (loop_vinfo)
5822     {
5823       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5824       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5825     }
5826
5827   /* Possibly unaligned access.  */
5828
5829   /* We can choose between using the implicit realignment scheme (generating
5830      a misaligned_move stmt) and the explicit realignment scheme (generating
5831      aligned loads with a REALIGN_LOAD).  There are two variants to the
5832      explicit realignment scheme: optimized, and unoptimized.
5833      We can optimize the realignment only if the step between consecutive
5834      vector loads is equal to the vector size.  Since the vector memory
5835      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5836      is guaranteed that the misalignment amount remains the same throughout the
5837      execution of the vectorized loop.  Therefore, we can create the
5838      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5839      at the loop preheader.
5840
5841      However, in the case of outer-loop vectorization, when vectorizing a
5842      memory access in the inner-loop nested within the LOOP that is now being
5843      vectorized, while it is guaranteed that the misalignment of the
5844      vectorized memory access will remain the same in different outer-loop
5845      iterations, it is *not* guaranteed that is will remain the same throughout
5846      the execution of the inner-loop.  This is because the inner-loop advances
5847      with the original scalar step (and not in steps of VS).  If the inner-loop
5848      step happens to be a multiple of VS, then the misalignment remains fixed
5849      and we can use the optimized realignment scheme.  For example:
5850
5851       for (i=0; i<N; i++)
5852         for (j=0; j<M; j++)
5853           s += a[i+j];
5854
5855      When vectorizing the i-loop in the above example, the step between
5856      consecutive vector loads is 1, and so the misalignment does not remain
5857      fixed across the execution of the inner-loop, and the realignment cannot
5858      be optimized (as illustrated in the following pseudo vectorized loop):
5859
5860       for (i=0; i<N; i+=4)
5861         for (j=0; j<M; j++){
5862           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5863                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5864                          // (assuming that we start from an aligned address).
5865           }
5866
5867      We therefore have to use the unoptimized realignment scheme:
5868
5869       for (i=0; i<N; i+=4)
5870           for (j=k; j<M; j+=4)
5871           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5872                            // that the misalignment of the initial address is
5873                            // 0).
5874
5875      The loop can then be vectorized as follows:
5876
5877       for (k=0; k<4; k++){
5878         rt = get_realignment_token (&vp[k]);
5879         for (i=0; i<N; i+=4){
5880           v1 = vp[i+k];
5881           for (j=k; j<M; j+=4){
5882             v2 = vp[i+j+VS-1];
5883             va = REALIGN_LOAD <v1,v2,rt>;
5884             vs += va;
5885             v1 = v2;
5886           }
5887         }
5888     } */
5889
5890   if (DR_IS_READ (dr))
5891     {
5892       bool is_packed = false;
5893       tree type = (TREE_TYPE (DR_REF (dr)));
5894
5895       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5896           && (!targetm.vectorize.builtin_mask_for_load
5897               || targetm.vectorize.builtin_mask_for_load ()))
5898         {
5899           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5900           if ((nested_in_vect_loop
5901                && (TREE_INT_CST_LOW (DR_STEP (dr))
5902                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5903               || !loop_vinfo)
5904             return dr_explicit_realign;
5905           else
5906             return dr_explicit_realign_optimized;
5907         }
5908       if (!known_alignment_for_access_p (dr))
5909         is_packed = not_size_aligned (DR_REF (dr));
5910
5911       if ((TYPE_USER_ALIGN (type) && !is_packed)
5912           || targetm.vectorize.
5913                support_vector_misalignment (mode, type,
5914                                             DR_MISALIGNMENT (dr), is_packed))
5915         /* Can't software pipeline the loads, but can at least do them.  */
5916         return dr_unaligned_supported;
5917     }
5918   else
5919     {
5920       bool is_packed = false;
5921       tree type = (TREE_TYPE (DR_REF (dr)));
5922
5923       if (!known_alignment_for_access_p (dr))
5924         is_packed = not_size_aligned (DR_REF (dr));
5925
5926      if ((TYPE_USER_ALIGN (type) && !is_packed)
5927          || targetm.vectorize.
5928               support_vector_misalignment (mode, type,
5929                                            DR_MISALIGNMENT (dr), is_packed))
5930        return dr_unaligned_supported;
5931     }
5932
5933   /* Unsupported.  */
5934   return dr_unaligned_unsupported;
5935 }