gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "tm_p.h"
  30 #include "target.h"
  31 #include "basic-block.h"
  32 #include "gimple-pretty-print.h"
  33 #include "tree-ssa-alias.h"
  34 #include "internal-fn.h"
  35 #include "tree-eh.h"
  36 #include "gimple-expr.h"
  37 #include "is-a.h"
  38 #include "gimple.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "gimple-ssa.h"
  43 #include "tree-phinodes.h"
  44 #include "ssa-iterators.h"
  45 #include "stringpool.h"
  46 #include "tree-ssanames.h"
  47 #include "tree-ssa-loop-ivopts.h"
  48 #include "tree-ssa-loop-manip.h"
  49 #include "tree-ssa-loop.h"
  50 #include "dumpfile.h"
  51 #include "cfgloop.h"
  52 #include "tree-chrec.h"
  53 #include "tree-scalar-evolution.h"
  54 #include "tree-vectorizer.h"
  55 #include "diagnostic-core.h"
  56 #include "cgraph.h"
  57 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  58 #include "expr.h"
  59 #include "optabs.h"
  60 #include "builtins.h"
  61
  62 /* Return true if load- or store-lanes optab OPTAB is implemented for
  63    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  64
  65 static bool
  66 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  67                               tree vectype, unsigned HOST_WIDE_INT count)
  68 {
  69   enum machine_mode mode, array_mode;
  70   bool limit_p;
  71
  72   mode = TYPE_MODE (vectype);
  73   limit_p = !targetm.array_mode_supported_p (mode, count);
  74   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  75                               MODE_INT, limit_p);
  76
  77   if (array_mode == BLKmode)
  78     {
  79       if (dump_enabled_p ())
  80         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  81                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  82                          GET_MODE_NAME (mode), count);
  83       return false;
  84     }
  85
  86   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  87     {
  88       if (dump_enabled_p ())
  89         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  90                          "cannot use %s<%s><%s>\n", name,
  91                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  92       return false;
  93     }
  94
  95   if (dump_enabled_p ())
  96     dump_printf_loc (MSG_NOTE, vect_location,
  97                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  98                      GET_MODE_NAME (mode));
  99
 100   return true;
 101 }
 102
 103
 104 /* Return the smallest scalar part of STMT.
 105    This is used to determine the vectype of the stmt.  We generally set the
 106    vectype according to the type of the result (lhs).  For stmts whose
 107    result-type is different than the type of the arguments (e.g., demotion,
 108    promotion), vectype will be reset appropriately (later).  Note that we have
 109    to visit the smallest datatype in this function, because that determines the
 110    VF.  If the smallest datatype in the loop is present only as the rhs of a
 111    promotion operation - we'd miss it.
 112    Such a case, where a variable of this datatype does not appear in the lhs
 113    anywhere in the loop, can only occur if it's an invariant: e.g.:
 114    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 115    invariant motion.  However, we cannot rely on invariant motion to always
 116    take invariants out of the loop, and so in the case of promotion we also
 117    have to check the rhs.
 118    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 119    types.  */
 120
 121 tree
 122 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 123                                HOST_WIDE_INT *rhs_size_unit)
 124 {
 125   tree scalar_type = gimple_expr_type (stmt);
 126   HOST_WIDE_INT lhs, rhs;
 127
 128   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 129
 130   if (is_gimple_assign (stmt)
 131       && (gimple_assign_cast_p (stmt)
 132           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 133           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 134           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 135     {
 136       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 137
 138       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 139       if (rhs < lhs)
 140         scalar_type = rhs_type;
 141     }
 142
 143   *lhs_size_unit = lhs;
 144   *rhs_size_unit = rhs;
 145   return scalar_type;
 146 }
 147
 148
 149 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 150    tested at run-time.  Return TRUE if DDR was successfully inserted.
 151    Return false if versioning is not supported.  */
 152
 153 static bool
 154 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 155 {
 156   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 157
 158   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 159     return false;
 160
 161   if (dump_enabled_p ())
 162     {
 163       dump_printf_loc (MSG_NOTE, vect_location,
 164                        "mark for run-time aliasing test between ");
 165       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 166       dump_printf (MSG_NOTE,  " and ");
 167       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 168       dump_printf (MSG_NOTE, "\n");
 169     }
 170
 171   if (optimize_loop_nest_for_size_p (loop))
 172     {
 173       if (dump_enabled_p ())
 174         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 175                          "versioning not supported when optimizing"
 176                          " for size.\n");
 177       return false;
 178     }
 179
 180   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 181   if (loop->inner)
 182     {
 183       if (dump_enabled_p ())
 184         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 185                          "versioning not yet supported for outer-loops.\n");
 186       return false;
 187     }
 188
 189   /* FORNOW: We don't support creating runtime alias tests for non-constant
 190      step.  */
 191   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 192       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 193     {
 194       if (dump_enabled_p ())
 195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 196                          "versioning not yet supported for non-constant "
 197                          "step\n");
 198       return false;
 199     }
 200
 201   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 202   return true;
 203 }
 204
 205
 206 /* Function vect_analyze_data_ref_dependence.
 207
 208    Return TRUE if there (might) exist a dependence between a memory-reference
 209    DRA and a memory-reference DRB.  When versioning for alias may check a
 210    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 211    the data dependence.  */
 212
 213 static bool
 214 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 215                                   loop_vec_info loop_vinfo, int *max_vf)
 216 {
 217   unsigned int i;
 218   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 219   struct data_reference *dra = DDR_A (ddr);
 220   struct data_reference *drb = DDR_B (ddr);
 221   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 222   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 223   lambda_vector dist_v;
 224   unsigned int loop_depth;
 225
 226   /* In loop analysis all data references should be vectorizable.  */
 227   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 228       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 229     gcc_unreachable ();
 230
 231   /* Independent data accesses.  */
 232   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 233     return false;
 234
 235   if (dra == drb
 236       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 237     return false;
 238
 239   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 240      least two scalar iterations, there is always also a true dependence.
 241      As the vectorizer does not re-order loads and stores we can ignore
 242      the anti-dependence if TBAA can disambiguate both DRs similar to the
 243      case with known negative distance anti-dependences (positive
 244      distance anti-dependences would violate TBAA constraints).  */
 245   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 246        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 247       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 248                                  get_alias_set (DR_REF (drb))))
 249     return false;
 250
 251   /* Unknown data dependence.  */
 252   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 253     {
 254       /* If user asserted safelen consecutive iterations can be
 255          executed concurrently, assume independence.  */
 256       if (loop->safelen >= 2)
 257         {
 258           if (loop->safelen < *max_vf)
 259             *max_vf = loop->safelen;
 260           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 261           return false;
 262         }
 263
 264       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 265           || STMT_VINFO_GATHER_P (stmtinfo_b))
 266         {
 267           if (dump_enabled_p ())
 268             {
 269               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 270                                "versioning for alias not supported for: "
 271                                "can't determine dependence between ");
 272               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 273                                  DR_REF (dra));
 274               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 275               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 276                                  DR_REF (drb));
 277               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 278             }
 279           return true;
 280         }
 281
 282       if (dump_enabled_p ())
 283         {
 284           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 285                            "versioning for alias required: "
 286                            "can't determine dependence between ");
 287           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 288                              DR_REF (dra));
 289           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 290           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 291                              DR_REF (drb));
 292           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 293         }
 294
 295       /* Add to list of ddrs that need to be tested at run-time.  */
 296       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 297     }
 298
 299   /* Known data dependence.  */
 300   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 301     {
 302       /* If user asserted safelen consecutive iterations can be
 303          executed concurrently, assume independence.  */
 304       if (loop->safelen >= 2)
 305         {
 306           if (loop->safelen < *max_vf)
 307             *max_vf = loop->safelen;
 308           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 309           return false;
 310         }
 311
 312       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 313           || STMT_VINFO_GATHER_P (stmtinfo_b))
 314         {
 315           if (dump_enabled_p ())
 316             {
 317               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 318                                "versioning for alias not supported for: "
 319                                "bad dist vector for ");
 320               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 321                                  DR_REF (dra));
 322               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 323               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 324                                  DR_REF (drb));
 325               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 326             }
 327           return true;
 328         }
 329
 330       if (dump_enabled_p ())
 331         {
 332           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 333                            "versioning for alias required: "
 334                            "bad dist vector for ");
 335           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 336           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 337           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 338           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 339         }
 340       /* Add to list of ddrs that need to be tested at run-time.  */
 341       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 342     }
 343
 344   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 345   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 346     {
 347       int dist = dist_v[loop_depth];
 348
 349       if (dump_enabled_p ())
 350         dump_printf_loc (MSG_NOTE, vect_location,
 351                          "dependence distance  = %d.\n", dist);
 352
 353       if (dist == 0)
 354         {
 355           if (dump_enabled_p ())
 356             {
 357               dump_printf_loc (MSG_NOTE, vect_location,
 358                                "dependence distance == 0 between ");
 359               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 360               dump_printf (MSG_NOTE, " and ");
 361               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 362               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 363             }
 364
 365           /* When we perform grouped accesses and perform implicit CSE
 366              by detecting equal accesses and doing disambiguation with
 367              runtime alias tests like for
 368                 .. = a[i];
 369                 .. = a[i+1];
 370                 a[i] = ..;
 371                 a[i+1] = ..;
 372                 *p = ..;
 373                 .. = a[i];
 374                 .. = a[i+1];
 375              where we will end up loading { a[i], a[i+1] } once, make
 376              sure that inserting group loads before the first load and
 377              stores after the last store will do the right thing.  */
 378           if ((STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 379                && GROUP_SAME_DR_STMT (stmtinfo_a))
 380               || (STMT_VINFO_GROUPED_ACCESS (stmtinfo_b)
 381                   && GROUP_SAME_DR_STMT (stmtinfo_b)))
 382             {
 383               gimple earlier_stmt;
 384               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 385               if (DR_IS_WRITE
 386                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 387                 {
 388                   if (dump_enabled_p ())
 389                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 390                                      "READ_WRITE dependence in interleaving."
 391                                      "\n");
 392                   return true;
 393                 }
 394             }
 395
 396           continue;
 397         }
 398
 399       if (dist > 0 && DDR_REVERSED_P (ddr))
 400         {
 401           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 402              reversed (to make distance vector positive), and the actual
 403              distance is negative.  */
 404           if (dump_enabled_p ())
 405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                              "dependence distance negative.\n");
 407           /* Record a negative dependence distance to later limit the
 408              amount of stmt copying / unrolling we can perform.
 409              Only need to handle read-after-write dependence.  */
 410           if (DR_IS_READ (drb)
 411               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 412                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 413             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 414           continue;
 415         }
 416
 417       if (abs (dist) >= 2
 418           && abs (dist) < *max_vf)
 419         {
 420           /* The dependence distance requires reduction of the maximal
 421              vectorization factor.  */
 422           *max_vf = abs (dist);
 423           if (dump_enabled_p ())
 424             dump_printf_loc (MSG_NOTE, vect_location,
 425                              "adjusting maximal vectorization factor to %i\n",
 426                              *max_vf);
 427         }
 428
 429       if (abs (dist) >= *max_vf)
 430         {
 431           /* Dependence distance does not create dependence, as far as
 432              vectorization is concerned, in this case.  */
 433           if (dump_enabled_p ())
 434             dump_printf_loc (MSG_NOTE, vect_location,
 435                              "dependence distance >= VF.\n");
 436           continue;
 437         }
 438
 439       if (dump_enabled_p ())
 440         {
 441           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 442                        "not vectorized, possible dependence "
 443                        "between data-refs ");
 444           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 445           dump_printf (MSG_NOTE,  " and ");
 446           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 447           dump_printf (MSG_NOTE,  "\n");
 448         }
 449
 450       return true;
 451     }
 452
 453   return false;
 454 }
 455
 456 /* Function vect_analyze_data_ref_dependences.
 457
 458    Examine all the data references in the loop, and make sure there do not
 459    exist any data dependences between them.  Set *MAX_VF according to
 460    the maximum vectorization factor the data dependences allow.  */
 461
 462 bool
 463 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 464 {
 465   unsigned int i;
 466   struct data_dependence_relation *ddr;
 467
 468   if (dump_enabled_p ())
 469     dump_printf_loc (MSG_NOTE, vect_location,
 470                      "=== vect_analyze_data_ref_dependences ===\n");
 471
 472   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 473   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 474                                 &LOOP_VINFO_DDRS (loop_vinfo),
 475                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 476     return false;
 477
 478   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 479     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 480       return false;
 481
 482   return true;
 483 }
 484
 485
 486 /* Function vect_slp_analyze_data_ref_dependence.
 487
 488    Return TRUE if there (might) exist a dependence between a memory-reference
 489    DRA and a memory-reference DRB.  When versioning for alias may check a
 490    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 491    the data dependence.  */
 492
 493 static bool
 494 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 495 {
 496   struct data_reference *dra = DDR_A (ddr);
 497   struct data_reference *drb = DDR_B (ddr);
 498
 499   /* We need to check dependences of statements marked as unvectorizable
 500      as well, they still can prohibit vectorization.  */
 501
 502   /* Independent data accesses.  */
 503   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 504     return false;
 505
 506   if (dra == drb)
 507     return false;
 508
 509   /* Read-read is OK.  */
 510   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 511     return false;
 512
 513   /* If dra and drb are part of the same interleaving chain consider
 514      them independent.  */
 515   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 516       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 517           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 518     return false;
 519
 520   /* Unknown data dependence.  */
 521   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 522     {
 523       if  (dump_enabled_p ())
 524         {
 525           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 526                            "can't determine dependence between ");
 527           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 528           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 529           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 530           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 531         }
 532     }
 533   else if (dump_enabled_p ())
 534     {
 535       dump_printf_loc (MSG_NOTE, vect_location,
 536                        "determined dependence between ");
 537       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 538       dump_printf (MSG_NOTE, " and ");
 539       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 540       dump_printf (MSG_NOTE,  "\n");
 541     }
 542
 543   /* We do not vectorize basic blocks with write-write dependencies.  */
 544   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 545     return true;
 546
 547   /* If we have a read-write dependence check that the load is before the store.
 548      When we vectorize basic blocks, vector load can be only before
 549      corresponding scalar load, and vector store can be only after its
 550      corresponding scalar store.  So the order of the acceses is preserved in
 551      case the load is before the store.  */
 552   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 553   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 554     {
 555       /* That only holds for load-store pairs taking part in vectorization.  */
 556       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 557           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 558         return false;
 559     }
 560
 561   return true;
 562 }
 563
 564
 565 /* Function vect_analyze_data_ref_dependences.
 566
 567    Examine all the data references in the basic-block, and make sure there
 568    do not exist any data dependences between them.  Set *MAX_VF according to
 569    the maximum vectorization factor the data dependences allow.  */
 570
 571 bool
 572 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 573 {
 574   struct data_dependence_relation *ddr;
 575   unsigned int i;
 576
 577   if (dump_enabled_p ())
 578     dump_printf_loc (MSG_NOTE, vect_location,
 579                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 580
 581   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 582                                 &BB_VINFO_DDRS (bb_vinfo),
 583                                 vNULL, true))
 584     return false;
 585
 586   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 587     if (vect_slp_analyze_data_ref_dependence (ddr))
 588       return false;
 589
 590   return true;
 591 }
 592
 593
 594 /* Function vect_compute_data_ref_alignment
 595
 596    Compute the misalignment of the data reference DR.
 597
 598    Output:
 599    1. If during the misalignment computation it is found that the data reference
 600       cannot be vectorized then false is returned.
 601    2. DR_MISALIGNMENT (DR) is defined.
 602
 603    FOR NOW: No analysis is actually performed. Misalignment is calculated
 604    only for trivial cases. TODO.  */
 605
 606 static bool
 607 vect_compute_data_ref_alignment (struct data_reference *dr)
 608 {
 609   gimple stmt = DR_STMT (dr);
 610   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 611   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 612   struct loop *loop = NULL;
 613   tree ref = DR_REF (dr);
 614   tree vectype;
 615   tree base, base_addr;
 616   bool base_aligned;
 617   tree misalign;
 618   tree aligned_to, alignment;
 619
 620   if (dump_enabled_p ())
 621     dump_printf_loc (MSG_NOTE, vect_location,
 622                      "vect_compute_data_ref_alignment:\n");
 623
 624   if (loop_vinfo)
 625     loop = LOOP_VINFO_LOOP (loop_vinfo);
 626
 627   /* Initialize misalignment to unknown.  */
 628   SET_DR_MISALIGNMENT (dr, -1);
 629
 630   /* Strided loads perform only component accesses, misalignment information
 631      is irrelevant for them.  */
 632   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 633     return true;
 634
 635   misalign = DR_INIT (dr);
 636   aligned_to = DR_ALIGNED_TO (dr);
 637   base_addr = DR_BASE_ADDRESS (dr);
 638   vectype = STMT_VINFO_VECTYPE (stmt_info);
 639
 640   /* In case the dataref is in an inner-loop of the loop that is being
 641      vectorized (LOOP), we use the base and misalignment information
 642      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 643      stays the same throughout the execution of the inner-loop, which is why
 644      we have to check that the stride of the dataref in the inner-loop evenly
 645      divides by the vector size.  */
 646   if (loop && nested_in_vect_loop_p (loop, stmt))
 647     {
 648       tree step = DR_STEP (dr);
 649       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 650
 651       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 652         {
 653           if (dump_enabled_p ())
 654             dump_printf_loc (MSG_NOTE, vect_location,
 655                              "inner step divides the vector-size.\n");
 656           misalign = STMT_VINFO_DR_INIT (stmt_info);
 657           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 658           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 659         }
 660       else
 661         {
 662           if (dump_enabled_p ())
 663             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 664                              "inner step doesn't divide the vector-size.\n");
 665           misalign = NULL_TREE;
 666         }
 667     }
 668
 669   /* Similarly, if we're doing basic-block vectorization, we can only use
 670      base and misalignment information relative to an innermost loop if the
 671      misalignment stays the same throughout the execution of the loop.
 672      As above, this is the case if the stride of the dataref evenly divides
 673      by the vector size.  */
 674   if (!loop)
 675     {
 676       tree step = DR_STEP (dr);
 677       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 678
 679       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 680         {
 681           if (dump_enabled_p ())
 682             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 683                              "SLP: step doesn't divide the vector-size.\n");
 684           misalign = NULL_TREE;
 685         }
 686     }
 687
 688   base = build_fold_indirect_ref (base_addr);
 689   alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
 690
 691   if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
 692       || !misalign)
 693     {
 694       if (dump_enabled_p ())
 695         {
 696           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 697                            "Unknown alignment for access: ");
 698           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
 699           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 700         }
 701       return true;
 702     }
 703
 704   if ((DECL_P (base)
 705        && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
 706                                 alignment) >= 0)
 707       || (TREE_CODE (base_addr) == SSA_NAME
 708           && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
 709                                                       TREE_TYPE (base_addr)))),
 710                                    alignment) >= 0)
 711       || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
 712     base_aligned = true;
 713   else
 714     base_aligned = false;
 715
 716   if (!base_aligned)
 717     {
 718       /* Do not change the alignment of global variables here if
 719          flag_section_anchors is enabled as we already generated
 720          RTL for other functions.  Most global variables should
 721          have been aligned during the IPA increase_alignment pass.  */
 722       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
 723           || (TREE_STATIC (base) && flag_section_anchors))
 724         {
 725           if (dump_enabled_p ())
 726             {
 727               dump_printf_loc (MSG_NOTE, vect_location,
 728                                "can't force alignment of ref: ");
 729               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 730               dump_printf (MSG_NOTE, "\n");
 731             }
 732           return true;
 733         }
 734
 735       /* Force the alignment of the decl.
 736          NOTE: This is the only change to the code we make during
 737          the analysis phase, before deciding to vectorize the loop.  */
 738       if (dump_enabled_p ())
 739         {
 740           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 741           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 742           dump_printf (MSG_NOTE, "\n");
 743         }
 744
 745       ((dataref_aux *)dr->aux)->base_decl = base;
 746       ((dataref_aux *)dr->aux)->base_misaligned = true;
 747     }
 748
 749   /* If this is a backward running DR then first access in the larger
 750      vectype actually is N-1 elements before the address in the DR.
 751      Adjust misalign accordingly.  */
 752   if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
 753     {
 754       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 755       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 756          otherwise we wouldn't be here.  */
 757       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 758       /* PLUS because DR_STEP was negative.  */
 759       misalign = size_binop (PLUS_EXPR, misalign, offset);
 760     }
 761
 762   /* Modulo alignment.  */
 763   misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
 764
 765   if (!tree_fits_uhwi_p (misalign))
 766     {
 767       /* Negative or overflowed misalignment value.  */
 768       if (dump_enabled_p ())
 769         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 770                          "unexpected misalign value\n");
 771       return false;
 772     }
 773
 774   SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
 775
 776   if (dump_enabled_p ())
 777     {
 778       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 779                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 780       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 781       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 782     }
 783
 784   return true;
 785 }
 786
 787
 788 /* Function vect_compute_data_refs_alignment
 789
 790    Compute the misalignment of data references in the loop.
 791    Return FALSE if a data reference is found that cannot be vectorized.  */
 792
 793 static bool
 794 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 795                                   bb_vec_info bb_vinfo)
 796 {
 797   vec<data_reference_p> datarefs;
 798   struct data_reference *dr;
 799   unsigned int i;
 800
 801   if (loop_vinfo)
 802     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 803   else
 804     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 805
 806   FOR_EACH_VEC_ELT (datarefs, i, dr)
 807     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 808         && !vect_compute_data_ref_alignment (dr))
 809       {
 810         if (bb_vinfo)
 811           {
 812             /* Mark unsupported statement as unvectorizable.  */
 813             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 814             continue;
 815           }
 816         else
 817           return false;
 818       }
 819
 820   return true;
 821 }
 822
 823
 824 /* Function vect_update_misalignment_for_peel
 825
 826    DR - the data reference whose misalignment is to be adjusted.
 827    DR_PEEL - the data reference whose misalignment is being made
 828              zero in the vector loop by the peel.
 829    NPEEL - the number of iterations in the peel loop if the misalignment
 830            of DR_PEEL is known at compile time.  */
 831
 832 static void
 833 vect_update_misalignment_for_peel (struct data_reference *dr,
 834                                    struct data_reference *dr_peel, int npeel)
 835 {
 836   unsigned int i;
 837   vec<dr_p> same_align_drs;
 838   struct data_reference *current_dr;
 839   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 840   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 841   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 842   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 843
 844  /* For interleaved data accesses the step in the loop must be multiplied by
 845      the size of the interleaving group.  */
 846   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 847     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 848   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 849     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 850
 851   /* It can be assumed that the data refs with the same alignment as dr_peel
 852      are aligned in the vector loop.  */
 853   same_align_drs
 854     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 855   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 856     {
 857       if (current_dr != dr)
 858         continue;
 859       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 860                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 861       SET_DR_MISALIGNMENT (dr, 0);
 862       return;
 863     }
 864
 865   if (known_alignment_for_access_p (dr)
 866       && known_alignment_for_access_p (dr_peel))
 867     {
 868       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 869       int misal = DR_MISALIGNMENT (dr);
 870       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 871       misal += negative ? -npeel * dr_size : npeel * dr_size;
 872       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 873       SET_DR_MISALIGNMENT (dr, misal);
 874       return;
 875     }
 876
 877   if (dump_enabled_p ())
 878     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 879   SET_DR_MISALIGNMENT (dr, -1);
 880 }
 881
 882
 883 /* Function vect_verify_datarefs_alignment
 884
 885    Return TRUE if all data references in the loop can be
 886    handled with respect to alignment.  */
 887
 888 bool
 889 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 890 {
 891   vec<data_reference_p> datarefs;
 892   struct data_reference *dr;
 893   enum dr_alignment_support supportable_dr_alignment;
 894   unsigned int i;
 895
 896   if (loop_vinfo)
 897     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 898   else
 899     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 900
 901   FOR_EACH_VEC_ELT (datarefs, i, dr)
 902     {
 903       gimple stmt = DR_STMT (dr);
 904       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 905
 906       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 907         continue;
 908
 909       /* For interleaving, only the alignment of the first access matters.
 910          Skip statements marked as not vectorizable.  */
 911       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 912            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 913           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 914         continue;
 915
 916       /* Strided loads perform only component accesses, alignment is
 917          irrelevant for them.  */
 918       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 919         continue;
 920
 921       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 922       if (!supportable_dr_alignment)
 923         {
 924           if (dump_enabled_p ())
 925             {
 926               if (DR_IS_READ (dr))
 927                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 928                                  "not vectorized: unsupported unaligned load.");
 929               else
 930                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 931                                  "not vectorized: unsupported unaligned "
 932                                  "store.");
 933
 934               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 935                                  DR_REF (dr));
 936               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 937             }
 938           return false;
 939         }
 940       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 941         dump_printf_loc (MSG_NOTE, vect_location,
 942                          "Vectorizing an unaligned access.\n");
 943     }
 944   return true;
 945 }
 946
 947 /* Given an memory reference EXP return whether its alignment is less
 948    than its size.  */
 949
 950 static bool
 951 not_size_aligned (tree exp)
 952 {
 953   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 954     return true;
 955
 956   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 957           > get_object_alignment (exp));
 958 }
 959
 960 /* Function vector_alignment_reachable_p
 961
 962    Return true if vector alignment for DR is reachable by peeling
 963    a few loop iterations.  Return false otherwise.  */
 964
 965 static bool
 966 vector_alignment_reachable_p (struct data_reference *dr)
 967 {
 968   gimple stmt = DR_STMT (dr);
 969   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 970   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 971
 972   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 973     {
 974       /* For interleaved access we peel only if number of iterations in
 975          the prolog loop ({VF - misalignment}), is a multiple of the
 976          number of the interleaved accesses.  */
 977       int elem_size, mis_in_elements;
 978       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 979
 980       /* FORNOW: handle only known alignment.  */
 981       if (!known_alignment_for_access_p (dr))
 982         return false;
 983
 984       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 985       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 986
 987       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
 988         return false;
 989     }
 990
 991   /* If misalignment is known at the compile time then allow peeling
 992      only if natural alignment is reachable through peeling.  */
 993   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
 994     {
 995       HOST_WIDE_INT elmsize =
 996                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
 997       if (dump_enabled_p ())
 998         {
 999           dump_printf_loc (MSG_NOTE, vect_location,
1000                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1001           dump_printf (MSG_NOTE,
1002                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1003         }
1004       if (DR_MISALIGNMENT (dr) % elmsize)
1005         {
1006           if (dump_enabled_p ())
1007             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1008                              "data size does not divide the misalignment.\n");
1009           return false;
1010         }
1011     }
1012
1013   if (!known_alignment_for_access_p (dr))
1014     {
1015       tree type = TREE_TYPE (DR_REF (dr));
1016       bool is_packed = not_size_aligned (DR_REF (dr));
1017       if (dump_enabled_p ())
1018         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1019                          "Unknown misalignment, is_packed = %d\n",is_packed);
1020       if ((TYPE_USER_ALIGN (type) && !is_packed)
1021           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1022         return true;
1023       else
1024         return false;
1025     }
1026
1027   return true;
1028 }
1029
1030
1031 /* Calculate the cost of the memory access represented by DR.  */
1032
1033 static void
1034 vect_get_data_access_cost (struct data_reference *dr,
1035                            unsigned int *inside_cost,
1036                            unsigned int *outside_cost,
1037                            stmt_vector_for_cost *body_cost_vec)
1038 {
1039   gimple stmt = DR_STMT (dr);
1040   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1041   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1042   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1043   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1044   int ncopies = vf / nunits;
1045
1046   if (DR_IS_READ (dr))
1047     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1048                         NULL, body_cost_vec, false);
1049   else
1050     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1051
1052   if (dump_enabled_p ())
1053     dump_printf_loc (MSG_NOTE, vect_location,
1054                      "vect_get_data_access_cost: inside_cost = %d, "
1055                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1056 }
1057
1058
1059 /* Insert DR into peeling hash table with NPEEL as key.  */
1060
1061 static void
1062 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1063                           int npeel)
1064 {
1065   struct _vect_peel_info elem, *slot;
1066   _vect_peel_info **new_slot;
1067   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1068
1069   elem.npeel = npeel;
1070   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find (&elem);
1071   if (slot)
1072     slot->count++;
1073   else
1074     {
1075       slot = XNEW (struct _vect_peel_info);
1076       slot->npeel = npeel;
1077       slot->dr = dr;
1078       slot->count = 1;
1079       new_slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo).find_slot (slot, INSERT);
1080       *new_slot = slot;
1081     }
1082
1083   if (!supportable_dr_alignment
1084       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1085     slot->count += VECT_MAX_COST;
1086 }
1087
1088
1089 /* Traverse peeling hash table to find peeling option that aligns maximum
1090    number of data accesses.  */
1091
1092 int
1093 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1094                                      _vect_peel_extended_info *max)
1095 {
1096   vect_peel_info elem = *slot;
1097
1098   if (elem->count > max->peel_info.count
1099       || (elem->count == max->peel_info.count
1100           && max->peel_info.npeel > elem->npeel))
1101     {
1102       max->peel_info.npeel = elem->npeel;
1103       max->peel_info.count = elem->count;
1104       max->peel_info.dr = elem->dr;
1105     }
1106
1107   return 1;
1108 }
1109
1110
1111 /* Traverse peeling hash table and calculate cost for each peeling option.
1112    Find the one with the lowest cost.  */
1113
1114 int
1115 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1116                                    _vect_peel_extended_info *min)
1117 {
1118   vect_peel_info elem = *slot;
1119   int save_misalignment, dummy;
1120   unsigned int inside_cost = 0, outside_cost = 0, i;
1121   gimple stmt = DR_STMT (elem->dr);
1122   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1123   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1124   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1125   struct data_reference *dr;
1126   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1127   int single_iter_cost;
1128
1129   prologue_cost_vec.create (2);
1130   body_cost_vec.create (2);
1131   epilogue_cost_vec.create (2);
1132
1133   FOR_EACH_VEC_ELT (datarefs, i, dr)
1134     {
1135       stmt = DR_STMT (dr);
1136       stmt_info = vinfo_for_stmt (stmt);
1137       /* For interleaving, only the alignment of the first access
1138          matters.  */
1139       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1140           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1141         continue;
1142
1143       save_misalignment = DR_MISALIGNMENT (dr);
1144       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1145       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1146                                  &body_cost_vec);
1147       SET_DR_MISALIGNMENT (dr, save_misalignment);
1148     }
1149
1150   single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1151   outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1152                                                &dummy, single_iter_cost,
1153                                                &prologue_cost_vec,
1154                                                &epilogue_cost_vec);
1155
1156   /* Prologue and epilogue costs are added to the target model later.
1157      These costs depend only on the scalar iteration cost, the
1158      number of peeling iterations finally chosen, and the number of
1159      misaligned statements.  So discard the information found here.  */
1160   prologue_cost_vec.release ();
1161   epilogue_cost_vec.release ();
1162
1163   if (inside_cost < min->inside_cost
1164       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1165     {
1166       min->inside_cost = inside_cost;
1167       min->outside_cost = outside_cost;
1168       min->body_cost_vec.release ();
1169       min->body_cost_vec = body_cost_vec;
1170       min->peel_info.dr = elem->dr;
1171       min->peel_info.npeel = elem->npeel;
1172     }
1173   else
1174     body_cost_vec.release ();
1175
1176   return 1;
1177 }
1178
1179
1180 /* Choose best peeling option by traversing peeling hash table and either
1181    choosing an option with the lowest cost (if cost model is enabled) or the
1182    option that aligns as many accesses as possible.  */
1183
1184 static struct data_reference *
1185 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1186                                        unsigned int *npeel,
1187                                        stmt_vector_for_cost *body_cost_vec)
1188 {
1189    struct _vect_peel_extended_info res;
1190
1191    res.peel_info.dr = NULL;
1192    res.body_cost_vec = stmt_vector_for_cost ();
1193
1194    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1195      {
1196        res.inside_cost = INT_MAX;
1197        res.outside_cost = INT_MAX;
1198        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1199            .traverse <_vect_peel_extended_info *,
1200                       vect_peeling_hash_get_lowest_cost> (&res);
1201      }
1202    else
1203      {
1204        res.peel_info.count = 0;
1205        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1206            .traverse <_vect_peel_extended_info *,
1207                       vect_peeling_hash_get_most_frequent> (&res);
1208      }
1209
1210    *npeel = res.peel_info.npeel;
1211    *body_cost_vec = res.body_cost_vec;
1212    return res.peel_info.dr;
1213 }
1214
1215
1216 /* Function vect_enhance_data_refs_alignment
1217
1218    This pass will use loop versioning and loop peeling in order to enhance
1219    the alignment of data references in the loop.
1220
1221    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1222    original loop is to be vectorized.  Any other loops that are created by
1223    the transformations performed in this pass - are not supposed to be
1224    vectorized.  This restriction will be relaxed.
1225
1226    This pass will require a cost model to guide it whether to apply peeling
1227    or versioning or a combination of the two.  For example, the scheme that
1228    intel uses when given a loop with several memory accesses, is as follows:
1229    choose one memory access ('p') which alignment you want to force by doing
1230    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1231    other accesses are not necessarily aligned, or (2) use loop versioning to
1232    generate one loop in which all accesses are aligned, and another loop in
1233    which only 'p' is necessarily aligned.
1234
1235    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1236    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1237    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1238
1239    Devising a cost model is the most critical aspect of this work.  It will
1240    guide us on which access to peel for, whether to use loop versioning, how
1241    many versions to create, etc.  The cost model will probably consist of
1242    generic considerations as well as target specific considerations (on
1243    powerpc for example, misaligned stores are more painful than misaligned
1244    loads).
1245
1246    Here are the general steps involved in alignment enhancements:
1247
1248      -- original loop, before alignment analysis:
1249         for (i=0; i<N; i++){
1250           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1251           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1252         }
1253
1254      -- After vect_compute_data_refs_alignment:
1255         for (i=0; i<N; i++){
1256           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1257           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1258         }
1259
1260      -- Possibility 1: we do loop versioning:
1261      if (p is aligned) {
1262         for (i=0; i<N; i++){    # loop 1A
1263           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1264           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1265         }
1266      }
1267      else {
1268         for (i=0; i<N; i++){    # loop 1B
1269           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1270           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1271         }
1272      }
1273
1274      -- Possibility 2: we do loop peeling:
1275      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1276         x = q[i];
1277         p[i] = y;
1278      }
1279      for (i = 3; i < N; i++){   # loop 2A
1280         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1281         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1282      }
1283
1284      -- Possibility 3: combination of loop peeling and versioning:
1285      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1286         x = q[i];
1287         p[i] = y;
1288      }
1289      if (p is aligned) {
1290         for (i = 3; i<N; i++){  # loop 3A
1291           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1292           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1293         }
1294      }
1295      else {
1296         for (i = 3; i<N; i++){  # loop 3B
1297           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1298           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1299         }
1300      }
1301
1302      These loops are later passed to loop_transform to be vectorized.  The
1303      vectorizer will use the alignment information to guide the transformation
1304      (whether to generate regular loads/stores, or with special handling for
1305      misalignment).  */
1306
1307 bool
1308 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1309 {
1310   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1311   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1312   enum dr_alignment_support supportable_dr_alignment;
1313   struct data_reference *dr0 = NULL, *first_store = NULL;
1314   struct data_reference *dr;
1315   unsigned int i, j;
1316   bool do_peeling = false;
1317   bool do_versioning = false;
1318   bool stat;
1319   gimple stmt;
1320   stmt_vec_info stmt_info;
1321   unsigned int npeel = 0;
1322   bool all_misalignments_unknown = true;
1323   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1324   unsigned possible_npeel_number = 1;
1325   tree vectype;
1326   unsigned int nelements, mis, same_align_drs_max = 0;
1327   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1328
1329   if (dump_enabled_p ())
1330     dump_printf_loc (MSG_NOTE, vect_location,
1331                      "=== vect_enhance_data_refs_alignment ===\n");
1332
1333   /* While cost model enhancements are expected in the future, the high level
1334      view of the code at this time is as follows:
1335
1336      A) If there is a misaligned access then see if peeling to align
1337         this access can make all data references satisfy
1338         vect_supportable_dr_alignment.  If so, update data structures
1339         as needed and return true.
1340
1341      B) If peeling wasn't possible and there is a data reference with an
1342         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1343         then see if loop versioning checks can be used to make all data
1344         references satisfy vect_supportable_dr_alignment.  If so, update
1345         data structures as needed and return true.
1346
1347      C) If neither peeling nor versioning were successful then return false if
1348         any data reference does not satisfy vect_supportable_dr_alignment.
1349
1350      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1351
1352      Note, Possibility 3 above (which is peeling and versioning together) is not
1353      being done at this time.  */
1354
1355   /* (1) Peeling to force alignment.  */
1356
1357   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1358      Considerations:
1359      + How many accesses will become aligned due to the peeling
1360      - How many accesses will become unaligned due to the peeling,
1361        and the cost of misaligned accesses.
1362      - The cost of peeling (the extra runtime checks, the increase
1363        in code size).  */
1364
1365   FOR_EACH_VEC_ELT (datarefs, i, dr)
1366     {
1367       stmt = DR_STMT (dr);
1368       stmt_info = vinfo_for_stmt (stmt);
1369
1370       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1371         continue;
1372
1373       /* For interleaving, only the alignment of the first access
1374          matters.  */
1375       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1376           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1377         continue;
1378
1379       /* For invariant accesses there is nothing to enhance.  */
1380       if (integer_zerop (DR_STEP (dr)))
1381         continue;
1382
1383       /* Strided loads perform only component accesses, alignment is
1384          irrelevant for them.  */
1385       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1386         continue;
1387
1388       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1389       do_peeling = vector_alignment_reachable_p (dr);
1390       if (do_peeling)
1391         {
1392           if (known_alignment_for_access_p (dr))
1393             {
1394               unsigned int npeel_tmp;
1395               bool negative = tree_int_cst_compare (DR_STEP (dr),
1396                                                     size_zero_node) < 0;
1397
1398               /* Save info about DR in the hash table.  */
1399               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo).is_created ())
1400                 LOOP_VINFO_PEELING_HTAB (loop_vinfo).create (1);
1401
1402               vectype = STMT_VINFO_VECTYPE (stmt_info);
1403               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1404               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1405                                                 TREE_TYPE (DR_REF (dr))));
1406               npeel_tmp = (negative
1407                            ? (mis - nelements) : (nelements - mis))
1408                   & (nelements - 1);
1409
1410               /* For multiple types, it is possible that the bigger type access
1411                  will have more than one peeling option.  E.g., a loop with two
1412                  types: one of size (vector size / 4), and the other one of
1413                  size (vector size / 8).  Vectorization factor will 8.  If both
1414                  access are misaligned by 3, the first one needs one scalar
1415                  iteration to be aligned, and the second one needs 5.  But the
1416                  the first one will be aligned also by peeling 5 scalar
1417                  iterations, and in that case both accesses will be aligned.
1418                  Hence, except for the immediate peeling amount, we also want
1419                  to try to add full vector size, while we don't exceed
1420                  vectorization factor.
1421                  We do this automtically for cost model, since we calculate cost
1422                  for every peeling option.  */
1423               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1424                 possible_npeel_number = vf /nelements;
1425
1426               /* Handle the aligned case. We may decide to align some other
1427                  access, making DR unaligned.  */
1428               if (DR_MISALIGNMENT (dr) == 0)
1429                 {
1430                   npeel_tmp = 0;
1431                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1432                     possible_npeel_number++;
1433                 }
1434
1435               for (j = 0; j < possible_npeel_number; j++)
1436                 {
1437                   gcc_assert (npeel_tmp <= vf);
1438                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1439                   npeel_tmp += nelements;
1440                 }
1441
1442               all_misalignments_unknown = false;
1443               /* Data-ref that was chosen for the case that all the
1444                  misalignments are unknown is not relevant anymore, since we
1445                  have a data-ref with known alignment.  */
1446               dr0 = NULL;
1447             }
1448           else
1449             {
1450               /* If we don't know any misalignment values, we prefer
1451                  peeling for data-ref that has the maximum number of data-refs
1452                  with the same alignment, unless the target prefers to align
1453                  stores over load.  */
1454               if (all_misalignments_unknown)
1455                 {
1456                   unsigned same_align_drs
1457                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1458                   if (!dr0
1459                       || same_align_drs_max < same_align_drs)
1460                     {
1461                       same_align_drs_max = same_align_drs;
1462                       dr0 = dr;
1463                     }
1464                   /* For data-refs with the same number of related
1465                      accesses prefer the one where the misalign
1466                      computation will be invariant in the outermost loop.  */
1467                   else if (same_align_drs_max == same_align_drs)
1468                     {
1469                       struct loop *ivloop0, *ivloop;
1470                       ivloop0 = outermost_invariant_loop_for_expr
1471                           (loop, DR_BASE_ADDRESS (dr0));
1472                       ivloop = outermost_invariant_loop_for_expr
1473                           (loop, DR_BASE_ADDRESS (dr));
1474                       if ((ivloop && !ivloop0)
1475                           || (ivloop && ivloop0
1476                               && flow_loop_nested_p (ivloop, ivloop0)))
1477                         dr0 = dr;
1478                     }
1479
1480                   if (!first_store && DR_IS_WRITE (dr))
1481                     first_store = dr;
1482                 }
1483
1484               /* If there are both known and unknown misaligned accesses in the
1485                  loop, we choose peeling amount according to the known
1486                  accesses.  */
1487               if (!supportable_dr_alignment)
1488                 {
1489                   dr0 = dr;
1490                   if (!first_store && DR_IS_WRITE (dr))
1491                     first_store = dr;
1492                 }
1493             }
1494         }
1495       else
1496         {
1497           if (!aligned_access_p (dr))
1498             {
1499               if (dump_enabled_p ())
1500                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1501                                  "vector alignment may not be reachable\n");
1502               break;
1503             }
1504         }
1505     }
1506
1507   /* Check if we can possibly peel the loop.  */
1508   if (!vect_can_advance_ivs_p (loop_vinfo)
1509       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1510     do_peeling = false;
1511
1512   if (do_peeling && all_misalignments_unknown
1513       && vect_supportable_dr_alignment (dr0, false))
1514     {
1515
1516       /* Check if the target requires to prefer stores over loads, i.e., if
1517          misaligned stores are more expensive than misaligned loads (taking
1518          drs with same alignment into account).  */
1519       if (first_store && DR_IS_READ (dr0))
1520         {
1521           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1522           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1523           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1524           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1525           stmt_vector_for_cost dummy;
1526           dummy.create (2);
1527
1528           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1529                                      &dummy);
1530           vect_get_data_access_cost (first_store, &store_inside_cost,
1531                                      &store_outside_cost, &dummy);
1532
1533           dummy.release ();
1534
1535           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1536              aligning the load DR0).  */
1537           load_inside_penalty = store_inside_cost;
1538           load_outside_penalty = store_outside_cost;
1539           for (i = 0;
1540                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1541                           DR_STMT (first_store))).iterate (i, &dr);
1542                i++)
1543             if (DR_IS_READ (dr))
1544               {
1545                 load_inside_penalty += load_inside_cost;
1546                 load_outside_penalty += load_outside_cost;
1547               }
1548             else
1549               {
1550                 load_inside_penalty += store_inside_cost;
1551                 load_outside_penalty += store_outside_cost;
1552               }
1553
1554           /* Calculate the penalty for leaving DR0 unaligned (by
1555              aligning the FIRST_STORE).  */
1556           store_inside_penalty = load_inside_cost;
1557           store_outside_penalty = load_outside_cost;
1558           for (i = 0;
1559                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1560                       DR_STMT (dr0))).iterate (i, &dr);
1561                i++)
1562             if (DR_IS_READ (dr))
1563               {
1564                 store_inside_penalty += load_inside_cost;
1565                 store_outside_penalty += load_outside_cost;
1566               }
1567             else
1568               {
1569                 store_inside_penalty += store_inside_cost;
1570                 store_outside_penalty += store_outside_cost;
1571               }
1572
1573           if (load_inside_penalty > store_inside_penalty
1574               || (load_inside_penalty == store_inside_penalty
1575                   && load_outside_penalty > store_outside_penalty))
1576             dr0 = first_store;
1577         }
1578
1579       /* In case there are only loads with different unknown misalignments, use
1580          peeling only if it may help to align other accesses in the loop.  */
1581       if (!first_store
1582           && !STMT_VINFO_SAME_ALIGN_REFS (
1583                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1584           && vect_supportable_dr_alignment (dr0, false)
1585               != dr_unaligned_supported)
1586         do_peeling = false;
1587     }
1588
1589   if (do_peeling && !dr0)
1590     {
1591       /* Peeling is possible, but there is no data access that is not supported
1592          unless aligned. So we try to choose the best possible peeling.  */
1593
1594       /* We should get here only if there are drs with known misalignment.  */
1595       gcc_assert (!all_misalignments_unknown);
1596
1597       /* Choose the best peeling from the hash table.  */
1598       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1599                                                    &body_cost_vec);
1600       if (!dr0 || !npeel)
1601         do_peeling = false;
1602     }
1603
1604   if (do_peeling)
1605     {
1606       stmt = DR_STMT (dr0);
1607       stmt_info = vinfo_for_stmt (stmt);
1608       vectype = STMT_VINFO_VECTYPE (stmt_info);
1609       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1610
1611       if (known_alignment_for_access_p (dr0))
1612         {
1613           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1614                                                 size_zero_node) < 0;
1615           if (!npeel)
1616             {
1617               /* Since it's known at compile time, compute the number of
1618                  iterations in the peeled loop (the peeling factor) for use in
1619                  updating DR_MISALIGNMENT values.  The peeling factor is the
1620                  vectorization factor minus the misalignment as an element
1621                  count.  */
1622               mis = DR_MISALIGNMENT (dr0);
1623               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1624               npeel = ((negative ? mis - nelements : nelements - mis)
1625                        & (nelements - 1));
1626             }
1627
1628           /* For interleaved data access every iteration accesses all the
1629              members of the group, therefore we divide the number of iterations
1630              by the group size.  */
1631           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1632           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1633             npeel /= GROUP_SIZE (stmt_info);
1634
1635           if (dump_enabled_p ())
1636             dump_printf_loc (MSG_NOTE, vect_location,
1637                              "Try peeling by %d\n", npeel);
1638         }
1639
1640       /* Ensure that all data refs can be vectorized after the peel.  */
1641       FOR_EACH_VEC_ELT (datarefs, i, dr)
1642         {
1643           int save_misalignment;
1644
1645           if (dr == dr0)
1646             continue;
1647
1648           stmt = DR_STMT (dr);
1649           stmt_info = vinfo_for_stmt (stmt);
1650           /* For interleaving, only the alignment of the first access
1651             matters.  */
1652           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1653               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1654             continue;
1655
1656           /* Strided loads perform only component accesses, alignment is
1657              irrelevant for them.  */
1658           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1659             continue;
1660
1661           save_misalignment = DR_MISALIGNMENT (dr);
1662           vect_update_misalignment_for_peel (dr, dr0, npeel);
1663           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1664           SET_DR_MISALIGNMENT (dr, save_misalignment);
1665
1666           if (!supportable_dr_alignment)
1667             {
1668               do_peeling = false;
1669               break;
1670             }
1671         }
1672
1673       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1674         {
1675           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1676           if (!stat)
1677             do_peeling = false;
1678           else
1679             {
1680               body_cost_vec.release ();
1681               return stat;
1682             }
1683         }
1684
1685       if (do_peeling)
1686         {
1687           unsigned max_allowed_peel
1688             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1689           if (max_allowed_peel != (unsigned)-1)
1690             {
1691               unsigned max_peel = npeel;
1692               if (max_peel == 0)
1693                 {
1694                   gimple dr_stmt = DR_STMT (dr0);
1695                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1696                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1697                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1698                 }
1699               if (max_peel > max_allowed_peel)
1700                 {
1701                   do_peeling = false;
1702                   if (dump_enabled_p ())
1703                     dump_printf_loc (MSG_NOTE, vect_location,
1704                         "Disable peeling, max peels reached: %d\n", max_peel);
1705                 }
1706             }
1707         }
1708
1709       if (do_peeling)
1710         {
1711           stmt_info_for_cost *si;
1712           void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1713
1714           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1715              If the misalignment of DR_i is identical to that of dr0 then set
1716              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1717              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1718              by the peeling factor times the element size of DR_i (MOD the
1719              vectorization factor times the size).  Otherwise, the
1720              misalignment of DR_i must be set to unknown.  */
1721           FOR_EACH_VEC_ELT (datarefs, i, dr)
1722             if (dr != dr0)
1723               vect_update_misalignment_for_peel (dr, dr0, npeel);
1724
1725           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1726           if (npeel)
1727             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1728           else
1729             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1730               = DR_MISALIGNMENT (dr0);
1731           SET_DR_MISALIGNMENT (dr0, 0);
1732           if (dump_enabled_p ())
1733             {
1734               dump_printf_loc (MSG_NOTE, vect_location,
1735                                "Alignment of access forced using peeling.\n");
1736               dump_printf_loc (MSG_NOTE, vect_location,
1737                                "Peeling for alignment will be applied.\n");
1738             }
1739           /* We've delayed passing the inside-loop peeling costs to the
1740              target cost model until we were sure peeling would happen.
1741              Do so now.  */
1742           if (body_cost_vec.exists ())
1743             {
1744               FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1745                 {
1746                   struct _stmt_vec_info *stmt_info
1747                     = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1748                   (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1749                                         si->misalign, vect_body);
1750                 }
1751               body_cost_vec.release ();
1752             }
1753
1754           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1755           gcc_assert (stat);
1756           return stat;
1757         }
1758     }
1759
1760   body_cost_vec.release ();
1761
1762   /* (2) Versioning to force alignment.  */
1763
1764   /* Try versioning if:
1765      1) optimize loop for speed
1766      2) there is at least one unsupported misaligned data ref with an unknown
1767         misalignment, and
1768      3) all misaligned data refs with a known misalignment are supported, and
1769      4) the number of runtime alignment checks is within reason.  */
1770
1771   do_versioning =
1772         optimize_loop_nest_for_speed_p (loop)
1773         && (!loop->inner); /* FORNOW */
1774
1775   if (do_versioning)
1776     {
1777       FOR_EACH_VEC_ELT (datarefs, i, dr)
1778         {
1779           stmt = DR_STMT (dr);
1780           stmt_info = vinfo_for_stmt (stmt);
1781
1782           /* For interleaving, only the alignment of the first access
1783              matters.  */
1784           if (aligned_access_p (dr)
1785               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1786                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1787             continue;
1788
1789           /* Strided loads perform only component accesses, alignment is
1790              irrelevant for them.  */
1791           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1792             continue;
1793
1794           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1795
1796           if (!supportable_dr_alignment)
1797             {
1798               gimple stmt;
1799               int mask;
1800               tree vectype;
1801
1802               if (known_alignment_for_access_p (dr)
1803                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1804                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1805                 {
1806                   do_versioning = false;
1807                   break;
1808                 }
1809
1810               stmt = DR_STMT (dr);
1811               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1812               gcc_assert (vectype);
1813
1814               /* The rightmost bits of an aligned address must be zeros.
1815                  Construct the mask needed for this test.  For example,
1816                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1817                  mask must be 15 = 0xf. */
1818               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1819
1820               /* FORNOW: use the same mask to test all potentially unaligned
1821                  references in the loop.  The vectorizer currently supports
1822                  a single vector size, see the reference to
1823                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1824                  vectorization factor is computed.  */
1825               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1826                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1827               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1828               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1829                       DR_STMT (dr));
1830             }
1831         }
1832
1833       /* Versioning requires at least one misaligned data reference.  */
1834       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1835         do_versioning = false;
1836       else if (!do_versioning)
1837         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1838     }
1839
1840   if (do_versioning)
1841     {
1842       vec<gimple> may_misalign_stmts
1843         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1844       gimple stmt;
1845
1846       /* It can now be assumed that the data references in the statements
1847          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1848          of the loop being vectorized.  */
1849       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1850         {
1851           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1852           dr = STMT_VINFO_DATA_REF (stmt_info);
1853           SET_DR_MISALIGNMENT (dr, 0);
1854           if (dump_enabled_p ())
1855             dump_printf_loc (MSG_NOTE, vect_location,
1856                              "Alignment of access forced using versioning.\n");
1857         }
1858
1859       if (dump_enabled_p ())
1860         dump_printf_loc (MSG_NOTE, vect_location,
1861                          "Versioning for alignment will be applied.\n");
1862
1863       /* Peeling and versioning can't be done together at this time.  */
1864       gcc_assert (! (do_peeling && do_versioning));
1865
1866       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1867       gcc_assert (stat);
1868       return stat;
1869     }
1870
1871   /* This point is reached if neither peeling nor versioning is being done.  */
1872   gcc_assert (! (do_peeling || do_versioning));
1873
1874   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1875   return stat;
1876 }
1877
1878
1879 /* Function vect_find_same_alignment_drs.
1880
1881    Update group and alignment relations according to the chosen
1882    vectorization factor.  */
1883
1884 static void
1885 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1886                               loop_vec_info loop_vinfo)
1887 {
1888   unsigned int i;
1889   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1890   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1891   struct data_reference *dra = DDR_A (ddr);
1892   struct data_reference *drb = DDR_B (ddr);
1893   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1894   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1895   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1896   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1897   lambda_vector dist_v;
1898   unsigned int loop_depth;
1899
1900   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1901     return;
1902
1903   if (dra == drb)
1904     return;
1905
1906   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1907     return;
1908
1909   /* Loop-based vectorization and known data dependence.  */
1910   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1911     return;
1912
1913   /* Data-dependence analysis reports a distance vector of zero
1914      for data-references that overlap only in the first iteration
1915      but have different sign step (see PR45764).
1916      So as a sanity check require equal DR_STEP.  */
1917   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1918     return;
1919
1920   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1921   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1922     {
1923       int dist = dist_v[loop_depth];
1924
1925       if (dump_enabled_p ())
1926         dump_printf_loc (MSG_NOTE, vect_location,
1927                          "dependence distance  = %d.\n", dist);
1928
1929       /* Same loop iteration.  */
1930       if (dist == 0
1931           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1932         {
1933           /* Two references with distance zero have the same alignment.  */
1934           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1935           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1936           if (dump_enabled_p ())
1937             {
1938               dump_printf_loc (MSG_NOTE, vect_location,
1939                                "accesses have the same alignment.\n");
1940               dump_printf (MSG_NOTE,
1941                            "dependence distance modulo vf == 0 between ");
1942               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1943               dump_printf (MSG_NOTE,  " and ");
1944               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1945               dump_printf (MSG_NOTE, "\n");
1946             }
1947         }
1948     }
1949 }
1950
1951
1952 /* Function vect_analyze_data_refs_alignment
1953
1954    Analyze the alignment of the data-references in the loop.
1955    Return FALSE if a data reference is found that cannot be vectorized.  */
1956
1957 bool
1958 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1959                                   bb_vec_info bb_vinfo)
1960 {
1961   if (dump_enabled_p ())
1962     dump_printf_loc (MSG_NOTE, vect_location,
1963                      "=== vect_analyze_data_refs_alignment ===\n");
1964
1965   /* Mark groups of data references with same alignment using
1966      data dependence information.  */
1967   if (loop_vinfo)
1968     {
1969       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1970       struct data_dependence_relation *ddr;
1971       unsigned int i;
1972
1973       FOR_EACH_VEC_ELT (ddrs, i, ddr)
1974         vect_find_same_alignment_drs (ddr, loop_vinfo);
1975     }
1976
1977   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1978     {
1979       if (dump_enabled_p ())
1980         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981                          "not vectorized: can't calculate alignment "
1982                          "for data ref.\n");
1983       return false;
1984     }
1985
1986   return true;
1987 }
1988
1989
1990 /* Analyze groups of accesses: check that DR belongs to a group of
1991    accesses of legal size, step, etc.  Detect gaps, single element
1992    interleaving, and other special cases. Set grouped access info.
1993    Collect groups of strided stores for further use in SLP analysis.  */
1994
1995 static bool
1996 vect_analyze_group_access (struct data_reference *dr)
1997 {
1998   tree step = DR_STEP (dr);
1999   tree scalar_type = TREE_TYPE (DR_REF (dr));
2000   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2001   gimple stmt = DR_STMT (dr);
2002   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2003   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2004   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2005   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2006   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2007   bool slp_impossible = false;
2008   struct loop *loop = NULL;
2009
2010   if (loop_vinfo)
2011     loop = LOOP_VINFO_LOOP (loop_vinfo);
2012
2013   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2014      size of the interleaving group (including gaps).  */
2015   groupsize = absu_hwi (dr_step) / type_size;
2016
2017   /* Not consecutive access is possible only if it is a part of interleaving.  */
2018   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2019     {
2020       /* Check if it this DR is a part of interleaving, and is a single
2021          element of the group that is accessed in the loop.  */
2022
2023       /* Gaps are supported only for loads. STEP must be a multiple of the type
2024          size.  The size of the group must be a power of 2.  */
2025       if (DR_IS_READ (dr)
2026           && (dr_step % type_size) == 0
2027           && groupsize > 0
2028           && exact_log2 (groupsize) != -1)
2029         {
2030           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2031           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2032           if (dump_enabled_p ())
2033             {
2034               dump_printf_loc (MSG_NOTE, vect_location,
2035                                "Detected single element interleaving ");
2036               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2037               dump_printf (MSG_NOTE, " step ");
2038               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2039               dump_printf (MSG_NOTE, "\n");
2040             }
2041
2042           if (loop_vinfo)
2043             {
2044               if (dump_enabled_p ())
2045                 dump_printf_loc (MSG_NOTE, vect_location,
2046                                  "Data access with gaps requires scalar "
2047                                  "epilogue loop\n");
2048               if (loop->inner)
2049                 {
2050                   if (dump_enabled_p ())
2051                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2052                                      "Peeling for outer loop is not"
2053                                      " supported\n");
2054                   return false;
2055                 }
2056
2057               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2058             }
2059
2060           return true;
2061         }
2062
2063       if (dump_enabled_p ())
2064         {
2065           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2066                            "not consecutive access ");
2067           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2068           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2069         }
2070
2071       if (bb_vinfo)
2072         {
2073           /* Mark the statement as unvectorizable.  */
2074           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2075           return true;
2076         }
2077
2078       return false;
2079     }
2080
2081   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2082     {
2083       /* First stmt in the interleaving chain. Check the chain.  */
2084       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2085       struct data_reference *data_ref = dr;
2086       unsigned int count = 1;
2087       tree prev_init = DR_INIT (data_ref);
2088       gimple prev = stmt;
2089       HOST_WIDE_INT diff, gaps = 0;
2090       unsigned HOST_WIDE_INT count_in_bytes;
2091
2092       while (next)
2093         {
2094           /* Skip same data-refs.  In case that two or more stmts share
2095              data-ref (supported only for loads), we vectorize only the first
2096              stmt, and the rest get their vectorized loads from the first
2097              one.  */
2098           if (!tree_int_cst_compare (DR_INIT (data_ref),
2099                                      DR_INIT (STMT_VINFO_DATA_REF (
2100                                                    vinfo_for_stmt (next)))))
2101             {
2102               if (DR_IS_WRITE (data_ref))
2103                 {
2104                   if (dump_enabled_p ())
2105                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2106                                      "Two store stmts share the same dr.\n");
2107                   return false;
2108                 }
2109
2110               /* For load use the same data-ref load.  */
2111               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2112
2113               prev = next;
2114               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2115               continue;
2116             }
2117
2118           prev = next;
2119           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2120
2121           /* All group members have the same STEP by construction.  */
2122           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2123
2124           /* Check that the distance between two accesses is equal to the type
2125              size. Otherwise, we have gaps.  */
2126           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2127                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2128           if (diff != 1)
2129             {
2130               /* FORNOW: SLP of accesses with gaps is not supported.  */
2131               slp_impossible = true;
2132               if (DR_IS_WRITE (data_ref))
2133                 {
2134                   if (dump_enabled_p ())
2135                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136                                      "interleaved store with gaps\n");
2137                   return false;
2138                 }
2139
2140               gaps += diff - 1;
2141             }
2142
2143           last_accessed_element += diff;
2144
2145           /* Store the gap from the previous member of the group. If there is no
2146              gap in the access, GROUP_GAP is always 1.  */
2147           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2148
2149           prev_init = DR_INIT (data_ref);
2150           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2151           /* Count the number of data-refs in the chain.  */
2152           count++;
2153         }
2154
2155       /* COUNT is the number of accesses found, we multiply it by the size of
2156          the type to get COUNT_IN_BYTES.  */
2157       count_in_bytes = type_size * count;
2158
2159       /* Check that the size of the interleaving (including gaps) is not
2160          greater than STEP.  */
2161       if (dr_step != 0
2162           && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2163         {
2164           if (dump_enabled_p ())
2165             {
2166               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2167                                "interleaving size is greater than step for ");
2168               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2169                                  DR_REF (dr));
2170               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2171             }
2172           return false;
2173         }
2174
2175       /* Check that the size of the interleaving is equal to STEP for stores,
2176          i.e., that there are no gaps.  */
2177       if (dr_step != 0
2178           && absu_hwi (dr_step) != count_in_bytes)
2179         {
2180           if (DR_IS_READ (dr))
2181             {
2182               slp_impossible = true;
2183               /* There is a gap after the last load in the group. This gap is a
2184                  difference between the groupsize and the number of elements.
2185                  When there is no gap, this difference should be 0.  */
2186               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2187             }
2188           else
2189             {
2190               if (dump_enabled_p ())
2191                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192                                  "interleaved store with gaps\n");
2193               return false;
2194             }
2195         }
2196
2197       /* Check that STEP is a multiple of type size.  */
2198       if (dr_step != 0
2199           && (dr_step % type_size) != 0)
2200         {
2201           if (dump_enabled_p ())
2202             {
2203               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204                                "step is not a multiple of type size: step ");
2205               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2206               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2207               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2208                                  TYPE_SIZE_UNIT (scalar_type));
2209               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2210             }
2211           return false;
2212         }
2213
2214       if (groupsize == 0)
2215         groupsize = count;
2216
2217       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2218       if (dump_enabled_p ())
2219         dump_printf_loc (MSG_NOTE, vect_location,
2220                          "Detected interleaving of size %d\n", (int)groupsize);
2221
2222       /* SLP: create an SLP data structure for every interleaving group of
2223          stores for further analysis in vect_analyse_slp.  */
2224       if (DR_IS_WRITE (dr) && !slp_impossible)
2225         {
2226           if (loop_vinfo)
2227             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2228           if (bb_vinfo)
2229             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2230         }
2231
2232       /* There is a gap in the end of the group.  */
2233       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2234         {
2235           if (dump_enabled_p ())
2236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2237                              "Data access with gaps requires scalar "
2238                              "epilogue loop\n");
2239           if (loop->inner)
2240             {
2241               if (dump_enabled_p ())
2242                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2243                                  "Peeling for outer loop is not supported\n");
2244               return false;
2245             }
2246
2247           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2248         }
2249     }
2250
2251   return true;
2252 }
2253
2254
2255 /* Analyze the access pattern of the data-reference DR.
2256    In case of non-consecutive accesses call vect_analyze_group_access() to
2257    analyze groups of accesses.  */
2258
2259 static bool
2260 vect_analyze_data_ref_access (struct data_reference *dr)
2261 {
2262   tree step = DR_STEP (dr);
2263   tree scalar_type = TREE_TYPE (DR_REF (dr));
2264   gimple stmt = DR_STMT (dr);
2265   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2266   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2267   struct loop *loop = NULL;
2268
2269   if (loop_vinfo)
2270     loop = LOOP_VINFO_LOOP (loop_vinfo);
2271
2272   if (loop_vinfo && !step)
2273     {
2274       if (dump_enabled_p ())
2275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2276                          "bad data-ref access in loop\n");
2277       return false;
2278     }
2279
2280   /* Allow invariant loads in not nested loops.  */
2281   if (loop_vinfo && integer_zerop (step))
2282     {
2283       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2284       if (nested_in_vect_loop_p (loop, stmt))
2285         {
2286           if (dump_enabled_p ())
2287             dump_printf_loc (MSG_NOTE, vect_location,
2288                              "zero step in inner loop of nest\n");
2289           return false;
2290         }
2291       return DR_IS_READ (dr);
2292     }
2293
2294   if (loop && nested_in_vect_loop_p (loop, stmt))
2295     {
2296       /* Interleaved accesses are not yet supported within outer-loop
2297         vectorization for references in the inner-loop.  */
2298       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2299
2300       /* For the rest of the analysis we use the outer-loop step.  */
2301       step = STMT_VINFO_DR_STEP (stmt_info);
2302       if (integer_zerop (step))
2303         {
2304           if (dump_enabled_p ())
2305             dump_printf_loc (MSG_NOTE, vect_location,
2306                              "zero step in outer loop.\n");
2307           if (DR_IS_READ (dr))
2308             return true;
2309           else
2310             return false;
2311         }
2312     }
2313
2314   /* Consecutive?  */
2315   if (TREE_CODE (step) == INTEGER_CST)
2316     {
2317       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2318       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2319           || (dr_step < 0
2320               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2321         {
2322           /* Mark that it is not interleaving.  */
2323           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2324           return true;
2325         }
2326     }
2327
2328   if (loop && nested_in_vect_loop_p (loop, stmt))
2329     {
2330       if (dump_enabled_p ())
2331         dump_printf_loc (MSG_NOTE, vect_location,
2332                          "grouped access in outer loop.\n");
2333       return false;
2334     }
2335
2336   /* Assume this is a DR handled by non-constant strided load case.  */
2337   if (TREE_CODE (step) != INTEGER_CST)
2338     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2339
2340   /* Not consecutive access - check if it's a part of interleaving group.  */
2341   return vect_analyze_group_access (dr);
2342 }
2343
2344
2345
2346 /*  A helper function used in the comparator function to sort data
2347     references.  T1 and T2 are two data references to be compared.
2348     The function returns -1, 0, or 1.  */
2349
2350 static int
2351 compare_tree (tree t1, tree t2)
2352 {
2353   int i, cmp;
2354   enum tree_code code;
2355   char tclass;
2356
2357   if (t1 == t2)
2358     return 0;
2359   if (t1 == NULL)
2360     return -1;
2361   if (t2 == NULL)
2362     return 1;
2363
2364
2365   if (TREE_CODE (t1) != TREE_CODE (t2))
2366     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2367
2368   code = TREE_CODE (t1);
2369   switch (code)
2370     {
2371     /* For const values, we can just use hash values for comparisons.  */
2372     case INTEGER_CST:
2373     case REAL_CST:
2374     case FIXED_CST:
2375     case STRING_CST:
2376     case COMPLEX_CST:
2377     case VECTOR_CST:
2378       {
2379         hashval_t h1 = iterative_hash_expr (t1, 0);
2380         hashval_t h2 = iterative_hash_expr (t2, 0);
2381         if (h1 != h2)
2382           return h1 < h2 ? -1 : 1;
2383         break;
2384       }
2385
2386     case SSA_NAME:
2387       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2388       if (cmp != 0)
2389         return cmp;
2390
2391       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2392         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2393       break;
2394
2395     default:
2396       tclass = TREE_CODE_CLASS (code);
2397
2398       /* For var-decl, we could compare their UIDs.  */
2399       if (tclass == tcc_declaration)
2400         {
2401           if (DECL_UID (t1) != DECL_UID (t2))
2402             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2403           break;
2404         }
2405
2406       /* For expressions with operands, compare their operands recursively.  */
2407       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2408         {
2409           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2410           if (cmp != 0)
2411             return cmp;
2412         }
2413     }
2414
2415   return 0;
2416 }
2417
2418
2419 /* Compare two data-references DRA and DRB to group them into chunks
2420    suitable for grouping.  */
2421
2422 static int
2423 dr_group_sort_cmp (const void *dra_, const void *drb_)
2424 {
2425   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2426   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2427   int cmp;
2428
2429   /* Stabilize sort.  */
2430   if (dra == drb)
2431     return 0;
2432
2433   /* Ordering of DRs according to base.  */
2434   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2435     {
2436       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2437       if (cmp != 0)
2438         return cmp;
2439     }
2440
2441   /* And according to DR_OFFSET.  */
2442   if (!dr_equal_offsets_p (dra, drb))
2443     {
2444       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2445       if (cmp != 0)
2446         return cmp;
2447     }
2448
2449   /* Put reads before writes.  */
2450   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2451     return DR_IS_READ (dra) ? -1 : 1;
2452
2453   /* Then sort after access size.  */
2454   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2455                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2456     {
2457       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2458                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2459       if (cmp != 0)
2460         return cmp;
2461     }
2462
2463   /* And after step.  */
2464   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2465     {
2466       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2467       if (cmp != 0)
2468         return cmp;
2469     }
2470
2471   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2472   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2473   if (cmp == 0)
2474     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2475   return cmp;
2476 }
2477
2478 /* Function vect_analyze_data_ref_accesses.
2479
2480    Analyze the access pattern of all the data references in the loop.
2481
2482    FORNOW: the only access pattern that is considered vectorizable is a
2483            simple step 1 (consecutive) access.
2484
2485    FORNOW: handle only arrays and pointer accesses.  */
2486
2487 bool
2488 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2489 {
2490   unsigned int i;
2491   vec<data_reference_p> datarefs;
2492   struct data_reference *dr;
2493
2494   if (dump_enabled_p ())
2495     dump_printf_loc (MSG_NOTE, vect_location,
2496                      "=== vect_analyze_data_ref_accesses ===\n");
2497
2498   if (loop_vinfo)
2499     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2500   else
2501     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2502
2503   if (datarefs.is_empty ())
2504     return true;
2505
2506   /* Sort the array of datarefs to make building the interleaving chains
2507      linear.  Don't modify the original vector's order, it is needed for
2508      determining what dependencies are reversed.  */
2509   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2510   qsort (datarefs_copy.address (), datarefs_copy.length (),
2511          sizeof (data_reference_p), dr_group_sort_cmp);
2512
2513   /* Build the interleaving chains.  */
2514   for (i = 0; i < datarefs_copy.length () - 1;)
2515     {
2516       data_reference_p dra = datarefs_copy[i];
2517       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2518       stmt_vec_info lastinfo = NULL;
2519       for (i = i + 1; i < datarefs_copy.length (); ++i)
2520         {
2521           data_reference_p drb = datarefs_copy[i];
2522           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2523
2524           /* ???  Imperfect sorting (non-compatible types, non-modulo
2525              accesses, same accesses) can lead to a group to be artificially
2526              split here as we don't just skip over those.  If it really
2527              matters we can push those to a worklist and re-iterate
2528              over them.  The we can just skip ahead to the next DR here.  */
2529
2530           /* Check that the data-refs have same first location (except init)
2531              and they are both either store or load (not load and store).  */
2532           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2533               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2534                                    DR_BASE_ADDRESS (drb), 0)
2535               || !dr_equal_offsets_p (dra, drb))
2536             break;
2537
2538           /* Check that the data-refs have the same constant size and step.  */
2539           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2540           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2541           if (!tree_fits_uhwi_p (sza)
2542               || !tree_fits_uhwi_p (szb)
2543               || !tree_int_cst_equal (sza, szb)
2544               || !tree_fits_shwi_p (DR_STEP (dra))
2545               || !tree_fits_shwi_p (DR_STEP (drb))
2546               || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2547             break;
2548
2549           /* Do not place the same access in the interleaving chain twice.  */
2550           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2551             break;
2552
2553           /* Check the types are compatible.
2554              ???  We don't distinguish this during sorting.  */
2555           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2556                                    TREE_TYPE (DR_REF (drb))))
2557             break;
2558
2559           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2560           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2561           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2562           gcc_assert (init_a < init_b);
2563
2564           /* If init_b == init_a + the size of the type * k, we have an
2565              interleaving, and DRA is accessed before DRB.  */
2566           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2567           if ((init_b - init_a) % type_size_a != 0)
2568             break;
2569
2570           /* The step (if not zero) is greater than the difference between
2571              data-refs' inits.  This splits groups into suitable sizes.  */
2572           HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2573           if (step != 0 && step <= (init_b - init_a))
2574             break;
2575
2576           if (dump_enabled_p ())
2577             {
2578               dump_printf_loc (MSG_NOTE, vect_location,
2579                                "Detected interleaving ");
2580               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2581               dump_printf (MSG_NOTE,  " and ");
2582               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2583               dump_printf (MSG_NOTE, "\n");
2584             }
2585
2586           /* Link the found element into the group list.  */
2587           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2588             {
2589               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2590               lastinfo = stmtinfo_a;
2591             }
2592           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2593           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2594           lastinfo = stmtinfo_b;
2595         }
2596     }
2597
2598   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2599     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2600         && !vect_analyze_data_ref_access (dr))
2601       {
2602         if (dump_enabled_p ())
2603           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2604                            "not vectorized: complicated access pattern.\n");
2605
2606         if (bb_vinfo)
2607           {
2608             /* Mark the statement as not vectorizable.  */
2609             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2610             continue;
2611           }
2612         else
2613           {
2614             datarefs_copy.release ();
2615             return false;
2616           }
2617       }
2618
2619   datarefs_copy.release ();
2620   return true;
2621 }
2622
2623
2624 /* Operator == between two dr_with_seg_len objects.
2625
2626    This equality operator is used to make sure two data refs
2627    are the same one so that we will consider to combine the
2628    aliasing checks of those two pairs of data dependent data
2629    refs.  */
2630
2631 static bool
2632 operator == (const dr_with_seg_len& d1,
2633              const dr_with_seg_len& d2)
2634 {
2635   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2636                           DR_BASE_ADDRESS (d2.dr), 0)
2637            && compare_tree (d1.offset, d2.offset) == 0
2638            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2639 }
2640
2641 /* Function comp_dr_with_seg_len_pair.
2642
2643    Comparison function for sorting objects of dr_with_seg_len_pair_t
2644    so that we can combine aliasing checks in one scan.  */
2645
2646 static int
2647 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2648 {
2649   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2650   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2651
2652   const dr_with_seg_len &p11 = p1->first,
2653                         &p12 = p1->second,
2654                         &p21 = p2->first,
2655                         &p22 = p2->second;
2656
2657   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2658      if a and c have the same basic address snd step, and b and d have the same
2659      address and step.  Therefore, if any a&c or b&d don't have the same address
2660      and step, we don't care the order of those two pairs after sorting.  */
2661   int comp_res;
2662
2663   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2664                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2665     return comp_res;
2666   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2667                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2668     return comp_res;
2669   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2670     return comp_res;
2671   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2672     return comp_res;
2673   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2674     return comp_res;
2675   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2676     return comp_res;
2677
2678   return 0;
2679 }
2680
2681 template <class T> static void
2682 swap (T& a, T& b)
2683 {
2684   T c (a);
2685   a = b;
2686   b = c;
2687 }
2688
2689 /* Function vect_vfa_segment_size.
2690
2691    Create an expression that computes the size of segment
2692    that will be accessed for a data reference.  The functions takes into
2693    account that realignment loads may access one more vector.
2694
2695    Input:
2696      DR: The data reference.
2697      LENGTH_FACTOR: segment length to consider.
2698
2699    Return an expression whose value is the size of segment which will be
2700    accessed by DR.  */
2701
2702 static tree
2703 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2704 {
2705   tree segment_length;
2706
2707   if (integer_zerop (DR_STEP (dr)))
2708     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2709   else
2710     segment_length = size_binop (MULT_EXPR,
2711                                  fold_convert (sizetype, DR_STEP (dr)),
2712                                  fold_convert (sizetype, length_factor));
2713
2714   if (vect_supportable_dr_alignment (dr, false)
2715         == dr_explicit_realign_optimized)
2716     {
2717       tree vector_size = TYPE_SIZE_UNIT
2718                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2719
2720       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2721     }
2722   return segment_length;
2723 }
2724
2725 /* Function vect_prune_runtime_alias_test_list.
2726
2727    Prune a list of ddrs to be tested at run-time by versioning for alias.
2728    Merge several alias checks into one if possible.
2729    Return FALSE if resulting list of ddrs is longer then allowed by
2730    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2731
2732 bool
2733 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2734 {
2735   vec<ddr_p> may_alias_ddrs =
2736     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2737   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2738     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2739   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2740   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2741
2742   ddr_p ddr;
2743   unsigned int i;
2744   tree length_factor;
2745
2746   if (dump_enabled_p ())
2747     dump_printf_loc (MSG_NOTE, vect_location,
2748                      "=== vect_prune_runtime_alias_test_list ===\n");
2749
2750   if (may_alias_ddrs.is_empty ())
2751     return true;
2752
2753   /* Basically, for each pair of dependent data refs store_ptr_0
2754      and load_ptr_0, we create an expression:
2755
2756      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2757      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2758
2759      for aliasing checks.  However, in some cases we can decrease
2760      the number of checks by combining two checks into one.  For
2761      example, suppose we have another pair of data refs store_ptr_0
2762      and load_ptr_1, and if the following condition is satisfied:
2763
2764      load_ptr_0 < load_ptr_1  &&
2765      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2766
2767      (this condition means, in each iteration of vectorized loop,
2768      the accessed memory of store_ptr_0 cannot be between the memory
2769      of load_ptr_0 and load_ptr_1.)
2770
2771      we then can use only the following expression to finish the
2772      alising checks between store_ptr_0 & load_ptr_0 and
2773      store_ptr_0 & load_ptr_1:
2774
2775      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2776      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2777
2778      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2779      same basic address.  */
2780
2781   comp_alias_ddrs.create (may_alias_ddrs.length ());
2782
2783   /* First, we collect all data ref pairs for aliasing checks.  */
2784   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2785     {
2786       struct data_reference *dr_a, *dr_b;
2787       gimple dr_group_first_a, dr_group_first_b;
2788       tree segment_length_a, segment_length_b;
2789       gimple stmt_a, stmt_b;
2790
2791       dr_a = DDR_A (ddr);
2792       stmt_a = DR_STMT (DDR_A (ddr));
2793       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2794       if (dr_group_first_a)
2795         {
2796           stmt_a = dr_group_first_a;
2797           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2798         }
2799
2800       dr_b = DDR_B (ddr);
2801       stmt_b = DR_STMT (DDR_B (ddr));
2802       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2803       if (dr_group_first_b)
2804         {
2805           stmt_b = dr_group_first_b;
2806           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2807         }
2808
2809       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2810         length_factor = scalar_loop_iters;
2811       else
2812         length_factor = size_int (vect_factor);
2813       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2814       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2815
2816       dr_with_seg_len_pair_t dr_with_seg_len_pair
2817           (dr_with_seg_len (dr_a, segment_length_a),
2818            dr_with_seg_len (dr_b, segment_length_b));
2819
2820       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2821         swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2822
2823       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2824     }
2825
2826   /* Second, we sort the collected data ref pairs so that we can scan
2827      them once to combine all possible aliasing checks.  */
2828   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2829
2830   /* Third, we scan the sorted dr pairs and check if we can combine
2831      alias checks of two neighbouring dr pairs.  */
2832   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2833     {
2834       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2835       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2836                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2837                       *dr_a2 = &comp_alias_ddrs[i].first,
2838                       *dr_b2 = &comp_alias_ddrs[i].second;
2839
2840       /* Remove duplicate data ref pairs.  */
2841       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2842         {
2843           if (dump_enabled_p ())
2844             {
2845               dump_printf_loc (MSG_NOTE, vect_location,
2846                                "found equal ranges ");
2847               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2848                                  DR_REF (dr_a1->dr));
2849               dump_printf (MSG_NOTE,  ", ");
2850               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2851                                  DR_REF (dr_b1->dr));
2852               dump_printf (MSG_NOTE,  " and ");
2853               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2854                                  DR_REF (dr_a2->dr));
2855               dump_printf (MSG_NOTE,  ", ");
2856               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2857                                  DR_REF (dr_b2->dr));
2858               dump_printf (MSG_NOTE, "\n");
2859             }
2860
2861           comp_alias_ddrs.ordered_remove (i--);
2862           continue;
2863         }
2864
2865       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2866         {
2867           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2868              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2869           if (*dr_a1 == *dr_a2)
2870             {
2871               swap (dr_a1, dr_b1);
2872               swap (dr_a2, dr_b2);
2873             }
2874
2875           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2876                                 DR_BASE_ADDRESS (dr_a2->dr),
2877                                 0)
2878               || !tree_fits_shwi_p (dr_a1->offset)
2879               || !tree_fits_shwi_p (dr_a2->offset))
2880             continue;
2881
2882           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2883                                 - tree_to_shwi (dr_a1->offset));
2884
2885
2886           /* Now we check if the following condition is satisfied:
2887
2888              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2889
2890              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2891              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2892              have to make a best estimation.  We can get the minimum value
2893              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2894              then either of the following two conditions can guarantee the
2895              one above:
2896
2897              1: DIFF <= MIN_SEG_LEN_B
2898              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2899
2900              */
2901
2902           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2903                                           ? tree_to_shwi (dr_b1->seg_len)
2904                                           : vect_factor);
2905
2906           if (diff <= min_seg_len_b
2907               || (tree_fits_shwi_p (dr_a1->seg_len)
2908                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2909             {
2910               if (dump_enabled_p ())
2911                 {
2912                   dump_printf_loc (MSG_NOTE, vect_location,
2913                                    "merging ranges for ");
2914                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2915                                      DR_REF (dr_a1->dr));
2916                   dump_printf (MSG_NOTE,  ", ");
2917                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2918                                      DR_REF (dr_b1->dr));
2919                   dump_printf (MSG_NOTE,  " and ");
2920                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2921                                      DR_REF (dr_a2->dr));
2922                   dump_printf (MSG_NOTE,  ", ");
2923                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2924                                      DR_REF (dr_b2->dr));
2925                   dump_printf (MSG_NOTE, "\n");
2926                 }
2927
2928               dr_a1->seg_len = size_binop (PLUS_EXPR,
2929                                            dr_a2->seg_len, size_int (diff));
2930               comp_alias_ddrs.ordered_remove (i--);
2931             }
2932         }
2933     }
2934
2935   dump_printf_loc (MSG_NOTE, vect_location,
2936                    "improved number of alias checks from %d to %d\n",
2937                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2938   if ((int) comp_alias_ddrs.length () >
2939       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2940     return false;
2941
2942   return true;
2943 }
2944
2945 /* Check whether a non-affine read in stmt is suitable for gather load
2946    and if so, return a builtin decl for that operation.  */
2947
2948 tree
2949 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2950                    tree *offp, int *scalep)
2951 {
2952   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2953   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2954   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2955   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2956   tree offtype = NULL_TREE;
2957   tree decl, base, off;
2958   enum machine_mode pmode;
2959   int punsignedp, pvolatilep;
2960
2961   base = DR_REF (dr);
2962   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2963      see if we can use the def stmt of the address.  */
2964   if (is_gimple_call (stmt)
2965       && gimple_call_internal_p (stmt)
2966       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2967           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2968       && TREE_CODE (base) == MEM_REF
2969       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2970       && integer_zerop (TREE_OPERAND (base, 1))
2971       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2972     {
2973       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2974       if (is_gimple_assign (def_stmt)
2975           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2976         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2977     }
2978
2979   /* The gather builtins need address of the form
2980      loop_invariant + vector * {1, 2, 4, 8}
2981      or
2982      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2983      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2984      of loop invariants/SSA_NAMEs defined in the loop, with casts,
2985      multiplications and additions in it.  To get a vector, we need
2986      a single SSA_NAME that will be defined in the loop and will
2987      contain everything that is not loop invariant and that can be
2988      vectorized.  The following code attempts to find such a preexistng
2989      SSA_NAME OFF and put the loop invariants into a tree BASE
2990      that can be gimplified before the loop.  */
2991   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
2992                               &pmode, &punsignedp, &pvolatilep, false);
2993   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2994
2995   if (TREE_CODE (base) == MEM_REF)
2996     {
2997       if (!integer_zerop (TREE_OPERAND (base, 1)))
2998         {
2999           if (off == NULL_TREE)
3000             {
3001               offset_int moff = mem_ref_offset (base);
3002               off = wide_int_to_tree (sizetype, moff);
3003             }
3004           else
3005             off = size_binop (PLUS_EXPR, off,
3006                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3007         }
3008       base = TREE_OPERAND (base, 0);
3009     }
3010   else
3011     base = build_fold_addr_expr (base);
3012
3013   if (off == NULL_TREE)
3014     off = size_zero_node;
3015
3016   /* If base is not loop invariant, either off is 0, then we start with just
3017      the constant offset in the loop invariant BASE and continue with base
3018      as OFF, otherwise give up.
3019      We could handle that case by gimplifying the addition of base + off
3020      into some SSA_NAME and use that as off, but for now punt.  */
3021   if (!expr_invariant_in_loop_p (loop, base))
3022     {
3023       if (!integer_zerop (off))
3024         return NULL_TREE;
3025       off = base;
3026       base = size_int (pbitpos / BITS_PER_UNIT);
3027     }
3028   /* Otherwise put base + constant offset into the loop invariant BASE
3029      and continue with OFF.  */
3030   else
3031     {
3032       base = fold_convert (sizetype, base);
3033       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3034     }
3035
3036   /* OFF at this point may be either a SSA_NAME or some tree expression
3037      from get_inner_reference.  Try to peel off loop invariants from it
3038      into BASE as long as possible.  */
3039   STRIP_NOPS (off);
3040   while (offtype == NULL_TREE)
3041     {
3042       enum tree_code code;
3043       tree op0, op1, add = NULL_TREE;
3044
3045       if (TREE_CODE (off) == SSA_NAME)
3046         {
3047           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3048
3049           if (expr_invariant_in_loop_p (loop, off))
3050             return NULL_TREE;
3051
3052           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3053             break;
3054
3055           op0 = gimple_assign_rhs1 (def_stmt);
3056           code = gimple_assign_rhs_code (def_stmt);
3057           op1 = gimple_assign_rhs2 (def_stmt);
3058         }
3059       else
3060         {
3061           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3062             return NULL_TREE;
3063           code = TREE_CODE (off);
3064           extract_ops_from_tree (off, &code, &op0, &op1);
3065         }
3066       switch (code)
3067         {
3068         case POINTER_PLUS_EXPR:
3069         case PLUS_EXPR:
3070           if (expr_invariant_in_loop_p (loop, op0))
3071             {
3072               add = op0;
3073               off = op1;
3074             do_add:
3075               add = fold_convert (sizetype, add);
3076               if (scale != 1)
3077                 add = size_binop (MULT_EXPR, add, size_int (scale));
3078               base = size_binop (PLUS_EXPR, base, add);
3079               continue;
3080             }
3081           if (expr_invariant_in_loop_p (loop, op1))
3082             {
3083               add = op1;
3084               off = op0;
3085               goto do_add;
3086             }
3087           break;
3088         case MINUS_EXPR:
3089           if (expr_invariant_in_loop_p (loop, op1))
3090             {
3091               add = fold_convert (sizetype, op1);
3092               add = size_binop (MINUS_EXPR, size_zero_node, add);
3093               off = op0;
3094               goto do_add;
3095             }
3096           break;
3097         case MULT_EXPR:
3098           if (scale == 1 && tree_fits_shwi_p (op1))
3099             {
3100               scale = tree_to_shwi (op1);
3101               off = op0;
3102               continue;
3103             }
3104           break;
3105         case SSA_NAME:
3106           off = op0;
3107           continue;
3108         CASE_CONVERT:
3109           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3110               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3111             break;
3112           if (TYPE_PRECISION (TREE_TYPE (op0))
3113               == TYPE_PRECISION (TREE_TYPE (off)))
3114             {
3115               off = op0;
3116               continue;
3117             }
3118           if (TYPE_PRECISION (TREE_TYPE (op0))
3119               < TYPE_PRECISION (TREE_TYPE (off)))
3120             {
3121               off = op0;
3122               offtype = TREE_TYPE (off);
3123               STRIP_NOPS (off);
3124               continue;
3125             }
3126           break;
3127         default:
3128           break;
3129         }
3130       break;
3131     }
3132
3133   /* If at the end OFF still isn't a SSA_NAME or isn't
3134      defined in the loop, punt.  */
3135   if (TREE_CODE (off) != SSA_NAME
3136       || expr_invariant_in_loop_p (loop, off))
3137     return NULL_TREE;
3138
3139   if (offtype == NULL_TREE)
3140     offtype = TREE_TYPE (off);
3141
3142   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3143                                            offtype, scale);
3144   if (decl == NULL_TREE)
3145     return NULL_TREE;
3146
3147   if (basep)
3148     *basep = base;
3149   if (offp)
3150     *offp = off;
3151   if (scalep)
3152     *scalep = scale;
3153   return decl;
3154 }
3155
3156 /* Function vect_analyze_data_refs.
3157
3158   Find all the data references in the loop or basic block.
3159
3160    The general structure of the analysis of data refs in the vectorizer is as
3161    follows:
3162    1- vect_analyze_data_refs(loop/bb): call
3163       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3164       in the loop/bb and their dependences.
3165    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3166    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3167    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3168
3169 */
3170
3171 bool
3172 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3173                         bb_vec_info bb_vinfo,
3174                         int *min_vf, unsigned *n_stmts)
3175 {
3176   struct loop *loop = NULL;
3177   basic_block bb = NULL;
3178   unsigned int i;
3179   vec<data_reference_p> datarefs;
3180   struct data_reference *dr;
3181   tree scalar_type;
3182
3183   if (dump_enabled_p ())
3184     dump_printf_loc (MSG_NOTE, vect_location,
3185                      "=== vect_analyze_data_refs ===\n");
3186
3187   if (loop_vinfo)
3188     {
3189       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3190
3191       loop = LOOP_VINFO_LOOP (loop_vinfo);
3192       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3193       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3194         {
3195           if (dump_enabled_p ())
3196             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3197                              "not vectorized: loop contains function calls"
3198                              " or data references that cannot be analyzed\n");
3199           return false;
3200         }
3201
3202       for (i = 0; i < loop->num_nodes; i++)
3203         {
3204           gimple_stmt_iterator gsi;
3205
3206           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3207             {
3208               gimple stmt = gsi_stmt (gsi);
3209               if (is_gimple_debug (stmt))
3210                 continue;
3211               ++*n_stmts;
3212               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3213                 {
3214                   if (is_gimple_call (stmt) && loop->safelen)
3215                     {
3216                       tree fndecl = gimple_call_fndecl (stmt), op;
3217                       if (fndecl != NULL_TREE)
3218                         {
3219                           struct cgraph_node *node = cgraph_get_node (fndecl);
3220                           if (node != NULL && node->simd_clones != NULL)
3221                             {
3222                               unsigned int j, n = gimple_call_num_args (stmt);
3223                               for (j = 0; j < n; j++)
3224                                 {
3225                                   op = gimple_call_arg (stmt, j);
3226                                   if (DECL_P (op)
3227                                       || (REFERENCE_CLASS_P (op)
3228                                           && get_base_address (op)))
3229                                     break;
3230                                 }
3231                               op = gimple_call_lhs (stmt);
3232                               /* Ignore #pragma omp declare simd functions
3233                                  if they don't have data references in the
3234                                  call stmt itself.  */
3235                               if (j == n
3236                                   && !(op
3237                                        && (DECL_P (op)
3238                                            || (REFERENCE_CLASS_P (op)
3239                                                && get_base_address (op)))))
3240                                 continue;
3241                             }
3242                         }
3243                     }
3244                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3245                   if (dump_enabled_p ())
3246                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3247                                      "not vectorized: loop contains function "
3248                                      "calls or data references that cannot "
3249                                      "be analyzed\n");
3250                   return false;
3251                 }
3252             }
3253         }
3254
3255       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3256     }
3257   else
3258     {
3259       gimple_stmt_iterator gsi;
3260
3261       bb = BB_VINFO_BB (bb_vinfo);
3262       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3263         {
3264           gimple stmt = gsi_stmt (gsi);
3265           if (is_gimple_debug (stmt))
3266             continue;
3267           ++*n_stmts;
3268           if (!find_data_references_in_stmt (NULL, stmt,
3269                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3270             {
3271               /* Mark the rest of the basic-block as unvectorizable.  */
3272               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3273                 {
3274                   stmt = gsi_stmt (gsi);
3275                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3276                 }
3277               break;
3278             }
3279         }
3280
3281       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3282     }
3283
3284   /* Go through the data-refs, check that the analysis succeeded.  Update
3285      pointer from stmt_vec_info struct to DR and vectype.  */
3286
3287   FOR_EACH_VEC_ELT (datarefs, i, dr)
3288     {
3289       gimple stmt;
3290       stmt_vec_info stmt_info;
3291       tree base, offset, init;
3292       bool gather = false;
3293       bool simd_lane_access = false;
3294       int vf;
3295
3296 again:
3297       if (!dr || !DR_REF (dr))
3298         {
3299           if (dump_enabled_p ())
3300             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3301                              "not vectorized: unhandled data-ref\n");
3302           return false;
3303         }
3304
3305       stmt = DR_STMT (dr);
3306       stmt_info = vinfo_for_stmt (stmt);
3307
3308       /* Discard clobbers from the dataref vector.  We will remove
3309          clobber stmts during vectorization.  */
3310       if (gimple_clobber_p (stmt))
3311         {
3312           free_data_ref (dr);
3313           if (i == datarefs.length () - 1)
3314             {
3315               datarefs.pop ();
3316               break;
3317             }
3318           datarefs.ordered_remove (i);
3319           dr = datarefs[i];
3320           goto again;
3321         }
3322
3323       /* Check that analysis of the data-ref succeeded.  */
3324       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3325           || !DR_STEP (dr))
3326         {
3327           bool maybe_gather
3328             = DR_IS_READ (dr)
3329               && !TREE_THIS_VOLATILE (DR_REF (dr))
3330               && targetm.vectorize.builtin_gather != NULL;
3331           bool maybe_simd_lane_access
3332             = loop_vinfo && loop->simduid;
3333
3334           /* If target supports vector gather loads, or if this might be
3335              a SIMD lane access, see if they can't be used.  */
3336           if (loop_vinfo
3337               && (maybe_gather || maybe_simd_lane_access)
3338               && !nested_in_vect_loop_p (loop, stmt))
3339             {
3340               struct data_reference *newdr
3341                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3342                                    DR_REF (dr), stmt, true);
3343               gcc_assert (newdr != NULL && DR_REF (newdr));
3344               if (DR_BASE_ADDRESS (newdr)
3345                   && DR_OFFSET (newdr)
3346                   && DR_INIT (newdr)
3347                   && DR_STEP (newdr)
3348                   && integer_zerop (DR_STEP (newdr)))
3349                 {
3350                   if (maybe_simd_lane_access)
3351                     {
3352                       tree off = DR_OFFSET (newdr);
3353                       STRIP_NOPS (off);
3354                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3355                           && TREE_CODE (off) == MULT_EXPR
3356                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3357                         {
3358                           tree step = TREE_OPERAND (off, 1);
3359                           off = TREE_OPERAND (off, 0);
3360                           STRIP_NOPS (off);
3361                           if (CONVERT_EXPR_P (off)
3362                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3363                                                                           0)))
3364                                  < TYPE_PRECISION (TREE_TYPE (off)))
3365                             off = TREE_OPERAND (off, 0);
3366                           if (TREE_CODE (off) == SSA_NAME)
3367                             {
3368                               gimple def = SSA_NAME_DEF_STMT (off);
3369                               tree reft = TREE_TYPE (DR_REF (newdr));
3370                               if (is_gimple_call (def)
3371                                   && gimple_call_internal_p (def)
3372                                   && (gimple_call_internal_fn (def)
3373                                       == IFN_GOMP_SIMD_LANE))
3374                                 {
3375                                   tree arg = gimple_call_arg (def, 0);
3376                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3377                                   arg = SSA_NAME_VAR (arg);
3378                                   if (arg == loop->simduid
3379                                       /* For now.  */
3380                                       && tree_int_cst_equal
3381                                            (TYPE_SIZE_UNIT (reft),
3382                                             step))
3383                                     {
3384                                       DR_OFFSET (newdr) = ssize_int (0);
3385                                       DR_STEP (newdr) = step;
3386                                       DR_ALIGNED_TO (newdr)
3387                                         = size_int (BIGGEST_ALIGNMENT);
3388                                       dr = newdr;
3389                                       simd_lane_access = true;
3390                                     }
3391                                 }
3392                             }
3393                         }
3394                     }
3395                   if (!simd_lane_access && maybe_gather)
3396                     {
3397                       dr = newdr;
3398                       gather = true;
3399                     }
3400                 }
3401               if (!gather && !simd_lane_access)
3402                 free_data_ref (newdr);
3403             }
3404
3405           if (!gather && !simd_lane_access)
3406             {
3407               if (dump_enabled_p ())
3408                 {
3409                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3410                                    "not vectorized: data ref analysis "
3411                                    "failed ");
3412                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3413                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3414                 }
3415
3416               if (bb_vinfo)
3417                 break;
3418
3419               return false;
3420             }
3421         }
3422
3423       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3424         {
3425           if (dump_enabled_p ())
3426             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3427                              "not vectorized: base addr of dr is a "
3428                              "constant\n");
3429
3430           if (bb_vinfo)
3431             break;
3432
3433           if (gather || simd_lane_access)
3434             free_data_ref (dr);
3435           return false;
3436         }
3437
3438       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3439         {
3440           if (dump_enabled_p ())
3441             {
3442               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3443                                "not vectorized: volatile type ");
3444               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3445               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3446             }
3447
3448           if (bb_vinfo)
3449             break;
3450
3451           return false;
3452         }
3453
3454       if (stmt_can_throw_internal (stmt))
3455         {
3456           if (dump_enabled_p ())
3457             {
3458               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3459                                "not vectorized: statement can throw an "
3460                                "exception ");
3461               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3462               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3463             }
3464
3465           if (bb_vinfo)
3466             break;
3467
3468           if (gather || simd_lane_access)
3469             free_data_ref (dr);
3470           return false;
3471         }
3472
3473       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3474           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3475         {
3476           if (dump_enabled_p ())
3477             {
3478               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3479                                "not vectorized: statement is bitfield "
3480                                "access ");
3481               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3482               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3483             }
3484
3485           if (bb_vinfo)
3486             break;
3487
3488           if (gather || simd_lane_access)
3489             free_data_ref (dr);
3490           return false;
3491         }
3492
3493       base = unshare_expr (DR_BASE_ADDRESS (dr));
3494       offset = unshare_expr (DR_OFFSET (dr));
3495       init = unshare_expr (DR_INIT (dr));
3496
3497       if (is_gimple_call (stmt)
3498           && (!gimple_call_internal_p (stmt)
3499               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3500                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3501         {
3502           if (dump_enabled_p ())
3503             {
3504               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3505                                "not vectorized: dr in a call ");
3506               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3507               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3508             }
3509
3510           if (bb_vinfo)
3511             break;
3512
3513           if (gather || simd_lane_access)
3514             free_data_ref (dr);
3515           return false;
3516         }
3517
3518       /* Update DR field in stmt_vec_info struct.  */
3519
3520       /* If the dataref is in an inner-loop of the loop that is considered for
3521          for vectorization, we also want to analyze the access relative to
3522          the outer-loop (DR contains information only relative to the
3523          inner-most enclosing loop).  We do that by building a reference to the
3524          first location accessed by the inner-loop, and analyze it relative to
3525          the outer-loop.  */
3526       if (loop && nested_in_vect_loop_p (loop, stmt))
3527         {
3528           tree outer_step, outer_base, outer_init;
3529           HOST_WIDE_INT pbitsize, pbitpos;
3530           tree poffset;
3531           enum machine_mode pmode;
3532           int punsignedp, pvolatilep;
3533           affine_iv base_iv, offset_iv;
3534           tree dinit;
3535
3536           /* Build a reference to the first location accessed by the
3537              inner-loop: *(BASE+INIT).  (The first location is actually
3538              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3539           tree inner_base = build_fold_indirect_ref
3540                                 (fold_build_pointer_plus (base, init));
3541
3542           if (dump_enabled_p ())
3543             {
3544               dump_printf_loc (MSG_NOTE, vect_location,
3545                                "analyze in outer-loop: ");
3546               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3547               dump_printf (MSG_NOTE, "\n");
3548             }
3549
3550           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3551                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3552           gcc_assert (outer_base != NULL_TREE);
3553
3554           if (pbitpos % BITS_PER_UNIT != 0)
3555             {
3556               if (dump_enabled_p ())
3557                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3558                                  "failed: bit offset alignment.\n");
3559               return false;
3560             }
3561
3562           outer_base = build_fold_addr_expr (outer_base);
3563           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3564                           &base_iv, false))
3565             {
3566               if (dump_enabled_p ())
3567                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3568                                  "failed: evolution of base is not affine.\n");
3569               return false;
3570             }
3571
3572           if (offset)
3573             {
3574               if (poffset)
3575                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3576                                        poffset);
3577               else
3578                 poffset = offset;
3579             }
3580
3581           if (!poffset)
3582             {
3583               offset_iv.base = ssize_int (0);
3584               offset_iv.step = ssize_int (0);
3585             }
3586           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3587                                &offset_iv, false))
3588             {
3589               if (dump_enabled_p ())
3590                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3591                                  "evolution of offset is not affine.\n");
3592               return false;
3593             }
3594
3595           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3596           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3597           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3598           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3599           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3600
3601           outer_step = size_binop (PLUS_EXPR,
3602                                 fold_convert (ssizetype, base_iv.step),
3603                                 fold_convert (ssizetype, offset_iv.step));
3604
3605           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3606           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3607           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3608           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3609           STMT_VINFO_DR_OFFSET (stmt_info) =
3610                                 fold_convert (ssizetype, offset_iv.base);
3611           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3612                                 size_int (highest_pow2_factor (offset_iv.base));
3613
3614           if (dump_enabled_p ())
3615             {
3616               dump_printf_loc (MSG_NOTE, vect_location,
3617                                "\touter base_address: ");
3618               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3619                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3620               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3621               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3622                                  STMT_VINFO_DR_OFFSET (stmt_info));
3623               dump_printf (MSG_NOTE,
3624                            "\n\touter constant offset from base address: ");
3625               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3626                                  STMT_VINFO_DR_INIT (stmt_info));
3627               dump_printf (MSG_NOTE, "\n\touter step: ");
3628               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3629                                  STMT_VINFO_DR_STEP (stmt_info));
3630               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3631               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3632                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3633               dump_printf (MSG_NOTE, "\n");
3634             }
3635         }
3636
3637       if (STMT_VINFO_DATA_REF (stmt_info))
3638         {
3639           if (dump_enabled_p ())
3640             {
3641               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3642                                "not vectorized: more than one data ref "
3643                                "in stmt: ");
3644               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3645               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3646             }
3647
3648           if (bb_vinfo)
3649             break;
3650
3651           if (gather || simd_lane_access)
3652             free_data_ref (dr);
3653           return false;
3654         }
3655
3656       STMT_VINFO_DATA_REF (stmt_info) = dr;
3657       if (simd_lane_access)
3658         {
3659           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3660           free_data_ref (datarefs[i]);
3661           datarefs[i] = dr;
3662         }
3663
3664       /* Set vectype for STMT.  */
3665       scalar_type = TREE_TYPE (DR_REF (dr));
3666       STMT_VINFO_VECTYPE (stmt_info)
3667         = get_vectype_for_scalar_type (scalar_type);
3668       if (!STMT_VINFO_VECTYPE (stmt_info))
3669         {
3670           if (dump_enabled_p ())
3671             {
3672               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3673                                "not vectorized: no vectype for stmt: ");
3674               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3675               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3676               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3677                                  scalar_type);
3678               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3679             }
3680
3681           if (bb_vinfo)
3682             break;
3683
3684           if (gather || simd_lane_access)
3685             {
3686               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3687               if (gather)
3688                 free_data_ref (dr);
3689             }
3690           return false;
3691         }
3692       else
3693         {
3694           if (dump_enabled_p ())
3695             {
3696               dump_printf_loc (MSG_NOTE, vect_location,
3697                                "got vectype for stmt: ");
3698               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3699               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3700                                  STMT_VINFO_VECTYPE (stmt_info));
3701               dump_printf (MSG_NOTE, "\n");
3702             }
3703         }
3704
3705       /* Adjust the minimal vectorization factor according to the
3706          vector type.  */
3707       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3708       if (vf > *min_vf)
3709         *min_vf = vf;
3710
3711       if (gather)
3712         {
3713           tree off;
3714
3715           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3716           if (gather
3717               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3718             gather = false;
3719           if (!gather)
3720             {
3721               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3722               free_data_ref (dr);
3723               if (dump_enabled_p ())
3724                 {
3725                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3726                                    "not vectorized: not suitable for gather "
3727                                    "load ");
3728                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3729                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3730                 }
3731               return false;
3732             }
3733
3734           datarefs[i] = dr;
3735           STMT_VINFO_GATHER_P (stmt_info) = true;
3736         }
3737       else if (loop_vinfo
3738                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3739         {
3740           if (nested_in_vect_loop_p (loop, stmt)
3741               || !DR_IS_READ (dr))
3742             {
3743               if (dump_enabled_p ())
3744                 {
3745                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3746                                    "not vectorized: not suitable for strided "
3747                                    "load ");
3748                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3749                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3750                 }
3751               return false;
3752             }
3753           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3754         }
3755     }
3756
3757   /* If we stopped analysis at the first dataref we could not analyze
3758      when trying to vectorize a basic-block mark the rest of the datarefs
3759      as not vectorizable and truncate the vector of datarefs.  That
3760      avoids spending useless time in analyzing their dependence.  */
3761   if (i != datarefs.length ())
3762     {
3763       gcc_assert (bb_vinfo != NULL);
3764       for (unsigned j = i; j < datarefs.length (); ++j)
3765         {
3766           data_reference_p dr = datarefs[j];
3767           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3768           free_data_ref (dr);
3769         }
3770       datarefs.truncate (i);
3771     }
3772
3773   return true;
3774 }
3775
3776
3777 /* Function vect_get_new_vect_var.
3778
3779    Returns a name for a new variable.  The current naming scheme appends the
3780    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3781    the name of vectorizer generated variables, and appends that to NAME if
3782    provided.  */
3783
3784 tree
3785 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3786 {
3787   const char *prefix;
3788   tree new_vect_var;
3789
3790   switch (var_kind)
3791   {
3792   case vect_simple_var:
3793     prefix = "vect";
3794     break;
3795   case vect_scalar_var:
3796     prefix = "stmp";
3797     break;
3798   case vect_pointer_var:
3799     prefix = "vectp";
3800     break;
3801   default:
3802     gcc_unreachable ();
3803   }
3804
3805   if (name)
3806     {
3807       char* tmp = concat (prefix, "_", name, NULL);
3808       new_vect_var = create_tmp_reg (type, tmp);
3809       free (tmp);
3810     }
3811   else
3812     new_vect_var = create_tmp_reg (type, prefix);
3813
3814   return new_vect_var;
3815 }
3816
3817
3818 /* Function vect_create_addr_base_for_vector_ref.
3819
3820    Create an expression that computes the address of the first memory location
3821    that will be accessed for a data reference.
3822
3823    Input:
3824    STMT: The statement containing the data reference.
3825    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3826    OFFSET: Optional. If supplied, it is be added to the initial address.
3827    LOOP:    Specify relative to which loop-nest should the address be computed.
3828             For example, when the dataref is in an inner-loop nested in an
3829             outer-loop that is now being vectorized, LOOP can be either the
3830             outer-loop, or the inner-loop.  The first memory location accessed
3831             by the following dataref ('in' points to short):
3832
3833                 for (i=0; i<N; i++)
3834                    for (j=0; j<M; j++)
3835                      s += in[i+j]
3836
3837             is as follows:
3838             if LOOP=i_loop:     &in             (relative to i_loop)
3839             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3840
3841    Output:
3842    1. Return an SSA_NAME whose value is the address of the memory location of
3843       the first vector of the data reference.
3844    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3845       these statement(s) which define the returned SSA_NAME.
3846
3847    FORNOW: We are only handling array accesses with step 1.  */
3848
3849 tree
3850 vect_create_addr_base_for_vector_ref (gimple stmt,
3851                                       gimple_seq *new_stmt_list,
3852                                       tree offset,
3853                                       struct loop *loop)
3854 {
3855   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3856   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3857   tree data_ref_base;
3858   const char *base_name;
3859   tree addr_base;
3860   tree dest;
3861   gimple_seq seq = NULL;
3862   tree base_offset;
3863   tree init;
3864   tree vect_ptr_type;
3865   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3866   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3867
3868   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3869     {
3870       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3871
3872       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3873
3874       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3875       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3876       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3877     }
3878   else
3879     {
3880       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3881       base_offset = unshare_expr (DR_OFFSET (dr));
3882       init = unshare_expr (DR_INIT (dr));
3883     }
3884
3885   if (loop_vinfo)
3886     base_name = get_name (data_ref_base);
3887   else
3888     {
3889       base_offset = ssize_int (0);
3890       init = ssize_int (0);
3891       base_name = get_name (DR_REF (dr));
3892     }
3893
3894   /* Create base_offset */
3895   base_offset = size_binop (PLUS_EXPR,
3896                             fold_convert (sizetype, base_offset),
3897                             fold_convert (sizetype, init));
3898
3899   if (offset)
3900     {
3901       offset = fold_build2 (MULT_EXPR, sizetype,
3902                             fold_convert (sizetype, offset), step);
3903       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3904                                  base_offset, offset);
3905     }
3906
3907   /* base + base_offset */
3908   if (loop_vinfo)
3909     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3910   else
3911     {
3912       addr_base = build1 (ADDR_EXPR,
3913                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3914                           unshare_expr (DR_REF (dr)));
3915     }
3916
3917   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3918   addr_base = fold_convert (vect_ptr_type, addr_base);
3919   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3920   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3921   gimple_seq_add_seq (new_stmt_list, seq);
3922
3923   if (DR_PTR_INFO (dr)
3924       && TREE_CODE (addr_base) == SSA_NAME)
3925     {
3926       duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3927       if (offset)
3928         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3929     }
3930
3931   if (dump_enabled_p ())
3932     {
3933       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3934       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3935       dump_printf (MSG_NOTE, "\n");
3936     }
3937
3938   return addr_base;
3939 }
3940
3941
3942 /* Function vect_create_data_ref_ptr.
3943
3944    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3945    location accessed in the loop by STMT, along with the def-use update
3946    chain to appropriately advance the pointer through the loop iterations.
3947    Also set aliasing information for the pointer.  This pointer is used by
3948    the callers to this function to create a memory reference expression for
3949    vector load/store access.
3950
3951    Input:
3952    1. STMT: a stmt that references memory. Expected to be of the form
3953          GIMPLE_ASSIGN <name, data-ref> or
3954          GIMPLE_ASSIGN <data-ref, name>.
3955    2. AGGR_TYPE: the type of the reference, which should be either a vector
3956         or an array.
3957    3. AT_LOOP: the loop where the vector memref is to be created.
3958    4. OFFSET (optional): an offset to be added to the initial address accessed
3959         by the data-ref in STMT.
3960    5. BSI: location where the new stmts are to be placed if there is no loop
3961    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3962         pointing to the initial address.
3963
3964    Output:
3965    1. Declare a new ptr to vector_type, and have it point to the base of the
3966       data reference (initial addressed accessed by the data reference).
3967       For example, for vector of type V8HI, the following code is generated:
3968
3969       v8hi *ap;
3970       ap = (v8hi *)initial_address;
3971
3972       if OFFSET is not supplied:
3973          initial_address = &a[init];
3974       if OFFSET is supplied:
3975          initial_address = &a[init + OFFSET];
3976
3977       Return the initial_address in INITIAL_ADDRESS.
3978
3979    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
3980       update the pointer in each iteration of the loop.
3981
3982       Return the increment stmt that updates the pointer in PTR_INCR.
3983
3984    3. Set INV_P to true if the access pattern of the data reference in the
3985       vectorized loop is invariant.  Set it to false otherwise.
3986
3987    4. Return the pointer.  */
3988
3989 tree
3990 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3991                           tree offset, tree *initial_address,
3992                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
3993                           bool only_init, bool *inv_p)
3994 {
3995   const char *base_name;
3996   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3997   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3998   struct loop *loop = NULL;
3999   bool nested_in_vect_loop = false;
4000   struct loop *containing_loop = NULL;
4001   tree aggr_ptr_type;
4002   tree aggr_ptr;
4003   tree new_temp;
4004   gimple vec_stmt;
4005   gimple_seq new_stmt_list = NULL;
4006   edge pe = NULL;
4007   basic_block new_bb;
4008   tree aggr_ptr_init;
4009   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4010   tree aptr;
4011   gimple_stmt_iterator incr_gsi;
4012   bool insert_after;
4013   tree indx_before_incr, indx_after_incr;
4014   gimple incr;
4015   tree step;
4016   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4017
4018   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4019               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4020
4021   if (loop_vinfo)
4022     {
4023       loop = LOOP_VINFO_LOOP (loop_vinfo);
4024       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4025       containing_loop = (gimple_bb (stmt))->loop_father;
4026       pe = loop_preheader_edge (loop);
4027     }
4028   else
4029     {
4030       gcc_assert (bb_vinfo);
4031       only_init = true;
4032       *ptr_incr = NULL;
4033     }
4034
4035   /* Check the step (evolution) of the load in LOOP, and record
4036      whether it's invariant.  */
4037   if (nested_in_vect_loop)
4038     step = STMT_VINFO_DR_STEP (stmt_info);
4039   else
4040     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4041
4042   if (integer_zerop (step))
4043     *inv_p = true;
4044   else
4045     *inv_p = false;
4046
4047   /* Create an expression for the first address accessed by this load
4048      in LOOP.  */
4049   base_name = get_name (DR_BASE_ADDRESS (dr));
4050
4051   if (dump_enabled_p ())
4052     {
4053       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4054       dump_printf_loc (MSG_NOTE, vect_location,
4055                        "create %s-pointer variable to type: ",
4056                        get_tree_code_name (TREE_CODE (aggr_type)));
4057       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4058       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4059         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4060       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4061         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4062       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4063         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4064       else
4065         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4066       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4067       dump_printf (MSG_NOTE, "\n");
4068     }
4069
4070   /* (1) Create the new aggregate-pointer variable.
4071      Vector and array types inherit the alias set of their component
4072      type by default so we need to use a ref-all pointer if the data
4073      reference does not conflict with the created aggregated data
4074      reference because it is not addressable.  */
4075   bool need_ref_all = false;
4076   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4077                               get_alias_set (DR_REF (dr))))
4078     need_ref_all = true;
4079   /* Likewise for any of the data references in the stmt group.  */
4080   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4081     {
4082       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4083       do
4084         {
4085           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4086           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4087           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4088                                       get_alias_set (DR_REF (sdr))))
4089             {
4090               need_ref_all = true;
4091               break;
4092             }
4093           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4094         }
4095       while (orig_stmt);
4096     }
4097   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4098                                                need_ref_all);
4099   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4100
4101
4102   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4103      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4104      def-use update cycles for the pointer: one relative to the outer-loop
4105      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4106      to the inner-loop (which is the inner-most loop containing the dataref),
4107      and this is done be step (5) below.
4108
4109      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4110      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4111      redundant.  Steps (3),(4) create the following:
4112
4113         vp0 = &base_addr;
4114         LOOP:   vp1 = phi(vp0,vp2)
4115                 ...
4116                 ...
4117                 vp2 = vp1 + step
4118                 goto LOOP
4119
4120      If there is an inner-loop nested in loop, then step (5) will also be
4121      applied, and an additional update in the inner-loop will be created:
4122
4123         vp0 = &base_addr;
4124         LOOP:   vp1 = phi(vp0,vp2)
4125                 ...
4126         inner:     vp3 = phi(vp1,vp4)
4127                    vp4 = vp3 + inner_step
4128                    if () goto inner
4129                 ...
4130                 vp2 = vp1 + step
4131                 if () goto LOOP   */
4132
4133   /* (2) Calculate the initial address of the aggregate-pointer, and set
4134      the aggregate-pointer to point to it before the loop.  */
4135
4136   /* Create: (&(base[init_val+offset]) in the loop preheader.  */
4137
4138   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4139                                                    offset, loop);
4140   if (new_stmt_list)
4141     {
4142       if (pe)
4143         {
4144           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4145           gcc_assert (!new_bb);
4146         }
4147       else
4148         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4149     }
4150
4151   *initial_address = new_temp;
4152
4153   /* Create: p = (aggr_type *) initial_base  */
4154   if (TREE_CODE (new_temp) != SSA_NAME
4155       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4156     {
4157       vec_stmt = gimple_build_assign (aggr_ptr,
4158                                       fold_convert (aggr_ptr_type, new_temp));
4159       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4160       /* Copy the points-to information if it exists. */
4161       if (DR_PTR_INFO (dr))
4162         duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4163       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4164       if (pe)
4165         {
4166           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4167           gcc_assert (!new_bb);
4168         }
4169       else
4170         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4171     }
4172   else
4173     aggr_ptr_init = new_temp;
4174
4175   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4176      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4177      inner-loop nested in LOOP (during outer-loop vectorization).  */
4178
4179   /* No update in loop is required.  */
4180   if (only_init && (!loop_vinfo || at_loop == loop))
4181     aptr = aggr_ptr_init;
4182   else
4183     {
4184       /* The step of the aggregate pointer is the type size.  */
4185       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4186       /* One exception to the above is when the scalar step of the load in
4187          LOOP is zero. In this case the step here is also zero.  */
4188       if (*inv_p)
4189         iv_step = size_zero_node;
4190       else if (tree_int_cst_sgn (step) == -1)
4191         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4192
4193       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4194
4195       create_iv (aggr_ptr_init,
4196                  fold_convert (aggr_ptr_type, iv_step),
4197                  aggr_ptr, loop, &incr_gsi, insert_after,
4198                  &indx_before_incr, &indx_after_incr);
4199       incr = gsi_stmt (incr_gsi);
4200       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4201
4202       /* Copy the points-to information if it exists. */
4203       if (DR_PTR_INFO (dr))
4204         {
4205           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4206           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4207         }
4208       if (ptr_incr)
4209         *ptr_incr = incr;
4210
4211       aptr = indx_before_incr;
4212     }
4213
4214   if (!nested_in_vect_loop || only_init)
4215     return aptr;
4216
4217
4218   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4219      nested in LOOP, if exists.  */
4220
4221   gcc_assert (nested_in_vect_loop);
4222   if (!only_init)
4223     {
4224       standard_iv_increment_position (containing_loop, &incr_gsi,
4225                                       &insert_after);
4226       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4227                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4228                  &indx_after_incr);
4229       incr = gsi_stmt (incr_gsi);
4230       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4231
4232       /* Copy the points-to information if it exists. */
4233       if (DR_PTR_INFO (dr))
4234         {
4235           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4236           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4237         }
4238       if (ptr_incr)
4239         *ptr_incr = incr;
4240
4241       return indx_before_incr;
4242     }
4243   else
4244     gcc_unreachable ();
4245 }
4246
4247
4248 /* Function bump_vector_ptr
4249
4250    Increment a pointer (to a vector type) by vector-size. If requested,
4251    i.e. if PTR-INCR is given, then also connect the new increment stmt
4252    to the existing def-use update-chain of the pointer, by modifying
4253    the PTR_INCR as illustrated below:
4254
4255    The pointer def-use update-chain before this function:
4256                         DATAREF_PTR = phi (p_0, p_2)
4257                         ....
4258         PTR_INCR:       p_2 = DATAREF_PTR + step
4259
4260    The pointer def-use update-chain after this function:
4261                         DATAREF_PTR = phi (p_0, p_2)
4262                         ....
4263                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4264                         ....
4265         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4266
4267    Input:
4268    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4269                  in the loop.
4270    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4271               the loop.  The increment amount across iterations is expected
4272               to be vector_size.
4273    BSI - location where the new update stmt is to be placed.
4274    STMT - the original scalar memory-access stmt that is being vectorized.
4275    BUMP - optional. The offset by which to bump the pointer. If not given,
4276           the offset is assumed to be vector_size.
4277
4278    Output: Return NEW_DATAREF_PTR as illustrated above.
4279
4280 */
4281
4282 tree
4283 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4284                  gimple stmt, tree bump)
4285 {
4286   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4287   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4288   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4289   tree update = TYPE_SIZE_UNIT (vectype);
4290   gimple incr_stmt;
4291   ssa_op_iter iter;
4292   use_operand_p use_p;
4293   tree new_dataref_ptr;
4294
4295   if (bump)
4296     update = bump;
4297
4298   new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4299   incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4300                                             dataref_ptr, update);
4301   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4302
4303   /* Copy the points-to information if it exists. */
4304   if (DR_PTR_INFO (dr))
4305     {
4306       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4307       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4308     }
4309
4310   if (!ptr_incr)
4311     return new_dataref_ptr;
4312
4313   /* Update the vector-pointer's cross-iteration increment.  */
4314   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4315     {
4316       tree use = USE_FROM_PTR (use_p);
4317
4318       if (use == dataref_ptr)
4319         SET_USE (use_p, new_dataref_ptr);
4320       else
4321         gcc_assert (tree_int_cst_compare (use, update) == 0);
4322     }
4323
4324   return new_dataref_ptr;
4325 }
4326
4327
4328 /* Function vect_create_destination_var.
4329
4330    Create a new temporary of type VECTYPE.  */
4331
4332 tree
4333 vect_create_destination_var (tree scalar_dest, tree vectype)
4334 {
4335   tree vec_dest;
4336   const char *name;
4337   char *new_name;
4338   tree type;
4339   enum vect_var_kind kind;
4340
4341   kind = vectype ? vect_simple_var : vect_scalar_var;
4342   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4343
4344   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4345
4346   name = get_name (scalar_dest);
4347   if (name)
4348     asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4349   else
4350     asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
4351   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4352   free (new_name);
4353
4354   return vec_dest;
4355 }
4356
4357 /* Function vect_grouped_store_supported.
4358
4359    Returns TRUE if interleave high and interleave low permutations
4360    are supported, and FALSE otherwise.  */
4361
4362 bool
4363 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4364 {
4365   enum machine_mode mode = TYPE_MODE (vectype);
4366
4367   /* vect_permute_store_chain requires the group size to be a power of two.  */
4368   if (exact_log2 (count) == -1)
4369     {
4370       if (dump_enabled_p ())
4371         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4372                          "the size of the group of accesses"
4373                          " is not a power of 2\n");
4374       return false;
4375     }
4376
4377   /* Check that the permutation is supported.  */
4378   if (VECTOR_MODE_P (mode))
4379     {
4380       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4381       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4382       for (i = 0; i < nelt / 2; i++)
4383         {
4384           sel[i * 2] = i;
4385           sel[i * 2 + 1] = i + nelt;
4386         }
4387       if (can_vec_perm_p (mode, false, sel))
4388         {
4389           for (i = 0; i < nelt; i++)
4390             sel[i] += nelt / 2;
4391           if (can_vec_perm_p (mode, false, sel))
4392             return true;
4393         }
4394     }
4395
4396   if (dump_enabled_p ())
4397     dump_printf (MSG_MISSED_OPTIMIZATION,
4398                  "interleave op not supported by target.\n");
4399   return false;
4400 }
4401
4402
4403 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4404    type VECTYPE.  */
4405
4406 bool
4407 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4408 {
4409   return vect_lanes_optab_supported_p ("vec_store_lanes",
4410                                        vec_store_lanes_optab,
4411                                        vectype, count);
4412 }
4413
4414
4415 /* Function vect_permute_store_chain.
4416
4417    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4418    a power of 2, generate interleave_high/low stmts to reorder the data
4419    correctly for the stores.  Return the final references for stores in
4420    RESULT_CHAIN.
4421
4422    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4423    The input is 4 vectors each containing 8 elements.  We assign a number to
4424    each element, the input sequence is:
4425
4426    1st vec:   0  1  2  3  4  5  6  7
4427    2nd vec:   8  9 10 11 12 13 14 15
4428    3rd vec:  16 17 18 19 20 21 22 23
4429    4th vec:  24 25 26 27 28 29 30 31
4430
4431    The output sequence should be:
4432
4433    1st vec:  0  8 16 24  1  9 17 25
4434    2nd vec:  2 10 18 26  3 11 19 27
4435    3rd vec:  4 12 20 28  5 13 21 30
4436    4th vec:  6 14 22 30  7 15 23 31
4437
4438    i.e., we interleave the contents of the four vectors in their order.
4439
4440    We use interleave_high/low instructions to create such output.  The input of
4441    each interleave_high/low operation is two vectors:
4442    1st vec    2nd vec
4443    0 1 2 3    4 5 6 7
4444    the even elements of the result vector are obtained left-to-right from the
4445    high/low elements of the first vector.  The odd elements of the result are
4446    obtained left-to-right from the high/low elements of the second vector.
4447    The output of interleave_high will be:   0 4 1 5
4448    and of interleave_low:                   2 6 3 7
4449
4450
4451    The permutation is done in log LENGTH stages.  In each stage interleave_high
4452    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4453    where the first argument is taken from the first half of DR_CHAIN and the
4454    second argument from it's second half.
4455    In our example,
4456
4457    I1: interleave_high (1st vec, 3rd vec)
4458    I2: interleave_low (1st vec, 3rd vec)
4459    I3: interleave_high (2nd vec, 4th vec)
4460    I4: interleave_low (2nd vec, 4th vec)
4461
4462    The output for the first stage is:
4463
4464    I1:  0 16  1 17  2 18  3 19
4465    I2:  4 20  5 21  6 22  7 23
4466    I3:  8 24  9 25 10 26 11 27
4467    I4: 12 28 13 29 14 30 15 31
4468
4469    The output of the second stage, i.e. the final result is:
4470
4471    I1:  0  8 16 24  1  9 17 25
4472    I2:  2 10 18 26  3 11 19 27
4473    I3:  4 12 20 28  5 13 21 30
4474    I4:  6 14 22 30  7 15 23 31.  */
4475
4476 void
4477 vect_permute_store_chain (vec<tree> dr_chain,
4478                           unsigned int length,
4479                           gimple stmt,
4480                           gimple_stmt_iterator *gsi,
4481                           vec<tree> *result_chain)
4482 {
4483   tree vect1, vect2, high, low;
4484   gimple perm_stmt;
4485   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4486   tree perm_mask_low, perm_mask_high;
4487   unsigned int i, n;
4488   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4489   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4490
4491   result_chain->quick_grow (length);
4492   memcpy (result_chain->address (), dr_chain.address (),
4493           length * sizeof (tree));
4494
4495   for (i = 0, n = nelt / 2; i < n; i++)
4496     {
4497       sel[i * 2] = i;
4498       sel[i * 2 + 1] = i + nelt;
4499     }
4500   perm_mask_high = vect_gen_perm_mask (vectype, sel);
4501   gcc_assert (perm_mask_high != NULL);
4502
4503   for (i = 0; i < nelt; i++)
4504     sel[i] += nelt / 2;
4505   perm_mask_low = vect_gen_perm_mask (vectype, sel);
4506   gcc_assert (perm_mask_low != NULL);
4507
4508   for (i = 0, n = exact_log2 (length); i < n; i++)
4509     {
4510       for (j = 0; j < length/2; j++)
4511         {
4512           vect1 = dr_chain[j];
4513           vect2 = dr_chain[j+length/2];
4514
4515           /* Create interleaving stmt:
4516              high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}>  */
4517           high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4518           perm_stmt
4519             = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4520                                             vect1, vect2, perm_mask_high);
4521           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4522           (*result_chain)[2*j] = high;
4523
4524           /* Create interleaving stmt:
4525              low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4526                                                  nelt*3/2+1, ...}>  */
4527           low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4528           perm_stmt
4529             = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4530                                             vect1, vect2, perm_mask_low);
4531           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4532           (*result_chain)[2*j+1] = low;
4533         }
4534       memcpy (dr_chain.address (), result_chain->address (),
4535               length * sizeof (tree));
4536     }
4537 }
4538
4539 /* Function vect_setup_realignment
4540
4541    This function is called when vectorizing an unaligned load using
4542    the dr_explicit_realign[_optimized] scheme.
4543    This function generates the following code at the loop prolog:
4544
4545       p = initial_addr;
4546    x  msq_init = *(floor(p));   # prolog load
4547       realignment_token = call target_builtin;
4548     loop:
4549    x  msq = phi (msq_init, ---)
4550
4551    The stmts marked with x are generated only for the case of
4552    dr_explicit_realign_optimized.
4553
4554    The code above sets up a new (vector) pointer, pointing to the first
4555    location accessed by STMT, and a "floor-aligned" load using that pointer.
4556    It also generates code to compute the "realignment-token" (if the relevant
4557    target hook was defined), and creates a phi-node at the loop-header bb
4558    whose arguments are the result of the prolog-load (created by this
4559    function) and the result of a load that takes place in the loop (to be
4560    created by the caller to this function).
4561
4562    For the case of dr_explicit_realign_optimized:
4563    The caller to this function uses the phi-result (msq) to create the
4564    realignment code inside the loop, and sets up the missing phi argument,
4565    as follows:
4566     loop:
4567       msq = phi (msq_init, lsq)
4568       lsq = *(floor(p'));        # load in loop
4569       result = realign_load (msq, lsq, realignment_token);
4570
4571    For the case of dr_explicit_realign:
4572     loop:
4573       msq = *(floor(p));        # load in loop
4574       p' = p + (VS-1);
4575       lsq = *(floor(p'));       # load in loop
4576       result = realign_load (msq, lsq, realignment_token);
4577
4578    Input:
4579    STMT - (scalar) load stmt to be vectorized. This load accesses
4580           a memory location that may be unaligned.
4581    BSI - place where new code is to be inserted.
4582    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4583                               is used.
4584
4585    Output:
4586    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4587                        target hook, if defined.
4588    Return value - the result of the loop-header phi node.  */
4589
4590 tree
4591 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4592                         tree *realignment_token,
4593                         enum dr_alignment_support alignment_support_scheme,
4594                         tree init_addr,
4595                         struct loop **at_loop)
4596 {
4597   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4598   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4599   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4600   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4601   struct loop *loop = NULL;
4602   edge pe = NULL;
4603   tree scalar_dest = gimple_assign_lhs (stmt);
4604   tree vec_dest;
4605   gimple inc;
4606   tree ptr;
4607   tree data_ref;
4608   gimple new_stmt;
4609   basic_block new_bb;
4610   tree msq_init = NULL_TREE;
4611   tree new_temp;
4612   gimple phi_stmt;
4613   tree msq = NULL_TREE;
4614   gimple_seq stmts = NULL;
4615   bool inv_p;
4616   bool compute_in_loop = false;
4617   bool nested_in_vect_loop = false;
4618   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4619   struct loop *loop_for_initial_load = NULL;
4620
4621   if (loop_vinfo)
4622     {
4623       loop = LOOP_VINFO_LOOP (loop_vinfo);
4624       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4625     }
4626
4627   gcc_assert (alignment_support_scheme == dr_explicit_realign
4628               || alignment_support_scheme == dr_explicit_realign_optimized);
4629
4630   /* We need to generate three things:
4631      1. the misalignment computation
4632      2. the extra vector load (for the optimized realignment scheme).
4633      3. the phi node for the two vectors from which the realignment is
4634       done (for the optimized realignment scheme).  */
4635
4636   /* 1. Determine where to generate the misalignment computation.
4637
4638      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4639      calculation will be generated by this function, outside the loop (in the
4640      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4641      caller, inside the loop.
4642
4643      Background: If the misalignment remains fixed throughout the iterations of
4644      the loop, then both realignment schemes are applicable, and also the
4645      misalignment computation can be done outside LOOP.  This is because we are
4646      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4647      are a multiple of VS (the Vector Size), and therefore the misalignment in
4648      different vectorized LOOP iterations is always the same.
4649      The problem arises only if the memory access is in an inner-loop nested
4650      inside LOOP, which is now being vectorized using outer-loop vectorization.
4651      This is the only case when the misalignment of the memory access may not
4652      remain fixed throughout the iterations of the inner-loop (as explained in
4653      detail in vect_supportable_dr_alignment).  In this case, not only is the
4654      optimized realignment scheme not applicable, but also the misalignment
4655      computation (and generation of the realignment token that is passed to
4656      REALIGN_LOAD) have to be done inside the loop.
4657
4658      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4659      or not, which in turn determines if the misalignment is computed inside
4660      the inner-loop, or outside LOOP.  */
4661
4662   if (init_addr != NULL_TREE || !loop_vinfo)
4663     {
4664       compute_in_loop = true;
4665       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4666     }
4667
4668
4669   /* 2. Determine where to generate the extra vector load.
4670
4671      For the optimized realignment scheme, instead of generating two vector
4672      loads in each iteration, we generate a single extra vector load in the
4673      preheader of the loop, and in each iteration reuse the result of the
4674      vector load from the previous iteration.  In case the memory access is in
4675      an inner-loop nested inside LOOP, which is now being vectorized using
4676      outer-loop vectorization, we need to determine whether this initial vector
4677      load should be generated at the preheader of the inner-loop, or can be
4678      generated at the preheader of LOOP.  If the memory access has no evolution
4679      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4680      to be generated inside LOOP (in the preheader of the inner-loop).  */
4681
4682   if (nested_in_vect_loop)
4683     {
4684       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4685       bool invariant_in_outerloop =
4686             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4687       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4688     }
4689   else
4690     loop_for_initial_load = loop;
4691   if (at_loop)
4692     *at_loop = loop_for_initial_load;
4693
4694   if (loop_for_initial_load)
4695     pe = loop_preheader_edge (loop_for_initial_load);
4696
4697   /* 3. For the case of the optimized realignment, create the first vector
4698       load at the loop preheader.  */
4699
4700   if (alignment_support_scheme == dr_explicit_realign_optimized)
4701     {
4702       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4703
4704       gcc_assert (!compute_in_loop);
4705       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4706       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4707                                       NULL_TREE, &init_addr, NULL, &inc,
4708                                       true, &inv_p);
4709       new_temp = copy_ssa_name (ptr, NULL);
4710       new_stmt = gimple_build_assign_with_ops
4711                    (BIT_AND_EXPR, new_temp, ptr,
4712                     build_int_cst (TREE_TYPE (ptr),
4713                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4714       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4715       gcc_assert (!new_bb);
4716       data_ref
4717         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4718                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4719       new_stmt = gimple_build_assign (vec_dest, data_ref);
4720       new_temp = make_ssa_name (vec_dest, new_stmt);
4721       gimple_assign_set_lhs (new_stmt, new_temp);
4722       if (pe)
4723         {
4724           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4725           gcc_assert (!new_bb);
4726         }
4727       else
4728          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4729
4730       msq_init = gimple_assign_lhs (new_stmt);
4731     }
4732
4733   /* 4. Create realignment token using a target builtin, if available.
4734       It is done either inside the containing loop, or before LOOP (as
4735       determined above).  */
4736
4737   if (targetm.vectorize.builtin_mask_for_load)
4738     {
4739       tree builtin_decl;
4740
4741       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4742       if (!init_addr)
4743         {
4744           /* Generate the INIT_ADDR computation outside LOOP.  */
4745           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4746                                                         NULL_TREE, loop);
4747           if (loop)
4748             {
4749               pe = loop_preheader_edge (loop);
4750               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4751               gcc_assert (!new_bb);
4752             }
4753           else
4754              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4755         }
4756
4757       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4758       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4759       vec_dest =
4760         vect_create_destination_var (scalar_dest,
4761                                      gimple_call_return_type (new_stmt));
4762       new_temp = make_ssa_name (vec_dest, new_stmt);
4763       gimple_call_set_lhs (new_stmt, new_temp);
4764
4765       if (compute_in_loop)
4766         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4767       else
4768         {
4769           /* Generate the misalignment computation outside LOOP.  */
4770           pe = loop_preheader_edge (loop);
4771           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4772           gcc_assert (!new_bb);
4773         }
4774
4775       *realignment_token = gimple_call_lhs (new_stmt);
4776
4777       /* The result of the CALL_EXPR to this builtin is determined from
4778          the value of the parameter and no global variables are touched
4779          which makes the builtin a "const" function.  Requiring the
4780          builtin to have the "const" attribute makes it unnecessary
4781          to call mark_call_clobbered.  */
4782       gcc_assert (TREE_READONLY (builtin_decl));
4783     }
4784
4785   if (alignment_support_scheme == dr_explicit_realign)
4786     return msq;
4787
4788   gcc_assert (!compute_in_loop);
4789   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4790
4791
4792   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4793
4794   pe = loop_preheader_edge (containing_loop);
4795   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4796   msq = make_ssa_name (vec_dest, NULL);
4797   phi_stmt = create_phi_node (msq, containing_loop->header);
4798   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4799
4800   return msq;
4801 }
4802
4803
4804 /* Function vect_grouped_load_supported.
4805
4806    Returns TRUE if even and odd permutations are supported,
4807    and FALSE otherwise.  */
4808
4809 bool
4810 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4811 {
4812   enum machine_mode mode = TYPE_MODE (vectype);
4813
4814   /* vect_permute_load_chain requires the group size to be equal to 3 or
4815      be a power of two.  */
4816   if (count != 3 && exact_log2 (count) == -1)
4817     {
4818       if (dump_enabled_p ())
4819         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4820                          "the size of the group of accesses"
4821                          " is not a power of 2 or not equal to 3\n");
4822       return false;
4823     }
4824
4825   /* Check that the permutation is supported.  */
4826   if (VECTOR_MODE_P (mode))
4827     {
4828       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4829       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4830
4831       if (count == 3)
4832         {
4833           unsigned int k;
4834           for (k = 0; k < 3; k++)
4835             {
4836               for (i = 0; i < nelt; i++)
4837                 if (3 * i + k < 2 * nelt)
4838                   sel[i] = 3 * i + k;
4839                 else
4840                   sel[i] = 0;
4841               if (!can_vec_perm_p (mode, false, sel))
4842                 {
4843                   if (dump_enabled_p ())
4844                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4845                                      "shuffle of 3 loads is not supported by"
4846                                      " target\n");
4847                     return false;
4848                 }
4849               for (i = 0, j = 0; i < nelt; i++)
4850                 if (3 * i + k < 2 * nelt)
4851                   sel[i] = i;
4852                 else
4853                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
4854               if (!can_vec_perm_p (mode, false, sel))
4855                 {
4856                   if (dump_enabled_p ())
4857                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4858                                      "shuffle of 3 loads is not supported by"
4859                                      " target\n");
4860                   return false;
4861                 }
4862             }
4863           return true;
4864         }
4865       else
4866         {
4867           /* If length is not equal to 3 then only power of 2 is supported.  */
4868           gcc_assert (exact_log2 (count) != -1);
4869           for (i = 0; i < nelt; i++)
4870             sel[i] = i * 2;
4871           if (can_vec_perm_p (mode, false, sel))
4872             {
4873               for (i = 0; i < nelt; i++)
4874                 sel[i] = i * 2 + 1;
4875               if (can_vec_perm_p (mode, false, sel))
4876                 return true;
4877             }
4878         }
4879     }
4880
4881   if (dump_enabled_p ())
4882     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4883                      "extract even/odd not supported by target\n");
4884   return false;
4885 }
4886
4887 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4888    type VECTYPE.  */
4889
4890 bool
4891 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4892 {
4893   return vect_lanes_optab_supported_p ("vec_load_lanes",
4894                                        vec_load_lanes_optab,
4895                                        vectype, count);
4896 }
4897
4898 /* Function vect_permute_load_chain.
4899
4900    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4901    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
4902    the input data correctly.  Return the final references for loads in
4903    RESULT_CHAIN.
4904
4905    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4906    The input is 4 vectors each containing 8 elements. We assign a number to each
4907    element, the input sequence is:
4908
4909    1st vec:   0  1  2  3  4  5  6  7
4910    2nd vec:   8  9 10 11 12 13 14 15
4911    3rd vec:  16 17 18 19 20 21 22 23
4912    4th vec:  24 25 26 27 28 29 30 31
4913
4914    The output sequence should be:
4915
4916    1st vec:  0 4  8 12 16 20 24 28
4917    2nd vec:  1 5  9 13 17 21 25 29
4918    3rd vec:  2 6 10 14 18 22 26 30
4919    4th vec:  3 7 11 15 19 23 27 31
4920
4921    i.e., the first output vector should contain the first elements of each
4922    interleaving group, etc.
4923
4924    We use extract_even/odd instructions to create such output.  The input of
4925    each extract_even/odd operation is two vectors
4926    1st vec    2nd vec
4927    0 1 2 3    4 5 6 7
4928
4929    and the output is the vector of extracted even/odd elements.  The output of
4930    extract_even will be:   0 2 4 6
4931    and of extract_odd:     1 3 5 7
4932
4933
4934    The permutation is done in log LENGTH stages.  In each stage extract_even
4935    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4936    their order.  In our example,
4937
4938    E1: extract_even (1st vec, 2nd vec)
4939    E2: extract_odd (1st vec, 2nd vec)
4940    E3: extract_even (3rd vec, 4th vec)
4941    E4: extract_odd (3rd vec, 4th vec)
4942
4943    The output for the first stage will be:
4944
4945    E1:  0  2  4  6  8 10 12 14
4946    E2:  1  3  5  7  9 11 13 15
4947    E3: 16 18 20 22 24 26 28 30
4948    E4: 17 19 21 23 25 27 29 31
4949
4950    In order to proceed and create the correct sequence for the next stage (or
4951    for the correct output, if the second stage is the last one, as in our
4952    example), we first put the output of extract_even operation and then the
4953    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4954    The input for the second stage is:
4955
4956    1st vec (E1):  0  2  4  6  8 10 12 14
4957    2nd vec (E3): 16 18 20 22 24 26 28 30
4958    3rd vec (E2):  1  3  5  7  9 11 13 15
4959    4th vec (E4): 17 19 21 23 25 27 29 31
4960
4961    The output of the second stage:
4962
4963    E1: 0 4  8 12 16 20 24 28
4964    E2: 2 6 10 14 18 22 26 30
4965    E3: 1 5  9 13 17 21 25 29
4966    E4: 3 7 11 15 19 23 27 31
4967
4968    And RESULT_CHAIN after reordering:
4969
4970    1st vec (E1):  0 4  8 12 16 20 24 28
4971    2nd vec (E3):  1 5  9 13 17 21 25 29
4972    3rd vec (E2):  2 6 10 14 18 22 26 30
4973    4th vec (E4):  3 7 11 15 19 23 27 31.  */
4974
4975 static void
4976 vect_permute_load_chain (vec<tree> dr_chain,
4977                          unsigned int length,
4978                          gimple stmt,
4979                          gimple_stmt_iterator *gsi,
4980                          vec<tree> *result_chain)
4981 {
4982   tree data_ref, first_vect, second_vect;
4983   tree perm_mask_even, perm_mask_odd;
4984   tree perm3_mask_low, perm3_mask_high;
4985   gimple perm_stmt;
4986   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4987   unsigned int i, j, log_length = exact_log2 (length);
4988   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4989   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4990
4991   result_chain->quick_grow (length);
4992   memcpy (result_chain->address (), dr_chain.address (),
4993           length * sizeof (tree));
4994
4995   if (length == 3)
4996     {
4997       unsigned int k;
4998
4999       for (k = 0; k < 3; k++)
5000         {
5001           for (i = 0; i < nelt; i++)
5002             if (3 * i + k < 2 * nelt)
5003               sel[i] = 3 * i + k;
5004             else
5005               sel[i] = 0;
5006           perm3_mask_low = vect_gen_perm_mask (vectype, sel);
5007           gcc_assert (perm3_mask_low != NULL);
5008
5009           for (i = 0, j = 0; i < nelt; i++)
5010             if (3 * i + k < 2 * nelt)
5011               sel[i] = i;
5012             else
5013               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5014
5015           perm3_mask_high = vect_gen_perm_mask (vectype, sel);
5016           gcc_assert (perm3_mask_high != NULL);
5017
5018           first_vect = dr_chain[0];
5019           second_vect = dr_chain[1];
5020
5021           /* Create interleaving stmt (low part of):
5022              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5023                                                              ...}>  */
5024           data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_low");
5025           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5026                                                     first_vect, second_vect,
5027                                                     perm3_mask_low);
5028           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5029
5030           /* Create interleaving stmt (high part of):
5031              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5032                                                               ...}>  */
5033           first_vect = data_ref;
5034           second_vect = dr_chain[2];
5035           data_ref = make_temp_ssa_name (vectype, NULL, "vect_suffle3_high");
5036           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5037                                                     first_vect, second_vect,
5038                                                     perm3_mask_high);
5039           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5040           (*result_chain)[k] = data_ref;
5041         }
5042     }
5043   else
5044     {
5045       /* If length is not equal to 3 then only power of 2 is supported.  */
5046       gcc_assert (exact_log2 (length) != -1);
5047
5048       for (i = 0; i < nelt; ++i)
5049         sel[i] = i * 2;
5050       perm_mask_even = vect_gen_perm_mask (vectype, sel);
5051       gcc_assert (perm_mask_even != NULL);
5052
5053       for (i = 0; i < nelt; ++i)
5054         sel[i] = i * 2 + 1;
5055       perm_mask_odd = vect_gen_perm_mask (vectype, sel);
5056       gcc_assert (perm_mask_odd != NULL);
5057
5058       for (i = 0; i < log_length; i++)
5059         {
5060           for (j = 0; j < length; j += 2)
5061             {
5062               first_vect = dr_chain[j];
5063               second_vect = dr_chain[j+1];
5064
5065               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5066               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5067               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5068                                                         first_vect, second_vect,
5069                                                         perm_mask_even);
5070               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5071               (*result_chain)[j/2] = data_ref;
5072
5073               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5074               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5075               perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
5076                                                         first_vect, second_vect,
5077                                                         perm_mask_odd);
5078               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5079               (*result_chain)[j/2+length/2] = data_ref;
5080             }
5081           memcpy (dr_chain.address (), result_chain->address (),
5082                   length * sizeof (tree));
5083         }
5084     }
5085 }
5086
5087 /* Function vect_transform_grouped_load.
5088
5089    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5090    to perform their permutation and ascribe the result vectorized statements to
5091    the scalar statements.
5092 */
5093
5094 void
5095 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5096                              gimple_stmt_iterator *gsi)
5097 {
5098   vec<tree> result_chain = vNULL;
5099
5100   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5101      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5102      vectors, that are ready for vector computation.  */
5103   result_chain.create (size);
5104   vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5105   vect_record_grouped_load_vectors (stmt, result_chain);
5106   result_chain.release ();
5107 }
5108
5109 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5110    generated as part of the vectorization of STMT.  Assign the statement
5111    for each vector to the associated scalar statement.  */
5112
5113 void
5114 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5115 {
5116   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5117   gimple next_stmt, new_stmt;
5118   unsigned int i, gap_count;
5119   tree tmp_data_ref;
5120
5121   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5122      Since we scan the chain starting from it's first node, their order
5123      corresponds the order of data-refs in RESULT_CHAIN.  */
5124   next_stmt = first_stmt;
5125   gap_count = 1;
5126   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5127     {
5128       if (!next_stmt)
5129         break;
5130
5131       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5132        code elimination pass later.  No need to check for the first stmt in
5133        the group, since it always exists.
5134        GROUP_GAP is the number of steps in elements from the previous
5135        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5136        correspond to the gaps.  */
5137       if (next_stmt != first_stmt
5138           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5139       {
5140         gap_count++;
5141         continue;
5142       }
5143
5144       while (next_stmt)
5145         {
5146           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5147           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5148              copies, and we put the new vector statement in the first available
5149              RELATED_STMT.  */
5150           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5151             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5152           else
5153             {
5154               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5155                 {
5156                   gimple prev_stmt =
5157                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5158                   gimple rel_stmt =
5159                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5160                   while (rel_stmt)
5161                     {
5162                       prev_stmt = rel_stmt;
5163                       rel_stmt =
5164                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5165                     }
5166
5167                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5168                     new_stmt;
5169                 }
5170             }
5171
5172           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5173           gap_count = 1;
5174           /* If NEXT_STMT accesses the same DR as the previous statement,
5175              put the same TMP_DATA_REF as its vectorized statement; otherwise
5176              get the next data-ref from RESULT_CHAIN.  */
5177           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5178             break;
5179         }
5180     }
5181 }
5182
5183 /* Function vect_force_dr_alignment_p.
5184
5185    Returns whether the alignment of a DECL can be forced to be aligned
5186    on ALIGNMENT bit boundary.  */
5187
5188 bool
5189 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5190 {
5191   if (TREE_CODE (decl) != VAR_DECL)
5192     return false;
5193
5194   /* We cannot change alignment of common or external symbols as another
5195      translation unit may contain a definition with lower alignment.
5196      The rules of common symbol linking mean that the definition
5197      will override the common symbol.  The same is true for constant
5198      pool entries which may be shared and are not properly merged
5199      by LTO.  */
5200   if (DECL_EXTERNAL (decl)
5201       || DECL_COMMON (decl)
5202       || DECL_IN_CONSTANT_POOL (decl))
5203     return false;
5204
5205   if (TREE_ASM_WRITTEN (decl))
5206     return false;
5207
5208   /* Do not override the alignment as specified by the ABI when the used
5209      attribute is set.  */
5210   if (DECL_PRESERVE_P (decl))
5211     return false;
5212
5213   /* Do not override explicit alignment set by the user when an explicit
5214      section name is also used.  This is a common idiom used by many
5215      software projects.  */
5216   if (TREE_STATIC (decl)
5217       && DECL_SECTION_NAME (decl) != NULL_TREE
5218       && !symtab_get_node (decl)->implicit_section)
5219     return false;
5220
5221   if (TREE_STATIC (decl))
5222     return (alignment <= MAX_OFILE_ALIGNMENT);
5223   else
5224     return (alignment <= MAX_STACK_ALIGNMENT);
5225 }
5226
5227
5228 /* Return whether the data reference DR is supported with respect to its
5229    alignment.
5230    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5231    it is aligned, i.e., check if it is possible to vectorize it with different
5232    alignment.  */
5233
5234 enum dr_alignment_support
5235 vect_supportable_dr_alignment (struct data_reference *dr,
5236                                bool check_aligned_accesses)
5237 {
5238   gimple stmt = DR_STMT (dr);
5239   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5240   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5241   enum machine_mode mode = TYPE_MODE (vectype);
5242   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5243   struct loop *vect_loop = NULL;
5244   bool nested_in_vect_loop = false;
5245
5246   if (aligned_access_p (dr) && !check_aligned_accesses)
5247     return dr_aligned;
5248
5249   /* For now assume all conditional loads/stores support unaligned
5250      access without any special code.  */
5251   if (is_gimple_call (stmt)
5252       && gimple_call_internal_p (stmt)
5253       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5254           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5255     return dr_unaligned_supported;
5256
5257   if (loop_vinfo)
5258     {
5259       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5260       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5261     }
5262
5263   /* Possibly unaligned access.  */
5264
5265   /* We can choose between using the implicit realignment scheme (generating
5266      a misaligned_move stmt) and the explicit realignment scheme (generating
5267      aligned loads with a REALIGN_LOAD).  There are two variants to the
5268      explicit realignment scheme: optimized, and unoptimized.
5269      We can optimize the realignment only if the step between consecutive
5270      vector loads is equal to the vector size.  Since the vector memory
5271      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5272      is guaranteed that the misalignment amount remains the same throughout the
5273      execution of the vectorized loop.  Therefore, we can create the
5274      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5275      at the loop preheader.
5276
5277      However, in the case of outer-loop vectorization, when vectorizing a
5278      memory access in the inner-loop nested within the LOOP that is now being
5279      vectorized, while it is guaranteed that the misalignment of the
5280      vectorized memory access will remain the same in different outer-loop
5281      iterations, it is *not* guaranteed that is will remain the same throughout
5282      the execution of the inner-loop.  This is because the inner-loop advances
5283      with the original scalar step (and not in steps of VS).  If the inner-loop
5284      step happens to be a multiple of VS, then the misalignment remains fixed
5285      and we can use the optimized realignment scheme.  For example:
5286
5287       for (i=0; i<N; i++)
5288         for (j=0; j<M; j++)
5289           s += a[i+j];
5290
5291      When vectorizing the i-loop in the above example, the step between
5292      consecutive vector loads is 1, and so the misalignment does not remain
5293      fixed across the execution of the inner-loop, and the realignment cannot
5294      be optimized (as illustrated in the following pseudo vectorized loop):
5295
5296       for (i=0; i<N; i+=4)
5297         for (j=0; j<M; j++){
5298           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5299                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5300                          // (assuming that we start from an aligned address).
5301           }
5302
5303      We therefore have to use the unoptimized realignment scheme:
5304
5305       for (i=0; i<N; i+=4)
5306           for (j=k; j<M; j+=4)
5307           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5308                            // that the misalignment of the initial address is
5309                            // 0).
5310
5311      The loop can then be vectorized as follows:
5312
5313       for (k=0; k<4; k++){
5314         rt = get_realignment_token (&vp[k]);
5315         for (i=0; i<N; i+=4){
5316           v1 = vp[i+k];
5317           for (j=k; j<M; j+=4){
5318             v2 = vp[i+j+VS-1];
5319             va = REALIGN_LOAD <v1,v2,rt>;
5320             vs += va;
5321             v1 = v2;
5322           }
5323         }
5324     } */
5325
5326   if (DR_IS_READ (dr))
5327     {
5328       bool is_packed = false;
5329       tree type = (TREE_TYPE (DR_REF (dr)));
5330
5331       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5332           && (!targetm.vectorize.builtin_mask_for_load
5333               || targetm.vectorize.builtin_mask_for_load ()))
5334         {
5335           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5336           if ((nested_in_vect_loop
5337                && (TREE_INT_CST_LOW (DR_STEP (dr))
5338                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5339               || !loop_vinfo)
5340             return dr_explicit_realign;
5341           else
5342             return dr_explicit_realign_optimized;
5343         }
5344       if (!known_alignment_for_access_p (dr))
5345         is_packed = not_size_aligned (DR_REF (dr));
5346
5347       if ((TYPE_USER_ALIGN (type) && !is_packed)
5348           || targetm.vectorize.
5349                support_vector_misalignment (mode, type,
5350                                             DR_MISALIGNMENT (dr), is_packed))
5351         /* Can't software pipeline the loads, but can at least do them.  */
5352         return dr_unaligned_supported;
5353     }
5354   else
5355     {
5356       bool is_packed = false;
5357       tree type = (TREE_TYPE (DR_REF (dr)));
5358
5359       if (!known_alignment_for_access_p (dr))
5360         is_packed = not_size_aligned (DR_REF (dr));
5361
5362      if ((TYPE_USER_ALIGN (type) && !is_packed)
5363          || targetm.vectorize.
5364               support_vector_misalignment (mode, type,
5365                                            DR_MISALIGNMENT (dr), is_packed))
5366        return dr_unaligned_supported;
5367     }
5368
5369   /* Unsupported.  */
5370   return dr_unaligned_unsupported;
5371 }