gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "backend.h"
  27 #include "predict.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "rtl.h"
  31 #include "ssa.h"
  32 #include "alias.h"
  33 #include "fold-const.h"
  34 #include "stor-layout.h"
  35 #include "tm_p.h"
  36 #include "target.h"
  37 #include "gimple-pretty-print.h"
  38 #include "internal-fn.h"
  39 #include "tree-eh.h"
  40 #include "gimplify.h"
  41 #include "gimple-iterator.h"
  42 #include "gimplify-me.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "tree-ssa-loop-manip.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-chrec.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "diagnostic-core.h"
  51 #include "cgraph.h"
  52 #include "expr.h"
  53 #include "insn-codes.h"
  54 #include "optabs-tree.h"
  55 #include "builtins.h"
  56 #include "params.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   limit_p = !targetm.array_mode_supported_p (mode, count);
  70   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  71                               MODE_INT, limit_p);
  72
  73   if (array_mode == BLKmode)
  74     {
  75       if (dump_enabled_p ())
  76         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  78                          GET_MODE_NAME (mode), count);
  79       return false;
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 119                                HOST_WIDE_INT *rhs_size_unit)
 120 {
 121   tree scalar_type = gimple_expr_type (stmt);
 122   HOST_WIDE_INT lhs, rhs;
 123
 124   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 125
 126   if (is_gimple_assign (stmt)
 127       && (gimple_assign_cast_p (stmt)
 128           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 129           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 130           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 131     {
 132       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 133
 134       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 135       if (rhs < lhs)
 136         scalar_type = rhs_type;
 137     }
 138
 139   *lhs_size_unit = lhs;
 140   *rhs_size_unit = rhs;
 141   return scalar_type;
 142 }
 143
 144
 145 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 146    tested at run-time.  Return TRUE if DDR was successfully inserted.
 147    Return false if versioning is not supported.  */
 148
 149 static bool
 150 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 151 {
 152   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 153
 154   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 155     return false;
 156
 157   if (dump_enabled_p ())
 158     {
 159       dump_printf_loc (MSG_NOTE, vect_location,
 160                        "mark for run-time aliasing test between ");
 161       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 162       dump_printf (MSG_NOTE,  " and ");
 163       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 164       dump_printf (MSG_NOTE, "\n");
 165     }
 166
 167   if (optimize_loop_nest_for_size_p (loop))
 168     {
 169       if (dump_enabled_p ())
 170         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 171                          "versioning not supported when optimizing"
 172                          " for size.\n");
 173       return false;
 174     }
 175
 176   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 177   if (loop->inner)
 178     {
 179       if (dump_enabled_p ())
 180         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 181                          "versioning not yet supported for outer-loops.\n");
 182       return false;
 183     }
 184
 185   /* FORNOW: We don't support creating runtime alias tests for non-constant
 186      step.  */
 187   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 188       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 189     {
 190       if (dump_enabled_p ())
 191         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 192                          "versioning not yet supported for non-constant "
 193                          "step\n");
 194       return false;
 195     }
 196
 197   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 198   return true;
 199 }
 200
 201
 202 /* Function vect_analyze_data_ref_dependence.
 203
 204    Return TRUE if there (might) exist a dependence between a memory-reference
 205    DRA and a memory-reference DRB.  When versioning for alias may check a
 206    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 207    the data dependence.  */
 208
 209 static bool
 210 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 211                                   loop_vec_info loop_vinfo, int *max_vf)
 212 {
 213   unsigned int i;
 214   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 215   struct data_reference *dra = DDR_A (ddr);
 216   struct data_reference *drb = DDR_B (ddr);
 217   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 218   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 219   lambda_vector dist_v;
 220   unsigned int loop_depth;
 221
 222   /* In loop analysis all data references should be vectorizable.  */
 223   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 224       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 225     gcc_unreachable ();
 226
 227   /* Independent data accesses.  */
 228   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 229     return false;
 230
 231   if (dra == drb
 232       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 233     return false;
 234
 235   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 236      least two scalar iterations, there is always also a true dependence.
 237      As the vectorizer does not re-order loads and stores we can ignore
 238      the anti-dependence if TBAA can disambiguate both DRs similar to the
 239      case with known negative distance anti-dependences (positive
 240      distance anti-dependences would violate TBAA constraints).  */
 241   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 242        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 243       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 244                                  get_alias_set (DR_REF (drb))))
 245     return false;
 246
 247   /* Unknown data dependence.  */
 248   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 249     {
 250       /* If user asserted safelen consecutive iterations can be
 251          executed concurrently, assume independence.  */
 252       if (loop->safelen >= 2)
 253         {
 254           if (loop->safelen < *max_vf)
 255             *max_vf = loop->safelen;
 256           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 257           return false;
 258         }
 259
 260       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 261           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 262         {
 263           if (dump_enabled_p ())
 264             {
 265               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 266                                "versioning for alias not supported for: "
 267                                "can't determine dependence between ");
 268               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 269                                  DR_REF (dra));
 270               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 271               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 272                                  DR_REF (drb));
 273               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 274             }
 275           return true;
 276         }
 277
 278       if (dump_enabled_p ())
 279         {
 280           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 281                            "versioning for alias required: "
 282                            "can't determine dependence between ");
 283           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 284                              DR_REF (dra));
 285           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 286           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 287                              DR_REF (drb));
 288           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 289         }
 290
 291       /* Add to list of ddrs that need to be tested at run-time.  */
 292       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 293     }
 294
 295   /* Known data dependence.  */
 296   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 297     {
 298       /* If user asserted safelen consecutive iterations can be
 299          executed concurrently, assume independence.  */
 300       if (loop->safelen >= 2)
 301         {
 302           if (loop->safelen < *max_vf)
 303             *max_vf = loop->safelen;
 304           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 305           return false;
 306         }
 307
 308       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 309           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 310         {
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 314                                "versioning for alias not supported for: "
 315                                "bad dist vector for ");
 316               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 317                                  DR_REF (dra));
 318               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 319               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 320                                  DR_REF (drb));
 321               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 322             }
 323           return true;
 324         }
 325
 326       if (dump_enabled_p ())
 327         {
 328           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 329                            "versioning for alias required: "
 330                            "bad dist vector for ");
 331           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 332           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 333           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 334           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 335         }
 336       /* Add to list of ddrs that need to be tested at run-time.  */
 337       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 338     }
 339
 340   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 341   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 342     {
 343       int dist = dist_v[loop_depth];
 344
 345       if (dump_enabled_p ())
 346         dump_printf_loc (MSG_NOTE, vect_location,
 347                          "dependence distance  = %d.\n", dist);
 348
 349       if (dist == 0)
 350         {
 351           if (dump_enabled_p ())
 352             {
 353               dump_printf_loc (MSG_NOTE, vect_location,
 354                                "dependence distance == 0 between ");
 355               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 356               dump_printf (MSG_NOTE, " and ");
 357               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 358               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 359             }
 360
 361           /* When we perform grouped accesses and perform implicit CSE
 362              by detecting equal accesses and doing disambiguation with
 363              runtime alias tests like for
 364                 .. = a[i];
 365                 .. = a[i+1];
 366                 a[i] = ..;
 367                 a[i+1] = ..;
 368                 *p = ..;
 369                 .. = a[i];
 370                 .. = a[i+1];
 371              where we will end up loading { a[i], a[i+1] } once, make
 372              sure that inserting group loads before the first load and
 373              stores after the last store will do the right thing.
 374              Similar for groups like
 375                 a[i] = ...;
 376                 ... = a[i];
 377                 a[i+1] = ...;
 378              where loads from the group interleave with the store.  */
 379           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 380               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 381             {
 382               gimple *earlier_stmt;
 383               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 384               if (DR_IS_WRITE
 385                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 386                 {
 387                   if (dump_enabled_p ())
 388                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                                      "READ_WRITE dependence in interleaving."
 390                                      "\n");
 391                   return true;
 392                 }
 393             }
 394
 395           continue;
 396         }
 397
 398       if (dist > 0 && DDR_REVERSED_P (ddr))
 399         {
 400           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 401              reversed (to make distance vector positive), and the actual
 402              distance is negative.  */
 403           if (dump_enabled_p ())
 404             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 405                              "dependence distance negative.\n");
 406           /* Record a negative dependence distance to later limit the
 407              amount of stmt copying / unrolling we can perform.
 408              Only need to handle read-after-write dependence.  */
 409           if (DR_IS_READ (drb)
 410               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 411                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 412             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 413           continue;
 414         }
 415
 416       if (abs (dist) >= 2
 417           && abs (dist) < *max_vf)
 418         {
 419           /* The dependence distance requires reduction of the maximal
 420              vectorization factor.  */
 421           *max_vf = abs (dist);
 422           if (dump_enabled_p ())
 423             dump_printf_loc (MSG_NOTE, vect_location,
 424                              "adjusting maximal vectorization factor to %i\n",
 425                              *max_vf);
 426         }
 427
 428       if (abs (dist) >= *max_vf)
 429         {
 430           /* Dependence distance does not create dependence, as far as
 431              vectorization is concerned, in this case.  */
 432           if (dump_enabled_p ())
 433             dump_printf_loc (MSG_NOTE, vect_location,
 434                              "dependence distance >= VF.\n");
 435           continue;
 436         }
 437
 438       if (dump_enabled_p ())
 439         {
 440           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 441                        "not vectorized, possible dependence "
 442                        "between data-refs ");
 443           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 444           dump_printf (MSG_NOTE,  " and ");
 445           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 446           dump_printf (MSG_NOTE,  "\n");
 447         }
 448
 449       return true;
 450     }
 451
 452   return false;
 453 }
 454
 455 /* Function vect_analyze_data_ref_dependences.
 456
 457    Examine all the data references in the loop, and make sure there do not
 458    exist any data dependences between them.  Set *MAX_VF according to
 459    the maximum vectorization factor the data dependences allow.  */
 460
 461 bool
 462 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 463 {
 464   unsigned int i;
 465   struct data_dependence_relation *ddr;
 466
 467   if (dump_enabled_p ())
 468     dump_printf_loc (MSG_NOTE, vect_location,
 469                      "=== vect_analyze_data_ref_dependences ===\n");
 470
 471   LOOP_VINFO_DDRS (loop_vinfo)
 472     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 473              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 474   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 475   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 476                                 &LOOP_VINFO_DDRS (loop_vinfo),
 477                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 478     return false;
 479
 480   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 481     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 482       return false;
 483
 484   return true;
 485 }
 486
 487
 488 /* Function vect_slp_analyze_data_ref_dependence.
 489
 490    Return TRUE if there (might) exist a dependence between a memory-reference
 491    DRA and a memory-reference DRB.  When versioning for alias may check a
 492    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 493    the data dependence.  */
 494
 495 static bool
 496 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 497 {
 498   struct data_reference *dra = DDR_A (ddr);
 499   struct data_reference *drb = DDR_B (ddr);
 500
 501   /* We need to check dependences of statements marked as unvectorizable
 502      as well, they still can prohibit vectorization.  */
 503
 504   /* Independent data accesses.  */
 505   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 506     return false;
 507
 508   if (dra == drb)
 509     return false;
 510
 511   /* Read-read is OK.  */
 512   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 513     return false;
 514
 515   /* If dra and drb are part of the same interleaving chain consider
 516      them independent.  */
 517   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 518       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 519           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 520     return false;
 521
 522   /* Unknown data dependence.  */
 523   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 524     {
 525       if  (dump_enabled_p ())
 526         {
 527           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 528                            "can't determine dependence between ");
 529           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 530           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 531           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 532           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 533         }
 534     }
 535   else if (dump_enabled_p ())
 536     {
 537       dump_printf_loc (MSG_NOTE, vect_location,
 538                        "determined dependence between ");
 539       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 540       dump_printf (MSG_NOTE, " and ");
 541       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 542       dump_printf (MSG_NOTE,  "\n");
 543     }
 544
 545   /* We do not vectorize basic blocks with write-write dependencies.  */
 546   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 547     return true;
 548
 549   /* If we have a read-write dependence check that the load is before the store.
 550      When we vectorize basic blocks, vector load can be only before
 551      corresponding scalar load, and vector store can be only after its
 552      corresponding scalar store.  So the order of the acceses is preserved in
 553      case the load is before the store.  */
 554   gimple *earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 555   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 556     {
 557       /* That only holds for load-store pairs taking part in vectorization.  */
 558       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 559           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 560         return false;
 561     }
 562
 563   return true;
 564 }
 565
 566
 567 /* Function vect_analyze_data_ref_dependences.
 568
 569    Examine all the data references in the basic-block, and make sure there
 570    do not exist any data dependences between them.  Set *MAX_VF according to
 571    the maximum vectorization factor the data dependences allow.  */
 572
 573 bool
 574 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 575 {
 576   struct data_dependence_relation *ddr;
 577   unsigned int i;
 578
 579   if (dump_enabled_p ())
 580     dump_printf_loc (MSG_NOTE, vect_location,
 581                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 582
 583   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 584                                 &BB_VINFO_DDRS (bb_vinfo),
 585                                 vNULL, true))
 586     return false;
 587
 588   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 589     if (vect_slp_analyze_data_ref_dependence (ddr))
 590       return false;
 591
 592   return true;
 593 }
 594
 595
 596 /* Function vect_compute_data_ref_alignment
 597
 598    Compute the misalignment of the data reference DR.
 599
 600    Output:
 601    1. If during the misalignment computation it is found that the data reference
 602       cannot be vectorized then false is returned.
 603    2. DR_MISALIGNMENT (DR) is defined.
 604
 605    FOR NOW: No analysis is actually performed. Misalignment is calculated
 606    only for trivial cases. TODO.  */
 607
 608 static bool
 609 vect_compute_data_ref_alignment (struct data_reference *dr)
 610 {
 611   gimple *stmt = DR_STMT (dr);
 612   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 613   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 614   struct loop *loop = NULL;
 615   tree ref = DR_REF (dr);
 616   tree vectype;
 617   tree base, base_addr;
 618   tree misalign = NULL_TREE;
 619   tree aligned_to;
 620   unsigned HOST_WIDE_INT alignment;
 621
 622   if (dump_enabled_p ())
 623     dump_printf_loc (MSG_NOTE, vect_location,
 624                      "vect_compute_data_ref_alignment:\n");
 625
 626   if (loop_vinfo)
 627     loop = LOOP_VINFO_LOOP (loop_vinfo);
 628
 629   /* Initialize misalignment to unknown.  */
 630   SET_DR_MISALIGNMENT (dr, -1);
 631
 632   /* Strided accesses perform only component accesses, misalignment information
 633      is irrelevant for them.  */
 634   if (STMT_VINFO_STRIDED_P (stmt_info)
 635       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 636     return true;
 637
 638   if (tree_fits_shwi_p (DR_STEP (dr)))
 639     misalign = DR_INIT (dr);
 640   aligned_to = DR_ALIGNED_TO (dr);
 641   base_addr = DR_BASE_ADDRESS (dr);
 642   vectype = STMT_VINFO_VECTYPE (stmt_info);
 643
 644   /* In case the dataref is in an inner-loop of the loop that is being
 645      vectorized (LOOP), we use the base and misalignment information
 646      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 647      stays the same throughout the execution of the inner-loop, which is why
 648      we have to check that the stride of the dataref in the inner-loop evenly
 649      divides by the vector size.  */
 650   if (loop && nested_in_vect_loop_p (loop, stmt))
 651     {
 652       tree step = DR_STEP (dr);
 653
 654       if (tree_fits_shwi_p (step)
 655           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 656         {
 657           if (dump_enabled_p ())
 658             dump_printf_loc (MSG_NOTE, vect_location,
 659                              "inner step divides the vector-size.\n");
 660           misalign = STMT_VINFO_DR_INIT (stmt_info);
 661           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 662           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 663         }
 664       else
 665         {
 666           if (dump_enabled_p ())
 667             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 668                              "inner step doesn't divide the vector-size.\n");
 669           misalign = NULL_TREE;
 670         }
 671     }
 672
 673   /* Similarly we can only use base and misalignment information relative to
 674      an innermost loop if the misalignment stays the same throughout the
 675      execution of the loop.  As above, this is the case if the stride of
 676      the dataref evenly divides by the vector size.  */
 677   else
 678     {
 679       tree step = DR_STEP (dr);
 680       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 681
 682       if (tree_fits_shwi_p (step)
 683           && ((tree_to_shwi (step) * vf)
 684               % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 685         {
 686           if (dump_enabled_p ())
 687             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 688                              "step doesn't divide the vector-size.\n");
 689           misalign = NULL_TREE;
 690         }
 691     }
 692
 693   /* To look at alignment of the base we have to preserve an inner MEM_REF
 694      as that carries alignment information of the actual access.  */
 695   base = ref;
 696   while (handled_component_p (base))
 697     base = TREE_OPERAND (base, 0);
 698   if (TREE_CODE (base) == MEM_REF)
 699     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 700                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 701   unsigned int base_alignment = get_object_alignment (base);
 702
 703   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
 704     DR_VECT_AUX (dr)->base_element_aligned = true;
 705
 706   alignment = TYPE_ALIGN_UNIT (vectype);
 707
 708   if ((compare_tree_int (aligned_to, alignment) < 0)
 709       || !misalign)
 710     {
 711       if (dump_enabled_p ())
 712         {
 713           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 714                            "Unknown alignment for access: ");
 715           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 716           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 717         }
 718       return true;
 719     }
 720
 721   if (base_alignment < TYPE_ALIGN (vectype))
 722     {
 723       /* Strip an inner MEM_REF to a bare decl if possible.  */
 724       if (TREE_CODE (base) == MEM_REF
 725           && integer_zerop (TREE_OPERAND (base, 1))
 726           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 727         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 728
 729       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 730         {
 731           if (dump_enabled_p ())
 732             {
 733               dump_printf_loc (MSG_NOTE, vect_location,
 734                                "can't force alignment of ref: ");
 735               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 736               dump_printf (MSG_NOTE, "\n");
 737             }
 738           return true;
 739         }
 740
 741       /* Force the alignment of the decl.
 742          NOTE: This is the only change to the code we make during
 743          the analysis phase, before deciding to vectorize the loop.  */
 744       if (dump_enabled_p ())
 745         {
 746           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 747           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 748           dump_printf (MSG_NOTE, "\n");
 749         }
 750
 751       DR_VECT_AUX (dr)->base_decl = base;
 752       DR_VECT_AUX (dr)->base_misaligned = true;
 753       DR_VECT_AUX (dr)->base_element_aligned = true;
 754     }
 755
 756   /* If this is a backward running DR then first access in the larger
 757      vectype actually is N-1 elements before the address in the DR.
 758      Adjust misalign accordingly.  */
 759   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 760     {
 761       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 762       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 763          otherwise we wouldn't be here.  */
 764       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 765       /* PLUS because DR_STEP was negative.  */
 766       misalign = size_binop (PLUS_EXPR, misalign, offset);
 767     }
 768
 769   SET_DR_MISALIGNMENT (dr,
 770                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 771
 772   if (dump_enabled_p ())
 773     {
 774       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 775                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 776       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 777       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 778     }
 779
 780   return true;
 781 }
 782
 783
 784 /* Function vect_compute_data_refs_alignment
 785
 786    Compute the misalignment of data references in the loop.
 787    Return FALSE if a data reference is found that cannot be vectorized.  */
 788
 789 static bool
 790 vect_compute_data_refs_alignment (vec_info *vinfo)
 791 {
 792   vec<data_reference_p> datarefs = vinfo->datarefs;
 793   struct data_reference *dr;
 794   unsigned int i;
 795
 796   FOR_EACH_VEC_ELT (datarefs, i, dr)
 797     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 798         && !vect_compute_data_ref_alignment (dr))
 799       {
 800         if (is_a <bb_vec_info> (vinfo))
 801           {
 802             /* Mark unsupported statement as unvectorizable.  */
 803             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 804             continue;
 805           }
 806         else
 807           return false;
 808       }
 809
 810   return true;
 811 }
 812
 813
 814 /* Function vect_update_misalignment_for_peel
 815
 816    DR - the data reference whose misalignment is to be adjusted.
 817    DR_PEEL - the data reference whose misalignment is being made
 818              zero in the vector loop by the peel.
 819    NPEEL - the number of iterations in the peel loop if the misalignment
 820            of DR_PEEL is known at compile time.  */
 821
 822 static void
 823 vect_update_misalignment_for_peel (struct data_reference *dr,
 824                                    struct data_reference *dr_peel, int npeel)
 825 {
 826   unsigned int i;
 827   vec<dr_p> same_align_drs;
 828   struct data_reference *current_dr;
 829   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 830   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 831   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 832   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 833
 834  /* For interleaved data accesses the step in the loop must be multiplied by
 835      the size of the interleaving group.  */
 836   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 837     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 838   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 839     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 840
 841   /* It can be assumed that the data refs with the same alignment as dr_peel
 842      are aligned in the vector loop.  */
 843   same_align_drs
 844     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 845   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 846     {
 847       if (current_dr != dr)
 848         continue;
 849       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 850                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 851       SET_DR_MISALIGNMENT (dr, 0);
 852       return;
 853     }
 854
 855   if (known_alignment_for_access_p (dr)
 856       && known_alignment_for_access_p (dr_peel))
 857     {
 858       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 859       int misal = DR_MISALIGNMENT (dr);
 860       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 861       misal += negative ? -npeel * dr_size : npeel * dr_size;
 862       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 863       SET_DR_MISALIGNMENT (dr, misal);
 864       return;
 865     }
 866
 867   if (dump_enabled_p ())
 868     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 869   SET_DR_MISALIGNMENT (dr, -1);
 870 }
 871
 872
 873 /* Function vect_verify_datarefs_alignment
 874
 875    Return TRUE if all data references in the loop can be
 876    handled with respect to alignment.  */
 877
 878 bool
 879 vect_verify_datarefs_alignment (vec_info *vinfo)
 880 {
 881   vec<data_reference_p> datarefs = vinfo->datarefs;
 882   struct data_reference *dr;
 883   enum dr_alignment_support supportable_dr_alignment;
 884   unsigned int i;
 885
 886   FOR_EACH_VEC_ELT (datarefs, i, dr)
 887     {
 888       gimple *stmt = DR_STMT (dr);
 889       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 890
 891       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 892         continue;
 893
 894       /* For interleaving, only the alignment of the first access matters.
 895          Skip statements marked as not vectorizable.  */
 896       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 897            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 898           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 899         continue;
 900
 901       /* Strided accesses perform only component accesses, alignment is
 902          irrelevant for them.  */
 903       if (STMT_VINFO_STRIDED_P (stmt_info)
 904           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 905         continue;
 906
 907       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 908       if (!supportable_dr_alignment)
 909         {
 910           if (dump_enabled_p ())
 911             {
 912               if (DR_IS_READ (dr))
 913                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 914                                  "not vectorized: unsupported unaligned load.");
 915               else
 916                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 917                                  "not vectorized: unsupported unaligned "
 918                                  "store.");
 919
 920               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 921                                  DR_REF (dr));
 922               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 923             }
 924           return false;
 925         }
 926       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 927         dump_printf_loc (MSG_NOTE, vect_location,
 928                          "Vectorizing an unaligned access.\n");
 929     }
 930   return true;
 931 }
 932
 933 /* Given an memory reference EXP return whether its alignment is less
 934    than its size.  */
 935
 936 static bool
 937 not_size_aligned (tree exp)
 938 {
 939   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 940     return true;
 941
 942   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 943           > get_object_alignment (exp));
 944 }
 945
 946 /* Function vector_alignment_reachable_p
 947
 948    Return true if vector alignment for DR is reachable by peeling
 949    a few loop iterations.  Return false otherwise.  */
 950
 951 static bool
 952 vector_alignment_reachable_p (struct data_reference *dr)
 953 {
 954   gimple *stmt = DR_STMT (dr);
 955   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 956   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 957
 958   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 959     {
 960       /* For interleaved access we peel only if number of iterations in
 961          the prolog loop ({VF - misalignment}), is a multiple of the
 962          number of the interleaved accesses.  */
 963       int elem_size, mis_in_elements;
 964       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 965
 966       /* FORNOW: handle only known alignment.  */
 967       if (!known_alignment_for_access_p (dr))
 968         return false;
 969
 970       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 971       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 972
 973       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
 974         return false;
 975     }
 976
 977   /* If misalignment is known at the compile time then allow peeling
 978      only if natural alignment is reachable through peeling.  */
 979   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
 980     {
 981       HOST_WIDE_INT elmsize =
 982                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
 983       if (dump_enabled_p ())
 984         {
 985           dump_printf_loc (MSG_NOTE, vect_location,
 986                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
 987           dump_printf (MSG_NOTE,
 988                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
 989         }
 990       if (DR_MISALIGNMENT (dr) % elmsize)
 991         {
 992           if (dump_enabled_p ())
 993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 994                              "data size does not divide the misalignment.\n");
 995           return false;
 996         }
 997     }
 998
 999   if (!known_alignment_for_access_p (dr))
1000     {
1001       tree type = TREE_TYPE (DR_REF (dr));
1002       bool is_packed = not_size_aligned (DR_REF (dr));
1003       if (dump_enabled_p ())
1004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1005                          "Unknown misalignment, is_packed = %d\n",is_packed);
1006       if ((TYPE_USER_ALIGN (type) && !is_packed)
1007           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1008         return true;
1009       else
1010         return false;
1011     }
1012
1013   return true;
1014 }
1015
1016
1017 /* Calculate the cost of the memory access represented by DR.  */
1018
1019 static void
1020 vect_get_data_access_cost (struct data_reference *dr,
1021                            unsigned int *inside_cost,
1022                            unsigned int *outside_cost,
1023                            stmt_vector_for_cost *body_cost_vec)
1024 {
1025   gimple *stmt = DR_STMT (dr);
1026   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1027   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1028   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1029   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1030   int ncopies = vf / nunits;
1031
1032   if (DR_IS_READ (dr))
1033     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1034                         NULL, body_cost_vec, false);
1035   else
1036     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1037
1038   if (dump_enabled_p ())
1039     dump_printf_loc (MSG_NOTE, vect_location,
1040                      "vect_get_data_access_cost: inside_cost = %d, "
1041                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1042 }
1043
1044
1045 typedef struct _vect_peel_info
1046 {
1047   int npeel;
1048   struct data_reference *dr;
1049   unsigned int count;
1050 } *vect_peel_info;
1051
1052 typedef struct _vect_peel_extended_info
1053 {
1054   struct _vect_peel_info peel_info;
1055   unsigned int inside_cost;
1056   unsigned int outside_cost;
1057   stmt_vector_for_cost body_cost_vec;
1058 } *vect_peel_extended_info;
1059
1060
1061 /* Peeling hashtable helpers.  */
1062
1063 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1064 {
1065   static inline hashval_t hash (const _vect_peel_info *);
1066   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1067 };
1068
1069 inline hashval_t
1070 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1071 {
1072   return (hashval_t) peel_info->npeel;
1073 }
1074
1075 inline bool
1076 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1077 {
1078   return (a->npeel == b->npeel);
1079 }
1080
1081
1082 /* Insert DR into peeling hash table with NPEEL as key.  */
1083
1084 static void
1085 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1086                           loop_vec_info loop_vinfo, struct data_reference *dr,
1087                           int npeel)
1088 {
1089   struct _vect_peel_info elem, *slot;
1090   _vect_peel_info **new_slot;
1091   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1092
1093   elem.npeel = npeel;
1094   slot = peeling_htab->find (&elem);
1095   if (slot)
1096     slot->count++;
1097   else
1098     {
1099       slot = XNEW (struct _vect_peel_info);
1100       slot->npeel = npeel;
1101       slot->dr = dr;
1102       slot->count = 1;
1103       new_slot = peeling_htab->find_slot (slot, INSERT);
1104       *new_slot = slot;
1105     }
1106
1107   if (!supportable_dr_alignment
1108       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1109     slot->count += VECT_MAX_COST;
1110 }
1111
1112
1113 /* Traverse peeling hash table to find peeling option that aligns maximum
1114    number of data accesses.  */
1115
1116 int
1117 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1118                                      _vect_peel_extended_info *max)
1119 {
1120   vect_peel_info elem = *slot;
1121
1122   if (elem->count > max->peel_info.count
1123       || (elem->count == max->peel_info.count
1124           && max->peel_info.npeel > elem->npeel))
1125     {
1126       max->peel_info.npeel = elem->npeel;
1127       max->peel_info.count = elem->count;
1128       max->peel_info.dr = elem->dr;
1129     }
1130
1131   return 1;
1132 }
1133
1134
1135 /* Traverse peeling hash table and calculate cost for each peeling option.
1136    Find the one with the lowest cost.  */
1137
1138 int
1139 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1140                                    _vect_peel_extended_info *min)
1141 {
1142   vect_peel_info elem = *slot;
1143   int save_misalignment, dummy;
1144   unsigned int inside_cost = 0, outside_cost = 0, i;
1145   gimple *stmt = DR_STMT (elem->dr);
1146   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1147   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1148   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1149   struct data_reference *dr;
1150   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1151
1152   prologue_cost_vec.create (2);
1153   body_cost_vec.create (2);
1154   epilogue_cost_vec.create (2);
1155
1156   FOR_EACH_VEC_ELT (datarefs, i, dr)
1157     {
1158       stmt = DR_STMT (dr);
1159       stmt_info = vinfo_for_stmt (stmt);
1160       /* For interleaving, only the alignment of the first access
1161          matters.  */
1162       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1163           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1164         continue;
1165
1166       save_misalignment = DR_MISALIGNMENT (dr);
1167       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1168       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1169                                  &body_cost_vec);
1170       SET_DR_MISALIGNMENT (dr, save_misalignment);
1171     }
1172
1173   outside_cost += vect_get_known_peeling_cost
1174     (loop_vinfo, elem->npeel, &dummy,
1175      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1176      &prologue_cost_vec, &epilogue_cost_vec);
1177
1178   /* Prologue and epilogue costs are added to the target model later.
1179      These costs depend only on the scalar iteration cost, the
1180      number of peeling iterations finally chosen, and the number of
1181      misaligned statements.  So discard the information found here.  */
1182   prologue_cost_vec.release ();
1183   epilogue_cost_vec.release ();
1184
1185   if (inside_cost < min->inside_cost
1186       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1187     {
1188       min->inside_cost = inside_cost;
1189       min->outside_cost = outside_cost;
1190       min->body_cost_vec.release ();
1191       min->body_cost_vec = body_cost_vec;
1192       min->peel_info.dr = elem->dr;
1193       min->peel_info.npeel = elem->npeel;
1194     }
1195   else
1196     body_cost_vec.release ();
1197
1198   return 1;
1199 }
1200
1201
1202 /* Choose best peeling option by traversing peeling hash table and either
1203    choosing an option with the lowest cost (if cost model is enabled) or the
1204    option that aligns as many accesses as possible.  */
1205
1206 static struct data_reference *
1207 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1208                                        loop_vec_info loop_vinfo,
1209                                        unsigned int *npeel,
1210                                        stmt_vector_for_cost *body_cost_vec)
1211 {
1212    struct _vect_peel_extended_info res;
1213
1214    res.peel_info.dr = NULL;
1215    res.body_cost_vec = stmt_vector_for_cost ();
1216
1217    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1218      {
1219        res.inside_cost = INT_MAX;
1220        res.outside_cost = INT_MAX;
1221        peeling_htab->traverse <_vect_peel_extended_info *,
1222                                vect_peeling_hash_get_lowest_cost> (&res);
1223      }
1224    else
1225      {
1226        res.peel_info.count = 0;
1227        peeling_htab->traverse <_vect_peel_extended_info *,
1228                                vect_peeling_hash_get_most_frequent> (&res);
1229      }
1230
1231    *npeel = res.peel_info.npeel;
1232    *body_cost_vec = res.body_cost_vec;
1233    return res.peel_info.dr;
1234 }
1235
1236
1237 /* Function vect_enhance_data_refs_alignment
1238
1239    This pass will use loop versioning and loop peeling in order to enhance
1240    the alignment of data references in the loop.
1241
1242    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1243    original loop is to be vectorized.  Any other loops that are created by
1244    the transformations performed in this pass - are not supposed to be
1245    vectorized.  This restriction will be relaxed.
1246
1247    This pass will require a cost model to guide it whether to apply peeling
1248    or versioning or a combination of the two.  For example, the scheme that
1249    intel uses when given a loop with several memory accesses, is as follows:
1250    choose one memory access ('p') which alignment you want to force by doing
1251    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1252    other accesses are not necessarily aligned, or (2) use loop versioning to
1253    generate one loop in which all accesses are aligned, and another loop in
1254    which only 'p' is necessarily aligned.
1255
1256    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1257    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1258    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1259
1260    Devising a cost model is the most critical aspect of this work.  It will
1261    guide us on which access to peel for, whether to use loop versioning, how
1262    many versions to create, etc.  The cost model will probably consist of
1263    generic considerations as well as target specific considerations (on
1264    powerpc for example, misaligned stores are more painful than misaligned
1265    loads).
1266
1267    Here are the general steps involved in alignment enhancements:
1268
1269      -- original loop, before alignment analysis:
1270         for (i=0; i<N; i++){
1271           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1272           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1273         }
1274
1275      -- After vect_compute_data_refs_alignment:
1276         for (i=0; i<N; i++){
1277           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1278           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1279         }
1280
1281      -- Possibility 1: we do loop versioning:
1282      if (p is aligned) {
1283         for (i=0; i<N; i++){    # loop 1A
1284           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1285           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1286         }
1287      }
1288      else {
1289         for (i=0; i<N; i++){    # loop 1B
1290           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1291           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1292         }
1293      }
1294
1295      -- Possibility 2: we do loop peeling:
1296      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1297         x = q[i];
1298         p[i] = y;
1299      }
1300      for (i = 3; i < N; i++){   # loop 2A
1301         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1302         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1303      }
1304
1305      -- Possibility 3: combination of loop peeling and versioning:
1306      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1307         x = q[i];
1308         p[i] = y;
1309      }
1310      if (p is aligned) {
1311         for (i = 3; i<N; i++){  # loop 3A
1312           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1313           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1314         }
1315      }
1316      else {
1317         for (i = 3; i<N; i++){  # loop 3B
1318           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1319           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1320         }
1321      }
1322
1323      These loops are later passed to loop_transform to be vectorized.  The
1324      vectorizer will use the alignment information to guide the transformation
1325      (whether to generate regular loads/stores, or with special handling for
1326      misalignment).  */
1327
1328 bool
1329 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1330 {
1331   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1332   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1333   enum dr_alignment_support supportable_dr_alignment;
1334   struct data_reference *dr0 = NULL, *first_store = NULL;
1335   struct data_reference *dr;
1336   unsigned int i, j;
1337   bool do_peeling = false;
1338   bool do_versioning = false;
1339   bool stat;
1340   gimple *stmt;
1341   stmt_vec_info stmt_info;
1342   unsigned int npeel = 0;
1343   bool all_misalignments_unknown = true;
1344   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1345   unsigned possible_npeel_number = 1;
1346   tree vectype;
1347   unsigned int nelements, mis, same_align_drs_max = 0;
1348   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1349   hash_table<peel_info_hasher> peeling_htab (1);
1350
1351   if (dump_enabled_p ())
1352     dump_printf_loc (MSG_NOTE, vect_location,
1353                      "=== vect_enhance_data_refs_alignment ===\n");
1354
1355   /* Reset data so we can safely be called multiple times.  */
1356   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1357   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1358
1359   /* While cost model enhancements are expected in the future, the high level
1360      view of the code at this time is as follows:
1361
1362      A) If there is a misaligned access then see if peeling to align
1363         this access can make all data references satisfy
1364         vect_supportable_dr_alignment.  If so, update data structures
1365         as needed and return true.
1366
1367      B) If peeling wasn't possible and there is a data reference with an
1368         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1369         then see if loop versioning checks can be used to make all data
1370         references satisfy vect_supportable_dr_alignment.  If so, update
1371         data structures as needed and return true.
1372
1373      C) If neither peeling nor versioning were successful then return false if
1374         any data reference does not satisfy vect_supportable_dr_alignment.
1375
1376      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1377
1378      Note, Possibility 3 above (which is peeling and versioning together) is not
1379      being done at this time.  */
1380
1381   /* (1) Peeling to force alignment.  */
1382
1383   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1384      Considerations:
1385      + How many accesses will become aligned due to the peeling
1386      - How many accesses will become unaligned due to the peeling,
1387        and the cost of misaligned accesses.
1388      - The cost of peeling (the extra runtime checks, the increase
1389        in code size).  */
1390
1391   FOR_EACH_VEC_ELT (datarefs, i, dr)
1392     {
1393       stmt = DR_STMT (dr);
1394       stmt_info = vinfo_for_stmt (stmt);
1395
1396       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1397         continue;
1398
1399       /* For interleaving, only the alignment of the first access
1400          matters.  */
1401       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1402           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1403         continue;
1404
1405       /* For invariant accesses there is nothing to enhance.  */
1406       if (integer_zerop (DR_STEP (dr)))
1407         continue;
1408
1409       /* Strided accesses perform only component accesses, alignment is
1410          irrelevant for them.  */
1411       if (STMT_VINFO_STRIDED_P (stmt_info)
1412           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1413         continue;
1414
1415       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1416       do_peeling = vector_alignment_reachable_p (dr);
1417       if (do_peeling)
1418         {
1419           if (known_alignment_for_access_p (dr))
1420             {
1421               unsigned int npeel_tmp;
1422               bool negative = tree_int_cst_compare (DR_STEP (dr),
1423                                                     size_zero_node) < 0;
1424
1425               /* Save info about DR in the hash table.  */
1426               vectype = STMT_VINFO_VECTYPE (stmt_info);
1427               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1428               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1429                                                 TREE_TYPE (DR_REF (dr))));
1430               npeel_tmp = (negative
1431                            ? (mis - nelements) : (nelements - mis))
1432                   & (nelements - 1);
1433
1434               /* For multiple types, it is possible that the bigger type access
1435                  will have more than one peeling option.  E.g., a loop with two
1436                  types: one of size (vector size / 4), and the other one of
1437                  size (vector size / 8).  Vectorization factor will 8.  If both
1438                  access are misaligned by 3, the first one needs one scalar
1439                  iteration to be aligned, and the second one needs 5.  But the
1440                  the first one will be aligned also by peeling 5 scalar
1441                  iterations, and in that case both accesses will be aligned.
1442                  Hence, except for the immediate peeling amount, we also want
1443                  to try to add full vector size, while we don't exceed
1444                  vectorization factor.
1445                  We do this automtically for cost model, since we calculate cost
1446                  for every peeling option.  */
1447               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1448                 {
1449                   if (STMT_SLP_TYPE (stmt_info))
1450                     possible_npeel_number
1451                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1452                   else
1453                     possible_npeel_number = vf / nelements;
1454                 }
1455
1456               /* Handle the aligned case. We may decide to align some other
1457                  access, making DR unaligned.  */
1458               if (DR_MISALIGNMENT (dr) == 0)
1459                 {
1460                   npeel_tmp = 0;
1461                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1462                     possible_npeel_number++;
1463                 }
1464
1465               for (j = 0; j < possible_npeel_number; j++)
1466                 {
1467                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1468                                             dr, npeel_tmp);
1469                   npeel_tmp += nelements;
1470                 }
1471
1472               all_misalignments_unknown = false;
1473               /* Data-ref that was chosen for the case that all the
1474                  misalignments are unknown is not relevant anymore, since we
1475                  have a data-ref with known alignment.  */
1476               dr0 = NULL;
1477             }
1478           else
1479             {
1480               /* If we don't know any misalignment values, we prefer
1481                  peeling for data-ref that has the maximum number of data-refs
1482                  with the same alignment, unless the target prefers to align
1483                  stores over load.  */
1484               if (all_misalignments_unknown)
1485                 {
1486                   unsigned same_align_drs
1487                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1488                   if (!dr0
1489                       || same_align_drs_max < same_align_drs)
1490                     {
1491                       same_align_drs_max = same_align_drs;
1492                       dr0 = dr;
1493                     }
1494                   /* For data-refs with the same number of related
1495                      accesses prefer the one where the misalign
1496                      computation will be invariant in the outermost loop.  */
1497                   else if (same_align_drs_max == same_align_drs)
1498                     {
1499                       struct loop *ivloop0, *ivloop;
1500                       ivloop0 = outermost_invariant_loop_for_expr
1501                           (loop, DR_BASE_ADDRESS (dr0));
1502                       ivloop = outermost_invariant_loop_for_expr
1503                           (loop, DR_BASE_ADDRESS (dr));
1504                       if ((ivloop && !ivloop0)
1505                           || (ivloop && ivloop0
1506                               && flow_loop_nested_p (ivloop, ivloop0)))
1507                         dr0 = dr;
1508                     }
1509
1510                   if (!first_store && DR_IS_WRITE (dr))
1511                     first_store = dr;
1512                 }
1513
1514               /* If there are both known and unknown misaligned accesses in the
1515                  loop, we choose peeling amount according to the known
1516                  accesses.  */
1517               if (!supportable_dr_alignment)
1518                 {
1519                   dr0 = dr;
1520                   if (!first_store && DR_IS_WRITE (dr))
1521                     first_store = dr;
1522                 }
1523             }
1524         }
1525       else
1526         {
1527           if (!aligned_access_p (dr))
1528             {
1529               if (dump_enabled_p ())
1530                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1531                                  "vector alignment may not be reachable\n");
1532               break;
1533             }
1534         }
1535     }
1536
1537   /* Check if we can possibly peel the loop.  */
1538   if (!vect_can_advance_ivs_p (loop_vinfo)
1539       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1540       || loop->inner)
1541     do_peeling = false;
1542
1543   if (do_peeling
1544       && all_misalignments_unknown
1545       && vect_supportable_dr_alignment (dr0, false))
1546     {
1547       /* Check if the target requires to prefer stores over loads, i.e., if
1548          misaligned stores are more expensive than misaligned loads (taking
1549          drs with same alignment into account).  */
1550       if (first_store && DR_IS_READ (dr0))
1551         {
1552           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1553           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1554           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1555           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1556           stmt_vector_for_cost dummy;
1557           dummy.create (2);
1558
1559           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1560                                      &dummy);
1561           vect_get_data_access_cost (first_store, &store_inside_cost,
1562                                      &store_outside_cost, &dummy);
1563
1564           dummy.release ();
1565
1566           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1567              aligning the load DR0).  */
1568           load_inside_penalty = store_inside_cost;
1569           load_outside_penalty = store_outside_cost;
1570           for (i = 0;
1571                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1572                           DR_STMT (first_store))).iterate (i, &dr);
1573                i++)
1574             if (DR_IS_READ (dr))
1575               {
1576                 load_inside_penalty += load_inside_cost;
1577                 load_outside_penalty += load_outside_cost;
1578               }
1579             else
1580               {
1581                 load_inside_penalty += store_inside_cost;
1582                 load_outside_penalty += store_outside_cost;
1583               }
1584
1585           /* Calculate the penalty for leaving DR0 unaligned (by
1586              aligning the FIRST_STORE).  */
1587           store_inside_penalty = load_inside_cost;
1588           store_outside_penalty = load_outside_cost;
1589           for (i = 0;
1590                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1591                       DR_STMT (dr0))).iterate (i, &dr);
1592                i++)
1593             if (DR_IS_READ (dr))
1594               {
1595                 store_inside_penalty += load_inside_cost;
1596                 store_outside_penalty += load_outside_cost;
1597               }
1598             else
1599               {
1600                 store_inside_penalty += store_inside_cost;
1601                 store_outside_penalty += store_outside_cost;
1602               }
1603
1604           if (load_inside_penalty > store_inside_penalty
1605               || (load_inside_penalty == store_inside_penalty
1606                   && load_outside_penalty > store_outside_penalty))
1607             dr0 = first_store;
1608         }
1609
1610       /* In case there are only loads with different unknown misalignments, use
1611          peeling only if it may help to align other accesses in the loop or
1612          if it may help improving load bandwith when we'd end up using
1613          unaligned loads.  */
1614       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1615       if (!first_store
1616           && !STMT_VINFO_SAME_ALIGN_REFS (
1617                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1618           && (vect_supportable_dr_alignment (dr0, false)
1619               != dr_unaligned_supported
1620               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1621                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1622         do_peeling = false;
1623     }
1624
1625   if (do_peeling && !dr0)
1626     {
1627       /* Peeling is possible, but there is no data access that is not supported
1628          unless aligned. So we try to choose the best possible peeling.  */
1629
1630       /* We should get here only if there are drs with known misalignment.  */
1631       gcc_assert (!all_misalignments_unknown);
1632
1633       /* Choose the best peeling from the hash table.  */
1634       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1635                                                    loop_vinfo, &npeel,
1636                                                    &body_cost_vec);
1637       if (!dr0 || !npeel)
1638         do_peeling = false;
1639     }
1640
1641   if (do_peeling)
1642     {
1643       stmt = DR_STMT (dr0);
1644       stmt_info = vinfo_for_stmt (stmt);
1645       vectype = STMT_VINFO_VECTYPE (stmt_info);
1646       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1647
1648       if (known_alignment_for_access_p (dr0))
1649         {
1650           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1651                                                 size_zero_node) < 0;
1652           if (!npeel)
1653             {
1654               /* Since it's known at compile time, compute the number of
1655                  iterations in the peeled loop (the peeling factor) for use in
1656                  updating DR_MISALIGNMENT values.  The peeling factor is the
1657                  vectorization factor minus the misalignment as an element
1658                  count.  */
1659               mis = DR_MISALIGNMENT (dr0);
1660               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1661               npeel = ((negative ? mis - nelements : nelements - mis)
1662                        & (nelements - 1));
1663             }
1664
1665           /* For interleaved data access every iteration accesses all the
1666              members of the group, therefore we divide the number of iterations
1667              by the group size.  */
1668           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1669           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1670             npeel /= GROUP_SIZE (stmt_info);
1671
1672           if (dump_enabled_p ())
1673             dump_printf_loc (MSG_NOTE, vect_location,
1674                              "Try peeling by %d\n", npeel);
1675         }
1676
1677       /* Ensure that all data refs can be vectorized after the peel.  */
1678       FOR_EACH_VEC_ELT (datarefs, i, dr)
1679         {
1680           int save_misalignment;
1681
1682           if (dr == dr0)
1683             continue;
1684
1685           stmt = DR_STMT (dr);
1686           stmt_info = vinfo_for_stmt (stmt);
1687           /* For interleaving, only the alignment of the first access
1688             matters.  */
1689           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1690               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1691             continue;
1692
1693           /* Strided accesses perform only component accesses, alignment is
1694              irrelevant for them.  */
1695           if (STMT_VINFO_STRIDED_P (stmt_info)
1696               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1697             continue;
1698
1699           save_misalignment = DR_MISALIGNMENT (dr);
1700           vect_update_misalignment_for_peel (dr, dr0, npeel);
1701           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1702           SET_DR_MISALIGNMENT (dr, save_misalignment);
1703
1704           if (!supportable_dr_alignment)
1705             {
1706               do_peeling = false;
1707               break;
1708             }
1709         }
1710
1711       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1712         {
1713           stat = vect_verify_datarefs_alignment (loop_vinfo);
1714           if (!stat)
1715             do_peeling = false;
1716           else
1717             {
1718               body_cost_vec.release ();
1719               return stat;
1720             }
1721         }
1722
1723       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1724       if (do_peeling)
1725         {
1726           unsigned max_allowed_peel
1727             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1728           if (max_allowed_peel != (unsigned)-1)
1729             {
1730               unsigned max_peel = npeel;
1731               if (max_peel == 0)
1732                 {
1733                   gimple *dr_stmt = DR_STMT (dr0);
1734                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1735                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1736                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1737                 }
1738               if (max_peel > max_allowed_peel)
1739                 {
1740                   do_peeling = false;
1741                   if (dump_enabled_p ())
1742                     dump_printf_loc (MSG_NOTE, vect_location,
1743                         "Disable peeling, max peels reached: %d\n", max_peel);
1744                 }
1745             }
1746         }
1747
1748       /* Cost model #2 - if peeling may result in a remaining loop not
1749          iterating enough to be vectorized then do not peel.  */
1750       if (do_peeling
1751           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1752         {
1753           unsigned max_peel
1754             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1755           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1756               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1757             do_peeling = false;
1758         }
1759
1760       if (do_peeling)
1761         {
1762           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1763              If the misalignment of DR_i is identical to that of dr0 then set
1764              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1765              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1766              by the peeling factor times the element size of DR_i (MOD the
1767              vectorization factor times the size).  Otherwise, the
1768              misalignment of DR_i must be set to unknown.  */
1769           FOR_EACH_VEC_ELT (datarefs, i, dr)
1770             if (dr != dr0)
1771               vect_update_misalignment_for_peel (dr, dr0, npeel);
1772
1773           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1774           if (npeel)
1775             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1776           else
1777             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1778               = DR_MISALIGNMENT (dr0);
1779           SET_DR_MISALIGNMENT (dr0, 0);
1780           if (dump_enabled_p ())
1781             {
1782               dump_printf_loc (MSG_NOTE, vect_location,
1783                                "Alignment of access forced using peeling.\n");
1784               dump_printf_loc (MSG_NOTE, vect_location,
1785                                "Peeling for alignment will be applied.\n");
1786             }
1787           /* The inside-loop cost will be accounted for in vectorizable_load
1788              and vectorizable_store correctly with adjusted alignments.
1789              Drop the body_cst_vec on the floor here.  */
1790           body_cost_vec.release ();
1791
1792           stat = vect_verify_datarefs_alignment (loop_vinfo);
1793           gcc_assert (stat);
1794           return stat;
1795         }
1796     }
1797
1798   body_cost_vec.release ();
1799
1800   /* (2) Versioning to force alignment.  */
1801
1802   /* Try versioning if:
1803      1) optimize loop for speed
1804      2) there is at least one unsupported misaligned data ref with an unknown
1805         misalignment, and
1806      3) all misaligned data refs with a known misalignment are supported, and
1807      4) the number of runtime alignment checks is within reason.  */
1808
1809   do_versioning =
1810         optimize_loop_nest_for_speed_p (loop)
1811         && (!loop->inner); /* FORNOW */
1812
1813   if (do_versioning)
1814     {
1815       FOR_EACH_VEC_ELT (datarefs, i, dr)
1816         {
1817           stmt = DR_STMT (dr);
1818           stmt_info = vinfo_for_stmt (stmt);
1819
1820           /* For interleaving, only the alignment of the first access
1821              matters.  */
1822           if (aligned_access_p (dr)
1823               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1824                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1825             continue;
1826
1827           if (STMT_VINFO_STRIDED_P (stmt_info))
1828             {
1829               /* Strided loads perform only component accesses, alignment is
1830                  irrelevant for them.  */
1831               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1832                 continue;
1833               do_versioning = false;
1834               break;
1835             }
1836
1837           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1838
1839           if (!supportable_dr_alignment)
1840             {
1841               gimple *stmt;
1842               int mask;
1843               tree vectype;
1844
1845               if (known_alignment_for_access_p (dr)
1846                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1847                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1848                 {
1849                   do_versioning = false;
1850                   break;
1851                 }
1852
1853               stmt = DR_STMT (dr);
1854               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1855               gcc_assert (vectype);
1856
1857               /* The rightmost bits of an aligned address must be zeros.
1858                  Construct the mask needed for this test.  For example,
1859                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1860                  mask must be 15 = 0xf. */
1861               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1862
1863               /* FORNOW: use the same mask to test all potentially unaligned
1864                  references in the loop.  The vectorizer currently supports
1865                  a single vector size, see the reference to
1866                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1867                  vectorization factor is computed.  */
1868               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1869                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1870               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1871               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1872                       DR_STMT (dr));
1873             }
1874         }
1875
1876       /* Versioning requires at least one misaligned data reference.  */
1877       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1878         do_versioning = false;
1879       else if (!do_versioning)
1880         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1881     }
1882
1883   if (do_versioning)
1884     {
1885       vec<gimple *> may_misalign_stmts
1886         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1887       gimple *stmt;
1888
1889       /* It can now be assumed that the data references in the statements
1890          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1891          of the loop being vectorized.  */
1892       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1893         {
1894           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1895           dr = STMT_VINFO_DATA_REF (stmt_info);
1896           SET_DR_MISALIGNMENT (dr, 0);
1897           if (dump_enabled_p ())
1898             dump_printf_loc (MSG_NOTE, vect_location,
1899                              "Alignment of access forced using versioning.\n");
1900         }
1901
1902       if (dump_enabled_p ())
1903         dump_printf_loc (MSG_NOTE, vect_location,
1904                          "Versioning for alignment will be applied.\n");
1905
1906       /* Peeling and versioning can't be done together at this time.  */
1907       gcc_assert (! (do_peeling && do_versioning));
1908
1909       stat = vect_verify_datarefs_alignment (loop_vinfo);
1910       gcc_assert (stat);
1911       return stat;
1912     }
1913
1914   /* This point is reached if neither peeling nor versioning is being done.  */
1915   gcc_assert (! (do_peeling || do_versioning));
1916
1917   stat = vect_verify_datarefs_alignment (loop_vinfo);
1918   return stat;
1919 }
1920
1921
1922 /* Function vect_find_same_alignment_drs.
1923
1924    Update group and alignment relations according to the chosen
1925    vectorization factor.  */
1926
1927 static void
1928 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1929                               loop_vec_info loop_vinfo)
1930 {
1931   unsigned int i;
1932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1933   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1934   struct data_reference *dra = DDR_A (ddr);
1935   struct data_reference *drb = DDR_B (ddr);
1936   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1937   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1938   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1939   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1940   lambda_vector dist_v;
1941   unsigned int loop_depth;
1942
1943   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1944     return;
1945
1946   if (dra == drb)
1947     return;
1948
1949   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1950     return;
1951
1952   /* Loop-based vectorization and known data dependence.  */
1953   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1954     return;
1955
1956   /* Data-dependence analysis reports a distance vector of zero
1957      for data-references that overlap only in the first iteration
1958      but have different sign step (see PR45764).
1959      So as a sanity check require equal DR_STEP.  */
1960   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1961     return;
1962
1963   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1964   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1965     {
1966       int dist = dist_v[loop_depth];
1967
1968       if (dump_enabled_p ())
1969         dump_printf_loc (MSG_NOTE, vect_location,
1970                          "dependence distance  = %d.\n", dist);
1971
1972       /* Same loop iteration.  */
1973       if (dist == 0
1974           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1975         {
1976           /* Two references with distance zero have the same alignment.  */
1977           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1978           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1979           if (dump_enabled_p ())
1980             {
1981               dump_printf_loc (MSG_NOTE, vect_location,
1982                                "accesses have the same alignment.\n");
1983               dump_printf (MSG_NOTE,
1984                            "dependence distance modulo vf == 0 between ");
1985               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1986               dump_printf (MSG_NOTE,  " and ");
1987               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1988               dump_printf (MSG_NOTE, "\n");
1989             }
1990         }
1991     }
1992 }
1993
1994
1995 /* Function vect_analyze_data_refs_alignment
1996
1997    Analyze the alignment of the data-references in the loop.
1998    Return FALSE if a data reference is found that cannot be vectorized.  */
1999
2000 bool
2001 vect_analyze_data_refs_alignment (vec_info *vinfo)
2002 {
2003   if (dump_enabled_p ())
2004     dump_printf_loc (MSG_NOTE, vect_location,
2005                      "=== vect_analyze_data_refs_alignment ===\n");
2006
2007   /* Mark groups of data references with same alignment using
2008      data dependence information.  */
2009   if (is_a <loop_vec_info> (vinfo))
2010     {
2011       vec<ddr_p> ddrs = vinfo->ddrs;
2012       struct data_dependence_relation *ddr;
2013       unsigned int i;
2014
2015       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2016         vect_find_same_alignment_drs (ddr, as_a <loop_vec_info> (vinfo));
2017     }
2018
2019   if (!vect_compute_data_refs_alignment (vinfo))
2020     {
2021       if (dump_enabled_p ())
2022         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2023                          "not vectorized: can't calculate alignment "
2024                          "for data ref.\n");
2025       return false;
2026     }
2027
2028   return true;
2029 }
2030
2031
2032 /* Analyze groups of accesses: check that DR belongs to a group of
2033    accesses of legal size, step, etc.  Detect gaps, single element
2034    interleaving, and other special cases. Set grouped access info.
2035    Collect groups of strided stores for further use in SLP analysis.
2036    Worker for vect_analyze_group_access.  */
2037
2038 static bool
2039 vect_analyze_group_access_1 (struct data_reference *dr)
2040 {
2041   tree step = DR_STEP (dr);
2042   tree scalar_type = TREE_TYPE (DR_REF (dr));
2043   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2044   gimple *stmt = DR_STMT (dr);
2045   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2046   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2047   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2048   HOST_WIDE_INT dr_step = -1;
2049   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2050   bool slp_impossible = false;
2051   struct loop *loop = NULL;
2052
2053   if (loop_vinfo)
2054     loop = LOOP_VINFO_LOOP (loop_vinfo);
2055
2056   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2057      size of the interleaving group (including gaps).  */
2058   if (tree_fits_shwi_p (step))
2059     {
2060       dr_step = tree_to_shwi (step);
2061       groupsize = absu_hwi (dr_step) / type_size;
2062     }
2063   else
2064     groupsize = 0;
2065
2066   /* Not consecutive access is possible only if it is a part of interleaving.  */
2067   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2068     {
2069       /* Check if it this DR is a part of interleaving, and is a single
2070          element of the group that is accessed in the loop.  */
2071
2072       /* Gaps are supported only for loads. STEP must be a multiple of the type
2073          size.  The size of the group must be a power of 2.  */
2074       if (DR_IS_READ (dr)
2075           && (dr_step % type_size) == 0
2076           && groupsize > 0
2077           && exact_log2 (groupsize) != -1)
2078         {
2079           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2080           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2081           if (dump_enabled_p ())
2082             {
2083               dump_printf_loc (MSG_NOTE, vect_location,
2084                                "Detected single element interleaving ");
2085               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2086               dump_printf (MSG_NOTE, " step ");
2087               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2088               dump_printf (MSG_NOTE, "\n");
2089             }
2090
2091           if (loop_vinfo)
2092             {
2093               if (dump_enabled_p ())
2094                 dump_printf_loc (MSG_NOTE, vect_location,
2095                                  "Data access with gaps requires scalar "
2096                                  "epilogue loop\n");
2097               if (loop->inner)
2098                 {
2099                   if (dump_enabled_p ())
2100                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2101                                      "Peeling for outer loop is not"
2102                                      " supported\n");
2103                   return false;
2104                 }
2105
2106               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2107             }
2108
2109           return true;
2110         }
2111
2112       if (dump_enabled_p ())
2113         {
2114           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2115                            "not consecutive access ");
2116           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2117           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2118         }
2119
2120       if (bb_vinfo)
2121         {
2122           /* Mark the statement as unvectorizable.  */
2123           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2124           return true;
2125         }
2126
2127       return false;
2128     }
2129
2130   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2131     {
2132       /* First stmt in the interleaving chain. Check the chain.  */
2133       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2134       struct data_reference *data_ref = dr;
2135       unsigned int count = 1;
2136       tree prev_init = DR_INIT (data_ref);
2137       gimple *prev = stmt;
2138       HOST_WIDE_INT diff, gaps = 0;
2139
2140       while (next)
2141         {
2142           /* Skip same data-refs.  In case that two or more stmts share
2143              data-ref (supported only for loads), we vectorize only the first
2144              stmt, and the rest get their vectorized loads from the first
2145              one.  */
2146           if (!tree_int_cst_compare (DR_INIT (data_ref),
2147                                      DR_INIT (STMT_VINFO_DATA_REF (
2148                                                    vinfo_for_stmt (next)))))
2149             {
2150               if (DR_IS_WRITE (data_ref))
2151                 {
2152                   if (dump_enabled_p ())
2153                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2154                                      "Two store stmts share the same dr.\n");
2155                   return false;
2156                 }
2157
2158               if (dump_enabled_p ())
2159                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2160                                  "Two or more load stmts share the same dr.\n");
2161
2162               /* For load use the same data-ref load.  */
2163               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2164
2165               prev = next;
2166               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2167               continue;
2168             }
2169
2170           prev = next;
2171           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2172
2173           /* All group members have the same STEP by construction.  */
2174           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2175
2176           /* Check that the distance between two accesses is equal to the type
2177              size. Otherwise, we have gaps.  */
2178           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2179                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2180           if (diff != 1)
2181             {
2182               /* FORNOW: SLP of accesses with gaps is not supported.  */
2183               slp_impossible = true;
2184               if (DR_IS_WRITE (data_ref))
2185                 {
2186                   if (dump_enabled_p ())
2187                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2188                                      "interleaved store with gaps\n");
2189                   return false;
2190                 }
2191
2192               gaps += diff - 1;
2193             }
2194
2195           last_accessed_element += diff;
2196
2197           /* Store the gap from the previous member of the group. If there is no
2198              gap in the access, GROUP_GAP is always 1.  */
2199           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2200
2201           prev_init = DR_INIT (data_ref);
2202           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2203           /* Count the number of data-refs in the chain.  */
2204           count++;
2205         }
2206
2207       if (groupsize == 0)
2208         groupsize = count + gaps;
2209
2210       if (groupsize > UINT_MAX)
2211         {
2212           if (dump_enabled_p ())
2213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214                              "group is too large\n");
2215           return false;
2216         }
2217
2218       /* Check that the size of the interleaving is equal to count for stores,
2219          i.e., that there are no gaps.  */
2220       if (groupsize != count
2221           && !DR_IS_READ (dr))
2222         {
2223           if (dump_enabled_p ())
2224             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2225                              "interleaved store with gaps\n");
2226           return false;
2227         }
2228
2229       /* If there is a gap after the last load in the group it is the
2230          difference between the groupsize and the last accessed
2231          element.
2232          When there is no gap, this difference should be 0.  */
2233       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2234
2235       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2236       if (dump_enabled_p ())
2237         {
2238           dump_printf_loc (MSG_NOTE, vect_location,
2239                            "Detected interleaving ");
2240           if (DR_IS_READ (dr))
2241             dump_printf (MSG_NOTE, "load ");
2242           else
2243             dump_printf (MSG_NOTE, "store ");
2244           dump_printf (MSG_NOTE, "of size %u starting with ",
2245                        (unsigned)groupsize);
2246           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2247           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2248             dump_printf_loc (MSG_NOTE, vect_location,
2249                              "There is a gap of %u elements after the group\n",
2250                              GROUP_GAP (vinfo_for_stmt (stmt)));
2251         }
2252
2253       /* SLP: create an SLP data structure for every interleaving group of
2254          stores for further analysis in vect_analyse_slp.  */
2255       if (DR_IS_WRITE (dr) && !slp_impossible)
2256         {
2257           if (loop_vinfo)
2258             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2259           if (bb_vinfo)
2260             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2261         }
2262
2263       /* If there is a gap in the end of the group or the group size cannot
2264          be made a multiple of the vector element count then we access excess
2265          elements in the last iteration and thus need to peel that off.  */
2266       if (loop_vinfo
2267           && (groupsize - last_accessed_element > 0
2268               || exact_log2 (groupsize) == -1))
2269
2270         {
2271           if (dump_enabled_p ())
2272             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273                              "Data access with gaps requires scalar "
2274                              "epilogue loop\n");
2275           if (loop->inner)
2276             {
2277               if (dump_enabled_p ())
2278                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279                                  "Peeling for outer loop is not supported\n");
2280               return false;
2281             }
2282
2283           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2284         }
2285     }
2286
2287   return true;
2288 }
2289
2290 /* Analyze groups of accesses: check that DR belongs to a group of
2291    accesses of legal size, step, etc.  Detect gaps, single element
2292    interleaving, and other special cases. Set grouped access info.
2293    Collect groups of strided stores for further use in SLP analysis.  */
2294
2295 static bool
2296 vect_analyze_group_access (struct data_reference *dr)
2297 {
2298   if (!vect_analyze_group_access_1 (dr))
2299     {
2300       /* Dissolve the group if present.  */
2301       gimple *next;
2302       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2303       while (stmt)
2304         {
2305           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2306           next = GROUP_NEXT_ELEMENT (vinfo);
2307           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2308           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2309           stmt = next;
2310         }
2311       return false;
2312     }
2313   return true;
2314 }
2315
2316 /* Analyze the access pattern of the data-reference DR.
2317    In case of non-consecutive accesses call vect_analyze_group_access() to
2318    analyze groups of accesses.  */
2319
2320 static bool
2321 vect_analyze_data_ref_access (struct data_reference *dr)
2322 {
2323   tree step = DR_STEP (dr);
2324   tree scalar_type = TREE_TYPE (DR_REF (dr));
2325   gimple *stmt = DR_STMT (dr);
2326   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2327   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2328   struct loop *loop = NULL;
2329
2330   if (loop_vinfo)
2331     loop = LOOP_VINFO_LOOP (loop_vinfo);
2332
2333   if (loop_vinfo && !step)
2334     {
2335       if (dump_enabled_p ())
2336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337                          "bad data-ref access in loop\n");
2338       return false;
2339     }
2340
2341   /* Allow loads with zero step in inner-loop vectorization.  */
2342   if (loop_vinfo && integer_zerop (step))
2343     {
2344       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2345       if (!nested_in_vect_loop_p (loop, stmt))
2346         return DR_IS_READ (dr);
2347       /* Allow references with zero step for outer loops marked
2348          with pragma omp simd only - it guarantees absence of
2349          loop-carried dependencies between inner loop iterations.  */
2350       if (!loop->force_vectorize)
2351         {
2352           if (dump_enabled_p ())
2353             dump_printf_loc (MSG_NOTE, vect_location,
2354                              "zero step in inner loop of nest\n");
2355           return false;
2356         }
2357     }
2358
2359   if (loop && nested_in_vect_loop_p (loop, stmt))
2360     {
2361       /* Interleaved accesses are not yet supported within outer-loop
2362         vectorization for references in the inner-loop.  */
2363       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2364
2365       /* For the rest of the analysis we use the outer-loop step.  */
2366       step = STMT_VINFO_DR_STEP (stmt_info);
2367       if (integer_zerop (step))
2368         {
2369           if (dump_enabled_p ())
2370             dump_printf_loc (MSG_NOTE, vect_location,
2371                              "zero step in outer loop.\n");
2372           return DR_IS_READ (dr);
2373         }
2374     }
2375
2376   /* Consecutive?  */
2377   if (TREE_CODE (step) == INTEGER_CST)
2378     {
2379       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2380       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2381           || (dr_step < 0
2382               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2383         {
2384           /* Mark that it is not interleaving.  */
2385           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2386           return true;
2387         }
2388     }
2389
2390   if (loop && nested_in_vect_loop_p (loop, stmt))
2391     {
2392       if (dump_enabled_p ())
2393         dump_printf_loc (MSG_NOTE, vect_location,
2394                          "grouped access in outer loop.\n");
2395       return false;
2396     }
2397
2398
2399   /* Assume this is a DR handled by non-constant strided load case.  */
2400   if (TREE_CODE (step) != INTEGER_CST)
2401     return (STMT_VINFO_STRIDED_P (stmt_info)
2402             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2403                 || vect_analyze_group_access (dr)));
2404
2405   /* Not consecutive access - check if it's a part of interleaving group.  */
2406   return vect_analyze_group_access (dr);
2407 }
2408
2409
2410
2411 /*  A helper function used in the comparator function to sort data
2412     references.  T1 and T2 are two data references to be compared.
2413     The function returns -1, 0, or 1.  */
2414
2415 static int
2416 compare_tree (tree t1, tree t2)
2417 {
2418   int i, cmp;
2419   enum tree_code code;
2420   char tclass;
2421
2422   if (t1 == t2)
2423     return 0;
2424   if (t1 == NULL)
2425     return -1;
2426   if (t2 == NULL)
2427     return 1;
2428
2429
2430   if (TREE_CODE (t1) != TREE_CODE (t2))
2431     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2432
2433   code = TREE_CODE (t1);
2434   switch (code)
2435     {
2436     /* For const values, we can just use hash values for comparisons.  */
2437     case INTEGER_CST:
2438     case REAL_CST:
2439     case FIXED_CST:
2440     case STRING_CST:
2441     case COMPLEX_CST:
2442     case VECTOR_CST:
2443       {
2444         hashval_t h1 = iterative_hash_expr (t1, 0);
2445         hashval_t h2 = iterative_hash_expr (t2, 0);
2446         if (h1 != h2)
2447           return h1 < h2 ? -1 : 1;
2448         break;
2449       }
2450
2451     case SSA_NAME:
2452       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2453       if (cmp != 0)
2454         return cmp;
2455
2456       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2457         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2458       break;
2459
2460     default:
2461       tclass = TREE_CODE_CLASS (code);
2462
2463       /* For var-decl, we could compare their UIDs.  */
2464       if (tclass == tcc_declaration)
2465         {
2466           if (DECL_UID (t1) != DECL_UID (t2))
2467             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2468           break;
2469         }
2470
2471       /* For expressions with operands, compare their operands recursively.  */
2472       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2473         {
2474           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2475           if (cmp != 0)
2476             return cmp;
2477         }
2478     }
2479
2480   return 0;
2481 }
2482
2483
2484 /* Compare two data-references DRA and DRB to group them into chunks
2485    suitable for grouping.  */
2486
2487 static int
2488 dr_group_sort_cmp (const void *dra_, const void *drb_)
2489 {
2490   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2491   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2492   int cmp;
2493
2494   /* Stabilize sort.  */
2495   if (dra == drb)
2496     return 0;
2497
2498   /* Ordering of DRs according to base.  */
2499   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2500     {
2501       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2502       if (cmp != 0)
2503         return cmp;
2504     }
2505
2506   /* And according to DR_OFFSET.  */
2507   if (!dr_equal_offsets_p (dra, drb))
2508     {
2509       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2510       if (cmp != 0)
2511         return cmp;
2512     }
2513
2514   /* Put reads before writes.  */
2515   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2516     return DR_IS_READ (dra) ? -1 : 1;
2517
2518   /* Then sort after access size.  */
2519   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2520                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2521     {
2522       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2523                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2524       if (cmp != 0)
2525         return cmp;
2526     }
2527
2528   /* And after step.  */
2529   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2530     {
2531       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2532       if (cmp != 0)
2533         return cmp;
2534     }
2535
2536   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2537   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2538   if (cmp == 0)
2539     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2540   return cmp;
2541 }
2542
2543 /* Function vect_analyze_data_ref_accesses.
2544
2545    Analyze the access pattern of all the data references in the loop.
2546
2547    FORNOW: the only access pattern that is considered vectorizable is a
2548            simple step 1 (consecutive) access.
2549
2550    FORNOW: handle only arrays and pointer accesses.  */
2551
2552 bool
2553 vect_analyze_data_ref_accesses (vec_info *vinfo)
2554 {
2555   unsigned int i;
2556   vec<data_reference_p> datarefs = vinfo->datarefs;
2557   struct data_reference *dr;
2558
2559   if (dump_enabled_p ())
2560     dump_printf_loc (MSG_NOTE, vect_location,
2561                      "=== vect_analyze_data_ref_accesses ===\n");
2562
2563   if (datarefs.is_empty ())
2564     return true;
2565
2566   /* Sort the array of datarefs to make building the interleaving chains
2567      linear.  Don't modify the original vector's order, it is needed for
2568      determining what dependencies are reversed.  */
2569   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2570   datarefs_copy.qsort (dr_group_sort_cmp);
2571
2572   /* Build the interleaving chains.  */
2573   for (i = 0; i < datarefs_copy.length () - 1;)
2574     {
2575       data_reference_p dra = datarefs_copy[i];
2576       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2577       stmt_vec_info lastinfo = NULL;
2578       for (i = i + 1; i < datarefs_copy.length (); ++i)
2579         {
2580           data_reference_p drb = datarefs_copy[i];
2581           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2582
2583           /* ???  Imperfect sorting (non-compatible types, non-modulo
2584              accesses, same accesses) can lead to a group to be artificially
2585              split here as we don't just skip over those.  If it really
2586              matters we can push those to a worklist and re-iterate
2587              over them.  The we can just skip ahead to the next DR here.  */
2588
2589           /* Check that the data-refs have same first location (except init)
2590              and they are both either store or load (not load and store,
2591              not masked loads or stores).  */
2592           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2593               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2594                                    DR_BASE_ADDRESS (drb), 0)
2595               || !dr_equal_offsets_p (dra, drb)
2596               || !gimple_assign_single_p (DR_STMT (dra))
2597               || !gimple_assign_single_p (DR_STMT (drb)))
2598             break;
2599
2600           /* Check that the data-refs have the same constant size.  */
2601           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2602           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2603           if (!tree_fits_uhwi_p (sza)
2604               || !tree_fits_uhwi_p (szb)
2605               || !tree_int_cst_equal (sza, szb))
2606             break;
2607
2608           /* Check that the data-refs have the same step.  */
2609           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2610             break;
2611
2612           /* Do not place the same access in the interleaving chain twice.  */
2613           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2614             break;
2615
2616           /* Check the types are compatible.
2617              ???  We don't distinguish this during sorting.  */
2618           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2619                                    TREE_TYPE (DR_REF (drb))))
2620             break;
2621
2622           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2623           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2624           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2625           gcc_assert (init_a < init_b);
2626
2627           /* If init_b == init_a + the size of the type * k, we have an
2628              interleaving, and DRA is accessed before DRB.  */
2629           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2630           if ((init_b - init_a) % type_size_a != 0)
2631             break;
2632
2633           /* If we have a store, the accesses are adjacent.  This splits
2634              groups into chunks we support (we don't support vectorization
2635              of stores with gaps).  */
2636           if (!DR_IS_READ (dra)
2637               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2638                                              (DR_INIT (datarefs_copy[i-1]))
2639                   != type_size_a))
2640             break;
2641
2642           /* If the step (if not zero or non-constant) is greater than the
2643              difference between data-refs' inits this splits groups into
2644              suitable sizes.  */
2645           if (tree_fits_shwi_p (DR_STEP (dra)))
2646             {
2647               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2648               if (step != 0 && step <= (init_b - init_a))
2649                 break;
2650             }
2651
2652           if (dump_enabled_p ())
2653             {
2654               dump_printf_loc (MSG_NOTE, vect_location,
2655                                "Detected interleaving ");
2656               if (DR_IS_READ (dra))
2657                 dump_printf (MSG_NOTE, "load ");
2658               else
2659                 dump_printf (MSG_NOTE, "store ");
2660               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2661               dump_printf (MSG_NOTE,  " and ");
2662               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2663               dump_printf (MSG_NOTE, "\n");
2664             }
2665
2666           /* Link the found element into the group list.  */
2667           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2668             {
2669               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2670               lastinfo = stmtinfo_a;
2671             }
2672           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2673           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2674           lastinfo = stmtinfo_b;
2675         }
2676     }
2677
2678   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2679     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2680         && !vect_analyze_data_ref_access (dr))
2681       {
2682         if (dump_enabled_p ())
2683           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2684                            "not vectorized: complicated access pattern.\n");
2685
2686         if (is_a <bb_vec_info> (vinfo))
2687           {
2688             /* Mark the statement as not vectorizable.  */
2689             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2690             continue;
2691           }
2692         else
2693           {
2694             datarefs_copy.release ();
2695             return false;
2696           }
2697       }
2698
2699   datarefs_copy.release ();
2700   return true;
2701 }
2702
2703
2704 /* Operator == between two dr_with_seg_len objects.
2705
2706    This equality operator is used to make sure two data refs
2707    are the same one so that we will consider to combine the
2708    aliasing checks of those two pairs of data dependent data
2709    refs.  */
2710
2711 static bool
2712 operator == (const dr_with_seg_len& d1,
2713              const dr_with_seg_len& d2)
2714 {
2715   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2716                           DR_BASE_ADDRESS (d2.dr), 0)
2717            && compare_tree (d1.offset, d2.offset) == 0
2718            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2719 }
2720
2721 /* Function comp_dr_with_seg_len_pair.
2722
2723    Comparison function for sorting objects of dr_with_seg_len_pair_t
2724    so that we can combine aliasing checks in one scan.  */
2725
2726 static int
2727 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2728 {
2729   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2730   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2731
2732   const dr_with_seg_len &p11 = p1->first,
2733                         &p12 = p1->second,
2734                         &p21 = p2->first,
2735                         &p22 = p2->second;
2736
2737   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2738      if a and c have the same basic address snd step, and b and d have the same
2739      address and step.  Therefore, if any a&c or b&d don't have the same address
2740      and step, we don't care the order of those two pairs after sorting.  */
2741   int comp_res;
2742
2743   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2744                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2745     return comp_res;
2746   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2747                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2748     return comp_res;
2749   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2750     return comp_res;
2751   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2752     return comp_res;
2753   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2754     return comp_res;
2755   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2756     return comp_res;
2757
2758   return 0;
2759 }
2760
2761 /* Function vect_vfa_segment_size.
2762
2763    Create an expression that computes the size of segment
2764    that will be accessed for a data reference.  The functions takes into
2765    account that realignment loads may access one more vector.
2766
2767    Input:
2768      DR: The data reference.
2769      LENGTH_FACTOR: segment length to consider.
2770
2771    Return an expression whose value is the size of segment which will be
2772    accessed by DR.  */
2773
2774 static tree
2775 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2776 {
2777   tree segment_length;
2778
2779   if (integer_zerop (DR_STEP (dr)))
2780     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2781   else
2782     segment_length = size_binop (MULT_EXPR,
2783                                  fold_convert (sizetype, DR_STEP (dr)),
2784                                  fold_convert (sizetype, length_factor));
2785
2786   if (vect_supportable_dr_alignment (dr, false)
2787         == dr_explicit_realign_optimized)
2788     {
2789       tree vector_size = TYPE_SIZE_UNIT
2790                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2791
2792       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2793     }
2794   return segment_length;
2795 }
2796
2797 /* Function vect_prune_runtime_alias_test_list.
2798
2799    Prune a list of ddrs to be tested at run-time by versioning for alias.
2800    Merge several alias checks into one if possible.
2801    Return FALSE if resulting list of ddrs is longer then allowed by
2802    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2803
2804 bool
2805 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2806 {
2807   vec<ddr_p> may_alias_ddrs =
2808     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2809   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2810     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2811   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2812   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2813
2814   ddr_p ddr;
2815   unsigned int i;
2816   tree length_factor;
2817
2818   if (dump_enabled_p ())
2819     dump_printf_loc (MSG_NOTE, vect_location,
2820                      "=== vect_prune_runtime_alias_test_list ===\n");
2821
2822   if (may_alias_ddrs.is_empty ())
2823     return true;
2824
2825   /* Basically, for each pair of dependent data refs store_ptr_0
2826      and load_ptr_0, we create an expression:
2827
2828      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2829      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2830
2831      for aliasing checks.  However, in some cases we can decrease
2832      the number of checks by combining two checks into one.  For
2833      example, suppose we have another pair of data refs store_ptr_0
2834      and load_ptr_1, and if the following condition is satisfied:
2835
2836      load_ptr_0 < load_ptr_1  &&
2837      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2838
2839      (this condition means, in each iteration of vectorized loop,
2840      the accessed memory of store_ptr_0 cannot be between the memory
2841      of load_ptr_0 and load_ptr_1.)
2842
2843      we then can use only the following expression to finish the
2844      alising checks between store_ptr_0 & load_ptr_0 and
2845      store_ptr_0 & load_ptr_1:
2846
2847      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2848      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2849
2850      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2851      same basic address.  */
2852
2853   comp_alias_ddrs.create (may_alias_ddrs.length ());
2854
2855   /* First, we collect all data ref pairs for aliasing checks.  */
2856   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2857     {
2858       struct data_reference *dr_a, *dr_b;
2859       gimple *dr_group_first_a, *dr_group_first_b;
2860       tree segment_length_a, segment_length_b;
2861       gimple *stmt_a, *stmt_b;
2862
2863       dr_a = DDR_A (ddr);
2864       stmt_a = DR_STMT (DDR_A (ddr));
2865       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2866       if (dr_group_first_a)
2867         {
2868           stmt_a = dr_group_first_a;
2869           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2870         }
2871
2872       dr_b = DDR_B (ddr);
2873       stmt_b = DR_STMT (DDR_B (ddr));
2874       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2875       if (dr_group_first_b)
2876         {
2877           stmt_b = dr_group_first_b;
2878           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2879         }
2880
2881       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2882         length_factor = scalar_loop_iters;
2883       else
2884         length_factor = size_int (vect_factor);
2885       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2886       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2887
2888       dr_with_seg_len_pair_t dr_with_seg_len_pair
2889           (dr_with_seg_len (dr_a, segment_length_a),
2890            dr_with_seg_len (dr_b, segment_length_b));
2891
2892       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2893         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2894
2895       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2896     }
2897
2898   /* Second, we sort the collected data ref pairs so that we can scan
2899      them once to combine all possible aliasing checks.  */
2900   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2901
2902   /* Third, we scan the sorted dr pairs and check if we can combine
2903      alias checks of two neighbouring dr pairs.  */
2904   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2905     {
2906       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2907       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2908                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2909                       *dr_a2 = &comp_alias_ddrs[i].first,
2910                       *dr_b2 = &comp_alias_ddrs[i].second;
2911
2912       /* Remove duplicate data ref pairs.  */
2913       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2914         {
2915           if (dump_enabled_p ())
2916             {
2917               dump_printf_loc (MSG_NOTE, vect_location,
2918                                "found equal ranges ");
2919               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2920                                  DR_REF (dr_a1->dr));
2921               dump_printf (MSG_NOTE,  ", ");
2922               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923                                  DR_REF (dr_b1->dr));
2924               dump_printf (MSG_NOTE,  " and ");
2925               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926                                  DR_REF (dr_a2->dr));
2927               dump_printf (MSG_NOTE,  ", ");
2928               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2929                                  DR_REF (dr_b2->dr));
2930               dump_printf (MSG_NOTE, "\n");
2931             }
2932
2933           comp_alias_ddrs.ordered_remove (i--);
2934           continue;
2935         }
2936
2937       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2938         {
2939           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2940              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2941           if (*dr_a1 == *dr_a2)
2942             {
2943               std::swap (dr_a1, dr_b1);
2944               std::swap (dr_a2, dr_b2);
2945             }
2946
2947           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2948                                 DR_BASE_ADDRESS (dr_a2->dr),
2949                                 0)
2950               || !tree_fits_shwi_p (dr_a1->offset)
2951               || !tree_fits_shwi_p (dr_a2->offset))
2952             continue;
2953
2954           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2955                                 - tree_to_shwi (dr_a1->offset));
2956
2957
2958           /* Now we check if the following condition is satisfied:
2959
2960              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2961
2962              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2963              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2964              have to make a best estimation.  We can get the minimum value
2965              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2966              then either of the following two conditions can guarantee the
2967              one above:
2968
2969              1: DIFF <= MIN_SEG_LEN_B
2970              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2971
2972              */
2973
2974           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2975                                           ? tree_to_shwi (dr_b1->seg_len)
2976                                           : vect_factor);
2977
2978           if (diff <= min_seg_len_b
2979               || (tree_fits_shwi_p (dr_a1->seg_len)
2980                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2981             {
2982               if (dump_enabled_p ())
2983                 {
2984                   dump_printf_loc (MSG_NOTE, vect_location,
2985                                    "merging ranges for ");
2986                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2987                                      DR_REF (dr_a1->dr));
2988                   dump_printf (MSG_NOTE,  ", ");
2989                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2990                                      DR_REF (dr_b1->dr));
2991                   dump_printf (MSG_NOTE,  " and ");
2992                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2993                                      DR_REF (dr_a2->dr));
2994                   dump_printf (MSG_NOTE,  ", ");
2995                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2996                                      DR_REF (dr_b2->dr));
2997                   dump_printf (MSG_NOTE, "\n");
2998                 }
2999
3000               dr_a1->seg_len = size_binop (PLUS_EXPR,
3001                                            dr_a2->seg_len, size_int (diff));
3002               comp_alias_ddrs.ordered_remove (i--);
3003             }
3004         }
3005     }
3006
3007   dump_printf_loc (MSG_NOTE, vect_location,
3008                    "improved number of alias checks from %d to %d\n",
3009                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
3010   if ((int) comp_alias_ddrs.length () >
3011       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3012     return false;
3013
3014   return true;
3015 }
3016
3017 /* Check whether a non-affine read or write in stmt is suitable for gather load
3018    or scatter store and if so, return a builtin decl for that operation.  */
3019
3020 tree
3021 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3022                            tree *offp, int *scalep)
3023 {
3024   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3025   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3026   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3027   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3028   tree offtype = NULL_TREE;
3029   tree decl, base, off;
3030   machine_mode pmode;
3031   int punsignedp, pvolatilep;
3032
3033   base = DR_REF (dr);
3034   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3035      see if we can use the def stmt of the address.  */
3036   if (is_gimple_call (stmt)
3037       && gimple_call_internal_p (stmt)
3038       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3039           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3040       && TREE_CODE (base) == MEM_REF
3041       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3042       && integer_zerop (TREE_OPERAND (base, 1))
3043       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3044     {
3045       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3046       if (is_gimple_assign (def_stmt)
3047           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3048         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3049     }
3050
3051   /* The gather and scatter builtins need address of the form
3052      loop_invariant + vector * {1, 2, 4, 8}
3053      or
3054      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3055      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3056      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3057      multiplications and additions in it.  To get a vector, we need
3058      a single SSA_NAME that will be defined in the loop and will
3059      contain everything that is not loop invariant and that can be
3060      vectorized.  The following code attempts to find such a preexistng
3061      SSA_NAME OFF and put the loop invariants into a tree BASE
3062      that can be gimplified before the loop.  */
3063   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3064                               &pmode, &punsignedp, &pvolatilep, false);
3065   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3066
3067   if (TREE_CODE (base) == MEM_REF)
3068     {
3069       if (!integer_zerop (TREE_OPERAND (base, 1)))
3070         {
3071           if (off == NULL_TREE)
3072             {
3073               offset_int moff = mem_ref_offset (base);
3074               off = wide_int_to_tree (sizetype, moff);
3075             }
3076           else
3077             off = size_binop (PLUS_EXPR, off,
3078                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3079         }
3080       base = TREE_OPERAND (base, 0);
3081     }
3082   else
3083     base = build_fold_addr_expr (base);
3084
3085   if (off == NULL_TREE)
3086     off = size_zero_node;
3087
3088   /* If base is not loop invariant, either off is 0, then we start with just
3089      the constant offset in the loop invariant BASE and continue with base
3090      as OFF, otherwise give up.
3091      We could handle that case by gimplifying the addition of base + off
3092      into some SSA_NAME and use that as off, but for now punt.  */
3093   if (!expr_invariant_in_loop_p (loop, base))
3094     {
3095       if (!integer_zerop (off))
3096         return NULL_TREE;
3097       off = base;
3098       base = size_int (pbitpos / BITS_PER_UNIT);
3099     }
3100   /* Otherwise put base + constant offset into the loop invariant BASE
3101      and continue with OFF.  */
3102   else
3103     {
3104       base = fold_convert (sizetype, base);
3105       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3106     }
3107
3108   /* OFF at this point may be either a SSA_NAME or some tree expression
3109      from get_inner_reference.  Try to peel off loop invariants from it
3110      into BASE as long as possible.  */
3111   STRIP_NOPS (off);
3112   while (offtype == NULL_TREE)
3113     {
3114       enum tree_code code;
3115       tree op0, op1, add = NULL_TREE;
3116
3117       if (TREE_CODE (off) == SSA_NAME)
3118         {
3119           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3120
3121           if (expr_invariant_in_loop_p (loop, off))
3122             return NULL_TREE;
3123
3124           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3125             break;
3126
3127           op0 = gimple_assign_rhs1 (def_stmt);
3128           code = gimple_assign_rhs_code (def_stmt);
3129           op1 = gimple_assign_rhs2 (def_stmt);
3130         }
3131       else
3132         {
3133           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3134             return NULL_TREE;
3135           code = TREE_CODE (off);
3136           extract_ops_from_tree (off, &code, &op0, &op1);
3137         }
3138       switch (code)
3139         {
3140         case POINTER_PLUS_EXPR:
3141         case PLUS_EXPR:
3142           if (expr_invariant_in_loop_p (loop, op0))
3143             {
3144               add = op0;
3145               off = op1;
3146             do_add:
3147               add = fold_convert (sizetype, add);
3148               if (scale != 1)
3149                 add = size_binop (MULT_EXPR, add, size_int (scale));
3150               base = size_binop (PLUS_EXPR, base, add);
3151               continue;
3152             }
3153           if (expr_invariant_in_loop_p (loop, op1))
3154             {
3155               add = op1;
3156               off = op0;
3157               goto do_add;
3158             }
3159           break;
3160         case MINUS_EXPR:
3161           if (expr_invariant_in_loop_p (loop, op1))
3162             {
3163               add = fold_convert (sizetype, op1);
3164               add = size_binop (MINUS_EXPR, size_zero_node, add);
3165               off = op0;
3166               goto do_add;
3167             }
3168           break;
3169         case MULT_EXPR:
3170           if (scale == 1 && tree_fits_shwi_p (op1))
3171             {
3172               scale = tree_to_shwi (op1);
3173               off = op0;
3174               continue;
3175             }
3176           break;
3177         case SSA_NAME:
3178           off = op0;
3179           continue;
3180         CASE_CONVERT:
3181           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3182               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3183             break;
3184           if (TYPE_PRECISION (TREE_TYPE (op0))
3185               == TYPE_PRECISION (TREE_TYPE (off)))
3186             {
3187               off = op0;
3188               continue;
3189             }
3190           if (TYPE_PRECISION (TREE_TYPE (op0))
3191               < TYPE_PRECISION (TREE_TYPE (off)))
3192             {
3193               off = op0;
3194               offtype = TREE_TYPE (off);
3195               STRIP_NOPS (off);
3196               continue;
3197             }
3198           break;
3199         default:
3200           break;
3201         }
3202       break;
3203     }
3204
3205   /* If at the end OFF still isn't a SSA_NAME or isn't
3206      defined in the loop, punt.  */
3207   if (TREE_CODE (off) != SSA_NAME
3208       || expr_invariant_in_loop_p (loop, off))
3209     return NULL_TREE;
3210
3211   if (offtype == NULL_TREE)
3212     offtype = TREE_TYPE (off);
3213
3214   if (DR_IS_READ (dr))
3215     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3216                                              offtype, scale);
3217   else
3218     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3219                                               offtype, scale);
3220
3221   if (decl == NULL_TREE)
3222     return NULL_TREE;
3223
3224   if (basep)
3225     *basep = base;
3226   if (offp)
3227     *offp = off;
3228   if (scalep)
3229     *scalep = scale;
3230   return decl;
3231 }
3232
3233 /* Function vect_analyze_data_refs.
3234
3235   Find all the data references in the loop or basic block.
3236
3237    The general structure of the analysis of data refs in the vectorizer is as
3238    follows:
3239    1- vect_analyze_data_refs(loop/bb): call
3240       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3241       in the loop/bb and their dependences.
3242    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3243    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3244    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3245
3246 */
3247
3248 bool
3249 vect_analyze_data_refs (vec_info *vinfo, int *min_vf, unsigned *n_stmts)
3250 {
3251   struct loop *loop = NULL;
3252   basic_block bb = NULL;
3253   unsigned int i;
3254   vec<data_reference_p> datarefs;
3255   struct data_reference *dr;
3256   tree scalar_type;
3257
3258   if (dump_enabled_p ())
3259     dump_printf_loc (MSG_NOTE, vect_location,
3260                      "=== vect_analyze_data_refs ===\n");
3261
3262   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3263     {
3264       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3265
3266       loop = LOOP_VINFO_LOOP (loop_vinfo);
3267       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3268       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3269         {
3270           if (dump_enabled_p ())
3271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3272                              "not vectorized: loop contains function calls"
3273                              " or data references that cannot be analyzed\n");
3274           return false;
3275         }
3276
3277       for (i = 0; i < loop->num_nodes; i++)
3278         {
3279           gimple_stmt_iterator gsi;
3280
3281           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3282             {
3283               gimple *stmt = gsi_stmt (gsi);
3284               if (is_gimple_debug (stmt))
3285                 continue;
3286               ++*n_stmts;
3287               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3288                 {
3289                   if (is_gimple_call (stmt) && loop->safelen)
3290                     {
3291                       tree fndecl = gimple_call_fndecl (stmt), op;
3292                       if (fndecl != NULL_TREE)
3293                         {
3294                           struct cgraph_node *node = cgraph_node::get (fndecl);
3295                           if (node != NULL && node->simd_clones != NULL)
3296                             {
3297                               unsigned int j, n = gimple_call_num_args (stmt);
3298                               for (j = 0; j < n; j++)
3299                                 {
3300                                   op = gimple_call_arg (stmt, j);
3301                                   if (DECL_P (op)
3302                                       || (REFERENCE_CLASS_P (op)
3303                                           && get_base_address (op)))
3304                                     break;
3305                                 }
3306                               op = gimple_call_lhs (stmt);
3307                               /* Ignore #pragma omp declare simd functions
3308                                  if they don't have data references in the
3309                                  call stmt itself.  */
3310                               if (j == n
3311                                   && !(op
3312                                        && (DECL_P (op)
3313                                            || (REFERENCE_CLASS_P (op)
3314                                                && get_base_address (op)))))
3315                                 continue;
3316                             }
3317                         }
3318                     }
3319                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3320                   if (dump_enabled_p ())
3321                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3322                                      "not vectorized: loop contains function "
3323                                      "calls or data references that cannot "
3324                                      "be analyzed\n");
3325                   return false;
3326                 }
3327             }
3328         }
3329
3330       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3331     }
3332   else
3333     {
3334       bb_vec_info bb_vinfo = as_a <bb_vec_info> (vinfo);
3335       gimple_stmt_iterator gsi;
3336
3337       bb = BB_VINFO_BB (bb_vinfo);
3338       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3339         {
3340           gimple *stmt = gsi_stmt (gsi);
3341           if (is_gimple_debug (stmt))
3342             continue;
3343           ++*n_stmts;
3344           if (!find_data_references_in_stmt (NULL, stmt,
3345                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3346             {
3347               /* Mark the rest of the basic-block as unvectorizable.  */
3348               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3349                 {
3350                   stmt = gsi_stmt (gsi);
3351                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3352                 }
3353               break;
3354             }
3355         }
3356
3357       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3358     }
3359
3360   /* Go through the data-refs, check that the analysis succeeded.  Update
3361      pointer from stmt_vec_info struct to DR and vectype.  */
3362
3363   FOR_EACH_VEC_ELT (datarefs, i, dr)
3364     {
3365       gimple *stmt;
3366       stmt_vec_info stmt_info;
3367       tree base, offset, init;
3368       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3369       bool simd_lane_access = false;
3370       int vf;
3371
3372 again:
3373       if (!dr || !DR_REF (dr))
3374         {
3375           if (dump_enabled_p ())
3376             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3377                              "not vectorized: unhandled data-ref\n");
3378           return false;
3379         }
3380
3381       stmt = DR_STMT (dr);
3382       stmt_info = vinfo_for_stmt (stmt);
3383
3384       /* Discard clobbers from the dataref vector.  We will remove
3385          clobber stmts during vectorization.  */
3386       if (gimple_clobber_p (stmt))
3387         {
3388           free_data_ref (dr);
3389           if (i == datarefs.length () - 1)
3390             {
3391               datarefs.pop ();
3392               break;
3393             }
3394           datarefs.ordered_remove (i);
3395           dr = datarefs[i];
3396           goto again;
3397         }
3398
3399       /* Check that analysis of the data-ref succeeded.  */
3400       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3401           || !DR_STEP (dr))
3402         {
3403           bool maybe_gather
3404             = DR_IS_READ (dr)
3405               && !TREE_THIS_VOLATILE (DR_REF (dr))
3406               && targetm.vectorize.builtin_gather != NULL;
3407           bool maybe_scatter
3408             = DR_IS_WRITE (dr)
3409               && !TREE_THIS_VOLATILE (DR_REF (dr))
3410               && targetm.vectorize.builtin_scatter != NULL;
3411           bool maybe_simd_lane_access
3412             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3413
3414           /* If target supports vector gather loads or scatter stores, or if
3415              this might be a SIMD lane access, see if they can't be used.  */
3416           if (is_a <loop_vec_info> (vinfo)
3417               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3418               && !nested_in_vect_loop_p (loop, stmt))
3419             {
3420               struct data_reference *newdr
3421                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3422                                    DR_REF (dr), stmt, maybe_scatter ? false : true);
3423               gcc_assert (newdr != NULL && DR_REF (newdr));
3424               if (DR_BASE_ADDRESS (newdr)
3425                   && DR_OFFSET (newdr)
3426                   && DR_INIT (newdr)
3427                   && DR_STEP (newdr)
3428                   && integer_zerop (DR_STEP (newdr)))
3429                 {
3430                   if (maybe_simd_lane_access)
3431                     {
3432                       tree off = DR_OFFSET (newdr);
3433                       STRIP_NOPS (off);
3434                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3435                           && TREE_CODE (off) == MULT_EXPR
3436                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3437                         {
3438                           tree step = TREE_OPERAND (off, 1);
3439                           off = TREE_OPERAND (off, 0);
3440                           STRIP_NOPS (off);
3441                           if (CONVERT_EXPR_P (off)
3442                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3443                                                                           0)))
3444                                  < TYPE_PRECISION (TREE_TYPE (off)))
3445                             off = TREE_OPERAND (off, 0);
3446                           if (TREE_CODE (off) == SSA_NAME)
3447                             {
3448                               gimple *def = SSA_NAME_DEF_STMT (off);
3449                               tree reft = TREE_TYPE (DR_REF (newdr));
3450                               if (is_gimple_call (def)
3451                                   && gimple_call_internal_p (def)
3452                                   && (gimple_call_internal_fn (def)
3453                                       == IFN_GOMP_SIMD_LANE))
3454                                 {
3455                                   tree arg = gimple_call_arg (def, 0);
3456                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3457                                   arg = SSA_NAME_VAR (arg);
3458                                   if (arg == loop->simduid
3459                                       /* For now.  */
3460                                       && tree_int_cst_equal
3461                                            (TYPE_SIZE_UNIT (reft),
3462                                             step))
3463                                     {
3464                                       DR_OFFSET (newdr) = ssize_int (0);
3465                                       DR_STEP (newdr) = step;
3466                                       DR_ALIGNED_TO (newdr)
3467                                         = size_int (BIGGEST_ALIGNMENT);
3468                                       dr = newdr;
3469                                       simd_lane_access = true;
3470                                     }
3471                                 }
3472                             }
3473                         }
3474                     }
3475                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3476                     {
3477                       dr = newdr;
3478                       if (maybe_gather)
3479                         gatherscatter = GATHER;
3480                       else
3481                         gatherscatter = SCATTER;
3482                     }
3483                 }
3484               if (gatherscatter == SG_NONE && !simd_lane_access)
3485                 free_data_ref (newdr);
3486             }
3487
3488           if (gatherscatter == SG_NONE && !simd_lane_access)
3489             {
3490               if (dump_enabled_p ())
3491                 {
3492                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3493                                    "not vectorized: data ref analysis "
3494                                    "failed ");
3495                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3496                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3497                 }
3498
3499               if (is_a <bb_vec_info> (vinfo))
3500                 break;
3501
3502               return false;
3503             }
3504         }
3505
3506       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3507         {
3508           if (dump_enabled_p ())
3509             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3510                              "not vectorized: base addr of dr is a "
3511                              "constant\n");
3512
3513           if (is_a <bb_vec_info> (vinfo))
3514             break;
3515
3516           if (gatherscatter != SG_NONE || simd_lane_access)
3517             free_data_ref (dr);
3518           return false;
3519         }
3520
3521       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3522         {
3523           if (dump_enabled_p ())
3524             {
3525               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3526                                "not vectorized: volatile type ");
3527               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3528               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3529             }
3530
3531           if (is_a <bb_vec_info> (vinfo))
3532             break;
3533
3534           return false;
3535         }
3536
3537       if (stmt_can_throw_internal (stmt))
3538         {
3539           if (dump_enabled_p ())
3540             {
3541               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3542                                "not vectorized: statement can throw an "
3543                                "exception ");
3544               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3545               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3546             }
3547
3548           if (is_a <bb_vec_info> (vinfo))
3549             break;
3550
3551           if (gatherscatter != SG_NONE || simd_lane_access)
3552             free_data_ref (dr);
3553           return false;
3554         }
3555
3556       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3557           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3558         {
3559           if (dump_enabled_p ())
3560             {
3561               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3562                                "not vectorized: statement is bitfield "
3563                                "access ");
3564               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3565               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3566             }
3567
3568           if (is_a <bb_vec_info> (vinfo))
3569             break;
3570
3571           if (gatherscatter != SG_NONE || simd_lane_access)
3572             free_data_ref (dr);
3573           return false;
3574         }
3575
3576       base = unshare_expr (DR_BASE_ADDRESS (dr));
3577       offset = unshare_expr (DR_OFFSET (dr));
3578       init = unshare_expr (DR_INIT (dr));
3579
3580       if (is_gimple_call (stmt)
3581           && (!gimple_call_internal_p (stmt)
3582               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3583                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3584         {
3585           if (dump_enabled_p ())
3586             {
3587               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3588                                "not vectorized: dr in a call ");
3589               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3590               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3591             }
3592
3593           if (is_a <bb_vec_info> (vinfo))
3594             break;
3595
3596           if (gatherscatter != SG_NONE || simd_lane_access)
3597             free_data_ref (dr);
3598           return false;
3599         }
3600
3601       /* Update DR field in stmt_vec_info struct.  */
3602
3603       /* If the dataref is in an inner-loop of the loop that is considered for
3604          for vectorization, we also want to analyze the access relative to
3605          the outer-loop (DR contains information only relative to the
3606          inner-most enclosing loop).  We do that by building a reference to the
3607          first location accessed by the inner-loop, and analyze it relative to
3608          the outer-loop.  */
3609       if (loop && nested_in_vect_loop_p (loop, stmt))
3610         {
3611           tree outer_step, outer_base, outer_init;
3612           HOST_WIDE_INT pbitsize, pbitpos;
3613           tree poffset;
3614           machine_mode pmode;
3615           int punsignedp, pvolatilep;
3616           affine_iv base_iv, offset_iv;
3617           tree dinit;
3618
3619           /* Build a reference to the first location accessed by the
3620              inner-loop: *(BASE+INIT).  (The first location is actually
3621              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3622           tree inner_base = build_fold_indirect_ref
3623                                 (fold_build_pointer_plus (base, init));
3624
3625           if (dump_enabled_p ())
3626             {
3627               dump_printf_loc (MSG_NOTE, vect_location,
3628                                "analyze in outer-loop: ");
3629               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3630               dump_printf (MSG_NOTE, "\n");
3631             }
3632
3633           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3634                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3635           gcc_assert (outer_base != NULL_TREE);
3636
3637           if (pbitpos % BITS_PER_UNIT != 0)
3638             {
3639               if (dump_enabled_p ())
3640                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641                                  "failed: bit offset alignment.\n");
3642               return false;
3643             }
3644
3645           outer_base = build_fold_addr_expr (outer_base);
3646           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3647                           &base_iv, false))
3648             {
3649               if (dump_enabled_p ())
3650                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3651                                  "failed: evolution of base is not affine.\n");
3652               return false;
3653             }
3654
3655           if (offset)
3656             {
3657               if (poffset)
3658                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3659                                        poffset);
3660               else
3661                 poffset = offset;
3662             }
3663
3664           if (!poffset)
3665             {
3666               offset_iv.base = ssize_int (0);
3667               offset_iv.step = ssize_int (0);
3668             }
3669           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3670                                &offset_iv, false))
3671             {
3672               if (dump_enabled_p ())
3673                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3674                                  "evolution of offset is not affine.\n");
3675               return false;
3676             }
3677
3678           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3679           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3680           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3681           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3682           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3683
3684           outer_step = size_binop (PLUS_EXPR,
3685                                 fold_convert (ssizetype, base_iv.step),
3686                                 fold_convert (ssizetype, offset_iv.step));
3687
3688           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3689           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3690           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3691           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3692           STMT_VINFO_DR_OFFSET (stmt_info) =
3693                                 fold_convert (ssizetype, offset_iv.base);
3694           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3695                                 size_int (highest_pow2_factor (offset_iv.base));
3696
3697           if (dump_enabled_p ())
3698             {
3699               dump_printf_loc (MSG_NOTE, vect_location,
3700                                "\touter base_address: ");
3701               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3702                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3703               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3704               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3705                                  STMT_VINFO_DR_OFFSET (stmt_info));
3706               dump_printf (MSG_NOTE,
3707                            "\n\touter constant offset from base address: ");
3708               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3709                                  STMT_VINFO_DR_INIT (stmt_info));
3710               dump_printf (MSG_NOTE, "\n\touter step: ");
3711               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3712                                  STMT_VINFO_DR_STEP (stmt_info));
3713               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3714               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3715                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3716               dump_printf (MSG_NOTE, "\n");
3717             }
3718         }
3719
3720       if (STMT_VINFO_DATA_REF (stmt_info))
3721         {
3722           if (dump_enabled_p ())
3723             {
3724               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3725                                "not vectorized: more than one data ref "
3726                                "in stmt: ");
3727               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3728               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3729             }
3730
3731           if (is_a <bb_vec_info> (vinfo))
3732             break;
3733
3734           if (gatherscatter != SG_NONE || simd_lane_access)
3735             free_data_ref (dr);
3736           return false;
3737         }
3738
3739       STMT_VINFO_DATA_REF (stmt_info) = dr;
3740       if (simd_lane_access)
3741         {
3742           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3743           free_data_ref (datarefs[i]);
3744           datarefs[i] = dr;
3745         }
3746
3747       /* Set vectype for STMT.  */
3748       scalar_type = TREE_TYPE (DR_REF (dr));
3749       STMT_VINFO_VECTYPE (stmt_info)
3750         = get_vectype_for_scalar_type (scalar_type);
3751       if (!STMT_VINFO_VECTYPE (stmt_info))
3752         {
3753           if (dump_enabled_p ())
3754             {
3755               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3756                                "not vectorized: no vectype for stmt: ");
3757               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3758               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3759               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3760                                  scalar_type);
3761               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3762             }
3763
3764           if (is_a <bb_vec_info> (vinfo))
3765             break;
3766
3767           if (gatherscatter != SG_NONE || simd_lane_access)
3768             {
3769               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3770               if (gatherscatter != SG_NONE)
3771                 free_data_ref (dr);
3772             }
3773           return false;
3774         }
3775       else
3776         {
3777           if (dump_enabled_p ())
3778             {
3779               dump_printf_loc (MSG_NOTE, vect_location,
3780                                "got vectype for stmt: ");
3781               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3782               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3783                                  STMT_VINFO_VECTYPE (stmt_info));
3784               dump_printf (MSG_NOTE, "\n");
3785             }
3786         }
3787
3788       /* Adjust the minimal vectorization factor according to the
3789          vector type.  */
3790       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3791       if (vf > *min_vf)
3792         *min_vf = vf;
3793
3794       if (gatherscatter != SG_NONE)
3795         {
3796           tree off;
3797           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3798                                           NULL, &off, NULL)
3799               || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3800             {
3801               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3802               free_data_ref (dr);
3803               if (dump_enabled_p ())
3804                 {
3805                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3806                                    (gatherscatter == GATHER) ?
3807                                    "not vectorized: not suitable for gather "
3808                                    "load " :
3809                                    "not vectorized: not suitable for scatter "
3810                                    "store ");
3811                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3812                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3813                 }
3814               return false;
3815             }
3816
3817           datarefs[i] = dr;
3818           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3819         }
3820
3821       else if (is_a <loop_vec_info> (vinfo)
3822                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3823         {
3824           if (nested_in_vect_loop_p (loop, stmt))
3825             {
3826               if (dump_enabled_p ())
3827                 {
3828                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3829                                    "not vectorized: not suitable for strided "
3830                                    "load ");
3831                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3832                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3833                 }
3834               return false;
3835             }
3836           STMT_VINFO_STRIDED_P (stmt_info) = true;
3837         }
3838     }
3839
3840   /* If we stopped analysis at the first dataref we could not analyze
3841      when trying to vectorize a basic-block mark the rest of the datarefs
3842      as not vectorizable and truncate the vector of datarefs.  That
3843      avoids spending useless time in analyzing their dependence.  */
3844   if (i != datarefs.length ())
3845     {
3846       gcc_assert (is_a <bb_vec_info> (vinfo));
3847       for (unsigned j = i; j < datarefs.length (); ++j)
3848         {
3849           data_reference_p dr = datarefs[j];
3850           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3851           free_data_ref (dr);
3852         }
3853       datarefs.truncate (i);
3854     }
3855
3856   return true;
3857 }
3858
3859
3860 /* Function vect_get_new_vect_var.
3861
3862    Returns a name for a new variable.  The current naming scheme appends the
3863    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3864    the name of vectorizer generated variables, and appends that to NAME if
3865    provided.  */
3866
3867 tree
3868 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3869 {
3870   const char *prefix;
3871   tree new_vect_var;
3872
3873   switch (var_kind)
3874   {
3875   case vect_simple_var:
3876     prefix = "vect";
3877     break;
3878   case vect_scalar_var:
3879     prefix = "stmp";
3880     break;
3881   case vect_pointer_var:
3882     prefix = "vectp";
3883     break;
3884   default:
3885     gcc_unreachable ();
3886   }
3887
3888   if (name)
3889     {
3890       char* tmp = concat (prefix, "_", name, NULL);
3891       new_vect_var = create_tmp_reg (type, tmp);
3892       free (tmp);
3893     }
3894   else
3895     new_vect_var = create_tmp_reg (type, prefix);
3896
3897   return new_vect_var;
3898 }
3899
3900 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3901
3902 static void
3903 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3904                                   stmt_vec_info stmt_info)
3905 {
3906   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3907   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3908   int misalign = DR_MISALIGNMENT (dr);
3909   if (misalign == -1)
3910     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3911   else
3912     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3913 }
3914
3915 /* Function vect_create_addr_base_for_vector_ref.
3916
3917    Create an expression that computes the address of the first memory location
3918    that will be accessed for a data reference.
3919
3920    Input:
3921    STMT: The statement containing the data reference.
3922    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3923    OFFSET: Optional. If supplied, it is be added to the initial address.
3924    LOOP:    Specify relative to which loop-nest should the address be computed.
3925             For example, when the dataref is in an inner-loop nested in an
3926             outer-loop that is now being vectorized, LOOP can be either the
3927             outer-loop, or the inner-loop.  The first memory location accessed
3928             by the following dataref ('in' points to short):
3929
3930                 for (i=0; i<N; i++)
3931                    for (j=0; j<M; j++)
3932                      s += in[i+j]
3933
3934             is as follows:
3935             if LOOP=i_loop:     &in             (relative to i_loop)
3936             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3937    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3938             initial address.  Unlike OFFSET, which is number of elements to
3939             be added, BYTE_OFFSET is measured in bytes.
3940
3941    Output:
3942    1. Return an SSA_NAME whose value is the address of the memory location of
3943       the first vector of the data reference.
3944    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3945       these statement(s) which define the returned SSA_NAME.
3946
3947    FORNOW: We are only handling array accesses with step 1.  */
3948
3949 tree
3950 vect_create_addr_base_for_vector_ref (gimple *stmt,
3951                                       gimple_seq *new_stmt_list,
3952                                       tree offset,
3953                                       struct loop *loop,
3954                                       tree byte_offset)
3955 {
3956   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3957   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3958   tree data_ref_base;
3959   const char *base_name;
3960   tree addr_base;
3961   tree dest;
3962   gimple_seq seq = NULL;
3963   tree base_offset;
3964   tree init;
3965   tree vect_ptr_type;
3966   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3967   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3968
3969   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3970     {
3971       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3972
3973       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3974
3975       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3976       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3977       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3978     }
3979   else
3980     {
3981       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3982       base_offset = unshare_expr (DR_OFFSET (dr));
3983       init = unshare_expr (DR_INIT (dr));
3984     }
3985
3986   if (loop_vinfo)
3987     base_name = get_name (data_ref_base);
3988   else
3989     {
3990       base_offset = ssize_int (0);
3991       init = ssize_int (0);
3992       base_name = get_name (DR_REF (dr));
3993     }
3994
3995   /* Create base_offset */
3996   base_offset = size_binop (PLUS_EXPR,
3997                             fold_convert (sizetype, base_offset),
3998                             fold_convert (sizetype, init));
3999
4000   if (offset)
4001     {
4002       offset = fold_build2 (MULT_EXPR, sizetype,
4003                             fold_convert (sizetype, offset), step);
4004       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4005                                  base_offset, offset);
4006     }
4007   if (byte_offset)
4008     {
4009       byte_offset = fold_convert (sizetype, byte_offset);
4010       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4011                                  base_offset, byte_offset);
4012     }
4013
4014   /* base + base_offset */
4015   if (loop_vinfo)
4016     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4017   else
4018     {
4019       addr_base = build1 (ADDR_EXPR,
4020                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4021                           unshare_expr (DR_REF (dr)));
4022     }
4023
4024   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4025   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4026   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4027   gimple_seq_add_seq (new_stmt_list, seq);
4028
4029   if (DR_PTR_INFO (dr)
4030       && TREE_CODE (addr_base) == SSA_NAME
4031       && !SSA_NAME_PTR_INFO (addr_base))
4032     {
4033       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4034       if (offset || byte_offset)
4035         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4036     }
4037
4038   if (dump_enabled_p ())
4039     {
4040       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4041       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4042       dump_printf (MSG_NOTE, "\n");
4043     }
4044
4045   return addr_base;
4046 }
4047
4048
4049 /* Function vect_create_data_ref_ptr.
4050
4051    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4052    location accessed in the loop by STMT, along with the def-use update
4053    chain to appropriately advance the pointer through the loop iterations.
4054    Also set aliasing information for the pointer.  This pointer is used by
4055    the callers to this function to create a memory reference expression for
4056    vector load/store access.
4057
4058    Input:
4059    1. STMT: a stmt that references memory. Expected to be of the form
4060          GIMPLE_ASSIGN <name, data-ref> or
4061          GIMPLE_ASSIGN <data-ref, name>.
4062    2. AGGR_TYPE: the type of the reference, which should be either a vector
4063         or an array.
4064    3. AT_LOOP: the loop where the vector memref is to be created.
4065    4. OFFSET (optional): an offset to be added to the initial address accessed
4066         by the data-ref in STMT.
4067    5. BSI: location where the new stmts are to be placed if there is no loop
4068    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4069         pointing to the initial address.
4070    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4071         to the initial address accessed by the data-ref in STMT.  This is
4072         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4073         in bytes.
4074
4075    Output:
4076    1. Declare a new ptr to vector_type, and have it point to the base of the
4077       data reference (initial addressed accessed by the data reference).
4078       For example, for vector of type V8HI, the following code is generated:
4079
4080       v8hi *ap;
4081       ap = (v8hi *)initial_address;
4082
4083       if OFFSET is not supplied:
4084          initial_address = &a[init];
4085       if OFFSET is supplied:
4086          initial_address = &a[init + OFFSET];
4087       if BYTE_OFFSET is supplied:
4088          initial_address = &a[init] + BYTE_OFFSET;
4089
4090       Return the initial_address in INITIAL_ADDRESS.
4091
4092    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4093       update the pointer in each iteration of the loop.
4094
4095       Return the increment stmt that updates the pointer in PTR_INCR.
4096
4097    3. Set INV_P to true if the access pattern of the data reference in the
4098       vectorized loop is invariant.  Set it to false otherwise.
4099
4100    4. Return the pointer.  */
4101
4102 tree
4103 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4104                           tree offset, tree *initial_address,
4105                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4106                           bool only_init, bool *inv_p, tree byte_offset)
4107 {
4108   const char *base_name;
4109   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4110   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4111   struct loop *loop = NULL;
4112   bool nested_in_vect_loop = false;
4113   struct loop *containing_loop = NULL;
4114   tree aggr_ptr_type;
4115   tree aggr_ptr;
4116   tree new_temp;
4117   gimple_seq new_stmt_list = NULL;
4118   edge pe = NULL;
4119   basic_block new_bb;
4120   tree aggr_ptr_init;
4121   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4122   tree aptr;
4123   gimple_stmt_iterator incr_gsi;
4124   bool insert_after;
4125   tree indx_before_incr, indx_after_incr;
4126   gimple *incr;
4127   tree step;
4128   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4129
4130   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4131               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4132
4133   if (loop_vinfo)
4134     {
4135       loop = LOOP_VINFO_LOOP (loop_vinfo);
4136       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4137       containing_loop = (gimple_bb (stmt))->loop_father;
4138       pe = loop_preheader_edge (loop);
4139     }
4140   else
4141     {
4142       gcc_assert (bb_vinfo);
4143       only_init = true;
4144       *ptr_incr = NULL;
4145     }
4146
4147   /* Check the step (evolution) of the load in LOOP, and record
4148      whether it's invariant.  */
4149   if (nested_in_vect_loop)
4150     step = STMT_VINFO_DR_STEP (stmt_info);
4151   else
4152     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4153
4154   if (integer_zerop (step))
4155     *inv_p = true;
4156   else
4157     *inv_p = false;
4158
4159   /* Create an expression for the first address accessed by this load
4160      in LOOP.  */
4161   base_name = get_name (DR_BASE_ADDRESS (dr));
4162
4163   if (dump_enabled_p ())
4164     {
4165       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4166       dump_printf_loc (MSG_NOTE, vect_location,
4167                        "create %s-pointer variable to type: ",
4168                        get_tree_code_name (TREE_CODE (aggr_type)));
4169       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4170       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4171         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4172       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4173         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4174       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4175         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4176       else
4177         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4178       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4179       dump_printf (MSG_NOTE, "\n");
4180     }
4181
4182   /* (1) Create the new aggregate-pointer variable.
4183      Vector and array types inherit the alias set of their component
4184      type by default so we need to use a ref-all pointer if the data
4185      reference does not conflict with the created aggregated data
4186      reference because it is not addressable.  */
4187   bool need_ref_all = false;
4188   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4189                               get_alias_set (DR_REF (dr))))
4190     need_ref_all = true;
4191   /* Likewise for any of the data references in the stmt group.  */
4192   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4193     {
4194       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4195       do
4196         {
4197           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4198           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4199           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4200                                       get_alias_set (DR_REF (sdr))))
4201             {
4202               need_ref_all = true;
4203               break;
4204             }
4205           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4206         }
4207       while (orig_stmt);
4208     }
4209   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4210                                                need_ref_all);
4211   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4212
4213
4214   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4215      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4216      def-use update cycles for the pointer: one relative to the outer-loop
4217      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4218      to the inner-loop (which is the inner-most loop containing the dataref),
4219      and this is done be step (5) below.
4220
4221      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4222      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4223      redundant.  Steps (3),(4) create the following:
4224
4225         vp0 = &base_addr;
4226         LOOP:   vp1 = phi(vp0,vp2)
4227                 ...
4228                 ...
4229                 vp2 = vp1 + step
4230                 goto LOOP
4231
4232      If there is an inner-loop nested in loop, then step (5) will also be
4233      applied, and an additional update in the inner-loop will be created:
4234
4235         vp0 = &base_addr;
4236         LOOP:   vp1 = phi(vp0,vp2)
4237                 ...
4238         inner:     vp3 = phi(vp1,vp4)
4239                    vp4 = vp3 + inner_step
4240                    if () goto inner
4241                 ...
4242                 vp2 = vp1 + step
4243                 if () goto LOOP   */
4244
4245   /* (2) Calculate the initial address of the aggregate-pointer, and set
4246      the aggregate-pointer to point to it before the loop.  */
4247
4248   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4249
4250   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4251                                                    offset, loop, byte_offset);
4252   if (new_stmt_list)
4253     {
4254       if (pe)
4255         {
4256           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4257           gcc_assert (!new_bb);
4258         }
4259       else
4260         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4261     }
4262
4263   *initial_address = new_temp;
4264   aggr_ptr_init = new_temp;
4265
4266   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4267      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4268      inner-loop nested in LOOP (during outer-loop vectorization).  */
4269
4270   /* No update in loop is required.  */
4271   if (only_init && (!loop_vinfo || at_loop == loop))
4272     aptr = aggr_ptr_init;
4273   else
4274     {
4275       /* The step of the aggregate pointer is the type size.  */
4276       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4277       /* One exception to the above is when the scalar step of the load in
4278          LOOP is zero. In this case the step here is also zero.  */
4279       if (*inv_p)
4280         iv_step = size_zero_node;
4281       else if (tree_int_cst_sgn (step) == -1)
4282         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4283
4284       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4285
4286       create_iv (aggr_ptr_init,
4287                  fold_convert (aggr_ptr_type, iv_step),
4288                  aggr_ptr, loop, &incr_gsi, insert_after,
4289                  &indx_before_incr, &indx_after_incr);
4290       incr = gsi_stmt (incr_gsi);
4291       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4292
4293       /* Copy the points-to information if it exists. */
4294       if (DR_PTR_INFO (dr))
4295         {
4296           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4297           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4298         }
4299       if (ptr_incr)
4300         *ptr_incr = incr;
4301
4302       aptr = indx_before_incr;
4303     }
4304
4305   if (!nested_in_vect_loop || only_init)
4306     return aptr;
4307
4308
4309   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4310      nested in LOOP, if exists.  */
4311
4312   gcc_assert (nested_in_vect_loop);
4313   if (!only_init)
4314     {
4315       standard_iv_increment_position (containing_loop, &incr_gsi,
4316                                       &insert_after);
4317       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4318                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4319                  &indx_after_incr);
4320       incr = gsi_stmt (incr_gsi);
4321       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4322
4323       /* Copy the points-to information if it exists. */
4324       if (DR_PTR_INFO (dr))
4325         {
4326           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4327           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4328         }
4329       if (ptr_incr)
4330         *ptr_incr = incr;
4331
4332       return indx_before_incr;
4333     }
4334   else
4335     gcc_unreachable ();
4336 }
4337
4338
4339 /* Function bump_vector_ptr
4340
4341    Increment a pointer (to a vector type) by vector-size. If requested,
4342    i.e. if PTR-INCR is given, then also connect the new increment stmt
4343    to the existing def-use update-chain of the pointer, by modifying
4344    the PTR_INCR as illustrated below:
4345
4346    The pointer def-use update-chain before this function:
4347                         DATAREF_PTR = phi (p_0, p_2)
4348                         ....
4349         PTR_INCR:       p_2 = DATAREF_PTR + step
4350
4351    The pointer def-use update-chain after this function:
4352                         DATAREF_PTR = phi (p_0, p_2)
4353                         ....
4354                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4355                         ....
4356         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4357
4358    Input:
4359    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4360                  in the loop.
4361    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4362               the loop.  The increment amount across iterations is expected
4363               to be vector_size.
4364    BSI - location where the new update stmt is to be placed.
4365    STMT - the original scalar memory-access stmt that is being vectorized.
4366    BUMP - optional. The offset by which to bump the pointer. If not given,
4367           the offset is assumed to be vector_size.
4368
4369    Output: Return NEW_DATAREF_PTR as illustrated above.
4370
4371 */
4372
4373 tree
4374 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4375                  gimple *stmt, tree bump)
4376 {
4377   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4378   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4379   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4380   tree update = TYPE_SIZE_UNIT (vectype);
4381   gassign *incr_stmt;
4382   ssa_op_iter iter;
4383   use_operand_p use_p;
4384   tree new_dataref_ptr;
4385
4386   if (bump)
4387     update = bump;
4388
4389   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4390     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4391   else
4392     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4393   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4394                                    dataref_ptr, update);
4395   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4396
4397   /* Copy the points-to information if it exists. */
4398   if (DR_PTR_INFO (dr))
4399     {
4400       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4401       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4402     }
4403
4404   if (!ptr_incr)
4405     return new_dataref_ptr;
4406
4407   /* Update the vector-pointer's cross-iteration increment.  */
4408   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4409     {
4410       tree use = USE_FROM_PTR (use_p);
4411
4412       if (use == dataref_ptr)
4413         SET_USE (use_p, new_dataref_ptr);
4414       else
4415         gcc_assert (tree_int_cst_compare (use, update) == 0);
4416     }
4417
4418   return new_dataref_ptr;
4419 }
4420
4421
4422 /* Function vect_create_destination_var.
4423
4424    Create a new temporary of type VECTYPE.  */
4425
4426 tree
4427 vect_create_destination_var (tree scalar_dest, tree vectype)
4428 {
4429   tree vec_dest;
4430   const char *name;
4431   char *new_name;
4432   tree type;
4433   enum vect_var_kind kind;
4434
4435   kind = vectype ? vect_simple_var : vect_scalar_var;
4436   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4437
4438   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4439
4440   name = get_name (scalar_dest);
4441   if (name)
4442     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4443   else
4444     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4445   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4446   free (new_name);
4447
4448   return vec_dest;
4449 }
4450
4451 /* Function vect_grouped_store_supported.
4452
4453    Returns TRUE if interleave high and interleave low permutations
4454    are supported, and FALSE otherwise.  */
4455
4456 bool
4457 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4458 {
4459   machine_mode mode = TYPE_MODE (vectype);
4460
4461   /* vect_permute_store_chain requires the group size to be equal to 3 or
4462      be a power of two.  */
4463   if (count != 3 && exact_log2 (count) == -1)
4464     {
4465       if (dump_enabled_p ())
4466         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4467                          "the size of the group of accesses"
4468                          " is not a power of 2 or not eqaul to 3\n");
4469       return false;
4470     }
4471
4472   /* Check that the permutation is supported.  */
4473   if (VECTOR_MODE_P (mode))
4474     {
4475       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4476       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4477
4478       if (count == 3)
4479         {
4480           unsigned int j0 = 0, j1 = 0, j2 = 0;
4481           unsigned int i, j;
4482
4483           for (j = 0; j < 3; j++)
4484             {
4485               int nelt0 = ((3 - j) * nelt) % 3;
4486               int nelt1 = ((3 - j) * nelt + 1) % 3;
4487               int nelt2 = ((3 - j) * nelt + 2) % 3;
4488               for (i = 0; i < nelt; i++)
4489                 {
4490                   if (3 * i + nelt0 < nelt)
4491                     sel[3 * i + nelt0] = j0++;
4492                   if (3 * i + nelt1 < nelt)
4493                     sel[3 * i + nelt1] = nelt + j1++;
4494                   if (3 * i + nelt2 < nelt)
4495                     sel[3 * i + nelt2] = 0;
4496                 }
4497               if (!can_vec_perm_p (mode, false, sel))
4498                 {
4499                   if (dump_enabled_p ())
4500                     dump_printf (MSG_MISSED_OPTIMIZATION,
4501                                  "permutaion op not supported by target.\n");
4502                   return false;
4503                 }
4504
4505               for (i = 0; i < nelt; i++)
4506                 {
4507                   if (3 * i + nelt0 < nelt)
4508                     sel[3 * i + nelt0] = 3 * i + nelt0;
4509                   if (3 * i + nelt1 < nelt)
4510                     sel[3 * i + nelt1] = 3 * i + nelt1;
4511                   if (3 * i + nelt2 < nelt)
4512                     sel[3 * i + nelt2] = nelt + j2++;
4513                 }
4514               if (!can_vec_perm_p (mode, false, sel))
4515                 {
4516                   if (dump_enabled_p ())
4517                     dump_printf (MSG_MISSED_OPTIMIZATION,
4518                                  "permutaion op not supported by target.\n");
4519                   return false;
4520                 }
4521             }
4522           return true;
4523         }
4524       else
4525         {
4526           /* If length is not equal to 3 then only power of 2 is supported.  */
4527           gcc_assert (exact_log2 (count) != -1);
4528
4529           for (i = 0; i < nelt / 2; i++)
4530             {
4531               sel[i * 2] = i;
4532               sel[i * 2 + 1] = i + nelt;
4533             }
4534             if (can_vec_perm_p (mode, false, sel))
4535               {
4536                 for (i = 0; i < nelt; i++)
4537                   sel[i] += nelt / 2;
4538                 if (can_vec_perm_p (mode, false, sel))
4539                   return true;
4540               }
4541         }
4542     }
4543
4544   if (dump_enabled_p ())
4545     dump_printf (MSG_MISSED_OPTIMIZATION,
4546                  "permutaion op not supported by target.\n");
4547   return false;
4548 }
4549
4550
4551 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4552    type VECTYPE.  */
4553
4554 bool
4555 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4556 {
4557   return vect_lanes_optab_supported_p ("vec_store_lanes",
4558                                        vec_store_lanes_optab,
4559                                        vectype, count);
4560 }
4561
4562
4563 /* Function vect_permute_store_chain.
4564
4565    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4566    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4567    the data correctly for the stores.  Return the final references for stores
4568    in RESULT_CHAIN.
4569
4570    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4571    The input is 4 vectors each containing 8 elements.  We assign a number to
4572    each element, the input sequence is:
4573
4574    1st vec:   0  1  2  3  4  5  6  7
4575    2nd vec:   8  9 10 11 12 13 14 15
4576    3rd vec:  16 17 18 19 20 21 22 23
4577    4th vec:  24 25 26 27 28 29 30 31
4578
4579    The output sequence should be:
4580
4581    1st vec:  0  8 16 24  1  9 17 25
4582    2nd vec:  2 10 18 26  3 11 19 27
4583    3rd vec:  4 12 20 28  5 13 21 30
4584    4th vec:  6 14 22 30  7 15 23 31
4585
4586    i.e., we interleave the contents of the four vectors in their order.
4587
4588    We use interleave_high/low instructions to create such output.  The input of
4589    each interleave_high/low operation is two vectors:
4590    1st vec    2nd vec
4591    0 1 2 3    4 5 6 7
4592    the even elements of the result vector are obtained left-to-right from the
4593    high/low elements of the first vector.  The odd elements of the result are
4594    obtained left-to-right from the high/low elements of the second vector.
4595    The output of interleave_high will be:   0 4 1 5
4596    and of interleave_low:                   2 6 3 7
4597
4598
4599    The permutation is done in log LENGTH stages.  In each stage interleave_high
4600    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4601    where the first argument is taken from the first half of DR_CHAIN and the
4602    second argument from it's second half.
4603    In our example,
4604
4605    I1: interleave_high (1st vec, 3rd vec)
4606    I2: interleave_low (1st vec, 3rd vec)
4607    I3: interleave_high (2nd vec, 4th vec)
4608    I4: interleave_low (2nd vec, 4th vec)
4609
4610    The output for the first stage is:
4611
4612    I1:  0 16  1 17  2 18  3 19
4613    I2:  4 20  5 21  6 22  7 23
4614    I3:  8 24  9 25 10 26 11 27
4615    I4: 12 28 13 29 14 30 15 31
4616
4617    The output of the second stage, i.e. the final result is:
4618
4619    I1:  0  8 16 24  1  9 17 25
4620    I2:  2 10 18 26  3 11 19 27
4621    I3:  4 12 20 28  5 13 21 30
4622    I4:  6 14 22 30  7 15 23 31.  */
4623
4624 void
4625 vect_permute_store_chain (vec<tree> dr_chain,
4626                           unsigned int length,
4627                           gimple *stmt,
4628                           gimple_stmt_iterator *gsi,
4629                           vec<tree> *result_chain)
4630 {
4631   tree vect1, vect2, high, low;
4632   gimple *perm_stmt;
4633   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4634   tree perm_mask_low, perm_mask_high;
4635   tree data_ref;
4636   tree perm3_mask_low, perm3_mask_high;
4637   unsigned int i, n, log_length = exact_log2 (length);
4638   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4639   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4640
4641   result_chain->quick_grow (length);
4642   memcpy (result_chain->address (), dr_chain.address (),
4643           length * sizeof (tree));
4644
4645   if (length == 3)
4646     {
4647       unsigned int j0 = 0, j1 = 0, j2 = 0;
4648
4649       for (j = 0; j < 3; j++)
4650         {
4651           int nelt0 = ((3 - j) * nelt) % 3;
4652           int nelt1 = ((3 - j) * nelt + 1) % 3;
4653           int nelt2 = ((3 - j) * nelt + 2) % 3;
4654
4655           for (i = 0; i < nelt; i++)
4656             {
4657               if (3 * i + nelt0 < nelt)
4658                 sel[3 * i + nelt0] = j0++;
4659               if (3 * i + nelt1 < nelt)
4660                 sel[3 * i + nelt1] = nelt + j1++;
4661               if (3 * i + nelt2 < nelt)
4662                 sel[3 * i + nelt2] = 0;
4663             }
4664           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4665
4666           for (i = 0; i < nelt; i++)
4667             {
4668               if (3 * i + nelt0 < nelt)
4669                 sel[3 * i + nelt0] = 3 * i + nelt0;
4670               if (3 * i + nelt1 < nelt)
4671                 sel[3 * i + nelt1] = 3 * i + nelt1;
4672               if (3 * i + nelt2 < nelt)
4673                 sel[3 * i + nelt2] = nelt + j2++;
4674             }
4675           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4676
4677           vect1 = dr_chain[0];
4678           vect2 = dr_chain[1];
4679
4680           /* Create interleaving stmt:
4681              low = VEC_PERM_EXPR <vect1, vect2,
4682                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4683                                    j + 2, nelt + j + 2, *, ...}>  */
4684           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4685           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4686                                            vect2, perm3_mask_low);
4687           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4688
4689           vect1 = data_ref;
4690           vect2 = dr_chain[2];
4691           /* Create interleaving stmt:
4692              low = VEC_PERM_EXPR <vect1, vect2,
4693                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4694                                    6, 7, nelt + j + 2, ...}>  */
4695           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4696           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4697                                            vect2, perm3_mask_high);
4698           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4699           (*result_chain)[j] = data_ref;
4700         }
4701     }
4702   else
4703     {
4704       /* If length is not equal to 3 then only power of 2 is supported.  */
4705       gcc_assert (exact_log2 (length) != -1);
4706
4707       for (i = 0, n = nelt / 2; i < n; i++)
4708         {
4709           sel[i * 2] = i;
4710           sel[i * 2 + 1] = i + nelt;
4711         }
4712         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4713
4714         for (i = 0; i < nelt; i++)
4715           sel[i] += nelt / 2;
4716         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4717
4718         for (i = 0, n = log_length; i < n; i++)
4719           {
4720             for (j = 0; j < length/2; j++)
4721               {
4722                 vect1 = dr_chain[j];
4723                 vect2 = dr_chain[j+length/2];
4724
4725                 /* Create interleaving stmt:
4726                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4727                                                         ...}>  */
4728                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4729                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4730                                                  vect2, perm_mask_high);
4731                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4732                 (*result_chain)[2*j] = high;
4733
4734                 /* Create interleaving stmt:
4735                    low = VEC_PERM_EXPR <vect1, vect2,
4736                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4737                                          ...}>  */
4738                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4739                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4740                                                  vect2, perm_mask_low);
4741                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4742                 (*result_chain)[2*j+1] = low;
4743               }
4744             memcpy (dr_chain.address (), result_chain->address (),
4745                     length * sizeof (tree));
4746           }
4747     }
4748 }
4749
4750 /* Function vect_setup_realignment
4751
4752    This function is called when vectorizing an unaligned load using
4753    the dr_explicit_realign[_optimized] scheme.
4754    This function generates the following code at the loop prolog:
4755
4756       p = initial_addr;
4757    x  msq_init = *(floor(p));   # prolog load
4758       realignment_token = call target_builtin;
4759     loop:
4760    x  msq = phi (msq_init, ---)
4761
4762    The stmts marked with x are generated only for the case of
4763    dr_explicit_realign_optimized.
4764
4765    The code above sets up a new (vector) pointer, pointing to the first
4766    location accessed by STMT, and a "floor-aligned" load using that pointer.
4767    It also generates code to compute the "realignment-token" (if the relevant
4768    target hook was defined), and creates a phi-node at the loop-header bb
4769    whose arguments are the result of the prolog-load (created by this
4770    function) and the result of a load that takes place in the loop (to be
4771    created by the caller to this function).
4772
4773    For the case of dr_explicit_realign_optimized:
4774    The caller to this function uses the phi-result (msq) to create the
4775    realignment code inside the loop, and sets up the missing phi argument,
4776    as follows:
4777     loop:
4778       msq = phi (msq_init, lsq)
4779       lsq = *(floor(p'));        # load in loop
4780       result = realign_load (msq, lsq, realignment_token);
4781
4782    For the case of dr_explicit_realign:
4783     loop:
4784       msq = *(floor(p));        # load in loop
4785       p' = p + (VS-1);
4786       lsq = *(floor(p'));       # load in loop
4787       result = realign_load (msq, lsq, realignment_token);
4788
4789    Input:
4790    STMT - (scalar) load stmt to be vectorized. This load accesses
4791           a memory location that may be unaligned.
4792    BSI - place where new code is to be inserted.
4793    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4794                               is used.
4795
4796    Output:
4797    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4798                        target hook, if defined.
4799    Return value - the result of the loop-header phi node.  */
4800
4801 tree
4802 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4803                         tree *realignment_token,
4804                         enum dr_alignment_support alignment_support_scheme,
4805                         tree init_addr,
4806                         struct loop **at_loop)
4807 {
4808   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4809   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4810   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4811   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4812   struct loop *loop = NULL;
4813   edge pe = NULL;
4814   tree scalar_dest = gimple_assign_lhs (stmt);
4815   tree vec_dest;
4816   gimple *inc;
4817   tree ptr;
4818   tree data_ref;
4819   basic_block new_bb;
4820   tree msq_init = NULL_TREE;
4821   tree new_temp;
4822   gphi *phi_stmt;
4823   tree msq = NULL_TREE;
4824   gimple_seq stmts = NULL;
4825   bool inv_p;
4826   bool compute_in_loop = false;
4827   bool nested_in_vect_loop = false;
4828   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4829   struct loop *loop_for_initial_load = NULL;
4830
4831   if (loop_vinfo)
4832     {
4833       loop = LOOP_VINFO_LOOP (loop_vinfo);
4834       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4835     }
4836
4837   gcc_assert (alignment_support_scheme == dr_explicit_realign
4838               || alignment_support_scheme == dr_explicit_realign_optimized);
4839
4840   /* We need to generate three things:
4841      1. the misalignment computation
4842      2. the extra vector load (for the optimized realignment scheme).
4843      3. the phi node for the two vectors from which the realignment is
4844       done (for the optimized realignment scheme).  */
4845
4846   /* 1. Determine where to generate the misalignment computation.
4847
4848      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4849      calculation will be generated by this function, outside the loop (in the
4850      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4851      caller, inside the loop.
4852
4853      Background: If the misalignment remains fixed throughout the iterations of
4854      the loop, then both realignment schemes are applicable, and also the
4855      misalignment computation can be done outside LOOP.  This is because we are
4856      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4857      are a multiple of VS (the Vector Size), and therefore the misalignment in
4858      different vectorized LOOP iterations is always the same.
4859      The problem arises only if the memory access is in an inner-loop nested
4860      inside LOOP, which is now being vectorized using outer-loop vectorization.
4861      This is the only case when the misalignment of the memory access may not
4862      remain fixed throughout the iterations of the inner-loop (as explained in
4863      detail in vect_supportable_dr_alignment).  In this case, not only is the
4864      optimized realignment scheme not applicable, but also the misalignment
4865      computation (and generation of the realignment token that is passed to
4866      REALIGN_LOAD) have to be done inside the loop.
4867
4868      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4869      or not, which in turn determines if the misalignment is computed inside
4870      the inner-loop, or outside LOOP.  */
4871
4872   if (init_addr != NULL_TREE || !loop_vinfo)
4873     {
4874       compute_in_loop = true;
4875       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4876     }
4877
4878
4879   /* 2. Determine where to generate the extra vector load.
4880
4881      For the optimized realignment scheme, instead of generating two vector
4882      loads in each iteration, we generate a single extra vector load in the
4883      preheader of the loop, and in each iteration reuse the result of the
4884      vector load from the previous iteration.  In case the memory access is in
4885      an inner-loop nested inside LOOP, which is now being vectorized using
4886      outer-loop vectorization, we need to determine whether this initial vector
4887      load should be generated at the preheader of the inner-loop, or can be
4888      generated at the preheader of LOOP.  If the memory access has no evolution
4889      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4890      to be generated inside LOOP (in the preheader of the inner-loop).  */
4891
4892   if (nested_in_vect_loop)
4893     {
4894       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4895       bool invariant_in_outerloop =
4896             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4897       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4898     }
4899   else
4900     loop_for_initial_load = loop;
4901   if (at_loop)
4902     *at_loop = loop_for_initial_load;
4903
4904   if (loop_for_initial_load)
4905     pe = loop_preheader_edge (loop_for_initial_load);
4906
4907   /* 3. For the case of the optimized realignment, create the first vector
4908       load at the loop preheader.  */
4909
4910   if (alignment_support_scheme == dr_explicit_realign_optimized)
4911     {
4912       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4913       gassign *new_stmt;
4914
4915       gcc_assert (!compute_in_loop);
4916       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4917       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4918                                       NULL_TREE, &init_addr, NULL, &inc,
4919                                       true, &inv_p);
4920       if (TREE_CODE (ptr) == SSA_NAME)
4921         new_temp = copy_ssa_name (ptr);
4922       else
4923         new_temp = make_ssa_name (TREE_TYPE (ptr));
4924       new_stmt = gimple_build_assign
4925                    (new_temp, BIT_AND_EXPR, ptr,
4926                     build_int_cst (TREE_TYPE (ptr),
4927                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4928       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4929       gcc_assert (!new_bb);
4930       data_ref
4931         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4932                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4933       new_stmt = gimple_build_assign (vec_dest, data_ref);
4934       new_temp = make_ssa_name (vec_dest, new_stmt);
4935       gimple_assign_set_lhs (new_stmt, new_temp);
4936       if (pe)
4937         {
4938           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4939           gcc_assert (!new_bb);
4940         }
4941       else
4942          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4943
4944       msq_init = gimple_assign_lhs (new_stmt);
4945     }
4946
4947   /* 4. Create realignment token using a target builtin, if available.
4948       It is done either inside the containing loop, or before LOOP (as
4949       determined above).  */
4950
4951   if (targetm.vectorize.builtin_mask_for_load)
4952     {
4953       gcall *new_stmt;
4954       tree builtin_decl;
4955
4956       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4957       if (!init_addr)
4958         {
4959           /* Generate the INIT_ADDR computation outside LOOP.  */
4960           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4961                                                         NULL_TREE, loop);
4962           if (loop)
4963             {
4964               pe = loop_preheader_edge (loop);
4965               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4966               gcc_assert (!new_bb);
4967             }
4968           else
4969              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4970         }
4971
4972       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4973       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4974       vec_dest =
4975         vect_create_destination_var (scalar_dest,
4976                                      gimple_call_return_type (new_stmt));
4977       new_temp = make_ssa_name (vec_dest, new_stmt);
4978       gimple_call_set_lhs (new_stmt, new_temp);
4979
4980       if (compute_in_loop)
4981         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4982       else
4983         {
4984           /* Generate the misalignment computation outside LOOP.  */
4985           pe = loop_preheader_edge (loop);
4986           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4987           gcc_assert (!new_bb);
4988         }
4989
4990       *realignment_token = gimple_call_lhs (new_stmt);
4991
4992       /* The result of the CALL_EXPR to this builtin is determined from
4993          the value of the parameter and no global variables are touched
4994          which makes the builtin a "const" function.  Requiring the
4995          builtin to have the "const" attribute makes it unnecessary
4996          to call mark_call_clobbered.  */
4997       gcc_assert (TREE_READONLY (builtin_decl));
4998     }
4999
5000   if (alignment_support_scheme == dr_explicit_realign)
5001     return msq;
5002
5003   gcc_assert (!compute_in_loop);
5004   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5005
5006
5007   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5008
5009   pe = loop_preheader_edge (containing_loop);
5010   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5011   msq = make_ssa_name (vec_dest);
5012   phi_stmt = create_phi_node (msq, containing_loop->header);
5013   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5014
5015   return msq;
5016 }
5017
5018
5019 /* Function vect_grouped_load_supported.
5020
5021    Returns TRUE if even and odd permutations are supported,
5022    and FALSE otherwise.  */
5023
5024 bool
5025 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5026 {
5027   machine_mode mode = TYPE_MODE (vectype);
5028
5029   /* vect_permute_load_chain requires the group size to be equal to 3 or
5030      be a power of two.  */
5031   if (count != 3 && exact_log2 (count) == -1)
5032     {
5033       if (dump_enabled_p ())
5034         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5035                          "the size of the group of accesses"
5036                          " is not a power of 2 or not equal to 3\n");
5037       return false;
5038     }
5039
5040   /* Check that the permutation is supported.  */
5041   if (VECTOR_MODE_P (mode))
5042     {
5043       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5044       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5045
5046       if (count == 3)
5047         {
5048           unsigned int k;
5049           for (k = 0; k < 3; k++)
5050             {
5051               for (i = 0; i < nelt; i++)
5052                 if (3 * i + k < 2 * nelt)
5053                   sel[i] = 3 * i + k;
5054                 else
5055                   sel[i] = 0;
5056               if (!can_vec_perm_p (mode, false, sel))
5057                 {
5058                   if (dump_enabled_p ())
5059                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5060                                      "shuffle of 3 loads is not supported by"
5061                                      " target\n");
5062                   return false;
5063                 }
5064               for (i = 0, j = 0; i < nelt; i++)
5065                 if (3 * i + k < 2 * nelt)
5066                   sel[i] = i;
5067                 else
5068                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5069               if (!can_vec_perm_p (mode, false, sel))
5070                 {
5071                   if (dump_enabled_p ())
5072                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5073                                      "shuffle of 3 loads is not supported by"
5074                                      " target\n");
5075                   return false;
5076                 }
5077             }
5078           return true;
5079         }
5080       else
5081         {
5082           /* If length is not equal to 3 then only power of 2 is supported.  */
5083           gcc_assert (exact_log2 (count) != -1);
5084           for (i = 0; i < nelt; i++)
5085             sel[i] = i * 2;
5086           if (can_vec_perm_p (mode, false, sel))
5087             {
5088               for (i = 0; i < nelt; i++)
5089                 sel[i] = i * 2 + 1;
5090               if (can_vec_perm_p (mode, false, sel))
5091                 return true;
5092             }
5093         }
5094     }
5095
5096   if (dump_enabled_p ())
5097     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5098                      "extract even/odd not supported by target\n");
5099   return false;
5100 }
5101
5102 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5103    type VECTYPE.  */
5104
5105 bool
5106 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5107 {
5108   return vect_lanes_optab_supported_p ("vec_load_lanes",
5109                                        vec_load_lanes_optab,
5110                                        vectype, count);
5111 }
5112
5113 /* Function vect_permute_load_chain.
5114
5115    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5116    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5117    the input data correctly.  Return the final references for loads in
5118    RESULT_CHAIN.
5119
5120    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5121    The input is 4 vectors each containing 8 elements. We assign a number to each
5122    element, the input sequence is:
5123
5124    1st vec:   0  1  2  3  4  5  6  7
5125    2nd vec:   8  9 10 11 12 13 14 15
5126    3rd vec:  16 17 18 19 20 21 22 23
5127    4th vec:  24 25 26 27 28 29 30 31
5128
5129    The output sequence should be:
5130
5131    1st vec:  0 4  8 12 16 20 24 28
5132    2nd vec:  1 5  9 13 17 21 25 29
5133    3rd vec:  2 6 10 14 18 22 26 30
5134    4th vec:  3 7 11 15 19 23 27 31
5135
5136    i.e., the first output vector should contain the first elements of each
5137    interleaving group, etc.
5138
5139    We use extract_even/odd instructions to create such output.  The input of
5140    each extract_even/odd operation is two vectors
5141    1st vec    2nd vec
5142    0 1 2 3    4 5 6 7
5143
5144    and the output is the vector of extracted even/odd elements.  The output of
5145    extract_even will be:   0 2 4 6
5146    and of extract_odd:     1 3 5 7
5147
5148
5149    The permutation is done in log LENGTH stages.  In each stage extract_even
5150    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5151    their order.  In our example,
5152
5153    E1: extract_even (1st vec, 2nd vec)
5154    E2: extract_odd (1st vec, 2nd vec)
5155    E3: extract_even (3rd vec, 4th vec)
5156    E4: extract_odd (3rd vec, 4th vec)
5157
5158    The output for the first stage will be:
5159
5160    E1:  0  2  4  6  8 10 12 14
5161    E2:  1  3  5  7  9 11 13 15
5162    E3: 16 18 20 22 24 26 28 30
5163    E4: 17 19 21 23 25 27 29 31
5164
5165    In order to proceed and create the correct sequence for the next stage (or
5166    for the correct output, if the second stage is the last one, as in our
5167    example), we first put the output of extract_even operation and then the
5168    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5169    The input for the second stage is:
5170
5171    1st vec (E1):  0  2  4  6  8 10 12 14
5172    2nd vec (E3): 16 18 20 22 24 26 28 30
5173    3rd vec (E2):  1  3  5  7  9 11 13 15
5174    4th vec (E4): 17 19 21 23 25 27 29 31
5175
5176    The output of the second stage:
5177
5178    E1: 0 4  8 12 16 20 24 28
5179    E2: 2 6 10 14 18 22 26 30
5180    E3: 1 5  9 13 17 21 25 29
5181    E4: 3 7 11 15 19 23 27 31
5182
5183    And RESULT_CHAIN after reordering:
5184
5185    1st vec (E1):  0 4  8 12 16 20 24 28
5186    2nd vec (E3):  1 5  9 13 17 21 25 29
5187    3rd vec (E2):  2 6 10 14 18 22 26 30
5188    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5189
5190 static void
5191 vect_permute_load_chain (vec<tree> dr_chain,
5192                          unsigned int length,
5193                          gimple *stmt,
5194                          gimple_stmt_iterator *gsi,
5195                          vec<tree> *result_chain)
5196 {
5197   tree data_ref, first_vect, second_vect;
5198   tree perm_mask_even, perm_mask_odd;
5199   tree perm3_mask_low, perm3_mask_high;
5200   gimple *perm_stmt;
5201   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5202   unsigned int i, j, log_length = exact_log2 (length);
5203   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5204   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5205
5206   result_chain->quick_grow (length);
5207   memcpy (result_chain->address (), dr_chain.address (),
5208           length * sizeof (tree));
5209
5210   if (length == 3)
5211     {
5212       unsigned int k;
5213
5214       for (k = 0; k < 3; k++)
5215         {
5216           for (i = 0; i < nelt; i++)
5217             if (3 * i + k < 2 * nelt)
5218               sel[i] = 3 * i + k;
5219             else
5220               sel[i] = 0;
5221           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5222
5223           for (i = 0, j = 0; i < nelt; i++)
5224             if (3 * i + k < 2 * nelt)
5225               sel[i] = i;
5226             else
5227               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5228
5229           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5230
5231           first_vect = dr_chain[0];
5232           second_vect = dr_chain[1];
5233
5234           /* Create interleaving stmt (low part of):
5235              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5236                                                              ...}>  */
5237           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5238           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5239                                            second_vect, perm3_mask_low);
5240           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5241
5242           /* Create interleaving stmt (high part of):
5243              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5244                                                               ...}>  */
5245           first_vect = data_ref;
5246           second_vect = dr_chain[2];
5247           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5248           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5249                                            second_vect, perm3_mask_high);
5250           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5251           (*result_chain)[k] = data_ref;
5252         }
5253     }
5254   else
5255     {
5256       /* If length is not equal to 3 then only power of 2 is supported.  */
5257       gcc_assert (exact_log2 (length) != -1);
5258
5259       for (i = 0; i < nelt; ++i)
5260         sel[i] = i * 2;
5261       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5262
5263       for (i = 0; i < nelt; ++i)
5264         sel[i] = i * 2 + 1;
5265       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5266
5267       for (i = 0; i < log_length; i++)
5268         {
5269           for (j = 0; j < length; j += 2)
5270             {
5271               first_vect = dr_chain[j];
5272               second_vect = dr_chain[j+1];
5273
5274               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5275               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5276               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5277                                                first_vect, second_vect,
5278                                                perm_mask_even);
5279               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5280               (*result_chain)[j/2] = data_ref;
5281
5282               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5283               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5284               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5285                                                first_vect, second_vect,
5286                                                perm_mask_odd);
5287               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5288               (*result_chain)[j/2+length/2] = data_ref;
5289             }
5290           memcpy (dr_chain.address (), result_chain->address (),
5291                   length * sizeof (tree));
5292         }
5293     }
5294 }
5295
5296 /* Function vect_shift_permute_load_chain.
5297
5298    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5299    sequence of stmts to reorder the input data accordingly.
5300    Return the final references for loads in RESULT_CHAIN.
5301    Return true if successed, false otherwise.
5302
5303    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5304    The input is 3 vectors each containing 8 elements.  We assign a
5305    number to each element, the input sequence is:
5306
5307    1st vec:   0  1  2  3  4  5  6  7
5308    2nd vec:   8  9 10 11 12 13 14 15
5309    3rd vec:  16 17 18 19 20 21 22 23
5310
5311    The output sequence should be:
5312
5313    1st vec:  0 3 6  9 12 15 18 21
5314    2nd vec:  1 4 7 10 13 16 19 22
5315    3rd vec:  2 5 8 11 14 17 20 23
5316
5317    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5318
5319    First we shuffle all 3 vectors to get correct elements order:
5320
5321    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5322    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5323    3rd vec:  (16 19 22) (17 20 23) (18 21)
5324
5325    Next we unite and shift vector 3 times:
5326
5327    1st step:
5328      shift right by 6 the concatenation of:
5329      "1st vec" and  "2nd vec"
5330        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5331      "2nd vec" and  "3rd vec"
5332        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5333      "3rd vec" and  "1st vec"
5334        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5335                              | New vectors                   |
5336
5337      So that now new vectors are:
5338
5339      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5340      2nd vec:  (10 13) (16 19 22) (17 20 23)
5341      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5342
5343    2nd step:
5344      shift right by 5 the concatenation of:
5345      "1st vec" and  "3rd vec"
5346        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5347      "2nd vec" and  "1st vec"
5348        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5349      "3rd vec" and  "2nd vec"
5350        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5351                           | New vectors                   |
5352
5353      So that now new vectors are:
5354
5355      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5356      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5357      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5358
5359    3rd step:
5360      shift right by 5 the concatenation of:
5361      "1st vec" and  "1st vec"
5362        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5363      shift right by 3 the concatenation of:
5364      "2nd vec" and  "2nd vec"
5365                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5366                           | New vectors                   |
5367
5368      So that now all vectors are READY:
5369      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5370      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5371      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5372
5373    This algorithm is faster than one in vect_permute_load_chain if:
5374      1.  "shift of a concatination" is faster than general permutation.
5375          This is usually so.
5376      2.  The TARGET machine can't execute vector instructions in parallel.
5377          This is because each step of the algorithm depends on previous.
5378          The algorithm in vect_permute_load_chain is much more parallel.
5379
5380    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5381 */
5382
5383 static bool
5384 vect_shift_permute_load_chain (vec<tree> dr_chain,
5385                                unsigned int length,
5386                                gimple *stmt,
5387                                gimple_stmt_iterator *gsi,
5388                                vec<tree> *result_chain)
5389 {
5390   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5391   tree perm2_mask1, perm2_mask2, perm3_mask;
5392   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5393   gimple *perm_stmt;
5394
5395   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5396   unsigned int i;
5397   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5398   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5399   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5400   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5401
5402   result_chain->quick_grow (length);
5403   memcpy (result_chain->address (), dr_chain.address (),
5404           length * sizeof (tree));
5405
5406   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5407     {
5408       unsigned int j, log_length = exact_log2 (length);
5409       for (i = 0; i < nelt / 2; ++i)
5410         sel[i] = i * 2;
5411       for (i = 0; i < nelt / 2; ++i)
5412         sel[nelt / 2 + i] = i * 2 + 1;
5413       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5414         {
5415           if (dump_enabled_p ())
5416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5417                              "shuffle of 2 fields structure is not \
5418                               supported by target\n");
5419           return false;
5420         }
5421       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5422
5423       for (i = 0; i < nelt / 2; ++i)
5424         sel[i] = i * 2 + 1;
5425       for (i = 0; i < nelt / 2; ++i)
5426         sel[nelt / 2 + i] = i * 2;
5427       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5428         {
5429           if (dump_enabled_p ())
5430             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5431                              "shuffle of 2 fields structure is not \
5432                               supported by target\n");
5433           return false;
5434         }
5435       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5436
5437       /* Generating permutation constant to shift all elements.
5438          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5439       for (i = 0; i < nelt; i++)
5440         sel[i] = nelt / 2 + i;
5441       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5442         {
5443           if (dump_enabled_p ())
5444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5445                              "shift permutation is not supported by target\n");
5446           return false;
5447         }
5448       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5449
5450       /* Generating permutation constant to select vector from 2.
5451          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5452       for (i = 0; i < nelt / 2; i++)
5453         sel[i] = i;
5454       for (i = nelt / 2; i < nelt; i++)
5455         sel[i] = nelt + i;
5456       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5457         {
5458           if (dump_enabled_p ())
5459             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5460                              "select is not supported by target\n");
5461           return false;
5462         }
5463       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5464
5465       for (i = 0; i < log_length; i++)
5466         {
5467           for (j = 0; j < length; j += 2)
5468             {
5469               first_vect = dr_chain[j];
5470               second_vect = dr_chain[j + 1];
5471
5472               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5473               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5474                                                first_vect, first_vect,
5475                                                perm2_mask1);
5476               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5477               vect[0] = data_ref;
5478
5479               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5480               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5481                                                second_vect, second_vect,
5482                                                perm2_mask2);
5483               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5484               vect[1] = data_ref;
5485
5486               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5487               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5488                                                vect[0], vect[1], shift1_mask);
5489               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5490               (*result_chain)[j/2 + length/2] = data_ref;
5491
5492               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5493               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5494                                                vect[0], vect[1], select_mask);
5495               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5496               (*result_chain)[j/2] = data_ref;
5497             }
5498           memcpy (dr_chain.address (), result_chain->address (),
5499                   length * sizeof (tree));
5500         }
5501       return true;
5502     }
5503   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5504     {
5505       unsigned int k = 0, l = 0;
5506
5507       /* Generating permutation constant to get all elements in rigth order.
5508          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5509       for (i = 0; i < nelt; i++)
5510         {
5511           if (3 * k + (l % 3) >= nelt)
5512             {
5513               k = 0;
5514               l += (3 - (nelt % 3));
5515             }
5516           sel[i] = 3 * k + (l % 3);
5517           k++;
5518         }
5519       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5520         {
5521           if (dump_enabled_p ())
5522             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5523                              "shuffle of 3 fields structure is not \
5524                               supported by target\n");
5525           return false;
5526         }
5527       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5528
5529       /* Generating permutation constant to shift all elements.
5530          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5531       for (i = 0; i < nelt; i++)
5532         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5533       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5534         {
5535           if (dump_enabled_p ())
5536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5537                              "shift permutation is not supported by target\n");
5538           return false;
5539         }
5540       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5541
5542       /* Generating permutation constant to shift all elements.
5543          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5544       for (i = 0; i < nelt; i++)
5545         sel[i] = 2 * (nelt / 3) + 1 + i;
5546       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5547         {
5548           if (dump_enabled_p ())
5549             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5550                              "shift permutation is not supported by target\n");
5551           return false;
5552         }
5553       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5554
5555       /* Generating permutation constant to shift all elements.
5556          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5557       for (i = 0; i < nelt; i++)
5558         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5559       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5560         {
5561           if (dump_enabled_p ())
5562             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5563                              "shift permutation is not supported by target\n");
5564           return false;
5565         }
5566       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5567
5568       /* Generating permutation constant to shift all elements.
5569          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5570       for (i = 0; i < nelt; i++)
5571         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5572       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5573         {
5574           if (dump_enabled_p ())
5575             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5576                              "shift permutation is not supported by target\n");
5577           return false;
5578         }
5579       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5580
5581       for (k = 0; k < 3; k++)
5582         {
5583           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5584           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5585                                            dr_chain[k], dr_chain[k],
5586                                            perm3_mask);
5587           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5588           vect[k] = data_ref;
5589         }
5590
5591       for (k = 0; k < 3; k++)
5592         {
5593           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5594           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5595                                            vect[k % 3], vect[(k + 1) % 3],
5596                                            shift1_mask);
5597           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5598           vect_shift[k] = data_ref;
5599         }
5600
5601       for (k = 0; k < 3; k++)
5602         {
5603           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5604           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5605                                            vect_shift[(4 - k) % 3],
5606                                            vect_shift[(3 - k) % 3],
5607                                            shift2_mask);
5608           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5609           vect[k] = data_ref;
5610         }
5611
5612       (*result_chain)[3 - (nelt % 3)] = vect[2];
5613
5614       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5615       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5616                                        vect[0], shift3_mask);
5617       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5618       (*result_chain)[nelt % 3] = data_ref;
5619
5620       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5621       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5622                                        vect[1], shift4_mask);
5623       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5624       (*result_chain)[0] = data_ref;
5625       return true;
5626     }
5627   return false;
5628 }
5629
5630 /* Function vect_transform_grouped_load.
5631
5632    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5633    to perform their permutation and ascribe the result vectorized statements to
5634    the scalar statements.
5635 */
5636
5637 void
5638 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5639                              gimple_stmt_iterator *gsi)
5640 {
5641   machine_mode mode;
5642   vec<tree> result_chain = vNULL;
5643
5644   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5645      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5646      vectors, that are ready for vector computation.  */
5647   result_chain.create (size);
5648
5649   /* If reassociation width for vector type is 2 or greater target machine can
5650      execute 2 or more vector instructions in parallel.  Otherwise try to
5651      get chain for loads group using vect_shift_permute_load_chain.  */
5652   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5653   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5654       || exact_log2 (size) != -1
5655       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5656                                          gsi, &result_chain))
5657     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5658   vect_record_grouped_load_vectors (stmt, result_chain);
5659   result_chain.release ();
5660 }
5661
5662 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5663    generated as part of the vectorization of STMT.  Assign the statement
5664    for each vector to the associated scalar statement.  */
5665
5666 void
5667 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5668 {
5669   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5670   gimple *next_stmt, *new_stmt;
5671   unsigned int i, gap_count;
5672   tree tmp_data_ref;
5673
5674   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5675      Since we scan the chain starting from it's first node, their order
5676      corresponds the order of data-refs in RESULT_CHAIN.  */
5677   next_stmt = first_stmt;
5678   gap_count = 1;
5679   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5680     {
5681       if (!next_stmt)
5682         break;
5683
5684       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5685        code elimination pass later.  No need to check for the first stmt in
5686        the group, since it always exists.
5687        GROUP_GAP is the number of steps in elements from the previous
5688        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5689        correspond to the gaps.  */
5690       if (next_stmt != first_stmt
5691           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5692       {
5693         gap_count++;
5694         continue;
5695       }
5696
5697       while (next_stmt)
5698         {
5699           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5700           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5701              copies, and we put the new vector statement in the first available
5702              RELATED_STMT.  */
5703           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5704             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5705           else
5706             {
5707               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5708                 {
5709                   gimple *prev_stmt =
5710                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5711                   gimple *rel_stmt =
5712                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5713                   while (rel_stmt)
5714                     {
5715                       prev_stmt = rel_stmt;
5716                       rel_stmt =
5717                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5718                     }
5719
5720                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5721                     new_stmt;
5722                 }
5723             }
5724
5725           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5726           gap_count = 1;
5727           /* If NEXT_STMT accesses the same DR as the previous statement,
5728              put the same TMP_DATA_REF as its vectorized statement; otherwise
5729              get the next data-ref from RESULT_CHAIN.  */
5730           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5731             break;
5732         }
5733     }
5734 }
5735
5736 /* Function vect_force_dr_alignment_p.
5737
5738    Returns whether the alignment of a DECL can be forced to be aligned
5739    on ALIGNMENT bit boundary.  */
5740
5741 bool
5742 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5743 {
5744   if (TREE_CODE (decl) != VAR_DECL)
5745     return false;
5746
5747   if (decl_in_symtab_p (decl)
5748       && !symtab_node::get (decl)->can_increase_alignment_p ())
5749     return false;
5750
5751   if (TREE_STATIC (decl))
5752     return (alignment <= MAX_OFILE_ALIGNMENT);
5753   else
5754     return (alignment <= MAX_STACK_ALIGNMENT);
5755 }
5756
5757
5758 /* Return whether the data reference DR is supported with respect to its
5759    alignment.
5760    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5761    it is aligned, i.e., check if it is possible to vectorize it with different
5762    alignment.  */
5763
5764 enum dr_alignment_support
5765 vect_supportable_dr_alignment (struct data_reference *dr,
5766                                bool check_aligned_accesses)
5767 {
5768   gimple *stmt = DR_STMT (dr);
5769   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5770   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5771   machine_mode mode = TYPE_MODE (vectype);
5772   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5773   struct loop *vect_loop = NULL;
5774   bool nested_in_vect_loop = false;
5775
5776   if (aligned_access_p (dr) && !check_aligned_accesses)
5777     return dr_aligned;
5778
5779   /* For now assume all conditional loads/stores support unaligned
5780      access without any special code.  */
5781   if (is_gimple_call (stmt)
5782       && gimple_call_internal_p (stmt)
5783       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5784           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5785     return dr_unaligned_supported;
5786
5787   if (loop_vinfo)
5788     {
5789       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5790       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5791     }
5792
5793   /* Possibly unaligned access.  */
5794
5795   /* We can choose between using the implicit realignment scheme (generating
5796      a misaligned_move stmt) and the explicit realignment scheme (generating
5797      aligned loads with a REALIGN_LOAD).  There are two variants to the
5798      explicit realignment scheme: optimized, and unoptimized.
5799      We can optimize the realignment only if the step between consecutive
5800      vector loads is equal to the vector size.  Since the vector memory
5801      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5802      is guaranteed that the misalignment amount remains the same throughout the
5803      execution of the vectorized loop.  Therefore, we can create the
5804      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5805      at the loop preheader.
5806
5807      However, in the case of outer-loop vectorization, when vectorizing a
5808      memory access in the inner-loop nested within the LOOP that is now being
5809      vectorized, while it is guaranteed that the misalignment of the
5810      vectorized memory access will remain the same in different outer-loop
5811      iterations, it is *not* guaranteed that is will remain the same throughout
5812      the execution of the inner-loop.  This is because the inner-loop advances
5813      with the original scalar step (and not in steps of VS).  If the inner-loop
5814      step happens to be a multiple of VS, then the misalignment remains fixed
5815      and we can use the optimized realignment scheme.  For example:
5816
5817       for (i=0; i<N; i++)
5818         for (j=0; j<M; j++)
5819           s += a[i+j];
5820
5821      When vectorizing the i-loop in the above example, the step between
5822      consecutive vector loads is 1, and so the misalignment does not remain
5823      fixed across the execution of the inner-loop, and the realignment cannot
5824      be optimized (as illustrated in the following pseudo vectorized loop):
5825
5826       for (i=0; i<N; i+=4)
5827         for (j=0; j<M; j++){
5828           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5829                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5830                          // (assuming that we start from an aligned address).
5831           }
5832
5833      We therefore have to use the unoptimized realignment scheme:
5834
5835       for (i=0; i<N; i+=4)
5836           for (j=k; j<M; j+=4)
5837           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5838                            // that the misalignment of the initial address is
5839                            // 0).
5840
5841      The loop can then be vectorized as follows:
5842
5843       for (k=0; k<4; k++){
5844         rt = get_realignment_token (&vp[k]);
5845         for (i=0; i<N; i+=4){
5846           v1 = vp[i+k];
5847           for (j=k; j<M; j+=4){
5848             v2 = vp[i+j+VS-1];
5849             va = REALIGN_LOAD <v1,v2,rt>;
5850             vs += va;
5851             v1 = v2;
5852           }
5853         }
5854     } */
5855
5856   if (DR_IS_READ (dr))
5857     {
5858       bool is_packed = false;
5859       tree type = (TREE_TYPE (DR_REF (dr)));
5860
5861       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5862           && (!targetm.vectorize.builtin_mask_for_load
5863               || targetm.vectorize.builtin_mask_for_load ()))
5864         {
5865           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5866           if ((nested_in_vect_loop
5867                && (TREE_INT_CST_LOW (DR_STEP (dr))
5868                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5869               || !loop_vinfo)
5870             return dr_explicit_realign;
5871           else
5872             return dr_explicit_realign_optimized;
5873         }
5874       if (!known_alignment_for_access_p (dr))
5875         is_packed = not_size_aligned (DR_REF (dr));
5876
5877       if ((TYPE_USER_ALIGN (type) && !is_packed)
5878           || targetm.vectorize.
5879                support_vector_misalignment (mode, type,
5880                                             DR_MISALIGNMENT (dr), is_packed))
5881         /* Can't software pipeline the loads, but can at least do them.  */
5882         return dr_unaligned_supported;
5883     }
5884   else
5885     {
5886       bool is_packed = false;
5887       tree type = (TREE_TYPE (DR_REF (dr)));
5888
5889       if (!known_alignment_for_access_p (dr))
5890         is_packed = not_size_aligned (DR_REF (dr));
5891
5892      if ((TYPE_USER_ALIGN (type) && !is_packed)
5893          || targetm.vectorize.
5894               support_vector_misalignment (mode, type,
5895                                            DR_MISALIGNMENT (dr), is_packed))
5896        return dr_unaligned_supported;
5897     }
5898
5899   /* Unsupported.  */
5900   return dr_unaligned_unsupported;
5901 }