gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "backend.h"
  27 #include "predict.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "rtl.h"
  31 #include "ssa.h"
  32 #include "alias.h"
  33 #include "fold-const.h"
  34 #include "stor-layout.h"
  35 #include "tm_p.h"
  36 #include "target.h"
  37 #include "gimple-pretty-print.h"
  38 #include "internal-fn.h"
  39 #include "tree-eh.h"
  40 #include "gimplify.h"
  41 #include "gimple-iterator.h"
  42 #include "gimplify-me.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "tree-ssa-loop-manip.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-chrec.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "diagnostic-core.h"
  51 #include "cgraph.h"
  52 #include "expr.h"
  53 #include "insn-codes.h"
  54 #include "optabs-tree.h"
  55 #include "builtins.h"
  56 #include "params.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   limit_p = !targetm.array_mode_supported_p (mode, count);
  70   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  71                               MODE_INT, limit_p);
  72
  73   if (array_mode == BLKmode)
  74     {
  75       if (dump_enabled_p ())
  76         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  78                          GET_MODE_NAME (mode), count);
  79       return false;
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 119                                HOST_WIDE_INT *rhs_size_unit)
 120 {
 121   tree scalar_type = gimple_expr_type (stmt);
 122   HOST_WIDE_INT lhs, rhs;
 123
 124   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 125
 126   if (is_gimple_assign (stmt)
 127       && (gimple_assign_cast_p (stmt)
 128           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 129           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 130           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 131     {
 132       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 133
 134       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 135       if (rhs < lhs)
 136         scalar_type = rhs_type;
 137     }
 138
 139   *lhs_size_unit = lhs;
 140   *rhs_size_unit = rhs;
 141   return scalar_type;
 142 }
 143
 144
 145 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 146    tested at run-time.  Return TRUE if DDR was successfully inserted.
 147    Return false if versioning is not supported.  */
 148
 149 static bool
 150 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 151 {
 152   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 153
 154   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 155     return false;
 156
 157   if (dump_enabled_p ())
 158     {
 159       dump_printf_loc (MSG_NOTE, vect_location,
 160                        "mark for run-time aliasing test between ");
 161       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 162       dump_printf (MSG_NOTE,  " and ");
 163       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 164       dump_printf (MSG_NOTE, "\n");
 165     }
 166
 167   if (optimize_loop_nest_for_size_p (loop))
 168     {
 169       if (dump_enabled_p ())
 170         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 171                          "versioning not supported when optimizing"
 172                          " for size.\n");
 173       return false;
 174     }
 175
 176   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 177   if (loop->inner)
 178     {
 179       if (dump_enabled_p ())
 180         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 181                          "versioning not yet supported for outer-loops.\n");
 182       return false;
 183     }
 184
 185   /* FORNOW: We don't support creating runtime alias tests for non-constant
 186      step.  */
 187   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 188       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 189     {
 190       if (dump_enabled_p ())
 191         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 192                          "versioning not yet supported for non-constant "
 193                          "step\n");
 194       return false;
 195     }
 196
 197   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 198   return true;
 199 }
 200
 201
 202 /* Function vect_analyze_data_ref_dependence.
 203
 204    Return TRUE if there (might) exist a dependence between a memory-reference
 205    DRA and a memory-reference DRB.  When versioning for alias may check a
 206    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 207    the data dependence.  */
 208
 209 static bool
 210 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 211                                   loop_vec_info loop_vinfo, int *max_vf)
 212 {
 213   unsigned int i;
 214   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 215   struct data_reference *dra = DDR_A (ddr);
 216   struct data_reference *drb = DDR_B (ddr);
 217   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 218   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 219   lambda_vector dist_v;
 220   unsigned int loop_depth;
 221
 222   /* In loop analysis all data references should be vectorizable.  */
 223   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 224       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 225     gcc_unreachable ();
 226
 227   /* Independent data accesses.  */
 228   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 229     return false;
 230
 231   if (dra == drb
 232       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 233     return false;
 234
 235   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 236      least two scalar iterations, there is always also a true dependence.
 237      As the vectorizer does not re-order loads and stores we can ignore
 238      the anti-dependence if TBAA can disambiguate both DRs similar to the
 239      case with known negative distance anti-dependences (positive
 240      distance anti-dependences would violate TBAA constraints).  */
 241   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 242        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 243       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 244                                  get_alias_set (DR_REF (drb))))
 245     return false;
 246
 247   /* Unknown data dependence.  */
 248   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 249     {
 250       /* If user asserted safelen consecutive iterations can be
 251          executed concurrently, assume independence.  */
 252       if (loop->safelen >= 2)
 253         {
 254           if (loop->safelen < *max_vf)
 255             *max_vf = loop->safelen;
 256           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 257           return false;
 258         }
 259
 260       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 261           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 262         {
 263           if (dump_enabled_p ())
 264             {
 265               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 266                                "versioning for alias not supported for: "
 267                                "can't determine dependence between ");
 268               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 269                                  DR_REF (dra));
 270               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 271               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 272                                  DR_REF (drb));
 273               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 274             }
 275           return true;
 276         }
 277
 278       if (dump_enabled_p ())
 279         {
 280           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 281                            "versioning for alias required: "
 282                            "can't determine dependence between ");
 283           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 284                              DR_REF (dra));
 285           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 286           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 287                              DR_REF (drb));
 288           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 289         }
 290
 291       /* Add to list of ddrs that need to be tested at run-time.  */
 292       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 293     }
 294
 295   /* Known data dependence.  */
 296   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 297     {
 298       /* If user asserted safelen consecutive iterations can be
 299          executed concurrently, assume independence.  */
 300       if (loop->safelen >= 2)
 301         {
 302           if (loop->safelen < *max_vf)
 303             *max_vf = loop->safelen;
 304           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 305           return false;
 306         }
 307
 308       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 309           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 310         {
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 314                                "versioning for alias not supported for: "
 315                                "bad dist vector for ");
 316               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 317                                  DR_REF (dra));
 318               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 319               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 320                                  DR_REF (drb));
 321               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 322             }
 323           return true;
 324         }
 325
 326       if (dump_enabled_p ())
 327         {
 328           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 329                            "versioning for alias required: "
 330                            "bad dist vector for ");
 331           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 332           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 333           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 334           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 335         }
 336       /* Add to list of ddrs that need to be tested at run-time.  */
 337       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 338     }
 339
 340   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 341   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 342     {
 343       int dist = dist_v[loop_depth];
 344
 345       if (dump_enabled_p ())
 346         dump_printf_loc (MSG_NOTE, vect_location,
 347                          "dependence distance  = %d.\n", dist);
 348
 349       if (dist == 0)
 350         {
 351           if (dump_enabled_p ())
 352             {
 353               dump_printf_loc (MSG_NOTE, vect_location,
 354                                "dependence distance == 0 between ");
 355               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 356               dump_printf (MSG_NOTE, " and ");
 357               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 358               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 359             }
 360
 361           /* When we perform grouped accesses and perform implicit CSE
 362              by detecting equal accesses and doing disambiguation with
 363              runtime alias tests like for
 364                 .. = a[i];
 365                 .. = a[i+1];
 366                 a[i] = ..;
 367                 a[i+1] = ..;
 368                 *p = ..;
 369                 .. = a[i];
 370                 .. = a[i+1];
 371              where we will end up loading { a[i], a[i+1] } once, make
 372              sure that inserting group loads before the first load and
 373              stores after the last store will do the right thing.
 374              Similar for groups like
 375                 a[i] = ...;
 376                 ... = a[i];
 377                 a[i+1] = ...;
 378              where loads from the group interleave with the store.  */
 379           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 380               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 381             {
 382               gimple *earlier_stmt;
 383               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 384               if (DR_IS_WRITE
 385                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 386                 {
 387                   if (dump_enabled_p ())
 388                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                                      "READ_WRITE dependence in interleaving."
 390                                      "\n");
 391                   return true;
 392                 }
 393             }
 394
 395           continue;
 396         }
 397
 398       if (dist > 0 && DDR_REVERSED_P (ddr))
 399         {
 400           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 401              reversed (to make distance vector positive), and the actual
 402              distance is negative.  */
 403           if (dump_enabled_p ())
 404             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 405                              "dependence distance negative.\n");
 406           /* Record a negative dependence distance to later limit the
 407              amount of stmt copying / unrolling we can perform.
 408              Only need to handle read-after-write dependence.  */
 409           if (DR_IS_READ (drb)
 410               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 411                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 412             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 413           continue;
 414         }
 415
 416       if (abs (dist) >= 2
 417           && abs (dist) < *max_vf)
 418         {
 419           /* The dependence distance requires reduction of the maximal
 420              vectorization factor.  */
 421           *max_vf = abs (dist);
 422           if (dump_enabled_p ())
 423             dump_printf_loc (MSG_NOTE, vect_location,
 424                              "adjusting maximal vectorization factor to %i\n",
 425                              *max_vf);
 426         }
 427
 428       if (abs (dist) >= *max_vf)
 429         {
 430           /* Dependence distance does not create dependence, as far as
 431              vectorization is concerned, in this case.  */
 432           if (dump_enabled_p ())
 433             dump_printf_loc (MSG_NOTE, vect_location,
 434                              "dependence distance >= VF.\n");
 435           continue;
 436         }
 437
 438       if (dump_enabled_p ())
 439         {
 440           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 441                        "not vectorized, possible dependence "
 442                        "between data-refs ");
 443           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 444           dump_printf (MSG_NOTE,  " and ");
 445           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 446           dump_printf (MSG_NOTE,  "\n");
 447         }
 448
 449       return true;
 450     }
 451
 452   return false;
 453 }
 454
 455 /* Function vect_analyze_data_ref_dependences.
 456
 457    Examine all the data references in the loop, and make sure there do not
 458    exist any data dependences between them.  Set *MAX_VF according to
 459    the maximum vectorization factor the data dependences allow.  */
 460
 461 bool
 462 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 463 {
 464   unsigned int i;
 465   struct data_dependence_relation *ddr;
 466
 467   if (dump_enabled_p ())
 468     dump_printf_loc (MSG_NOTE, vect_location,
 469                      "=== vect_analyze_data_ref_dependences ===\n");
 470
 471   LOOP_VINFO_DDRS (loop_vinfo)
 472     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 473              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 474   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 475   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 476                                 &LOOP_VINFO_DDRS (loop_vinfo),
 477                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 478     return false;
 479
 480   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 481     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 482       return false;
 483
 484   return true;
 485 }
 486
 487
 488 /* Function vect_slp_analyze_data_ref_dependence.
 489
 490    Return TRUE if there (might) exist a dependence between a memory-reference
 491    DRA and a memory-reference DRB.  When versioning for alias may check a
 492    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 493    the data dependence.  */
 494
 495 static bool
 496 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 497 {
 498   struct data_reference *dra = DDR_A (ddr);
 499   struct data_reference *drb = DDR_B (ddr);
 500
 501   /* We need to check dependences of statements marked as unvectorizable
 502      as well, they still can prohibit vectorization.  */
 503
 504   /* Independent data accesses.  */
 505   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 506     return false;
 507
 508   if (dra == drb)
 509     return false;
 510
 511   /* Read-read is OK.  */
 512   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 513     return false;
 514
 515   /* If dra and drb are part of the same interleaving chain consider
 516      them independent.  */
 517   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 518       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 519           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 520     return false;
 521
 522   /* Unknown data dependence.  */
 523   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 524     {
 525       if  (dump_enabled_p ())
 526         {
 527           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 528                            "can't determine dependence between ");
 529           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 530           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 531           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 532           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 533         }
 534     }
 535   else if (dump_enabled_p ())
 536     {
 537       dump_printf_loc (MSG_NOTE, vect_location,
 538                        "determined dependence between ");
 539       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 540       dump_printf (MSG_NOTE, " and ");
 541       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 542       dump_printf (MSG_NOTE,  "\n");
 543     }
 544
 545   /* We do not vectorize basic blocks with write-write dependencies.  */
 546   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 547     return true;
 548
 549   /* If we have a read-write dependence check that the load is before the store.
 550      When we vectorize basic blocks, vector load can be only before
 551      corresponding scalar load, and vector store can be only after its
 552      corresponding scalar store.  So the order of the acceses is preserved in
 553      case the load is before the store.  */
 554   gimple *earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 555   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 556     {
 557       /* That only holds for load-store pairs taking part in vectorization.  */
 558       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 559           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 560         return false;
 561     }
 562
 563   return true;
 564 }
 565
 566
 567 /* Function vect_analyze_data_ref_dependences.
 568
 569    Examine all the data references in the basic-block, and make sure there
 570    do not exist any data dependences between them.  Set *MAX_VF according to
 571    the maximum vectorization factor the data dependences allow.  */
 572
 573 bool
 574 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 575 {
 576   struct data_dependence_relation *ddr;
 577   unsigned int i;
 578
 579   if (dump_enabled_p ())
 580     dump_printf_loc (MSG_NOTE, vect_location,
 581                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 582
 583   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 584                                 &BB_VINFO_DDRS (bb_vinfo),
 585                                 vNULL, true))
 586     return false;
 587
 588   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 589     if (vect_slp_analyze_data_ref_dependence (ddr))
 590       return false;
 591
 592   return true;
 593 }
 594
 595
 596 /* Function vect_compute_data_ref_alignment
 597
 598    Compute the misalignment of the data reference DR.
 599
 600    Output:
 601    1. If during the misalignment computation it is found that the data reference
 602       cannot be vectorized then false is returned.
 603    2. DR_MISALIGNMENT (DR) is defined.
 604
 605    FOR NOW: No analysis is actually performed. Misalignment is calculated
 606    only for trivial cases. TODO.  */
 607
 608 static bool
 609 vect_compute_data_ref_alignment (struct data_reference *dr)
 610 {
 611   gimple *stmt = DR_STMT (dr);
 612   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 613   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 614   struct loop *loop = NULL;
 615   tree ref = DR_REF (dr);
 616   tree vectype;
 617   tree base, base_addr;
 618   tree misalign = NULL_TREE;
 619   tree aligned_to;
 620   unsigned HOST_WIDE_INT alignment;
 621
 622   if (dump_enabled_p ())
 623     dump_printf_loc (MSG_NOTE, vect_location,
 624                      "vect_compute_data_ref_alignment:\n");
 625
 626   if (loop_vinfo)
 627     loop = LOOP_VINFO_LOOP (loop_vinfo);
 628
 629   /* Initialize misalignment to unknown.  */
 630   SET_DR_MISALIGNMENT (dr, -1);
 631
 632   /* Strided accesses perform only component accesses, misalignment information
 633      is irrelevant for them.  */
 634   if (STMT_VINFO_STRIDED_P (stmt_info)
 635       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 636     return true;
 637
 638   if (tree_fits_shwi_p (DR_STEP (dr)))
 639     misalign = DR_INIT (dr);
 640   aligned_to = DR_ALIGNED_TO (dr);
 641   base_addr = DR_BASE_ADDRESS (dr);
 642   vectype = STMT_VINFO_VECTYPE (stmt_info);
 643
 644   /* In case the dataref is in an inner-loop of the loop that is being
 645      vectorized (LOOP), we use the base and misalignment information
 646      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 647      stays the same throughout the execution of the inner-loop, which is why
 648      we have to check that the stride of the dataref in the inner-loop evenly
 649      divides by the vector size.  */
 650   if (loop && nested_in_vect_loop_p (loop, stmt))
 651     {
 652       tree step = DR_STEP (dr);
 653
 654       if (tree_fits_shwi_p (step)
 655           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 656         {
 657           if (dump_enabled_p ())
 658             dump_printf_loc (MSG_NOTE, vect_location,
 659                              "inner step divides the vector-size.\n");
 660           misalign = STMT_VINFO_DR_INIT (stmt_info);
 661           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 662           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 663         }
 664       else
 665         {
 666           if (dump_enabled_p ())
 667             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 668                              "inner step doesn't divide the vector-size.\n");
 669           misalign = NULL_TREE;
 670         }
 671     }
 672
 673   /* Similarly we can only use base and misalignment information relative to
 674      an innermost loop if the misalignment stays the same throughout the
 675      execution of the loop.  As above, this is the case if the stride of
 676      the dataref evenly divides by the vector size.  */
 677   else
 678     {
 679       tree step = DR_STEP (dr);
 680       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 681
 682       if (tree_fits_shwi_p (step)
 683           && ((tree_to_shwi (step) * vf)
 684               % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 685         {
 686           if (dump_enabled_p ())
 687             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 688                              "step doesn't divide the vector-size.\n");
 689           misalign = NULL_TREE;
 690         }
 691     }
 692
 693   /* To look at alignment of the base we have to preserve an inner MEM_REF
 694      as that carries alignment information of the actual access.  */
 695   base = ref;
 696   while (handled_component_p (base))
 697     base = TREE_OPERAND (base, 0);
 698   if (TREE_CODE (base) == MEM_REF)
 699     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 700                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 701   unsigned int base_alignment = get_object_alignment (base);
 702
 703   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
 704     DR_VECT_AUX (dr)->base_element_aligned = true;
 705
 706   alignment = TYPE_ALIGN_UNIT (vectype);
 707
 708   if ((compare_tree_int (aligned_to, alignment) < 0)
 709       || !misalign)
 710     {
 711       if (dump_enabled_p ())
 712         {
 713           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 714                            "Unknown alignment for access: ");
 715           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 716           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 717         }
 718       return true;
 719     }
 720
 721   if (base_alignment < TYPE_ALIGN (vectype))
 722     {
 723       /* Strip an inner MEM_REF to a bare decl if possible.  */
 724       if (TREE_CODE (base) == MEM_REF
 725           && integer_zerop (TREE_OPERAND (base, 1))
 726           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 727         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 728
 729       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 730         {
 731           if (dump_enabled_p ())
 732             {
 733               dump_printf_loc (MSG_NOTE, vect_location,
 734                                "can't force alignment of ref: ");
 735               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 736               dump_printf (MSG_NOTE, "\n");
 737             }
 738           return true;
 739         }
 740
 741       /* Force the alignment of the decl.
 742          NOTE: This is the only change to the code we make during
 743          the analysis phase, before deciding to vectorize the loop.  */
 744       if (dump_enabled_p ())
 745         {
 746           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 747           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 748           dump_printf (MSG_NOTE, "\n");
 749         }
 750
 751       DR_VECT_AUX (dr)->base_decl = base;
 752       DR_VECT_AUX (dr)->base_misaligned = true;
 753       DR_VECT_AUX (dr)->base_element_aligned = true;
 754     }
 755
 756   /* If this is a backward running DR then first access in the larger
 757      vectype actually is N-1 elements before the address in the DR.
 758      Adjust misalign accordingly.  */
 759   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 760     {
 761       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 762       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 763          otherwise we wouldn't be here.  */
 764       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 765       /* PLUS because DR_STEP was negative.  */
 766       misalign = size_binop (PLUS_EXPR, misalign, offset);
 767     }
 768
 769   SET_DR_MISALIGNMENT (dr,
 770                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 771
 772   if (dump_enabled_p ())
 773     {
 774       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 775                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 776       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 777       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 778     }
 779
 780   return true;
 781 }
 782
 783
 784 /* Function vect_compute_data_refs_alignment
 785
 786    Compute the misalignment of data references in the loop.
 787    Return FALSE if a data reference is found that cannot be vectorized.  */
 788
 789 static bool
 790 vect_compute_data_refs_alignment (vec_info *vinfo)
 791 {
 792   vec<data_reference_p> datarefs = vinfo->datarefs;
 793   struct data_reference *dr;
 794   unsigned int i;
 795
 796   FOR_EACH_VEC_ELT (datarefs, i, dr)
 797     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 798         && !vect_compute_data_ref_alignment (dr))
 799       {
 800         if (is_a <bb_vec_info> (vinfo))
 801           {
 802             /* Mark unsupported statement as unvectorizable.  */
 803             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 804             continue;
 805           }
 806         else
 807           return false;
 808       }
 809
 810   return true;
 811 }
 812
 813
 814 /* Function vect_update_misalignment_for_peel
 815
 816    DR - the data reference whose misalignment is to be adjusted.
 817    DR_PEEL - the data reference whose misalignment is being made
 818              zero in the vector loop by the peel.
 819    NPEEL - the number of iterations in the peel loop if the misalignment
 820            of DR_PEEL is known at compile time.  */
 821
 822 static void
 823 vect_update_misalignment_for_peel (struct data_reference *dr,
 824                                    struct data_reference *dr_peel, int npeel)
 825 {
 826   unsigned int i;
 827   vec<dr_p> same_align_drs;
 828   struct data_reference *current_dr;
 829   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 830   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 831   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 832   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 833
 834  /* For interleaved data accesses the step in the loop must be multiplied by
 835      the size of the interleaving group.  */
 836   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 837     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 838   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 839     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 840
 841   /* It can be assumed that the data refs with the same alignment as dr_peel
 842      are aligned in the vector loop.  */
 843   same_align_drs
 844     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 845   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 846     {
 847       if (current_dr != dr)
 848         continue;
 849       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 850                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 851       SET_DR_MISALIGNMENT (dr, 0);
 852       return;
 853     }
 854
 855   if (known_alignment_for_access_p (dr)
 856       && known_alignment_for_access_p (dr_peel))
 857     {
 858       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 859       int misal = DR_MISALIGNMENT (dr);
 860       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 861       misal += negative ? -npeel * dr_size : npeel * dr_size;
 862       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 863       SET_DR_MISALIGNMENT (dr, misal);
 864       return;
 865     }
 866
 867   if (dump_enabled_p ())
 868     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 869   SET_DR_MISALIGNMENT (dr, -1);
 870 }
 871
 872
 873 /* Function vect_verify_datarefs_alignment
 874
 875    Return TRUE if all data references in the loop can be
 876    handled with respect to alignment.  */
 877
 878 bool
 879 vect_verify_datarefs_alignment (vec_info *vinfo)
 880 {
 881   vec<data_reference_p> datarefs = vinfo->datarefs;
 882   struct data_reference *dr;
 883   enum dr_alignment_support supportable_dr_alignment;
 884   unsigned int i;
 885
 886   FOR_EACH_VEC_ELT (datarefs, i, dr)
 887     {
 888       gimple *stmt = DR_STMT (dr);
 889       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 890
 891       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 892         continue;
 893
 894       /* For interleaving, only the alignment of the first access matters.
 895          Skip statements marked as not vectorizable.  */
 896       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 897            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 898           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 899         continue;
 900
 901       /* Strided accesses perform only component accesses, alignment is
 902          irrelevant for them.  */
 903       if (STMT_VINFO_STRIDED_P (stmt_info)
 904           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 905         continue;
 906
 907       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 908       if (!supportable_dr_alignment)
 909         {
 910           if (dump_enabled_p ())
 911             {
 912               if (DR_IS_READ (dr))
 913                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 914                                  "not vectorized: unsupported unaligned load.");
 915               else
 916                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 917                                  "not vectorized: unsupported unaligned "
 918                                  "store.");
 919
 920               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 921                                  DR_REF (dr));
 922               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 923             }
 924           return false;
 925         }
 926       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 927         dump_printf_loc (MSG_NOTE, vect_location,
 928                          "Vectorizing an unaligned access.\n");
 929     }
 930   return true;
 931 }
 932
 933 /* Given an memory reference EXP return whether its alignment is less
 934    than its size.  */
 935
 936 static bool
 937 not_size_aligned (tree exp)
 938 {
 939   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 940     return true;
 941
 942   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 943           > get_object_alignment (exp));
 944 }
 945
 946 /* Function vector_alignment_reachable_p
 947
 948    Return true if vector alignment for DR is reachable by peeling
 949    a few loop iterations.  Return false otherwise.  */
 950
 951 static bool
 952 vector_alignment_reachable_p (struct data_reference *dr)
 953 {
 954   gimple *stmt = DR_STMT (dr);
 955   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 956   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 957
 958   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 959     {
 960       /* For interleaved access we peel only if number of iterations in
 961          the prolog loop ({VF - misalignment}), is a multiple of the
 962          number of the interleaved accesses.  */
 963       int elem_size, mis_in_elements;
 964       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 965
 966       /* FORNOW: handle only known alignment.  */
 967       if (!known_alignment_for_access_p (dr))
 968         return false;
 969
 970       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 971       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 972
 973       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
 974         return false;
 975     }
 976
 977   /* If misalignment is known at the compile time then allow peeling
 978      only if natural alignment is reachable through peeling.  */
 979   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
 980     {
 981       HOST_WIDE_INT elmsize =
 982                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
 983       if (dump_enabled_p ())
 984         {
 985           dump_printf_loc (MSG_NOTE, vect_location,
 986                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
 987           dump_printf (MSG_NOTE,
 988                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
 989         }
 990       if (DR_MISALIGNMENT (dr) % elmsize)
 991         {
 992           if (dump_enabled_p ())
 993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 994                              "data size does not divide the misalignment.\n");
 995           return false;
 996         }
 997     }
 998
 999   if (!known_alignment_for_access_p (dr))
1000     {
1001       tree type = TREE_TYPE (DR_REF (dr));
1002       bool is_packed = not_size_aligned (DR_REF (dr));
1003       if (dump_enabled_p ())
1004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1005                          "Unknown misalignment, is_packed = %d\n",is_packed);
1006       if ((TYPE_USER_ALIGN (type) && !is_packed)
1007           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1008         return true;
1009       else
1010         return false;
1011     }
1012
1013   return true;
1014 }
1015
1016
1017 /* Calculate the cost of the memory access represented by DR.  */
1018
1019 static void
1020 vect_get_data_access_cost (struct data_reference *dr,
1021                            unsigned int *inside_cost,
1022                            unsigned int *outside_cost,
1023                            stmt_vector_for_cost *body_cost_vec)
1024 {
1025   gimple *stmt = DR_STMT (dr);
1026   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1027   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1028   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1029   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1030   int ncopies = vf / nunits;
1031
1032   if (DR_IS_READ (dr))
1033     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1034                         NULL, body_cost_vec, false);
1035   else
1036     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1037
1038   if (dump_enabled_p ())
1039     dump_printf_loc (MSG_NOTE, vect_location,
1040                      "vect_get_data_access_cost: inside_cost = %d, "
1041                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1042 }
1043
1044
1045 typedef struct _vect_peel_info
1046 {
1047   int npeel;
1048   struct data_reference *dr;
1049   unsigned int count;
1050 } *vect_peel_info;
1051
1052 typedef struct _vect_peel_extended_info
1053 {
1054   struct _vect_peel_info peel_info;
1055   unsigned int inside_cost;
1056   unsigned int outside_cost;
1057   stmt_vector_for_cost body_cost_vec;
1058 } *vect_peel_extended_info;
1059
1060
1061 /* Peeling hashtable helpers.  */
1062
1063 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1064 {
1065   static inline hashval_t hash (const _vect_peel_info *);
1066   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1067 };
1068
1069 inline hashval_t
1070 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1071 {
1072   return (hashval_t) peel_info->npeel;
1073 }
1074
1075 inline bool
1076 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1077 {
1078   return (a->npeel == b->npeel);
1079 }
1080
1081
1082 /* Insert DR into peeling hash table with NPEEL as key.  */
1083
1084 static void
1085 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1086                           loop_vec_info loop_vinfo, struct data_reference *dr,
1087                           int npeel)
1088 {
1089   struct _vect_peel_info elem, *slot;
1090   _vect_peel_info **new_slot;
1091   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1092
1093   elem.npeel = npeel;
1094   slot = peeling_htab->find (&elem);
1095   if (slot)
1096     slot->count++;
1097   else
1098     {
1099       slot = XNEW (struct _vect_peel_info);
1100       slot->npeel = npeel;
1101       slot->dr = dr;
1102       slot->count = 1;
1103       new_slot = peeling_htab->find_slot (slot, INSERT);
1104       *new_slot = slot;
1105     }
1106
1107   if (!supportable_dr_alignment
1108       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1109     slot->count += VECT_MAX_COST;
1110 }
1111
1112
1113 /* Traverse peeling hash table to find peeling option that aligns maximum
1114    number of data accesses.  */
1115
1116 int
1117 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1118                                      _vect_peel_extended_info *max)
1119 {
1120   vect_peel_info elem = *slot;
1121
1122   if (elem->count > max->peel_info.count
1123       || (elem->count == max->peel_info.count
1124           && max->peel_info.npeel > elem->npeel))
1125     {
1126       max->peel_info.npeel = elem->npeel;
1127       max->peel_info.count = elem->count;
1128       max->peel_info.dr = elem->dr;
1129     }
1130
1131   return 1;
1132 }
1133
1134
1135 /* Traverse peeling hash table and calculate cost for each peeling option.
1136    Find the one with the lowest cost.  */
1137
1138 int
1139 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1140                                    _vect_peel_extended_info *min)
1141 {
1142   vect_peel_info elem = *slot;
1143   int save_misalignment, dummy;
1144   unsigned int inside_cost = 0, outside_cost = 0, i;
1145   gimple *stmt = DR_STMT (elem->dr);
1146   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1147   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1148   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1149   struct data_reference *dr;
1150   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1151
1152   prologue_cost_vec.create (2);
1153   body_cost_vec.create (2);
1154   epilogue_cost_vec.create (2);
1155
1156   FOR_EACH_VEC_ELT (datarefs, i, dr)
1157     {
1158       stmt = DR_STMT (dr);
1159       stmt_info = vinfo_for_stmt (stmt);
1160       /* For interleaving, only the alignment of the first access
1161          matters.  */
1162       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1163           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1164         continue;
1165
1166       save_misalignment = DR_MISALIGNMENT (dr);
1167       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1168       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1169                                  &body_cost_vec);
1170       SET_DR_MISALIGNMENT (dr, save_misalignment);
1171     }
1172
1173   outside_cost += vect_get_known_peeling_cost
1174     (loop_vinfo, elem->npeel, &dummy,
1175      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1176      &prologue_cost_vec, &epilogue_cost_vec);
1177
1178   /* Prologue and epilogue costs are added to the target model later.
1179      These costs depend only on the scalar iteration cost, the
1180      number of peeling iterations finally chosen, and the number of
1181      misaligned statements.  So discard the information found here.  */
1182   prologue_cost_vec.release ();
1183   epilogue_cost_vec.release ();
1184
1185   if (inside_cost < min->inside_cost
1186       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1187     {
1188       min->inside_cost = inside_cost;
1189       min->outside_cost = outside_cost;
1190       min->body_cost_vec.release ();
1191       min->body_cost_vec = body_cost_vec;
1192       min->peel_info.dr = elem->dr;
1193       min->peel_info.npeel = elem->npeel;
1194     }
1195   else
1196     body_cost_vec.release ();
1197
1198   return 1;
1199 }
1200
1201
1202 /* Choose best peeling option by traversing peeling hash table and either
1203    choosing an option with the lowest cost (if cost model is enabled) or the
1204    option that aligns as many accesses as possible.  */
1205
1206 static struct data_reference *
1207 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1208                                        loop_vec_info loop_vinfo,
1209                                        unsigned int *npeel,
1210                                        stmt_vector_for_cost *body_cost_vec)
1211 {
1212    struct _vect_peel_extended_info res;
1213
1214    res.peel_info.dr = NULL;
1215    res.body_cost_vec = stmt_vector_for_cost ();
1216
1217    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1218      {
1219        res.inside_cost = INT_MAX;
1220        res.outside_cost = INT_MAX;
1221        peeling_htab->traverse <_vect_peel_extended_info *,
1222                                vect_peeling_hash_get_lowest_cost> (&res);
1223      }
1224    else
1225      {
1226        res.peel_info.count = 0;
1227        peeling_htab->traverse <_vect_peel_extended_info *,
1228                                vect_peeling_hash_get_most_frequent> (&res);
1229      }
1230
1231    *npeel = res.peel_info.npeel;
1232    *body_cost_vec = res.body_cost_vec;
1233    return res.peel_info.dr;
1234 }
1235
1236
1237 /* Function vect_enhance_data_refs_alignment
1238
1239    This pass will use loop versioning and loop peeling in order to enhance
1240    the alignment of data references in the loop.
1241
1242    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1243    original loop is to be vectorized.  Any other loops that are created by
1244    the transformations performed in this pass - are not supposed to be
1245    vectorized.  This restriction will be relaxed.
1246
1247    This pass will require a cost model to guide it whether to apply peeling
1248    or versioning or a combination of the two.  For example, the scheme that
1249    intel uses when given a loop with several memory accesses, is as follows:
1250    choose one memory access ('p') which alignment you want to force by doing
1251    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1252    other accesses are not necessarily aligned, or (2) use loop versioning to
1253    generate one loop in which all accesses are aligned, and another loop in
1254    which only 'p' is necessarily aligned.
1255
1256    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1257    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1258    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1259
1260    Devising a cost model is the most critical aspect of this work.  It will
1261    guide us on which access to peel for, whether to use loop versioning, how
1262    many versions to create, etc.  The cost model will probably consist of
1263    generic considerations as well as target specific considerations (on
1264    powerpc for example, misaligned stores are more painful than misaligned
1265    loads).
1266
1267    Here are the general steps involved in alignment enhancements:
1268
1269      -- original loop, before alignment analysis:
1270         for (i=0; i<N; i++){
1271           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1272           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1273         }
1274
1275      -- After vect_compute_data_refs_alignment:
1276         for (i=0; i<N; i++){
1277           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1278           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1279         }
1280
1281      -- Possibility 1: we do loop versioning:
1282      if (p is aligned) {
1283         for (i=0; i<N; i++){    # loop 1A
1284           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1285           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1286         }
1287      }
1288      else {
1289         for (i=0; i<N; i++){    # loop 1B
1290           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1291           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1292         }
1293      }
1294
1295      -- Possibility 2: we do loop peeling:
1296      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1297         x = q[i];
1298         p[i] = y;
1299      }
1300      for (i = 3; i < N; i++){   # loop 2A
1301         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1302         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1303      }
1304
1305      -- Possibility 3: combination of loop peeling and versioning:
1306      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1307         x = q[i];
1308         p[i] = y;
1309      }
1310      if (p is aligned) {
1311         for (i = 3; i<N; i++){  # loop 3A
1312           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1313           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1314         }
1315      }
1316      else {
1317         for (i = 3; i<N; i++){  # loop 3B
1318           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1319           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1320         }
1321      }
1322
1323      These loops are later passed to loop_transform to be vectorized.  The
1324      vectorizer will use the alignment information to guide the transformation
1325      (whether to generate regular loads/stores, or with special handling for
1326      misalignment).  */
1327
1328 bool
1329 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1330 {
1331   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1332   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1333   enum dr_alignment_support supportable_dr_alignment;
1334   struct data_reference *dr0 = NULL, *first_store = NULL;
1335   struct data_reference *dr;
1336   unsigned int i, j;
1337   bool do_peeling = false;
1338   bool do_versioning = false;
1339   bool stat;
1340   gimple *stmt;
1341   stmt_vec_info stmt_info;
1342   unsigned int npeel = 0;
1343   bool all_misalignments_unknown = true;
1344   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1345   unsigned possible_npeel_number = 1;
1346   tree vectype;
1347   unsigned int nelements, mis, same_align_drs_max = 0;
1348   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1349   hash_table<peel_info_hasher> peeling_htab (1);
1350
1351   if (dump_enabled_p ())
1352     dump_printf_loc (MSG_NOTE, vect_location,
1353                      "=== vect_enhance_data_refs_alignment ===\n");
1354
1355   /* Reset data so we can safely be called multiple times.  */
1356   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1357   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1358
1359   /* While cost model enhancements are expected in the future, the high level
1360      view of the code at this time is as follows:
1361
1362      A) If there is a misaligned access then see if peeling to align
1363         this access can make all data references satisfy
1364         vect_supportable_dr_alignment.  If so, update data structures
1365         as needed and return true.
1366
1367      B) If peeling wasn't possible and there is a data reference with an
1368         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1369         then see if loop versioning checks can be used to make all data
1370         references satisfy vect_supportable_dr_alignment.  If so, update
1371         data structures as needed and return true.
1372
1373      C) If neither peeling nor versioning were successful then return false if
1374         any data reference does not satisfy vect_supportable_dr_alignment.
1375
1376      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1377
1378      Note, Possibility 3 above (which is peeling and versioning together) is not
1379      being done at this time.  */
1380
1381   /* (1) Peeling to force alignment.  */
1382
1383   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1384      Considerations:
1385      + How many accesses will become aligned due to the peeling
1386      - How many accesses will become unaligned due to the peeling,
1387        and the cost of misaligned accesses.
1388      - The cost of peeling (the extra runtime checks, the increase
1389        in code size).  */
1390
1391   FOR_EACH_VEC_ELT (datarefs, i, dr)
1392     {
1393       stmt = DR_STMT (dr);
1394       stmt_info = vinfo_for_stmt (stmt);
1395
1396       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1397         continue;
1398
1399       /* For interleaving, only the alignment of the first access
1400          matters.  */
1401       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1402           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1403         continue;
1404
1405       /* For invariant accesses there is nothing to enhance.  */
1406       if (integer_zerop (DR_STEP (dr)))
1407         continue;
1408
1409       /* Strided accesses perform only component accesses, alignment is
1410          irrelevant for them.  */
1411       if (STMT_VINFO_STRIDED_P (stmt_info)
1412           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1413         continue;
1414
1415       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1416       do_peeling = vector_alignment_reachable_p (dr);
1417       if (do_peeling)
1418         {
1419           if (known_alignment_for_access_p (dr))
1420             {
1421               unsigned int npeel_tmp;
1422               bool negative = tree_int_cst_compare (DR_STEP (dr),
1423                                                     size_zero_node) < 0;
1424
1425               /* Save info about DR in the hash table.  */
1426               vectype = STMT_VINFO_VECTYPE (stmt_info);
1427               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1428               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1429                                                 TREE_TYPE (DR_REF (dr))));
1430               npeel_tmp = (negative
1431                            ? (mis - nelements) : (nelements - mis))
1432                   & (nelements - 1);
1433
1434               /* For multiple types, it is possible that the bigger type access
1435                  will have more than one peeling option.  E.g., a loop with two
1436                  types: one of size (vector size / 4), and the other one of
1437                  size (vector size / 8).  Vectorization factor will 8.  If both
1438                  access are misaligned by 3, the first one needs one scalar
1439                  iteration to be aligned, and the second one needs 5.  But the
1440                  the first one will be aligned also by peeling 5 scalar
1441                  iterations, and in that case both accesses will be aligned.
1442                  Hence, except for the immediate peeling amount, we also want
1443                  to try to add full vector size, while we don't exceed
1444                  vectorization factor.
1445                  We do this automtically for cost model, since we calculate cost
1446                  for every peeling option.  */
1447               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1448                 {
1449                   if (STMT_SLP_TYPE (stmt_info))
1450                     possible_npeel_number
1451                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1452                   else
1453                     possible_npeel_number = vf / nelements;
1454                 }
1455
1456               /* Handle the aligned case. We may decide to align some other
1457                  access, making DR unaligned.  */
1458               if (DR_MISALIGNMENT (dr) == 0)
1459                 {
1460                   npeel_tmp = 0;
1461                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1462                     possible_npeel_number++;
1463                 }
1464
1465               for (j = 0; j < possible_npeel_number; j++)
1466                 {
1467                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1468                                             dr, npeel_tmp);
1469                   npeel_tmp += nelements;
1470                 }
1471
1472               all_misalignments_unknown = false;
1473               /* Data-ref that was chosen for the case that all the
1474                  misalignments are unknown is not relevant anymore, since we
1475                  have a data-ref with known alignment.  */
1476               dr0 = NULL;
1477             }
1478           else
1479             {
1480               /* If we don't know any misalignment values, we prefer
1481                  peeling for data-ref that has the maximum number of data-refs
1482                  with the same alignment, unless the target prefers to align
1483                  stores over load.  */
1484               if (all_misalignments_unknown)
1485                 {
1486                   unsigned same_align_drs
1487                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1488                   if (!dr0
1489                       || same_align_drs_max < same_align_drs)
1490                     {
1491                       same_align_drs_max = same_align_drs;
1492                       dr0 = dr;
1493                     }
1494                   /* For data-refs with the same number of related
1495                      accesses prefer the one where the misalign
1496                      computation will be invariant in the outermost loop.  */
1497                   else if (same_align_drs_max == same_align_drs)
1498                     {
1499                       struct loop *ivloop0, *ivloop;
1500                       ivloop0 = outermost_invariant_loop_for_expr
1501                           (loop, DR_BASE_ADDRESS (dr0));
1502                       ivloop = outermost_invariant_loop_for_expr
1503                           (loop, DR_BASE_ADDRESS (dr));
1504                       if ((ivloop && !ivloop0)
1505                           || (ivloop && ivloop0
1506                               && flow_loop_nested_p (ivloop, ivloop0)))
1507                         dr0 = dr;
1508                     }
1509
1510                   if (!first_store && DR_IS_WRITE (dr))
1511                     first_store = dr;
1512                 }
1513
1514               /* If there are both known and unknown misaligned accesses in the
1515                  loop, we choose peeling amount according to the known
1516                  accesses.  */
1517               if (!supportable_dr_alignment)
1518                 {
1519                   dr0 = dr;
1520                   if (!first_store && DR_IS_WRITE (dr))
1521                     first_store = dr;
1522                 }
1523             }
1524         }
1525       else
1526         {
1527           if (!aligned_access_p (dr))
1528             {
1529               if (dump_enabled_p ())
1530                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1531                                  "vector alignment may not be reachable\n");
1532               break;
1533             }
1534         }
1535     }
1536
1537   /* Check if we can possibly peel the loop.  */
1538   if (!vect_can_advance_ivs_p (loop_vinfo)
1539       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1540       || loop->inner)
1541     do_peeling = false;
1542
1543   if (do_peeling
1544       && all_misalignments_unknown
1545       && vect_supportable_dr_alignment (dr0, false))
1546     {
1547       /* Check if the target requires to prefer stores over loads, i.e., if
1548          misaligned stores are more expensive than misaligned loads (taking
1549          drs with same alignment into account).  */
1550       if (first_store && DR_IS_READ (dr0))
1551         {
1552           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1553           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1554           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1555           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1556           stmt_vector_for_cost dummy;
1557           dummy.create (2);
1558
1559           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1560                                      &dummy);
1561           vect_get_data_access_cost (first_store, &store_inside_cost,
1562                                      &store_outside_cost, &dummy);
1563
1564           dummy.release ();
1565
1566           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1567              aligning the load DR0).  */
1568           load_inside_penalty = store_inside_cost;
1569           load_outside_penalty = store_outside_cost;
1570           for (i = 0;
1571                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1572                           DR_STMT (first_store))).iterate (i, &dr);
1573                i++)
1574             if (DR_IS_READ (dr))
1575               {
1576                 load_inside_penalty += load_inside_cost;
1577                 load_outside_penalty += load_outside_cost;
1578               }
1579             else
1580               {
1581                 load_inside_penalty += store_inside_cost;
1582                 load_outside_penalty += store_outside_cost;
1583               }
1584
1585           /* Calculate the penalty for leaving DR0 unaligned (by
1586              aligning the FIRST_STORE).  */
1587           store_inside_penalty = load_inside_cost;
1588           store_outside_penalty = load_outside_cost;
1589           for (i = 0;
1590                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1591                       DR_STMT (dr0))).iterate (i, &dr);
1592                i++)
1593             if (DR_IS_READ (dr))
1594               {
1595                 store_inside_penalty += load_inside_cost;
1596                 store_outside_penalty += load_outside_cost;
1597               }
1598             else
1599               {
1600                 store_inside_penalty += store_inside_cost;
1601                 store_outside_penalty += store_outside_cost;
1602               }
1603
1604           if (load_inside_penalty > store_inside_penalty
1605               || (load_inside_penalty == store_inside_penalty
1606                   && load_outside_penalty > store_outside_penalty))
1607             dr0 = first_store;
1608         }
1609
1610       /* In case there are only loads with different unknown misalignments, use
1611          peeling only if it may help to align other accesses in the loop or
1612          if it may help improving load bandwith when we'd end up using
1613          unaligned loads.  */
1614       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1615       if (!first_store
1616           && !STMT_VINFO_SAME_ALIGN_REFS (
1617                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1618           && (vect_supportable_dr_alignment (dr0, false)
1619               != dr_unaligned_supported
1620               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1621                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1622         do_peeling = false;
1623     }
1624
1625   if (do_peeling && !dr0)
1626     {
1627       /* Peeling is possible, but there is no data access that is not supported
1628          unless aligned. So we try to choose the best possible peeling.  */
1629
1630       /* We should get here only if there are drs with known misalignment.  */
1631       gcc_assert (!all_misalignments_unknown);
1632
1633       /* Choose the best peeling from the hash table.  */
1634       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1635                                                    loop_vinfo, &npeel,
1636                                                    &body_cost_vec);
1637       if (!dr0 || !npeel)
1638         do_peeling = false;
1639     }
1640
1641   if (do_peeling)
1642     {
1643       stmt = DR_STMT (dr0);
1644       stmt_info = vinfo_for_stmt (stmt);
1645       vectype = STMT_VINFO_VECTYPE (stmt_info);
1646       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1647
1648       if (known_alignment_for_access_p (dr0))
1649         {
1650           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1651                                                 size_zero_node) < 0;
1652           if (!npeel)
1653             {
1654               /* Since it's known at compile time, compute the number of
1655                  iterations in the peeled loop (the peeling factor) for use in
1656                  updating DR_MISALIGNMENT values.  The peeling factor is the
1657                  vectorization factor minus the misalignment as an element
1658                  count.  */
1659               mis = DR_MISALIGNMENT (dr0);
1660               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1661               npeel = ((negative ? mis - nelements : nelements - mis)
1662                        & (nelements - 1));
1663             }
1664
1665           /* For interleaved data access every iteration accesses all the
1666              members of the group, therefore we divide the number of iterations
1667              by the group size.  */
1668           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1669           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1670             npeel /= GROUP_SIZE (stmt_info);
1671
1672           if (dump_enabled_p ())
1673             dump_printf_loc (MSG_NOTE, vect_location,
1674                              "Try peeling by %d\n", npeel);
1675         }
1676
1677       /* Ensure that all data refs can be vectorized after the peel.  */
1678       FOR_EACH_VEC_ELT (datarefs, i, dr)
1679         {
1680           int save_misalignment;
1681
1682           if (dr == dr0)
1683             continue;
1684
1685           stmt = DR_STMT (dr);
1686           stmt_info = vinfo_for_stmt (stmt);
1687           /* For interleaving, only the alignment of the first access
1688             matters.  */
1689           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1690               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1691             continue;
1692
1693           /* Strided accesses perform only component accesses, alignment is
1694              irrelevant for them.  */
1695           if (STMT_VINFO_STRIDED_P (stmt_info)
1696               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1697             continue;
1698
1699           save_misalignment = DR_MISALIGNMENT (dr);
1700           vect_update_misalignment_for_peel (dr, dr0, npeel);
1701           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1702           SET_DR_MISALIGNMENT (dr, save_misalignment);
1703
1704           if (!supportable_dr_alignment)
1705             {
1706               do_peeling = false;
1707               break;
1708             }
1709         }
1710
1711       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1712         {
1713           stat = vect_verify_datarefs_alignment (loop_vinfo);
1714           if (!stat)
1715             do_peeling = false;
1716           else
1717             {
1718               body_cost_vec.release ();
1719               return stat;
1720             }
1721         }
1722
1723       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1724       if (do_peeling)
1725         {
1726           unsigned max_allowed_peel
1727             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1728           if (max_allowed_peel != (unsigned)-1)
1729             {
1730               unsigned max_peel = npeel;
1731               if (max_peel == 0)
1732                 {
1733                   gimple *dr_stmt = DR_STMT (dr0);
1734                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1735                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1736                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1737                 }
1738               if (max_peel > max_allowed_peel)
1739                 {
1740                   do_peeling = false;
1741                   if (dump_enabled_p ())
1742                     dump_printf_loc (MSG_NOTE, vect_location,
1743                         "Disable peeling, max peels reached: %d\n", max_peel);
1744                 }
1745             }
1746         }
1747
1748       /* Cost model #2 - if peeling may result in a remaining loop not
1749          iterating enough to be vectorized then do not peel.  */
1750       if (do_peeling
1751           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1752         {
1753           unsigned max_peel
1754             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1755           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1756               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1757             do_peeling = false;
1758         }
1759
1760       if (do_peeling)
1761         {
1762           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1763              If the misalignment of DR_i is identical to that of dr0 then set
1764              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1765              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1766              by the peeling factor times the element size of DR_i (MOD the
1767              vectorization factor times the size).  Otherwise, the
1768              misalignment of DR_i must be set to unknown.  */
1769           FOR_EACH_VEC_ELT (datarefs, i, dr)
1770             if (dr != dr0)
1771               vect_update_misalignment_for_peel (dr, dr0, npeel);
1772
1773           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1774           if (npeel)
1775             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1776           else
1777             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1778               = DR_MISALIGNMENT (dr0);
1779           SET_DR_MISALIGNMENT (dr0, 0);
1780           if (dump_enabled_p ())
1781             {
1782               dump_printf_loc (MSG_NOTE, vect_location,
1783                                "Alignment of access forced using peeling.\n");
1784               dump_printf_loc (MSG_NOTE, vect_location,
1785                                "Peeling for alignment will be applied.\n");
1786             }
1787           /* The inside-loop cost will be accounted for in vectorizable_load
1788              and vectorizable_store correctly with adjusted alignments.
1789              Drop the body_cst_vec on the floor here.  */
1790           body_cost_vec.release ();
1791
1792           stat = vect_verify_datarefs_alignment (loop_vinfo);
1793           gcc_assert (stat);
1794           return stat;
1795         }
1796     }
1797
1798   body_cost_vec.release ();
1799
1800   /* (2) Versioning to force alignment.  */
1801
1802   /* Try versioning if:
1803      1) optimize loop for speed
1804      2) there is at least one unsupported misaligned data ref with an unknown
1805         misalignment, and
1806      3) all misaligned data refs with a known misalignment are supported, and
1807      4) the number of runtime alignment checks is within reason.  */
1808
1809   do_versioning =
1810         optimize_loop_nest_for_speed_p (loop)
1811         && (!loop->inner); /* FORNOW */
1812
1813   if (do_versioning)
1814     {
1815       FOR_EACH_VEC_ELT (datarefs, i, dr)
1816         {
1817           stmt = DR_STMT (dr);
1818           stmt_info = vinfo_for_stmt (stmt);
1819
1820           /* For interleaving, only the alignment of the first access
1821              matters.  */
1822           if (aligned_access_p (dr)
1823               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1824                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1825             continue;
1826
1827           if (STMT_VINFO_STRIDED_P (stmt_info))
1828             {
1829               /* Strided loads perform only component accesses, alignment is
1830                  irrelevant for them.  */
1831               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1832                 continue;
1833               do_versioning = false;
1834               break;
1835             }
1836
1837           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1838
1839           if (!supportable_dr_alignment)
1840             {
1841               gimple *stmt;
1842               int mask;
1843               tree vectype;
1844
1845               if (known_alignment_for_access_p (dr)
1846                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1847                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1848                 {
1849                   do_versioning = false;
1850                   break;
1851                 }
1852
1853               stmt = DR_STMT (dr);
1854               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1855               gcc_assert (vectype);
1856
1857               /* The rightmost bits of an aligned address must be zeros.
1858                  Construct the mask needed for this test.  For example,
1859                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1860                  mask must be 15 = 0xf. */
1861               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1862
1863               /* FORNOW: use the same mask to test all potentially unaligned
1864                  references in the loop.  The vectorizer currently supports
1865                  a single vector size, see the reference to
1866                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1867                  vectorization factor is computed.  */
1868               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1869                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1870               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1871               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1872                       DR_STMT (dr));
1873             }
1874         }
1875
1876       /* Versioning requires at least one misaligned data reference.  */
1877       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1878         do_versioning = false;
1879       else if (!do_versioning)
1880         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1881     }
1882
1883   if (do_versioning)
1884     {
1885       vec<gimple *> may_misalign_stmts
1886         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1887       gimple *stmt;
1888
1889       /* It can now be assumed that the data references in the statements
1890          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1891          of the loop being vectorized.  */
1892       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1893         {
1894           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1895           dr = STMT_VINFO_DATA_REF (stmt_info);
1896           SET_DR_MISALIGNMENT (dr, 0);
1897           if (dump_enabled_p ())
1898             dump_printf_loc (MSG_NOTE, vect_location,
1899                              "Alignment of access forced using versioning.\n");
1900         }
1901
1902       if (dump_enabled_p ())
1903         dump_printf_loc (MSG_NOTE, vect_location,
1904                          "Versioning for alignment will be applied.\n");
1905
1906       /* Peeling and versioning can't be done together at this time.  */
1907       gcc_assert (! (do_peeling && do_versioning));
1908
1909       stat = vect_verify_datarefs_alignment (loop_vinfo);
1910       gcc_assert (stat);
1911       return stat;
1912     }
1913
1914   /* This point is reached if neither peeling nor versioning is being done.  */
1915   gcc_assert (! (do_peeling || do_versioning));
1916
1917   stat = vect_verify_datarefs_alignment (loop_vinfo);
1918   return stat;
1919 }
1920
1921
1922 /* Function vect_find_same_alignment_drs.
1923
1924    Update group and alignment relations according to the chosen
1925    vectorization factor.  */
1926
1927 static void
1928 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1929                               loop_vec_info loop_vinfo)
1930 {
1931   unsigned int i;
1932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1933   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1934   struct data_reference *dra = DDR_A (ddr);
1935   struct data_reference *drb = DDR_B (ddr);
1936   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1937   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1938   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1939   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1940   lambda_vector dist_v;
1941   unsigned int loop_depth;
1942
1943   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1944     return;
1945
1946   if (dra == drb)
1947     return;
1948
1949   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1950     return;
1951
1952   /* Loop-based vectorization and known data dependence.  */
1953   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1954     return;
1955
1956   /* Data-dependence analysis reports a distance vector of zero
1957      for data-references that overlap only in the first iteration
1958      but have different sign step (see PR45764).
1959      So as a sanity check require equal DR_STEP.  */
1960   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1961     return;
1962
1963   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1964   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1965     {
1966       int dist = dist_v[loop_depth];
1967
1968       if (dump_enabled_p ())
1969         dump_printf_loc (MSG_NOTE, vect_location,
1970                          "dependence distance  = %d.\n", dist);
1971
1972       /* Same loop iteration.  */
1973       if (dist == 0
1974           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1975         {
1976           /* Two references with distance zero have the same alignment.  */
1977           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1978           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1979           if (dump_enabled_p ())
1980             {
1981               dump_printf_loc (MSG_NOTE, vect_location,
1982                                "accesses have the same alignment.\n");
1983               dump_printf (MSG_NOTE,
1984                            "dependence distance modulo vf == 0 between ");
1985               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1986               dump_printf (MSG_NOTE,  " and ");
1987               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1988               dump_printf (MSG_NOTE, "\n");
1989             }
1990         }
1991     }
1992 }
1993
1994
1995 /* Function vect_analyze_data_refs_alignment
1996
1997    Analyze the alignment of the data-references in the loop.
1998    Return FALSE if a data reference is found that cannot be vectorized.  */
1999
2000 bool
2001 vect_analyze_data_refs_alignment (vec_info *vinfo)
2002 {
2003   if (dump_enabled_p ())
2004     dump_printf_loc (MSG_NOTE, vect_location,
2005                      "=== vect_analyze_data_refs_alignment ===\n");
2006
2007   /* Mark groups of data references with same alignment using
2008      data dependence information.  */
2009   if (is_a <loop_vec_info> (vinfo))
2010     {
2011       vec<ddr_p> ddrs = vinfo->ddrs;
2012       struct data_dependence_relation *ddr;
2013       unsigned int i;
2014
2015       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2016         vect_find_same_alignment_drs (ddr, as_a <loop_vec_info> (vinfo));
2017     }
2018
2019   if (!vect_compute_data_refs_alignment (vinfo))
2020     {
2021       if (dump_enabled_p ())
2022         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2023                          "not vectorized: can't calculate alignment "
2024                          "for data ref.\n");
2025       return false;
2026     }
2027
2028   return true;
2029 }
2030
2031
2032 /* Analyze groups of accesses: check that DR belongs to a group of
2033    accesses of legal size, step, etc.  Detect gaps, single element
2034    interleaving, and other special cases. Set grouped access info.
2035    Collect groups of strided stores for further use in SLP analysis.
2036    Worker for vect_analyze_group_access.  */
2037
2038 static bool
2039 vect_analyze_group_access_1 (struct data_reference *dr)
2040 {
2041   tree step = DR_STEP (dr);
2042   tree scalar_type = TREE_TYPE (DR_REF (dr));
2043   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2044   gimple *stmt = DR_STMT (dr);
2045   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2046   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2047   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2048   HOST_WIDE_INT dr_step = -1;
2049   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2050   bool slp_impossible = false;
2051   struct loop *loop = NULL;
2052
2053   if (loop_vinfo)
2054     loop = LOOP_VINFO_LOOP (loop_vinfo);
2055
2056   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2057      size of the interleaving group (including gaps).  */
2058   if (tree_fits_shwi_p (step))
2059     {
2060       dr_step = tree_to_shwi (step);
2061       groupsize = absu_hwi (dr_step) / type_size;
2062     }
2063   else
2064     groupsize = 0;
2065
2066   /* Not consecutive access is possible only if it is a part of interleaving.  */
2067   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2068     {
2069       /* Check if it this DR is a part of interleaving, and is a single
2070          element of the group that is accessed in the loop.  */
2071
2072       /* Gaps are supported only for loads. STEP must be a multiple of the type
2073          size.  The size of the group must be a power of 2.  */
2074       if (DR_IS_READ (dr)
2075           && (dr_step % type_size) == 0
2076           && groupsize > 0
2077           && exact_log2 (groupsize) != -1)
2078         {
2079           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2080           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2081           if (dump_enabled_p ())
2082             {
2083               dump_printf_loc (MSG_NOTE, vect_location,
2084                                "Detected single element interleaving ");
2085               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2086               dump_printf (MSG_NOTE, " step ");
2087               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2088               dump_printf (MSG_NOTE, "\n");
2089             }
2090
2091           if (loop_vinfo)
2092             {
2093               if (dump_enabled_p ())
2094                 dump_printf_loc (MSG_NOTE, vect_location,
2095                                  "Data access with gaps requires scalar "
2096                                  "epilogue loop\n");
2097               if (loop->inner)
2098                 {
2099                   if (dump_enabled_p ())
2100                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2101                                      "Peeling for outer loop is not"
2102                                      " supported\n");
2103                   return false;
2104                 }
2105
2106               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2107             }
2108
2109           return true;
2110         }
2111
2112       if (dump_enabled_p ())
2113         {
2114           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2115                            "not consecutive access ");
2116           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2117           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2118         }
2119
2120       if (bb_vinfo)
2121         {
2122           /* Mark the statement as unvectorizable.  */
2123           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2124           return true;
2125         }
2126
2127       return false;
2128     }
2129
2130   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2131     {
2132       /* First stmt in the interleaving chain. Check the chain.  */
2133       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2134       struct data_reference *data_ref = dr;
2135       unsigned int count = 1;
2136       tree prev_init = DR_INIT (data_ref);
2137       gimple *prev = stmt;
2138       HOST_WIDE_INT diff, gaps = 0;
2139
2140       while (next)
2141         {
2142           /* Skip same data-refs.  In case that two or more stmts share
2143              data-ref (supported only for loads), we vectorize only the first
2144              stmt, and the rest get their vectorized loads from the first
2145              one.  */
2146           if (!tree_int_cst_compare (DR_INIT (data_ref),
2147                                      DR_INIT (STMT_VINFO_DATA_REF (
2148                                                    vinfo_for_stmt (next)))))
2149             {
2150               if (DR_IS_WRITE (data_ref))
2151                 {
2152                   if (dump_enabled_p ())
2153                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2154                                      "Two store stmts share the same dr.\n");
2155                   return false;
2156                 }
2157
2158               if (dump_enabled_p ())
2159                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2160                                  "Two or more load stmts share the same dr.\n");
2161
2162               /* For load use the same data-ref load.  */
2163               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2164
2165               prev = next;
2166               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2167               continue;
2168             }
2169
2170           prev = next;
2171           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2172
2173           /* All group members have the same STEP by construction.  */
2174           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2175
2176           /* Check that the distance between two accesses is equal to the type
2177              size. Otherwise, we have gaps.  */
2178           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2179                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2180           if (diff != 1)
2181             {
2182               /* FORNOW: SLP of accesses with gaps is not supported.  */
2183               slp_impossible = true;
2184               if (DR_IS_WRITE (data_ref))
2185                 {
2186                   if (dump_enabled_p ())
2187                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2188                                      "interleaved store with gaps\n");
2189                   return false;
2190                 }
2191
2192               gaps += diff - 1;
2193             }
2194
2195           last_accessed_element += diff;
2196
2197           /* Store the gap from the previous member of the group. If there is no
2198              gap in the access, GROUP_GAP is always 1.  */
2199           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2200
2201           prev_init = DR_INIT (data_ref);
2202           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2203           /* Count the number of data-refs in the chain.  */
2204           count++;
2205         }
2206
2207       if (groupsize == 0)
2208         groupsize = count + gaps;
2209
2210       if (groupsize > UINT_MAX)
2211         {
2212           if (dump_enabled_p ())
2213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2214                              "group is too large\n");
2215           return false;
2216         }
2217
2218       /* Check that the size of the interleaving is equal to count for stores,
2219          i.e., that there are no gaps.  */
2220       if (groupsize != count
2221           && !DR_IS_READ (dr))
2222         {
2223           if (dump_enabled_p ())
2224             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2225                              "interleaved store with gaps\n");
2226           return false;
2227         }
2228
2229       /* If there is a gap after the last load in the group it is the
2230          difference between the groupsize and the last accessed
2231          element.
2232          When there is no gap, this difference should be 0.  */
2233       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2234
2235       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2236       if (dump_enabled_p ())
2237         {
2238           dump_printf_loc (MSG_NOTE, vect_location,
2239                            "Detected interleaving ");
2240           if (DR_IS_READ (dr))
2241             dump_printf (MSG_NOTE, "load ");
2242           else
2243             dump_printf (MSG_NOTE, "store ");
2244           dump_printf (MSG_NOTE, "of size %u starting with ",
2245                        (unsigned)groupsize);
2246           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2247           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2248             dump_printf_loc (MSG_NOTE, vect_location,
2249                              "There is a gap of %u elements after the group\n",
2250                              GROUP_GAP (vinfo_for_stmt (stmt)));
2251         }
2252
2253       /* SLP: create an SLP data structure for every interleaving group of
2254          stores for further analysis in vect_analyse_slp.  */
2255       if (DR_IS_WRITE (dr) && !slp_impossible)
2256         {
2257           if (loop_vinfo)
2258             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2259           if (bb_vinfo)
2260             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2261         }
2262
2263       /* If there is a gap in the end of the group or the group size cannot
2264          be made a multiple of the vector element count then we access excess
2265          elements in the last iteration and thus need to peel that off.  */
2266       if (loop_vinfo
2267           && (groupsize - last_accessed_element > 0
2268               || exact_log2 (groupsize) == -1))
2269
2270         {
2271           if (dump_enabled_p ())
2272             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273                              "Data access with gaps requires scalar "
2274                              "epilogue loop\n");
2275           if (loop->inner)
2276             {
2277               if (dump_enabled_p ())
2278                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279                                  "Peeling for outer loop is not supported\n");
2280               return false;
2281             }
2282
2283           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2284         }
2285     }
2286
2287   return true;
2288 }
2289
2290 /* Analyze groups of accesses: check that DR belongs to a group of
2291    accesses of legal size, step, etc.  Detect gaps, single element
2292    interleaving, and other special cases. Set grouped access info.
2293    Collect groups of strided stores for further use in SLP analysis.  */
2294
2295 static bool
2296 vect_analyze_group_access (struct data_reference *dr)
2297 {
2298   if (!vect_analyze_group_access_1 (dr))
2299     {
2300       /* Dissolve the group if present.  */
2301       gimple *next;
2302       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2303       while (stmt)
2304         {
2305           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2306           next = GROUP_NEXT_ELEMENT (vinfo);
2307           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2308           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2309           stmt = next;
2310         }
2311       return false;
2312     }
2313   return true;
2314 }
2315
2316 /* Analyze the access pattern of the data-reference DR.
2317    In case of non-consecutive accesses call vect_analyze_group_access() to
2318    analyze groups of accesses.  */
2319
2320 static bool
2321 vect_analyze_data_ref_access (struct data_reference *dr)
2322 {
2323   tree step = DR_STEP (dr);
2324   tree scalar_type = TREE_TYPE (DR_REF (dr));
2325   gimple *stmt = DR_STMT (dr);
2326   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2327   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2328   struct loop *loop = NULL;
2329
2330   if (loop_vinfo)
2331     loop = LOOP_VINFO_LOOP (loop_vinfo);
2332
2333   if (loop_vinfo && !step)
2334     {
2335       if (dump_enabled_p ())
2336         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2337                          "bad data-ref access in loop\n");
2338       return false;
2339     }
2340
2341   /* Allow loads with zero step in inner-loop vectorization.  */
2342   if (loop_vinfo && integer_zerop (step))
2343     {
2344       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2345       if (!nested_in_vect_loop_p (loop, stmt))
2346         return DR_IS_READ (dr);
2347       /* Allow references with zero step for outer loops marked
2348          with pragma omp simd only - it guarantees absence of
2349          loop-carried dependencies between inner loop iterations.  */
2350       if (!loop->force_vectorize)
2351         {
2352           if (dump_enabled_p ())
2353             dump_printf_loc (MSG_NOTE, vect_location,
2354                              "zero step in inner loop of nest\n");
2355           return false;
2356         }
2357     }
2358
2359   if (loop && nested_in_vect_loop_p (loop, stmt))
2360     {
2361       /* Interleaved accesses are not yet supported within outer-loop
2362         vectorization for references in the inner-loop.  */
2363       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2364
2365       /* For the rest of the analysis we use the outer-loop step.  */
2366       step = STMT_VINFO_DR_STEP (stmt_info);
2367       if (integer_zerop (step))
2368         {
2369           if (dump_enabled_p ())
2370             dump_printf_loc (MSG_NOTE, vect_location,
2371                              "zero step in outer loop.\n");
2372           return DR_IS_READ (dr);
2373         }
2374     }
2375
2376   /* Consecutive?  */
2377   if (TREE_CODE (step) == INTEGER_CST)
2378     {
2379       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2380       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2381           || (dr_step < 0
2382               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2383         {
2384           /* Mark that it is not interleaving.  */
2385           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2386           return true;
2387         }
2388     }
2389
2390   if (loop && nested_in_vect_loop_p (loop, stmt))
2391     {
2392       if (dump_enabled_p ())
2393         dump_printf_loc (MSG_NOTE, vect_location,
2394                          "grouped access in outer loop.\n");
2395       return false;
2396     }
2397
2398
2399   /* Assume this is a DR handled by non-constant strided load case.  */
2400   if (TREE_CODE (step) != INTEGER_CST)
2401     return (STMT_VINFO_STRIDED_P (stmt_info)
2402             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2403                 || vect_analyze_group_access (dr)));
2404
2405   /* Not consecutive access - check if it's a part of interleaving group.  */
2406   return vect_analyze_group_access (dr);
2407 }
2408
2409
2410
2411 /*  A helper function used in the comparator function to sort data
2412     references.  T1 and T2 are two data references to be compared.
2413     The function returns -1, 0, or 1.  */
2414
2415 static int
2416 compare_tree (tree t1, tree t2)
2417 {
2418   int i, cmp;
2419   enum tree_code code;
2420   char tclass;
2421
2422   if (t1 == t2)
2423     return 0;
2424   if (t1 == NULL)
2425     return -1;
2426   if (t2 == NULL)
2427     return 1;
2428
2429
2430   if (TREE_CODE (t1) != TREE_CODE (t2))
2431     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2432
2433   code = TREE_CODE (t1);
2434   switch (code)
2435     {
2436     /* For const values, we can just use hash values for comparisons.  */
2437     case INTEGER_CST:
2438     case REAL_CST:
2439     case FIXED_CST:
2440     case STRING_CST:
2441     case COMPLEX_CST:
2442     case VECTOR_CST:
2443       {
2444         hashval_t h1 = iterative_hash_expr (t1, 0);
2445         hashval_t h2 = iterative_hash_expr (t2, 0);
2446         if (h1 != h2)
2447           return h1 < h2 ? -1 : 1;
2448         break;
2449       }
2450
2451     case SSA_NAME:
2452       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2453       if (cmp != 0)
2454         return cmp;
2455
2456       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2457         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2458       break;
2459
2460     default:
2461       tclass = TREE_CODE_CLASS (code);
2462
2463       /* For var-decl, we could compare their UIDs.  */
2464       if (tclass == tcc_declaration)
2465         {
2466           if (DECL_UID (t1) != DECL_UID (t2))
2467             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2468           break;
2469         }
2470
2471       /* For expressions with operands, compare their operands recursively.  */
2472       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2473         {
2474           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2475           if (cmp != 0)
2476             return cmp;
2477         }
2478     }
2479
2480   return 0;
2481 }
2482
2483
2484 /* Compare two data-references DRA and DRB to group them into chunks
2485    suitable for grouping.  */
2486
2487 static int
2488 dr_group_sort_cmp (const void *dra_, const void *drb_)
2489 {
2490   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2491   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2492   int cmp;
2493
2494   /* Stabilize sort.  */
2495   if (dra == drb)
2496     return 0;
2497
2498   /* Ordering of DRs according to base.  */
2499   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2500     {
2501       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2502       if (cmp != 0)
2503         return cmp;
2504     }
2505
2506   /* And according to DR_OFFSET.  */
2507   if (!dr_equal_offsets_p (dra, drb))
2508     {
2509       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2510       if (cmp != 0)
2511         return cmp;
2512     }
2513
2514   /* Put reads before writes.  */
2515   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2516     return DR_IS_READ (dra) ? -1 : 1;
2517
2518   /* Then sort after access size.  */
2519   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2520                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2521     {
2522       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2523                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2524       if (cmp != 0)
2525         return cmp;
2526     }
2527
2528   /* And after step.  */
2529   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2530     {
2531       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2532       if (cmp != 0)
2533         return cmp;
2534     }
2535
2536   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2537   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2538   if (cmp == 0)
2539     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2540   return cmp;
2541 }
2542
2543 /* Function vect_analyze_data_ref_accesses.
2544
2545    Analyze the access pattern of all the data references in the loop.
2546
2547    FORNOW: the only access pattern that is considered vectorizable is a
2548            simple step 1 (consecutive) access.
2549
2550    FORNOW: handle only arrays and pointer accesses.  */
2551
2552 bool
2553 vect_analyze_data_ref_accesses (vec_info *vinfo)
2554 {
2555   unsigned int i;
2556   vec<data_reference_p> datarefs = vinfo->datarefs;
2557   struct data_reference *dr;
2558
2559   if (dump_enabled_p ())
2560     dump_printf_loc (MSG_NOTE, vect_location,
2561                      "=== vect_analyze_data_ref_accesses ===\n");
2562
2563   if (datarefs.is_empty ())
2564     return true;
2565
2566   /* Sort the array of datarefs to make building the interleaving chains
2567      linear.  Don't modify the original vector's order, it is needed for
2568      determining what dependencies are reversed.  */
2569   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2570   datarefs_copy.qsort (dr_group_sort_cmp);
2571
2572   /* Build the interleaving chains.  */
2573   for (i = 0; i < datarefs_copy.length () - 1;)
2574     {
2575       data_reference_p dra = datarefs_copy[i];
2576       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2577       stmt_vec_info lastinfo = NULL;
2578       for (i = i + 1; i < datarefs_copy.length (); ++i)
2579         {
2580           data_reference_p drb = datarefs_copy[i];
2581           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2582
2583           /* ???  Imperfect sorting (non-compatible types, non-modulo
2584              accesses, same accesses) can lead to a group to be artificially
2585              split here as we don't just skip over those.  If it really
2586              matters we can push those to a worklist and re-iterate
2587              over them.  The we can just skip ahead to the next DR here.  */
2588
2589           /* Check that the data-refs have same first location (except init)
2590              and they are both either store or load (not load and store,
2591              not masked loads or stores).  */
2592           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2593               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2594                                    DR_BASE_ADDRESS (drb), 0)
2595               || !dr_equal_offsets_p (dra, drb)
2596               || !gimple_assign_single_p (DR_STMT (dra))
2597               || !gimple_assign_single_p (DR_STMT (drb)))
2598             break;
2599
2600           /* Check that the data-refs have the same constant size.  */
2601           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2602           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2603           if (!tree_fits_uhwi_p (sza)
2604               || !tree_fits_uhwi_p (szb)
2605               || !tree_int_cst_equal (sza, szb))
2606             break;
2607
2608           /* Check that the data-refs have the same step.  */
2609           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2610             break;
2611
2612           /* Do not place the same access in the interleaving chain twice.  */
2613           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2614             break;
2615
2616           /* Check the types are compatible.
2617              ???  We don't distinguish this during sorting.  */
2618           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2619                                    TREE_TYPE (DR_REF (drb))))
2620             break;
2621
2622           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2623           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2624           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2625           gcc_assert (init_a < init_b);
2626
2627           /* If init_b == init_a + the size of the type * k, we have an
2628              interleaving, and DRA is accessed before DRB.  */
2629           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2630           if ((init_b - init_a) % type_size_a != 0)
2631             break;
2632
2633           /* If we have a store, the accesses are adjacent.  This splits
2634              groups into chunks we support (we don't support vectorization
2635              of stores with gaps).  */
2636           if (!DR_IS_READ (dra)
2637               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2638                                              (DR_INIT (datarefs_copy[i-1]))
2639                   != type_size_a))
2640             break;
2641
2642           /* If the step (if not zero or non-constant) is greater than the
2643              difference between data-refs' inits this splits groups into
2644              suitable sizes.  */
2645           if (tree_fits_shwi_p (DR_STEP (dra)))
2646             {
2647               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2648               if (step != 0 && step <= (init_b - init_a))
2649                 break;
2650             }
2651
2652           if (dump_enabled_p ())
2653             {
2654               dump_printf_loc (MSG_NOTE, vect_location,
2655                                "Detected interleaving ");
2656               if (DR_IS_READ (dra))
2657                 dump_printf (MSG_NOTE, "load ");
2658               else
2659                 dump_printf (MSG_NOTE, "store ");
2660               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2661               dump_printf (MSG_NOTE,  " and ");
2662               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2663               dump_printf (MSG_NOTE, "\n");
2664             }
2665
2666           /* Link the found element into the group list.  */
2667           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2668             {
2669               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2670               lastinfo = stmtinfo_a;
2671             }
2672           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2673           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2674           lastinfo = stmtinfo_b;
2675         }
2676     }
2677
2678   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2679     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2680         && !vect_analyze_data_ref_access (dr))
2681       {
2682         if (dump_enabled_p ())
2683           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2684                            "not vectorized: complicated access pattern.\n");
2685
2686         if (is_a <bb_vec_info> (vinfo))
2687           {
2688             /* Mark the statement as not vectorizable.  */
2689             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2690             continue;
2691           }
2692         else
2693           {
2694             datarefs_copy.release ();
2695             return false;
2696           }
2697       }
2698
2699   datarefs_copy.release ();
2700   return true;
2701 }
2702
2703
2704 /* Operator == between two dr_with_seg_len objects.
2705
2706    This equality operator is used to make sure two data refs
2707    are the same one so that we will consider to combine the
2708    aliasing checks of those two pairs of data dependent data
2709    refs.  */
2710
2711 static bool
2712 operator == (const dr_with_seg_len& d1,
2713              const dr_with_seg_len& d2)
2714 {
2715   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2716                           DR_BASE_ADDRESS (d2.dr), 0)
2717            && compare_tree (d1.offset, d2.offset) == 0
2718            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2719 }
2720
2721 /* Function comp_dr_with_seg_len_pair.
2722
2723    Comparison function for sorting objects of dr_with_seg_len_pair_t
2724    so that we can combine aliasing checks in one scan.  */
2725
2726 static int
2727 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2728 {
2729   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2730   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2731
2732   const dr_with_seg_len &p11 = p1->first,
2733                         &p12 = p1->second,
2734                         &p21 = p2->first,
2735                         &p22 = p2->second;
2736
2737   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2738      if a and c have the same basic address snd step, and b and d have the same
2739      address and step.  Therefore, if any a&c or b&d don't have the same address
2740      and step, we don't care the order of those two pairs after sorting.  */
2741   int comp_res;
2742
2743   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2744                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2745     return comp_res;
2746   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2747                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2748     return comp_res;
2749   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2750     return comp_res;
2751   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2752     return comp_res;
2753   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2754     return comp_res;
2755   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2756     return comp_res;
2757
2758   return 0;
2759 }
2760
2761 /* Function vect_vfa_segment_size.
2762
2763    Create an expression that computes the size of segment
2764    that will be accessed for a data reference.  The functions takes into
2765    account that realignment loads may access one more vector.
2766
2767    Input:
2768      DR: The data reference.
2769      LENGTH_FACTOR: segment length to consider.
2770
2771    Return an expression whose value is the size of segment which will be
2772    accessed by DR.  */
2773
2774 static tree
2775 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2776 {
2777   tree segment_length;
2778
2779   if (integer_zerop (DR_STEP (dr)))
2780     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2781   else
2782     segment_length = size_binop (MULT_EXPR,
2783                                  fold_convert (sizetype, DR_STEP (dr)),
2784                                  fold_convert (sizetype, length_factor));
2785
2786   if (vect_supportable_dr_alignment (dr, false)
2787         == dr_explicit_realign_optimized)
2788     {
2789       tree vector_size = TYPE_SIZE_UNIT
2790                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2791
2792       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2793     }
2794   return segment_length;
2795 }
2796
2797 /* Function vect_prune_runtime_alias_test_list.
2798
2799    Prune a list of ddrs to be tested at run-time by versioning for alias.
2800    Merge several alias checks into one if possible.
2801    Return FALSE if resulting list of ddrs is longer then allowed by
2802    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2803
2804 bool
2805 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2806 {
2807   vec<ddr_p> may_alias_ddrs =
2808     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2809   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2810     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2811   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2812   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2813
2814   ddr_p ddr;
2815   unsigned int i;
2816   tree length_factor;
2817
2818   if (dump_enabled_p ())
2819     dump_printf_loc (MSG_NOTE, vect_location,
2820                      "=== vect_prune_runtime_alias_test_list ===\n");
2821
2822   if (may_alias_ddrs.is_empty ())
2823     return true;
2824
2825   /* Basically, for each pair of dependent data refs store_ptr_0
2826      and load_ptr_0, we create an expression:
2827
2828      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2829      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2830
2831      for aliasing checks.  However, in some cases we can decrease
2832      the number of checks by combining two checks into one.  For
2833      example, suppose we have another pair of data refs store_ptr_0
2834      and load_ptr_1, and if the following condition is satisfied:
2835
2836      load_ptr_0 < load_ptr_1  &&
2837      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2838
2839      (this condition means, in each iteration of vectorized loop,
2840      the accessed memory of store_ptr_0 cannot be between the memory
2841      of load_ptr_0 and load_ptr_1.)
2842
2843      we then can use only the following expression to finish the
2844      alising checks between store_ptr_0 & load_ptr_0 and
2845      store_ptr_0 & load_ptr_1:
2846
2847      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2848      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2849
2850      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2851      same basic address.  */
2852
2853   comp_alias_ddrs.create (may_alias_ddrs.length ());
2854
2855   /* First, we collect all data ref pairs for aliasing checks.  */
2856   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2857     {
2858       struct data_reference *dr_a, *dr_b;
2859       gimple *dr_group_first_a, *dr_group_first_b;
2860       tree segment_length_a, segment_length_b;
2861       gimple *stmt_a, *stmt_b;
2862
2863       dr_a = DDR_A (ddr);
2864       stmt_a = DR_STMT (DDR_A (ddr));
2865       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2866       if (dr_group_first_a)
2867         {
2868           stmt_a = dr_group_first_a;
2869           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2870         }
2871
2872       dr_b = DDR_B (ddr);
2873       stmt_b = DR_STMT (DDR_B (ddr));
2874       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2875       if (dr_group_first_b)
2876         {
2877           stmt_b = dr_group_first_b;
2878           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2879         }
2880
2881       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2882         length_factor = scalar_loop_iters;
2883       else
2884         length_factor = size_int (vect_factor);
2885       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2886       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2887
2888       dr_with_seg_len_pair_t dr_with_seg_len_pair
2889           (dr_with_seg_len (dr_a, segment_length_a),
2890            dr_with_seg_len (dr_b, segment_length_b));
2891
2892       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2893         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2894
2895       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2896     }
2897
2898   /* Second, we sort the collected data ref pairs so that we can scan
2899      them once to combine all possible aliasing checks.  */
2900   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2901
2902   /* Third, we scan the sorted dr pairs and check if we can combine
2903      alias checks of two neighbouring dr pairs.  */
2904   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2905     {
2906       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2907       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2908                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2909                       *dr_a2 = &comp_alias_ddrs[i].first,
2910                       *dr_b2 = &comp_alias_ddrs[i].second;
2911
2912       /* Remove duplicate data ref pairs.  */
2913       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2914         {
2915           if (dump_enabled_p ())
2916             {
2917               dump_printf_loc (MSG_NOTE, vect_location,
2918                                "found equal ranges ");
2919               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2920                                  DR_REF (dr_a1->dr));
2921               dump_printf (MSG_NOTE,  ", ");
2922               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923                                  DR_REF (dr_b1->dr));
2924               dump_printf (MSG_NOTE,  " and ");
2925               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926                                  DR_REF (dr_a2->dr));
2927               dump_printf (MSG_NOTE,  ", ");
2928               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2929                                  DR_REF (dr_b2->dr));
2930               dump_printf (MSG_NOTE, "\n");
2931             }
2932
2933           comp_alias_ddrs.ordered_remove (i--);
2934           continue;
2935         }
2936
2937       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2938         {
2939           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2940              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2941           if (*dr_a1 == *dr_a2)
2942             {
2943               std::swap (dr_a1, dr_b1);
2944               std::swap (dr_a2, dr_b2);
2945             }
2946
2947           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2948                                 DR_BASE_ADDRESS (dr_a2->dr),
2949                                 0)
2950               || !tree_fits_shwi_p (dr_a1->offset)
2951               || !tree_fits_shwi_p (dr_a2->offset))
2952             continue;
2953
2954           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2955                                 - tree_to_shwi (dr_a1->offset));
2956
2957
2958           /* Now we check if the following condition is satisfied:
2959
2960              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2961
2962              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2963              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2964              have to make a best estimation.  We can get the minimum value
2965              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2966              then either of the following two conditions can guarantee the
2967              one above:
2968
2969              1: DIFF <= MIN_SEG_LEN_B
2970              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2971
2972              */
2973
2974           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2975                                           ? tree_to_shwi (dr_b1->seg_len)
2976                                           : vect_factor);
2977
2978           if (diff <= min_seg_len_b
2979               || (tree_fits_shwi_p (dr_a1->seg_len)
2980                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2981             {
2982               if (dump_enabled_p ())
2983                 {
2984                   dump_printf_loc (MSG_NOTE, vect_location,
2985                                    "merging ranges for ");
2986                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2987                                      DR_REF (dr_a1->dr));
2988                   dump_printf (MSG_NOTE,  ", ");
2989                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2990                                      DR_REF (dr_b1->dr));
2991                   dump_printf (MSG_NOTE,  " and ");
2992                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2993                                      DR_REF (dr_a2->dr));
2994                   dump_printf (MSG_NOTE,  ", ");
2995                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2996                                      DR_REF (dr_b2->dr));
2997                   dump_printf (MSG_NOTE, "\n");
2998                 }
2999
3000               dr_a1->seg_len = size_binop (PLUS_EXPR,
3001                                            dr_a2->seg_len, size_int (diff));
3002               comp_alias_ddrs.ordered_remove (i--);
3003             }
3004         }
3005     }
3006
3007   dump_printf_loc (MSG_NOTE, vect_location,
3008                    "improved number of alias checks from %d to %d\n",
3009                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
3010   if ((int) comp_alias_ddrs.length () >
3011       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3012     return false;
3013
3014   return true;
3015 }
3016
3017 /* Check whether a non-affine read or write in stmt is suitable for gather load
3018    or scatter store and if so, return a builtin decl for that operation.  */
3019
3020 tree
3021 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3022                            tree *offp, int *scalep)
3023 {
3024   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3025   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3026   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3027   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3028   tree offtype = NULL_TREE;
3029   tree decl, base, off;
3030   machine_mode pmode;
3031   int punsignedp, pvolatilep;
3032
3033   base = DR_REF (dr);
3034   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3035      see if we can use the def stmt of the address.  */
3036   if (is_gimple_call (stmt)
3037       && gimple_call_internal_p (stmt)
3038       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3039           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3040       && TREE_CODE (base) == MEM_REF
3041       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3042       && integer_zerop (TREE_OPERAND (base, 1))
3043       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3044     {
3045       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3046       if (is_gimple_assign (def_stmt)
3047           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3048         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3049     }
3050
3051   /* The gather and scatter builtins need address of the form
3052      loop_invariant + vector * {1, 2, 4, 8}
3053      or
3054      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3055      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3056      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3057      multiplications and additions in it.  To get a vector, we need
3058      a single SSA_NAME that will be defined in the loop and will
3059      contain everything that is not loop invariant and that can be
3060      vectorized.  The following code attempts to find such a preexistng
3061      SSA_NAME OFF and put the loop invariants into a tree BASE
3062      that can be gimplified before the loop.  */
3063   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3064                               &pmode, &punsignedp, &pvolatilep, false);
3065   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3066
3067   if (TREE_CODE (base) == MEM_REF)
3068     {
3069       if (!integer_zerop (TREE_OPERAND (base, 1)))
3070         {
3071           if (off == NULL_TREE)
3072             {
3073               offset_int moff = mem_ref_offset (base);
3074               off = wide_int_to_tree (sizetype, moff);
3075             }
3076           else
3077             off = size_binop (PLUS_EXPR, off,
3078                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3079         }
3080       base = TREE_OPERAND (base, 0);
3081     }
3082   else
3083     base = build_fold_addr_expr (base);
3084
3085   if (off == NULL_TREE)
3086     off = size_zero_node;
3087
3088   /* If base is not loop invariant, either off is 0, then we start with just
3089      the constant offset in the loop invariant BASE and continue with base
3090      as OFF, otherwise give up.
3091      We could handle that case by gimplifying the addition of base + off
3092      into some SSA_NAME and use that as off, but for now punt.  */
3093   if (!expr_invariant_in_loop_p (loop, base))
3094     {
3095       if (!integer_zerop (off))
3096         return NULL_TREE;
3097       off = base;
3098       base = size_int (pbitpos / BITS_PER_UNIT);
3099     }
3100   /* Otherwise put base + constant offset into the loop invariant BASE
3101      and continue with OFF.  */
3102   else
3103     {
3104       base = fold_convert (sizetype, base);
3105       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3106     }
3107
3108   /* OFF at this point may be either a SSA_NAME or some tree expression
3109      from get_inner_reference.  Try to peel off loop invariants from it
3110      into BASE as long as possible.  */
3111   STRIP_NOPS (off);
3112   while (offtype == NULL_TREE)
3113     {
3114       enum tree_code code;
3115       tree op0, op1, add = NULL_TREE;
3116
3117       if (TREE_CODE (off) == SSA_NAME)
3118         {
3119           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3120
3121           if (expr_invariant_in_loop_p (loop, off))
3122             return NULL_TREE;
3123
3124           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3125             break;
3126
3127           op0 = gimple_assign_rhs1 (def_stmt);
3128           code = gimple_assign_rhs_code (def_stmt);
3129           op1 = gimple_assign_rhs2 (def_stmt);
3130         }
3131       else
3132         {
3133           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3134             return NULL_TREE;
3135           code = TREE_CODE (off);
3136           extract_ops_from_tree (off, &code, &op0, &op1);
3137         }
3138       switch (code)
3139         {
3140         case POINTER_PLUS_EXPR:
3141         case PLUS_EXPR:
3142           if (expr_invariant_in_loop_p (loop, op0))
3143             {
3144               add = op0;
3145               off = op1;
3146             do_add:
3147               add = fold_convert (sizetype, add);
3148               if (scale != 1)
3149                 add = size_binop (MULT_EXPR, add, size_int (scale));
3150               base = size_binop (PLUS_EXPR, base, add);
3151               continue;
3152             }
3153           if (expr_invariant_in_loop_p (loop, op1))
3154             {
3155               add = op1;
3156               off = op0;
3157               goto do_add;
3158             }
3159           break;
3160         case MINUS_EXPR:
3161           if (expr_invariant_in_loop_p (loop, op1))
3162             {
3163               add = fold_convert (sizetype, op1);
3164               add = size_binop (MINUS_EXPR, size_zero_node, add);
3165               off = op0;
3166               goto do_add;
3167             }
3168           break;
3169         case MULT_EXPR:
3170           if (scale == 1 && tree_fits_shwi_p (op1))
3171             {
3172               scale = tree_to_shwi (op1);
3173               off = op0;
3174               continue;
3175             }
3176           break;
3177         case SSA_NAME:
3178           off = op0;
3179           continue;
3180         CASE_CONVERT:
3181           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3182               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3183             break;
3184           if (TYPE_PRECISION (TREE_TYPE (op0))
3185               == TYPE_PRECISION (TREE_TYPE (off)))
3186             {
3187               off = op0;
3188               continue;
3189             }
3190           if (TYPE_PRECISION (TREE_TYPE (op0))
3191               < TYPE_PRECISION (TREE_TYPE (off)))
3192             {
3193               off = op0;
3194               offtype = TREE_TYPE (off);
3195               STRIP_NOPS (off);
3196               continue;
3197             }
3198           break;
3199         default:
3200           break;
3201         }
3202       break;
3203     }
3204
3205   /* If at the end OFF still isn't a SSA_NAME or isn't
3206      defined in the loop, punt.  */
3207   if (TREE_CODE (off) != SSA_NAME
3208       || expr_invariant_in_loop_p (loop, off))
3209     return NULL_TREE;
3210
3211   if (offtype == NULL_TREE)
3212     offtype = TREE_TYPE (off);
3213
3214   if (DR_IS_READ (dr))
3215     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3216                                              offtype, scale);
3217   else
3218     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3219                                               offtype, scale);
3220
3221   if (decl == NULL_TREE)
3222     return NULL_TREE;
3223
3224   if (basep)
3225     *basep = base;
3226   if (offp)
3227     *offp = off;
3228   if (scalep)
3229     *scalep = scale;
3230   return decl;
3231 }
3232
3233 /* Function vect_analyze_data_refs.
3234
3235   Find all the data references in the loop or basic block.
3236
3237    The general structure of the analysis of data refs in the vectorizer is as
3238    follows:
3239    1- vect_analyze_data_refs(loop/bb): call
3240       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3241       in the loop/bb and their dependences.
3242    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3243    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3244    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3245
3246 */
3247
3248 bool
3249 vect_analyze_data_refs (vec_info *vinfo, int *min_vf, unsigned *n_stmts)
3250 {
3251   struct loop *loop = NULL;
3252   basic_block bb = NULL;
3253   unsigned int i;
3254   vec<data_reference_p> datarefs;
3255   struct data_reference *dr;
3256   tree scalar_type;
3257
3258   if (dump_enabled_p ())
3259     dump_printf_loc (MSG_NOTE, vect_location,
3260                      "=== vect_analyze_data_refs ===\n");
3261
3262   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3263     {
3264       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3265
3266       loop = LOOP_VINFO_LOOP (loop_vinfo);
3267       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3268       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3269         {
3270           if (dump_enabled_p ())
3271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3272                              "not vectorized: loop contains function calls"
3273                              " or data references that cannot be analyzed\n");
3274           return false;
3275         }
3276
3277       for (i = 0; i < loop->num_nodes; i++)
3278         {
3279           gimple_stmt_iterator gsi;
3280
3281           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3282             {
3283               gimple *stmt = gsi_stmt (gsi);
3284               if (is_gimple_debug (stmt))
3285                 continue;
3286               ++*n_stmts;
3287               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3288                 {
3289                   if (is_gimple_call (stmt) && loop->safelen)
3290                     {
3291                       tree fndecl = gimple_call_fndecl (stmt), op;
3292                       if (fndecl != NULL_TREE)
3293                         {
3294                           struct cgraph_node *node = cgraph_node::get (fndecl);
3295                           if (node != NULL && node->simd_clones != NULL)
3296                             {
3297                               unsigned int j, n = gimple_call_num_args (stmt);
3298                               for (j = 0; j < n; j++)
3299                                 {
3300                                   op = gimple_call_arg (stmt, j);
3301                                   if (DECL_P (op)
3302                                       || (REFERENCE_CLASS_P (op)
3303                                           && get_base_address (op)))
3304                                     break;
3305                                 }
3306                               op = gimple_call_lhs (stmt);
3307                               /* Ignore #pragma omp declare simd functions
3308                                  if they don't have data references in the
3309                                  call stmt itself.  */
3310                               if (j == n
3311                                   && !(op
3312                                        && (DECL_P (op)
3313                                            || (REFERENCE_CLASS_P (op)
3314                                                && get_base_address (op)))))
3315                                 continue;
3316                             }
3317                         }
3318                     }
3319                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3320                   if (dump_enabled_p ())
3321                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3322                                      "not vectorized: loop contains function "
3323                                      "calls or data references that cannot "
3324                                      "be analyzed\n");
3325                   return false;
3326                 }
3327             }
3328         }
3329
3330       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3331     }
3332   else
3333     {
3334       bb_vec_info bb_vinfo = as_a <bb_vec_info> (vinfo);
3335       gimple_stmt_iterator gsi;
3336
3337       bb = BB_VINFO_BB (bb_vinfo);
3338       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3339         {
3340           gimple *stmt = gsi_stmt (gsi);
3341           if (is_gimple_debug (stmt))
3342             continue;
3343           ++*n_stmts;
3344           if (!find_data_references_in_stmt (NULL, stmt,
3345                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3346             {
3347               /* Mark the rest of the basic-block as unvectorizable.  */
3348               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3349                 {
3350                   stmt = gsi_stmt (gsi);
3351                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3352                 }
3353               break;
3354             }
3355         }
3356
3357       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3358     }
3359
3360   /* Go through the data-refs, check that the analysis succeeded.  Update
3361      pointer from stmt_vec_info struct to DR and vectype.  */
3362
3363   FOR_EACH_VEC_ELT (datarefs, i, dr)
3364     {
3365       gimple *stmt;
3366       stmt_vec_info stmt_info;
3367       tree base, offset, init;
3368       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3369       bool simd_lane_access = false;
3370       int vf;
3371
3372 again:
3373       if (!dr || !DR_REF (dr))
3374         {
3375           if (dump_enabled_p ())
3376             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3377                              "not vectorized: unhandled data-ref\n");
3378           return false;
3379         }
3380
3381       stmt = DR_STMT (dr);
3382       stmt_info = vinfo_for_stmt (stmt);
3383
3384       /* Discard clobbers from the dataref vector.  We will remove
3385          clobber stmts during vectorization.  */
3386       if (gimple_clobber_p (stmt))
3387         {
3388           free_data_ref (dr);
3389           if (i == datarefs.length () - 1)
3390             {
3391               datarefs.pop ();
3392               break;
3393             }
3394           datarefs.ordered_remove (i);
3395           dr = datarefs[i];
3396           goto again;
3397         }
3398
3399       /* Check that analysis of the data-ref succeeded.  */
3400       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3401           || !DR_STEP (dr))
3402         {
3403           bool maybe_gather
3404             = DR_IS_READ (dr)
3405               && !TREE_THIS_VOLATILE (DR_REF (dr))
3406               && targetm.vectorize.builtin_gather != NULL;
3407           bool maybe_scatter
3408             = DR_IS_WRITE (dr)
3409               && !TREE_THIS_VOLATILE (DR_REF (dr))
3410               && targetm.vectorize.builtin_scatter != NULL;
3411           bool maybe_simd_lane_access
3412             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3413
3414           /* If target supports vector gather loads or scatter stores, or if
3415              this might be a SIMD lane access, see if they can't be used.  */
3416           if (is_a <loop_vec_info> (vinfo)
3417               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3418               && !nested_in_vect_loop_p (loop, stmt))
3419             {
3420               struct data_reference *newdr
3421                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3422                                    DR_REF (dr), stmt, maybe_scatter ? false : true);
3423               gcc_assert (newdr != NULL && DR_REF (newdr));
3424               if (DR_BASE_ADDRESS (newdr)
3425                   && DR_OFFSET (newdr)
3426                   && DR_INIT (newdr)
3427                   && DR_STEP (newdr)
3428                   && integer_zerop (DR_STEP (newdr)))
3429                 {
3430                   if (maybe_simd_lane_access)
3431                     {
3432                       tree off = DR_OFFSET (newdr);
3433                       STRIP_NOPS (off);
3434                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3435                           && TREE_CODE (off) == MULT_EXPR
3436                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3437                         {
3438                           tree step = TREE_OPERAND (off, 1);
3439                           off = TREE_OPERAND (off, 0);
3440                           STRIP_NOPS (off);
3441                           if (CONVERT_EXPR_P (off)
3442                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3443                                                                           0)))
3444                                  < TYPE_PRECISION (TREE_TYPE (off)))
3445                             off = TREE_OPERAND (off, 0);
3446                           if (TREE_CODE (off) == SSA_NAME)
3447                             {
3448                               gimple *def = SSA_NAME_DEF_STMT (off);
3449                               tree reft = TREE_TYPE (DR_REF (newdr));
3450                               if (is_gimple_call (def)
3451                                   && gimple_call_internal_p (def)
3452                                   && (gimple_call_internal_fn (def)
3453                                       == IFN_GOMP_SIMD_LANE))
3454                                 {
3455                                   tree arg = gimple_call_arg (def, 0);
3456                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3457                                   arg = SSA_NAME_VAR (arg);
3458                                   if (arg == loop->simduid
3459                                       /* For now.  */
3460                                       && tree_int_cst_equal
3461                                            (TYPE_SIZE_UNIT (reft),
3462                                             step))
3463                                     {
3464                                       DR_OFFSET (newdr) = ssize_int (0);
3465                                       DR_STEP (newdr) = step;
3466                                       DR_ALIGNED_TO (newdr)
3467                                         = size_int (BIGGEST_ALIGNMENT);
3468                                       dr = newdr;
3469                                       simd_lane_access = true;
3470                                     }
3471                                 }
3472                             }
3473                         }
3474                     }
3475                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3476                     {
3477                       dr = newdr;
3478                       if (maybe_gather)
3479                         gatherscatter = GATHER;
3480                       else
3481                         gatherscatter = SCATTER;
3482                     }
3483                 }
3484               if (gatherscatter == SG_NONE && !simd_lane_access)
3485                 free_data_ref (newdr);
3486             }
3487
3488           if (gatherscatter == SG_NONE && !simd_lane_access)
3489             {
3490               if (dump_enabled_p ())
3491                 {
3492                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3493                                    "not vectorized: data ref analysis "
3494                                    "failed ");
3495                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3496                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3497                 }
3498
3499               if (is_a <bb_vec_info> (vinfo))
3500                 break;
3501
3502               return false;
3503             }
3504         }
3505
3506       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3507         {
3508           if (dump_enabled_p ())
3509             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3510                              "not vectorized: base addr of dr is a "
3511                              "constant\n");
3512
3513           if (is_a <bb_vec_info> (vinfo))
3514             break;
3515
3516           if (gatherscatter != SG_NONE || simd_lane_access)
3517             free_data_ref (dr);
3518           return false;
3519         }
3520
3521       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3522         {
3523           if (dump_enabled_p ())
3524             {
3525               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3526                                "not vectorized: volatile type ");
3527               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3528               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3529             }
3530
3531           if (is_a <bb_vec_info> (vinfo))
3532             break;
3533
3534           return false;
3535         }
3536
3537       if (stmt_can_throw_internal (stmt))
3538         {
3539           if (dump_enabled_p ())
3540             {
3541               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3542                                "not vectorized: statement can throw an "
3543                                "exception ");
3544               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3545               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3546             }
3547
3548           if (is_a <bb_vec_info> (vinfo))
3549             break;
3550
3551           if (gatherscatter != SG_NONE || simd_lane_access)
3552             free_data_ref (dr);
3553           return false;
3554         }
3555
3556       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3557           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3558         {
3559           if (dump_enabled_p ())
3560             {
3561               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3562                                "not vectorized: statement is bitfield "
3563                                "access ");
3564               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3565               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3566             }
3567
3568           if (is_a <bb_vec_info> (vinfo))
3569             break;
3570
3571           if (gatherscatter != SG_NONE || simd_lane_access)
3572             free_data_ref (dr);
3573           return false;
3574         }
3575
3576       base = unshare_expr (DR_BASE_ADDRESS (dr));
3577       offset = unshare_expr (DR_OFFSET (dr));
3578       init = unshare_expr (DR_INIT (dr));
3579
3580       if (is_gimple_call (stmt)
3581           && (!gimple_call_internal_p (stmt)
3582               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3583                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3584         {
3585           if (dump_enabled_p ())
3586             {
3587               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3588                                "not vectorized: dr in a call ");
3589               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3590               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3591             }
3592
3593           if (is_a <bb_vec_info> (vinfo))
3594             break;
3595
3596           if (gatherscatter != SG_NONE || simd_lane_access)
3597             free_data_ref (dr);
3598           return false;
3599         }
3600
3601       /* Update DR field in stmt_vec_info struct.  */
3602
3603       /* If the dataref is in an inner-loop of the loop that is considered for
3604          for vectorization, we also want to analyze the access relative to
3605          the outer-loop (DR contains information only relative to the
3606          inner-most enclosing loop).  We do that by building a reference to the
3607          first location accessed by the inner-loop, and analyze it relative to
3608          the outer-loop.  */
3609       if (loop && nested_in_vect_loop_p (loop, stmt))
3610         {
3611           tree outer_step, outer_base, outer_init;
3612           HOST_WIDE_INT pbitsize, pbitpos;
3613           tree poffset;
3614           machine_mode pmode;
3615           int punsignedp, pvolatilep;
3616           affine_iv base_iv, offset_iv;
3617           tree dinit;
3618
3619           /* Build a reference to the first location accessed by the
3620              inner-loop: *(BASE+INIT).  (The first location is actually
3621              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3622           tree inner_base = build_fold_indirect_ref
3623                                 (fold_build_pointer_plus (base, init));
3624
3625           if (dump_enabled_p ())
3626             {
3627               dump_printf_loc (MSG_NOTE, vect_location,
3628                                "analyze in outer-loop: ");
3629               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3630               dump_printf (MSG_NOTE, "\n");
3631             }
3632
3633           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3634                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3635           gcc_assert (outer_base != NULL_TREE);
3636
3637           if (pbitpos % BITS_PER_UNIT != 0)
3638             {
3639               if (dump_enabled_p ())
3640                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3641                                  "failed: bit offset alignment.\n");
3642               return false;
3643             }
3644
3645           outer_base = build_fold_addr_expr (outer_base);
3646           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3647                           &base_iv, false))
3648             {
3649               if (dump_enabled_p ())
3650                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3651                                  "failed: evolution of base is not affine.\n");
3652               return false;
3653             }
3654
3655           if (offset)
3656             {
3657               if (poffset)
3658                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3659                                        poffset);
3660               else
3661                 poffset = offset;
3662             }
3663
3664           if (!poffset)
3665             {
3666               offset_iv.base = ssize_int (0);
3667               offset_iv.step = ssize_int (0);
3668             }
3669           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3670                                &offset_iv, false))
3671             {
3672               if (dump_enabled_p ())
3673                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3674                                  "evolution of offset is not affine.\n");
3675               return false;
3676             }
3677
3678           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3679           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3680           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3681           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3682           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3683
3684           outer_step = size_binop (PLUS_EXPR,
3685                                 fold_convert (ssizetype, base_iv.step),
3686                                 fold_convert (ssizetype, offset_iv.step));
3687
3688           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3689           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3690           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3691           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3692           STMT_VINFO_DR_OFFSET (stmt_info) =
3693                                 fold_convert (ssizetype, offset_iv.base);
3694           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3695                                 size_int (highest_pow2_factor (offset_iv.base));
3696
3697           if (dump_enabled_p ())
3698             {
3699               dump_printf_loc (MSG_NOTE, vect_location,
3700                                "\touter base_address: ");
3701               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3702                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3703               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3704               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3705                                  STMT_VINFO_DR_OFFSET (stmt_info));
3706               dump_printf (MSG_NOTE,
3707                            "\n\touter constant offset from base address: ");
3708               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3709                                  STMT_VINFO_DR_INIT (stmt_info));
3710               dump_printf (MSG_NOTE, "\n\touter step: ");
3711               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3712                                  STMT_VINFO_DR_STEP (stmt_info));
3713               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3714               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3715                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3716               dump_printf (MSG_NOTE, "\n");
3717             }
3718         }
3719
3720       if (STMT_VINFO_DATA_REF (stmt_info))
3721         {
3722           if (dump_enabled_p ())
3723             {
3724               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3725                                "not vectorized: more than one data ref "
3726                                "in stmt: ");
3727               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3728               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3729             }
3730
3731           if (is_a <bb_vec_info> (vinfo))
3732             break;
3733
3734           if (gatherscatter != SG_NONE || simd_lane_access)
3735             free_data_ref (dr);
3736           return false;
3737         }
3738
3739       STMT_VINFO_DATA_REF (stmt_info) = dr;
3740       if (simd_lane_access)
3741         {
3742           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3743           free_data_ref (datarefs[i]);
3744           datarefs[i] = dr;
3745         }
3746
3747       /* Set vectype for STMT.  */
3748       scalar_type = TREE_TYPE (DR_REF (dr));
3749       STMT_VINFO_VECTYPE (stmt_info)
3750         = get_vectype_for_scalar_type (scalar_type);
3751       if (!STMT_VINFO_VECTYPE (stmt_info))
3752         {
3753           if (dump_enabled_p ())
3754             {
3755               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3756                                "not vectorized: no vectype for stmt: ");
3757               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3758               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3759               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3760                                  scalar_type);
3761               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3762             }
3763
3764           if (is_a <bb_vec_info> (vinfo))
3765             break;
3766
3767           if (gatherscatter != SG_NONE || simd_lane_access)
3768             {
3769               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3770               if (gatherscatter != SG_NONE)
3771                 free_data_ref (dr);
3772             }
3773           return false;
3774         }
3775       else
3776         {
3777           if (dump_enabled_p ())
3778             {
3779               dump_printf_loc (MSG_NOTE, vect_location,
3780                                "got vectype for stmt: ");
3781               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3782               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3783                                  STMT_VINFO_VECTYPE (stmt_info));
3784               dump_printf (MSG_NOTE, "\n");
3785             }
3786         }
3787
3788       /* Adjust the minimal vectorization factor according to the
3789          vector type.  */
3790       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3791       if (vf > *min_vf)
3792         *min_vf = vf;
3793
3794       if (gatherscatter != SG_NONE)
3795         {
3796           tree off;
3797           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3798                                           NULL, &off, NULL)
3799               || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3800             {
3801               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3802               free_data_ref (dr);
3803               if (dump_enabled_p ())
3804                 {
3805                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3806                                    (gatherscatter == GATHER) ?
3807                                    "not vectorized: not suitable for gather "
3808                                    "load " :
3809                                    "not vectorized: not suitable for scatter "
3810                                    "store ");
3811                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3812                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3813                 }
3814               return false;
3815             }
3816
3817           datarefs[i] = dr;
3818           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3819         }
3820
3821       else if (is_a <loop_vec_info> (vinfo)
3822                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3823         {
3824           if (nested_in_vect_loop_p (loop, stmt))
3825             {
3826               if (dump_enabled_p ())
3827                 {
3828                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3829                                    "not vectorized: not suitable for strided "
3830                                    "load ");
3831                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3832                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3833                 }
3834               return false;
3835             }
3836           STMT_VINFO_STRIDED_P (stmt_info) = true;
3837         }
3838     }
3839
3840   /* If we stopped analysis at the first dataref we could not analyze
3841      when trying to vectorize a basic-block mark the rest of the datarefs
3842      as not vectorizable and truncate the vector of datarefs.  That
3843      avoids spending useless time in analyzing their dependence.  */
3844   if (i != datarefs.length ())
3845     {
3846       gcc_assert (is_a <bb_vec_info> (vinfo));
3847       for (unsigned j = i; j < datarefs.length (); ++j)
3848         {
3849           data_reference_p dr = datarefs[j];
3850           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3851           free_data_ref (dr);
3852         }
3853       datarefs.truncate (i);
3854     }
3855
3856   return true;
3857 }
3858
3859
3860 /* Function vect_get_new_vect_var.
3861
3862    Returns a name for a new variable.  The current naming scheme appends the
3863    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3864    the name of vectorizer generated variables, and appends that to NAME if
3865    provided.  */
3866
3867 tree
3868 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3869 {
3870   const char *prefix;
3871   tree new_vect_var;
3872
3873   switch (var_kind)
3874   {
3875   case vect_simple_var:
3876     prefix = "vect";
3877     break;
3878   case vect_scalar_var:
3879     prefix = "stmp";
3880     break;
3881   case vect_pointer_var:
3882     prefix = "vectp";
3883     break;
3884   default:
3885     gcc_unreachable ();
3886   }
3887
3888   if (name)
3889     {
3890       char* tmp = concat (prefix, "_", name, NULL);
3891       new_vect_var = create_tmp_reg (type, tmp);
3892       free (tmp);
3893     }
3894   else
3895     new_vect_var = create_tmp_reg (type, prefix);
3896
3897   return new_vect_var;
3898 }
3899
3900 /* Like vect_get_new_vect_var but return an SSA name.  */
3901
3902 tree
3903 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3904 {
3905   const char *prefix;
3906   tree new_vect_var;
3907
3908   switch (var_kind)
3909   {
3910   case vect_simple_var:
3911     prefix = "vect";
3912     break;
3913   case vect_scalar_var:
3914     prefix = "stmp";
3915     break;
3916   case vect_pointer_var:
3917     prefix = "vectp";
3918     break;
3919   default:
3920     gcc_unreachable ();
3921   }
3922
3923   if (name)
3924     {
3925       char* tmp = concat (prefix, "_", name, NULL);
3926       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3927       free (tmp);
3928     }
3929   else
3930     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3931
3932   return new_vect_var;
3933 }
3934
3935 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3936
3937 static void
3938 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3939                                   stmt_vec_info stmt_info)
3940 {
3941   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3942   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3943   int misalign = DR_MISALIGNMENT (dr);
3944   if (misalign == -1)
3945     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3946   else
3947     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3948 }
3949
3950 /* Function vect_create_addr_base_for_vector_ref.
3951
3952    Create an expression that computes the address of the first memory location
3953    that will be accessed for a data reference.
3954
3955    Input:
3956    STMT: The statement containing the data reference.
3957    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3958    OFFSET: Optional. If supplied, it is be added to the initial address.
3959    LOOP:    Specify relative to which loop-nest should the address be computed.
3960             For example, when the dataref is in an inner-loop nested in an
3961             outer-loop that is now being vectorized, LOOP can be either the
3962             outer-loop, or the inner-loop.  The first memory location accessed
3963             by the following dataref ('in' points to short):
3964
3965                 for (i=0; i<N; i++)
3966                    for (j=0; j<M; j++)
3967                      s += in[i+j]
3968
3969             is as follows:
3970             if LOOP=i_loop:     &in             (relative to i_loop)
3971             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3972    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3973             initial address.  Unlike OFFSET, which is number of elements to
3974             be added, BYTE_OFFSET is measured in bytes.
3975
3976    Output:
3977    1. Return an SSA_NAME whose value is the address of the memory location of
3978       the first vector of the data reference.
3979    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3980       these statement(s) which define the returned SSA_NAME.
3981
3982    FORNOW: We are only handling array accesses with step 1.  */
3983
3984 tree
3985 vect_create_addr_base_for_vector_ref (gimple *stmt,
3986                                       gimple_seq *new_stmt_list,
3987                                       tree offset,
3988                                       struct loop *loop,
3989                                       tree byte_offset)
3990 {
3991   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3992   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3993   tree data_ref_base;
3994   const char *base_name;
3995   tree addr_base;
3996   tree dest;
3997   gimple_seq seq = NULL;
3998   tree base_offset;
3999   tree init;
4000   tree vect_ptr_type;
4001   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4002   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4003
4004   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4005     {
4006       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4007
4008       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4009
4010       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4011       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4012       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4013     }
4014   else
4015     {
4016       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4017       base_offset = unshare_expr (DR_OFFSET (dr));
4018       init = unshare_expr (DR_INIT (dr));
4019     }
4020
4021   if (loop_vinfo)
4022     base_name = get_name (data_ref_base);
4023   else
4024     {
4025       base_offset = ssize_int (0);
4026       init = ssize_int (0);
4027       base_name = get_name (DR_REF (dr));
4028     }
4029
4030   /* Create base_offset */
4031   base_offset = size_binop (PLUS_EXPR,
4032                             fold_convert (sizetype, base_offset),
4033                             fold_convert (sizetype, init));
4034
4035   if (offset)
4036     {
4037       offset = fold_build2 (MULT_EXPR, sizetype,
4038                             fold_convert (sizetype, offset), step);
4039       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4040                                  base_offset, offset);
4041     }
4042   if (byte_offset)
4043     {
4044       byte_offset = fold_convert (sizetype, byte_offset);
4045       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4046                                  base_offset, byte_offset);
4047     }
4048
4049   /* base + base_offset */
4050   if (loop_vinfo)
4051     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4052   else
4053     {
4054       addr_base = build1 (ADDR_EXPR,
4055                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4056                           unshare_expr (DR_REF (dr)));
4057     }
4058
4059   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4060   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4061   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4062   gimple_seq_add_seq (new_stmt_list, seq);
4063
4064   if (DR_PTR_INFO (dr)
4065       && TREE_CODE (addr_base) == SSA_NAME
4066       && !SSA_NAME_PTR_INFO (addr_base))
4067     {
4068       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4069       if (offset || byte_offset)
4070         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4071     }
4072
4073   if (dump_enabled_p ())
4074     {
4075       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4076       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4077       dump_printf (MSG_NOTE, "\n");
4078     }
4079
4080   return addr_base;
4081 }
4082
4083
4084 /* Function vect_create_data_ref_ptr.
4085
4086    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4087    location accessed in the loop by STMT, along with the def-use update
4088    chain to appropriately advance the pointer through the loop iterations.
4089    Also set aliasing information for the pointer.  This pointer is used by
4090    the callers to this function to create a memory reference expression for
4091    vector load/store access.
4092
4093    Input:
4094    1. STMT: a stmt that references memory. Expected to be of the form
4095          GIMPLE_ASSIGN <name, data-ref> or
4096          GIMPLE_ASSIGN <data-ref, name>.
4097    2. AGGR_TYPE: the type of the reference, which should be either a vector
4098         or an array.
4099    3. AT_LOOP: the loop where the vector memref is to be created.
4100    4. OFFSET (optional): an offset to be added to the initial address accessed
4101         by the data-ref in STMT.
4102    5. BSI: location where the new stmts are to be placed if there is no loop
4103    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4104         pointing to the initial address.
4105    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4106         to the initial address accessed by the data-ref in STMT.  This is
4107         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4108         in bytes.
4109
4110    Output:
4111    1. Declare a new ptr to vector_type, and have it point to the base of the
4112       data reference (initial addressed accessed by the data reference).
4113       For example, for vector of type V8HI, the following code is generated:
4114
4115       v8hi *ap;
4116       ap = (v8hi *)initial_address;
4117
4118       if OFFSET is not supplied:
4119          initial_address = &a[init];
4120       if OFFSET is supplied:
4121          initial_address = &a[init + OFFSET];
4122       if BYTE_OFFSET is supplied:
4123          initial_address = &a[init] + BYTE_OFFSET;
4124
4125       Return the initial_address in INITIAL_ADDRESS.
4126
4127    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4128       update the pointer in each iteration of the loop.
4129
4130       Return the increment stmt that updates the pointer in PTR_INCR.
4131
4132    3. Set INV_P to true if the access pattern of the data reference in the
4133       vectorized loop is invariant.  Set it to false otherwise.
4134
4135    4. Return the pointer.  */
4136
4137 tree
4138 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4139                           tree offset, tree *initial_address,
4140                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4141                           bool only_init, bool *inv_p, tree byte_offset)
4142 {
4143   const char *base_name;
4144   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4145   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4146   struct loop *loop = NULL;
4147   bool nested_in_vect_loop = false;
4148   struct loop *containing_loop = NULL;
4149   tree aggr_ptr_type;
4150   tree aggr_ptr;
4151   tree new_temp;
4152   gimple_seq new_stmt_list = NULL;
4153   edge pe = NULL;
4154   basic_block new_bb;
4155   tree aggr_ptr_init;
4156   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4157   tree aptr;
4158   gimple_stmt_iterator incr_gsi;
4159   bool insert_after;
4160   tree indx_before_incr, indx_after_incr;
4161   gimple *incr;
4162   tree step;
4163   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4164
4165   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4166               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4167
4168   if (loop_vinfo)
4169     {
4170       loop = LOOP_VINFO_LOOP (loop_vinfo);
4171       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4172       containing_loop = (gimple_bb (stmt))->loop_father;
4173       pe = loop_preheader_edge (loop);
4174     }
4175   else
4176     {
4177       gcc_assert (bb_vinfo);
4178       only_init = true;
4179       *ptr_incr = NULL;
4180     }
4181
4182   /* Check the step (evolution) of the load in LOOP, and record
4183      whether it's invariant.  */
4184   if (nested_in_vect_loop)
4185     step = STMT_VINFO_DR_STEP (stmt_info);
4186   else
4187     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4188
4189   if (integer_zerop (step))
4190     *inv_p = true;
4191   else
4192     *inv_p = false;
4193
4194   /* Create an expression for the first address accessed by this load
4195      in LOOP.  */
4196   base_name = get_name (DR_BASE_ADDRESS (dr));
4197
4198   if (dump_enabled_p ())
4199     {
4200       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4201       dump_printf_loc (MSG_NOTE, vect_location,
4202                        "create %s-pointer variable to type: ",
4203                        get_tree_code_name (TREE_CODE (aggr_type)));
4204       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4205       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4206         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4207       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4208         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4209       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4210         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4211       else
4212         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4213       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4214       dump_printf (MSG_NOTE, "\n");
4215     }
4216
4217   /* (1) Create the new aggregate-pointer variable.
4218      Vector and array types inherit the alias set of their component
4219      type by default so we need to use a ref-all pointer if the data
4220      reference does not conflict with the created aggregated data
4221      reference because it is not addressable.  */
4222   bool need_ref_all = false;
4223   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4224                               get_alias_set (DR_REF (dr))))
4225     need_ref_all = true;
4226   /* Likewise for any of the data references in the stmt group.  */
4227   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4228     {
4229       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4230       do
4231         {
4232           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4233           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4234           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4235                                       get_alias_set (DR_REF (sdr))))
4236             {
4237               need_ref_all = true;
4238               break;
4239             }
4240           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4241         }
4242       while (orig_stmt);
4243     }
4244   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4245                                                need_ref_all);
4246   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4247
4248
4249   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4250      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4251      def-use update cycles for the pointer: one relative to the outer-loop
4252      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4253      to the inner-loop (which is the inner-most loop containing the dataref),
4254      and this is done be step (5) below.
4255
4256      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4257      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4258      redundant.  Steps (3),(4) create the following:
4259
4260         vp0 = &base_addr;
4261         LOOP:   vp1 = phi(vp0,vp2)
4262                 ...
4263                 ...
4264                 vp2 = vp1 + step
4265                 goto LOOP
4266
4267      If there is an inner-loop nested in loop, then step (5) will also be
4268      applied, and an additional update in the inner-loop will be created:
4269
4270         vp0 = &base_addr;
4271         LOOP:   vp1 = phi(vp0,vp2)
4272                 ...
4273         inner:     vp3 = phi(vp1,vp4)
4274                    vp4 = vp3 + inner_step
4275                    if () goto inner
4276                 ...
4277                 vp2 = vp1 + step
4278                 if () goto LOOP   */
4279
4280   /* (2) Calculate the initial address of the aggregate-pointer, and set
4281      the aggregate-pointer to point to it before the loop.  */
4282
4283   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4284
4285   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4286                                                    offset, loop, byte_offset);
4287   if (new_stmt_list)
4288     {
4289       if (pe)
4290         {
4291           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4292           gcc_assert (!new_bb);
4293         }
4294       else
4295         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4296     }
4297
4298   *initial_address = new_temp;
4299   aggr_ptr_init = new_temp;
4300
4301   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4302      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4303      inner-loop nested in LOOP (during outer-loop vectorization).  */
4304
4305   /* No update in loop is required.  */
4306   if (only_init && (!loop_vinfo || at_loop == loop))
4307     aptr = aggr_ptr_init;
4308   else
4309     {
4310       /* The step of the aggregate pointer is the type size.  */
4311       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4312       /* One exception to the above is when the scalar step of the load in
4313          LOOP is zero. In this case the step here is also zero.  */
4314       if (*inv_p)
4315         iv_step = size_zero_node;
4316       else if (tree_int_cst_sgn (step) == -1)
4317         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4318
4319       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4320
4321       create_iv (aggr_ptr_init,
4322                  fold_convert (aggr_ptr_type, iv_step),
4323                  aggr_ptr, loop, &incr_gsi, insert_after,
4324                  &indx_before_incr, &indx_after_incr);
4325       incr = gsi_stmt (incr_gsi);
4326       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4327
4328       /* Copy the points-to information if it exists. */
4329       if (DR_PTR_INFO (dr))
4330         {
4331           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4332           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4333         }
4334       if (ptr_incr)
4335         *ptr_incr = incr;
4336
4337       aptr = indx_before_incr;
4338     }
4339
4340   if (!nested_in_vect_loop || only_init)
4341     return aptr;
4342
4343
4344   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4345      nested in LOOP, if exists.  */
4346
4347   gcc_assert (nested_in_vect_loop);
4348   if (!only_init)
4349     {
4350       standard_iv_increment_position (containing_loop, &incr_gsi,
4351                                       &insert_after);
4352       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4353                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4354                  &indx_after_incr);
4355       incr = gsi_stmt (incr_gsi);
4356       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4357
4358       /* Copy the points-to information if it exists. */
4359       if (DR_PTR_INFO (dr))
4360         {
4361           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4362           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4363         }
4364       if (ptr_incr)
4365         *ptr_incr = incr;
4366
4367       return indx_before_incr;
4368     }
4369   else
4370     gcc_unreachable ();
4371 }
4372
4373
4374 /* Function bump_vector_ptr
4375
4376    Increment a pointer (to a vector type) by vector-size. If requested,
4377    i.e. if PTR-INCR is given, then also connect the new increment stmt
4378    to the existing def-use update-chain of the pointer, by modifying
4379    the PTR_INCR as illustrated below:
4380
4381    The pointer def-use update-chain before this function:
4382                         DATAREF_PTR = phi (p_0, p_2)
4383                         ....
4384         PTR_INCR:       p_2 = DATAREF_PTR + step
4385
4386    The pointer def-use update-chain after this function:
4387                         DATAREF_PTR = phi (p_0, p_2)
4388                         ....
4389                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4390                         ....
4391         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4392
4393    Input:
4394    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4395                  in the loop.
4396    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4397               the loop.  The increment amount across iterations is expected
4398               to be vector_size.
4399    BSI - location where the new update stmt is to be placed.
4400    STMT - the original scalar memory-access stmt that is being vectorized.
4401    BUMP - optional. The offset by which to bump the pointer. If not given,
4402           the offset is assumed to be vector_size.
4403
4404    Output: Return NEW_DATAREF_PTR as illustrated above.
4405
4406 */
4407
4408 tree
4409 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4410                  gimple *stmt, tree bump)
4411 {
4412   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4413   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4414   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4415   tree update = TYPE_SIZE_UNIT (vectype);
4416   gassign *incr_stmt;
4417   ssa_op_iter iter;
4418   use_operand_p use_p;
4419   tree new_dataref_ptr;
4420
4421   if (bump)
4422     update = bump;
4423
4424   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4425     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4426   else
4427     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4428   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4429                                    dataref_ptr, update);
4430   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4431
4432   /* Copy the points-to information if it exists. */
4433   if (DR_PTR_INFO (dr))
4434     {
4435       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4436       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4437     }
4438
4439   if (!ptr_incr)
4440     return new_dataref_ptr;
4441
4442   /* Update the vector-pointer's cross-iteration increment.  */
4443   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4444     {
4445       tree use = USE_FROM_PTR (use_p);
4446
4447       if (use == dataref_ptr)
4448         SET_USE (use_p, new_dataref_ptr);
4449       else
4450         gcc_assert (tree_int_cst_compare (use, update) == 0);
4451     }
4452
4453   return new_dataref_ptr;
4454 }
4455
4456
4457 /* Function vect_create_destination_var.
4458
4459    Create a new temporary of type VECTYPE.  */
4460
4461 tree
4462 vect_create_destination_var (tree scalar_dest, tree vectype)
4463 {
4464   tree vec_dest;
4465   const char *name;
4466   char *new_name;
4467   tree type;
4468   enum vect_var_kind kind;
4469
4470   kind = vectype ? vect_simple_var : vect_scalar_var;
4471   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4472
4473   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4474
4475   name = get_name (scalar_dest);
4476   if (name)
4477     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4478   else
4479     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4480   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4481   free (new_name);
4482
4483   return vec_dest;
4484 }
4485
4486 /* Function vect_grouped_store_supported.
4487
4488    Returns TRUE if interleave high and interleave low permutations
4489    are supported, and FALSE otherwise.  */
4490
4491 bool
4492 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4493 {
4494   machine_mode mode = TYPE_MODE (vectype);
4495
4496   /* vect_permute_store_chain requires the group size to be equal to 3 or
4497      be a power of two.  */
4498   if (count != 3 && exact_log2 (count) == -1)
4499     {
4500       if (dump_enabled_p ())
4501         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4502                          "the size of the group of accesses"
4503                          " is not a power of 2 or not eqaul to 3\n");
4504       return false;
4505     }
4506
4507   /* Check that the permutation is supported.  */
4508   if (VECTOR_MODE_P (mode))
4509     {
4510       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4511       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4512
4513       if (count == 3)
4514         {
4515           unsigned int j0 = 0, j1 = 0, j2 = 0;
4516           unsigned int i, j;
4517
4518           for (j = 0; j < 3; j++)
4519             {
4520               int nelt0 = ((3 - j) * nelt) % 3;
4521               int nelt1 = ((3 - j) * nelt + 1) % 3;
4522               int nelt2 = ((3 - j) * nelt + 2) % 3;
4523               for (i = 0; i < nelt; i++)
4524                 {
4525                   if (3 * i + nelt0 < nelt)
4526                     sel[3 * i + nelt0] = j0++;
4527                   if (3 * i + nelt1 < nelt)
4528                     sel[3 * i + nelt1] = nelt + j1++;
4529                   if (3 * i + nelt2 < nelt)
4530                     sel[3 * i + nelt2] = 0;
4531                 }
4532               if (!can_vec_perm_p (mode, false, sel))
4533                 {
4534                   if (dump_enabled_p ())
4535                     dump_printf (MSG_MISSED_OPTIMIZATION,
4536                                  "permutaion op not supported by target.\n");
4537                   return false;
4538                 }
4539
4540               for (i = 0; i < nelt; i++)
4541                 {
4542                   if (3 * i + nelt0 < nelt)
4543                     sel[3 * i + nelt0] = 3 * i + nelt0;
4544                   if (3 * i + nelt1 < nelt)
4545                     sel[3 * i + nelt1] = 3 * i + nelt1;
4546                   if (3 * i + nelt2 < nelt)
4547                     sel[3 * i + nelt2] = nelt + j2++;
4548                 }
4549               if (!can_vec_perm_p (mode, false, sel))
4550                 {
4551                   if (dump_enabled_p ())
4552                     dump_printf (MSG_MISSED_OPTIMIZATION,
4553                                  "permutaion op not supported by target.\n");
4554                   return false;
4555                 }
4556             }
4557           return true;
4558         }
4559       else
4560         {
4561           /* If length is not equal to 3 then only power of 2 is supported.  */
4562           gcc_assert (exact_log2 (count) != -1);
4563
4564           for (i = 0; i < nelt / 2; i++)
4565             {
4566               sel[i * 2] = i;
4567               sel[i * 2 + 1] = i + nelt;
4568             }
4569             if (can_vec_perm_p (mode, false, sel))
4570               {
4571                 for (i = 0; i < nelt; i++)
4572                   sel[i] += nelt / 2;
4573                 if (can_vec_perm_p (mode, false, sel))
4574                   return true;
4575               }
4576         }
4577     }
4578
4579   if (dump_enabled_p ())
4580     dump_printf (MSG_MISSED_OPTIMIZATION,
4581                  "permutaion op not supported by target.\n");
4582   return false;
4583 }
4584
4585
4586 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4587    type VECTYPE.  */
4588
4589 bool
4590 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4591 {
4592   return vect_lanes_optab_supported_p ("vec_store_lanes",
4593                                        vec_store_lanes_optab,
4594                                        vectype, count);
4595 }
4596
4597
4598 /* Function vect_permute_store_chain.
4599
4600    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4601    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4602    the data correctly for the stores.  Return the final references for stores
4603    in RESULT_CHAIN.
4604
4605    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4606    The input is 4 vectors each containing 8 elements.  We assign a number to
4607    each element, the input sequence is:
4608
4609    1st vec:   0  1  2  3  4  5  6  7
4610    2nd vec:   8  9 10 11 12 13 14 15
4611    3rd vec:  16 17 18 19 20 21 22 23
4612    4th vec:  24 25 26 27 28 29 30 31
4613
4614    The output sequence should be:
4615
4616    1st vec:  0  8 16 24  1  9 17 25
4617    2nd vec:  2 10 18 26  3 11 19 27
4618    3rd vec:  4 12 20 28  5 13 21 30
4619    4th vec:  6 14 22 30  7 15 23 31
4620
4621    i.e., we interleave the contents of the four vectors in their order.
4622
4623    We use interleave_high/low instructions to create such output.  The input of
4624    each interleave_high/low operation is two vectors:
4625    1st vec    2nd vec
4626    0 1 2 3    4 5 6 7
4627    the even elements of the result vector are obtained left-to-right from the
4628    high/low elements of the first vector.  The odd elements of the result are
4629    obtained left-to-right from the high/low elements of the second vector.
4630    The output of interleave_high will be:   0 4 1 5
4631    and of interleave_low:                   2 6 3 7
4632
4633
4634    The permutation is done in log LENGTH stages.  In each stage interleave_high
4635    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4636    where the first argument is taken from the first half of DR_CHAIN and the
4637    second argument from it's second half.
4638    In our example,
4639
4640    I1: interleave_high (1st vec, 3rd vec)
4641    I2: interleave_low (1st vec, 3rd vec)
4642    I3: interleave_high (2nd vec, 4th vec)
4643    I4: interleave_low (2nd vec, 4th vec)
4644
4645    The output for the first stage is:
4646
4647    I1:  0 16  1 17  2 18  3 19
4648    I2:  4 20  5 21  6 22  7 23
4649    I3:  8 24  9 25 10 26 11 27
4650    I4: 12 28 13 29 14 30 15 31
4651
4652    The output of the second stage, i.e. the final result is:
4653
4654    I1:  0  8 16 24  1  9 17 25
4655    I2:  2 10 18 26  3 11 19 27
4656    I3:  4 12 20 28  5 13 21 30
4657    I4:  6 14 22 30  7 15 23 31.  */
4658
4659 void
4660 vect_permute_store_chain (vec<tree> dr_chain,
4661                           unsigned int length,
4662                           gimple *stmt,
4663                           gimple_stmt_iterator *gsi,
4664                           vec<tree> *result_chain)
4665 {
4666   tree vect1, vect2, high, low;
4667   gimple *perm_stmt;
4668   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4669   tree perm_mask_low, perm_mask_high;
4670   tree data_ref;
4671   tree perm3_mask_low, perm3_mask_high;
4672   unsigned int i, n, log_length = exact_log2 (length);
4673   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4674   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4675
4676   result_chain->quick_grow (length);
4677   memcpy (result_chain->address (), dr_chain.address (),
4678           length * sizeof (tree));
4679
4680   if (length == 3)
4681     {
4682       unsigned int j0 = 0, j1 = 0, j2 = 0;
4683
4684       for (j = 0; j < 3; j++)
4685         {
4686           int nelt0 = ((3 - j) * nelt) % 3;
4687           int nelt1 = ((3 - j) * nelt + 1) % 3;
4688           int nelt2 = ((3 - j) * nelt + 2) % 3;
4689
4690           for (i = 0; i < nelt; i++)
4691             {
4692               if (3 * i + nelt0 < nelt)
4693                 sel[3 * i + nelt0] = j0++;
4694               if (3 * i + nelt1 < nelt)
4695                 sel[3 * i + nelt1] = nelt + j1++;
4696               if (3 * i + nelt2 < nelt)
4697                 sel[3 * i + nelt2] = 0;
4698             }
4699           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4700
4701           for (i = 0; i < nelt; i++)
4702             {
4703               if (3 * i + nelt0 < nelt)
4704                 sel[3 * i + nelt0] = 3 * i + nelt0;
4705               if (3 * i + nelt1 < nelt)
4706                 sel[3 * i + nelt1] = 3 * i + nelt1;
4707               if (3 * i + nelt2 < nelt)
4708                 sel[3 * i + nelt2] = nelt + j2++;
4709             }
4710           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4711
4712           vect1 = dr_chain[0];
4713           vect2 = dr_chain[1];
4714
4715           /* Create interleaving stmt:
4716              low = VEC_PERM_EXPR <vect1, vect2,
4717                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4718                                    j + 2, nelt + j + 2, *, ...}>  */
4719           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4720           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4721                                            vect2, perm3_mask_low);
4722           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4723
4724           vect1 = data_ref;
4725           vect2 = dr_chain[2];
4726           /* Create interleaving stmt:
4727              low = VEC_PERM_EXPR <vect1, vect2,
4728                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4729                                    6, 7, nelt + j + 2, ...}>  */
4730           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4731           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4732                                            vect2, perm3_mask_high);
4733           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4734           (*result_chain)[j] = data_ref;
4735         }
4736     }
4737   else
4738     {
4739       /* If length is not equal to 3 then only power of 2 is supported.  */
4740       gcc_assert (exact_log2 (length) != -1);
4741
4742       for (i = 0, n = nelt / 2; i < n; i++)
4743         {
4744           sel[i * 2] = i;
4745           sel[i * 2 + 1] = i + nelt;
4746         }
4747         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4748
4749         for (i = 0; i < nelt; i++)
4750           sel[i] += nelt / 2;
4751         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4752
4753         for (i = 0, n = log_length; i < n; i++)
4754           {
4755             for (j = 0; j < length/2; j++)
4756               {
4757                 vect1 = dr_chain[j];
4758                 vect2 = dr_chain[j+length/2];
4759
4760                 /* Create interleaving stmt:
4761                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4762                                                         ...}>  */
4763                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4764                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4765                                                  vect2, perm_mask_high);
4766                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4767                 (*result_chain)[2*j] = high;
4768
4769                 /* Create interleaving stmt:
4770                    low = VEC_PERM_EXPR <vect1, vect2,
4771                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4772                                          ...}>  */
4773                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4774                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4775                                                  vect2, perm_mask_low);
4776                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4777                 (*result_chain)[2*j+1] = low;
4778               }
4779             memcpy (dr_chain.address (), result_chain->address (),
4780                     length * sizeof (tree));
4781           }
4782     }
4783 }
4784
4785 /* Function vect_setup_realignment
4786
4787    This function is called when vectorizing an unaligned load using
4788    the dr_explicit_realign[_optimized] scheme.
4789    This function generates the following code at the loop prolog:
4790
4791       p = initial_addr;
4792    x  msq_init = *(floor(p));   # prolog load
4793       realignment_token = call target_builtin;
4794     loop:
4795    x  msq = phi (msq_init, ---)
4796
4797    The stmts marked with x are generated only for the case of
4798    dr_explicit_realign_optimized.
4799
4800    The code above sets up a new (vector) pointer, pointing to the first
4801    location accessed by STMT, and a "floor-aligned" load using that pointer.
4802    It also generates code to compute the "realignment-token" (if the relevant
4803    target hook was defined), and creates a phi-node at the loop-header bb
4804    whose arguments are the result of the prolog-load (created by this
4805    function) and the result of a load that takes place in the loop (to be
4806    created by the caller to this function).
4807
4808    For the case of dr_explicit_realign_optimized:
4809    The caller to this function uses the phi-result (msq) to create the
4810    realignment code inside the loop, and sets up the missing phi argument,
4811    as follows:
4812     loop:
4813       msq = phi (msq_init, lsq)
4814       lsq = *(floor(p'));        # load in loop
4815       result = realign_load (msq, lsq, realignment_token);
4816
4817    For the case of dr_explicit_realign:
4818     loop:
4819       msq = *(floor(p));        # load in loop
4820       p' = p + (VS-1);
4821       lsq = *(floor(p'));       # load in loop
4822       result = realign_load (msq, lsq, realignment_token);
4823
4824    Input:
4825    STMT - (scalar) load stmt to be vectorized. This load accesses
4826           a memory location that may be unaligned.
4827    BSI - place where new code is to be inserted.
4828    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4829                               is used.
4830
4831    Output:
4832    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4833                        target hook, if defined.
4834    Return value - the result of the loop-header phi node.  */
4835
4836 tree
4837 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4838                         tree *realignment_token,
4839                         enum dr_alignment_support alignment_support_scheme,
4840                         tree init_addr,
4841                         struct loop **at_loop)
4842 {
4843   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4844   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4845   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4846   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4847   struct loop *loop = NULL;
4848   edge pe = NULL;
4849   tree scalar_dest = gimple_assign_lhs (stmt);
4850   tree vec_dest;
4851   gimple *inc;
4852   tree ptr;
4853   tree data_ref;
4854   basic_block new_bb;
4855   tree msq_init = NULL_TREE;
4856   tree new_temp;
4857   gphi *phi_stmt;
4858   tree msq = NULL_TREE;
4859   gimple_seq stmts = NULL;
4860   bool inv_p;
4861   bool compute_in_loop = false;
4862   bool nested_in_vect_loop = false;
4863   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4864   struct loop *loop_for_initial_load = NULL;
4865
4866   if (loop_vinfo)
4867     {
4868       loop = LOOP_VINFO_LOOP (loop_vinfo);
4869       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4870     }
4871
4872   gcc_assert (alignment_support_scheme == dr_explicit_realign
4873               || alignment_support_scheme == dr_explicit_realign_optimized);
4874
4875   /* We need to generate three things:
4876      1. the misalignment computation
4877      2. the extra vector load (for the optimized realignment scheme).
4878      3. the phi node for the two vectors from which the realignment is
4879       done (for the optimized realignment scheme).  */
4880
4881   /* 1. Determine where to generate the misalignment computation.
4882
4883      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4884      calculation will be generated by this function, outside the loop (in the
4885      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4886      caller, inside the loop.
4887
4888      Background: If the misalignment remains fixed throughout the iterations of
4889      the loop, then both realignment schemes are applicable, and also the
4890      misalignment computation can be done outside LOOP.  This is because we are
4891      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4892      are a multiple of VS (the Vector Size), and therefore the misalignment in
4893      different vectorized LOOP iterations is always the same.
4894      The problem arises only if the memory access is in an inner-loop nested
4895      inside LOOP, which is now being vectorized using outer-loop vectorization.
4896      This is the only case when the misalignment of the memory access may not
4897      remain fixed throughout the iterations of the inner-loop (as explained in
4898      detail in vect_supportable_dr_alignment).  In this case, not only is the
4899      optimized realignment scheme not applicable, but also the misalignment
4900      computation (and generation of the realignment token that is passed to
4901      REALIGN_LOAD) have to be done inside the loop.
4902
4903      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4904      or not, which in turn determines if the misalignment is computed inside
4905      the inner-loop, or outside LOOP.  */
4906
4907   if (init_addr != NULL_TREE || !loop_vinfo)
4908     {
4909       compute_in_loop = true;
4910       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4911     }
4912
4913
4914   /* 2. Determine where to generate the extra vector load.
4915
4916      For the optimized realignment scheme, instead of generating two vector
4917      loads in each iteration, we generate a single extra vector load in the
4918      preheader of the loop, and in each iteration reuse the result of the
4919      vector load from the previous iteration.  In case the memory access is in
4920      an inner-loop nested inside LOOP, which is now being vectorized using
4921      outer-loop vectorization, we need to determine whether this initial vector
4922      load should be generated at the preheader of the inner-loop, or can be
4923      generated at the preheader of LOOP.  If the memory access has no evolution
4924      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4925      to be generated inside LOOP (in the preheader of the inner-loop).  */
4926
4927   if (nested_in_vect_loop)
4928     {
4929       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4930       bool invariant_in_outerloop =
4931             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4932       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4933     }
4934   else
4935     loop_for_initial_load = loop;
4936   if (at_loop)
4937     *at_loop = loop_for_initial_load;
4938
4939   if (loop_for_initial_load)
4940     pe = loop_preheader_edge (loop_for_initial_load);
4941
4942   /* 3. For the case of the optimized realignment, create the first vector
4943       load at the loop preheader.  */
4944
4945   if (alignment_support_scheme == dr_explicit_realign_optimized)
4946     {
4947       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4948       gassign *new_stmt;
4949
4950       gcc_assert (!compute_in_loop);
4951       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4952       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4953                                       NULL_TREE, &init_addr, NULL, &inc,
4954                                       true, &inv_p);
4955       if (TREE_CODE (ptr) == SSA_NAME)
4956         new_temp = copy_ssa_name (ptr);
4957       else
4958         new_temp = make_ssa_name (TREE_TYPE (ptr));
4959       new_stmt = gimple_build_assign
4960                    (new_temp, BIT_AND_EXPR, ptr,
4961                     build_int_cst (TREE_TYPE (ptr),
4962                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4963       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4964       gcc_assert (!new_bb);
4965       data_ref
4966         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4967                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4968       new_stmt = gimple_build_assign (vec_dest, data_ref);
4969       new_temp = make_ssa_name (vec_dest, new_stmt);
4970       gimple_assign_set_lhs (new_stmt, new_temp);
4971       if (pe)
4972         {
4973           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4974           gcc_assert (!new_bb);
4975         }
4976       else
4977          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4978
4979       msq_init = gimple_assign_lhs (new_stmt);
4980     }
4981
4982   /* 4. Create realignment token using a target builtin, if available.
4983       It is done either inside the containing loop, or before LOOP (as
4984       determined above).  */
4985
4986   if (targetm.vectorize.builtin_mask_for_load)
4987     {
4988       gcall *new_stmt;
4989       tree builtin_decl;
4990
4991       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4992       if (!init_addr)
4993         {
4994           /* Generate the INIT_ADDR computation outside LOOP.  */
4995           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4996                                                         NULL_TREE, loop);
4997           if (loop)
4998             {
4999               pe = loop_preheader_edge (loop);
5000               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5001               gcc_assert (!new_bb);
5002             }
5003           else
5004              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5005         }
5006
5007       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5008       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5009       vec_dest =
5010         vect_create_destination_var (scalar_dest,
5011                                      gimple_call_return_type (new_stmt));
5012       new_temp = make_ssa_name (vec_dest, new_stmt);
5013       gimple_call_set_lhs (new_stmt, new_temp);
5014
5015       if (compute_in_loop)
5016         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5017       else
5018         {
5019           /* Generate the misalignment computation outside LOOP.  */
5020           pe = loop_preheader_edge (loop);
5021           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5022           gcc_assert (!new_bb);
5023         }
5024
5025       *realignment_token = gimple_call_lhs (new_stmt);
5026
5027       /* The result of the CALL_EXPR to this builtin is determined from
5028          the value of the parameter and no global variables are touched
5029          which makes the builtin a "const" function.  Requiring the
5030          builtin to have the "const" attribute makes it unnecessary
5031          to call mark_call_clobbered.  */
5032       gcc_assert (TREE_READONLY (builtin_decl));
5033     }
5034
5035   if (alignment_support_scheme == dr_explicit_realign)
5036     return msq;
5037
5038   gcc_assert (!compute_in_loop);
5039   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5040
5041
5042   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5043
5044   pe = loop_preheader_edge (containing_loop);
5045   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5046   msq = make_ssa_name (vec_dest);
5047   phi_stmt = create_phi_node (msq, containing_loop->header);
5048   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5049
5050   return msq;
5051 }
5052
5053
5054 /* Function vect_grouped_load_supported.
5055
5056    Returns TRUE if even and odd permutations are supported,
5057    and FALSE otherwise.  */
5058
5059 bool
5060 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5061 {
5062   machine_mode mode = TYPE_MODE (vectype);
5063
5064   /* vect_permute_load_chain requires the group size to be equal to 3 or
5065      be a power of two.  */
5066   if (count != 3 && exact_log2 (count) == -1)
5067     {
5068       if (dump_enabled_p ())
5069         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5070                          "the size of the group of accesses"
5071                          " is not a power of 2 or not equal to 3\n");
5072       return false;
5073     }
5074
5075   /* Check that the permutation is supported.  */
5076   if (VECTOR_MODE_P (mode))
5077     {
5078       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5079       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5080
5081       if (count == 3)
5082         {
5083           unsigned int k;
5084           for (k = 0; k < 3; k++)
5085             {
5086               for (i = 0; i < nelt; i++)
5087                 if (3 * i + k < 2 * nelt)
5088                   sel[i] = 3 * i + k;
5089                 else
5090                   sel[i] = 0;
5091               if (!can_vec_perm_p (mode, false, sel))
5092                 {
5093                   if (dump_enabled_p ())
5094                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5095                                      "shuffle of 3 loads is not supported by"
5096                                      " target\n");
5097                   return false;
5098                 }
5099               for (i = 0, j = 0; i < nelt; i++)
5100                 if (3 * i + k < 2 * nelt)
5101                   sel[i] = i;
5102                 else
5103                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5104               if (!can_vec_perm_p (mode, false, sel))
5105                 {
5106                   if (dump_enabled_p ())
5107                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5108                                      "shuffle of 3 loads is not supported by"
5109                                      " target\n");
5110                   return false;
5111                 }
5112             }
5113           return true;
5114         }
5115       else
5116         {
5117           /* If length is not equal to 3 then only power of 2 is supported.  */
5118           gcc_assert (exact_log2 (count) != -1);
5119           for (i = 0; i < nelt; i++)
5120             sel[i] = i * 2;
5121           if (can_vec_perm_p (mode, false, sel))
5122             {
5123               for (i = 0; i < nelt; i++)
5124                 sel[i] = i * 2 + 1;
5125               if (can_vec_perm_p (mode, false, sel))
5126                 return true;
5127             }
5128         }
5129     }
5130
5131   if (dump_enabled_p ())
5132     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5133                      "extract even/odd not supported by target\n");
5134   return false;
5135 }
5136
5137 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5138    type VECTYPE.  */
5139
5140 bool
5141 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5142 {
5143   return vect_lanes_optab_supported_p ("vec_load_lanes",
5144                                        vec_load_lanes_optab,
5145                                        vectype, count);
5146 }
5147
5148 /* Function vect_permute_load_chain.
5149
5150    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5151    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5152    the input data correctly.  Return the final references for loads in
5153    RESULT_CHAIN.
5154
5155    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5156    The input is 4 vectors each containing 8 elements. We assign a number to each
5157    element, the input sequence is:
5158
5159    1st vec:   0  1  2  3  4  5  6  7
5160    2nd vec:   8  9 10 11 12 13 14 15
5161    3rd vec:  16 17 18 19 20 21 22 23
5162    4th vec:  24 25 26 27 28 29 30 31
5163
5164    The output sequence should be:
5165
5166    1st vec:  0 4  8 12 16 20 24 28
5167    2nd vec:  1 5  9 13 17 21 25 29
5168    3rd vec:  2 6 10 14 18 22 26 30
5169    4th vec:  3 7 11 15 19 23 27 31
5170
5171    i.e., the first output vector should contain the first elements of each
5172    interleaving group, etc.
5173
5174    We use extract_even/odd instructions to create such output.  The input of
5175    each extract_even/odd operation is two vectors
5176    1st vec    2nd vec
5177    0 1 2 3    4 5 6 7
5178
5179    and the output is the vector of extracted even/odd elements.  The output of
5180    extract_even will be:   0 2 4 6
5181    and of extract_odd:     1 3 5 7
5182
5183
5184    The permutation is done in log LENGTH stages.  In each stage extract_even
5185    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5186    their order.  In our example,
5187
5188    E1: extract_even (1st vec, 2nd vec)
5189    E2: extract_odd (1st vec, 2nd vec)
5190    E3: extract_even (3rd vec, 4th vec)
5191    E4: extract_odd (3rd vec, 4th vec)
5192
5193    The output for the first stage will be:
5194
5195    E1:  0  2  4  6  8 10 12 14
5196    E2:  1  3  5  7  9 11 13 15
5197    E3: 16 18 20 22 24 26 28 30
5198    E4: 17 19 21 23 25 27 29 31
5199
5200    In order to proceed and create the correct sequence for the next stage (or
5201    for the correct output, if the second stage is the last one, as in our
5202    example), we first put the output of extract_even operation and then the
5203    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5204    The input for the second stage is:
5205
5206    1st vec (E1):  0  2  4  6  8 10 12 14
5207    2nd vec (E3): 16 18 20 22 24 26 28 30
5208    3rd vec (E2):  1  3  5  7  9 11 13 15
5209    4th vec (E4): 17 19 21 23 25 27 29 31
5210
5211    The output of the second stage:
5212
5213    E1: 0 4  8 12 16 20 24 28
5214    E2: 2 6 10 14 18 22 26 30
5215    E3: 1 5  9 13 17 21 25 29
5216    E4: 3 7 11 15 19 23 27 31
5217
5218    And RESULT_CHAIN after reordering:
5219
5220    1st vec (E1):  0 4  8 12 16 20 24 28
5221    2nd vec (E3):  1 5  9 13 17 21 25 29
5222    3rd vec (E2):  2 6 10 14 18 22 26 30
5223    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5224
5225 static void
5226 vect_permute_load_chain (vec<tree> dr_chain,
5227                          unsigned int length,
5228                          gimple *stmt,
5229                          gimple_stmt_iterator *gsi,
5230                          vec<tree> *result_chain)
5231 {
5232   tree data_ref, first_vect, second_vect;
5233   tree perm_mask_even, perm_mask_odd;
5234   tree perm3_mask_low, perm3_mask_high;
5235   gimple *perm_stmt;
5236   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5237   unsigned int i, j, log_length = exact_log2 (length);
5238   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5239   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5240
5241   result_chain->quick_grow (length);
5242   memcpy (result_chain->address (), dr_chain.address (),
5243           length * sizeof (tree));
5244
5245   if (length == 3)
5246     {
5247       unsigned int k;
5248
5249       for (k = 0; k < 3; k++)
5250         {
5251           for (i = 0; i < nelt; i++)
5252             if (3 * i + k < 2 * nelt)
5253               sel[i] = 3 * i + k;
5254             else
5255               sel[i] = 0;
5256           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5257
5258           for (i = 0, j = 0; i < nelt; i++)
5259             if (3 * i + k < 2 * nelt)
5260               sel[i] = i;
5261             else
5262               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5263
5264           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5265
5266           first_vect = dr_chain[0];
5267           second_vect = dr_chain[1];
5268
5269           /* Create interleaving stmt (low part of):
5270              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5271                                                              ...}>  */
5272           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5273           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5274                                            second_vect, perm3_mask_low);
5275           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5276
5277           /* Create interleaving stmt (high part of):
5278              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5279                                                               ...}>  */
5280           first_vect = data_ref;
5281           second_vect = dr_chain[2];
5282           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5283           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5284                                            second_vect, perm3_mask_high);
5285           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5286           (*result_chain)[k] = data_ref;
5287         }
5288     }
5289   else
5290     {
5291       /* If length is not equal to 3 then only power of 2 is supported.  */
5292       gcc_assert (exact_log2 (length) != -1);
5293
5294       for (i = 0; i < nelt; ++i)
5295         sel[i] = i * 2;
5296       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5297
5298       for (i = 0; i < nelt; ++i)
5299         sel[i] = i * 2 + 1;
5300       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5301
5302       for (i = 0; i < log_length; i++)
5303         {
5304           for (j = 0; j < length; j += 2)
5305             {
5306               first_vect = dr_chain[j];
5307               second_vect = dr_chain[j+1];
5308
5309               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5310               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5311               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5312                                                first_vect, second_vect,
5313                                                perm_mask_even);
5314               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5315               (*result_chain)[j/2] = data_ref;
5316
5317               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5318               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5319               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5320                                                first_vect, second_vect,
5321                                                perm_mask_odd);
5322               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5323               (*result_chain)[j/2+length/2] = data_ref;
5324             }
5325           memcpy (dr_chain.address (), result_chain->address (),
5326                   length * sizeof (tree));
5327         }
5328     }
5329 }
5330
5331 /* Function vect_shift_permute_load_chain.
5332
5333    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5334    sequence of stmts to reorder the input data accordingly.
5335    Return the final references for loads in RESULT_CHAIN.
5336    Return true if successed, false otherwise.
5337
5338    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5339    The input is 3 vectors each containing 8 elements.  We assign a
5340    number to each element, the input sequence is:
5341
5342    1st vec:   0  1  2  3  4  5  6  7
5343    2nd vec:   8  9 10 11 12 13 14 15
5344    3rd vec:  16 17 18 19 20 21 22 23
5345
5346    The output sequence should be:
5347
5348    1st vec:  0 3 6  9 12 15 18 21
5349    2nd vec:  1 4 7 10 13 16 19 22
5350    3rd vec:  2 5 8 11 14 17 20 23
5351
5352    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5353
5354    First we shuffle all 3 vectors to get correct elements order:
5355
5356    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5357    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5358    3rd vec:  (16 19 22) (17 20 23) (18 21)
5359
5360    Next we unite and shift vector 3 times:
5361
5362    1st step:
5363      shift right by 6 the concatenation of:
5364      "1st vec" and  "2nd vec"
5365        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5366      "2nd vec" and  "3rd vec"
5367        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5368      "3rd vec" and  "1st vec"
5369        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5370                              | New vectors                   |
5371
5372      So that now new vectors are:
5373
5374      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5375      2nd vec:  (10 13) (16 19 22) (17 20 23)
5376      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5377
5378    2nd step:
5379      shift right by 5 the concatenation of:
5380      "1st vec" and  "3rd vec"
5381        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5382      "2nd vec" and  "1st vec"
5383        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5384      "3rd vec" and  "2nd vec"
5385        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5386                           | New vectors                   |
5387
5388      So that now new vectors are:
5389
5390      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5391      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5392      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5393
5394    3rd step:
5395      shift right by 5 the concatenation of:
5396      "1st vec" and  "1st vec"
5397        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5398      shift right by 3 the concatenation of:
5399      "2nd vec" and  "2nd vec"
5400                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5401                           | New vectors                   |
5402
5403      So that now all vectors are READY:
5404      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5405      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5406      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5407
5408    This algorithm is faster than one in vect_permute_load_chain if:
5409      1.  "shift of a concatination" is faster than general permutation.
5410          This is usually so.
5411      2.  The TARGET machine can't execute vector instructions in parallel.
5412          This is because each step of the algorithm depends on previous.
5413          The algorithm in vect_permute_load_chain is much more parallel.
5414
5415    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5416 */
5417
5418 static bool
5419 vect_shift_permute_load_chain (vec<tree> dr_chain,
5420                                unsigned int length,
5421                                gimple *stmt,
5422                                gimple_stmt_iterator *gsi,
5423                                vec<tree> *result_chain)
5424 {
5425   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5426   tree perm2_mask1, perm2_mask2, perm3_mask;
5427   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5428   gimple *perm_stmt;
5429
5430   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5431   unsigned int i;
5432   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5433   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5434   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5435   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5436
5437   result_chain->quick_grow (length);
5438   memcpy (result_chain->address (), dr_chain.address (),
5439           length * sizeof (tree));
5440
5441   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5442     {
5443       unsigned int j, log_length = exact_log2 (length);
5444       for (i = 0; i < nelt / 2; ++i)
5445         sel[i] = i * 2;
5446       for (i = 0; i < nelt / 2; ++i)
5447         sel[nelt / 2 + i] = i * 2 + 1;
5448       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5449         {
5450           if (dump_enabled_p ())
5451             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5452                              "shuffle of 2 fields structure is not \
5453                               supported by target\n");
5454           return false;
5455         }
5456       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5457
5458       for (i = 0; i < nelt / 2; ++i)
5459         sel[i] = i * 2 + 1;
5460       for (i = 0; i < nelt / 2; ++i)
5461         sel[nelt / 2 + i] = i * 2;
5462       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5463         {
5464           if (dump_enabled_p ())
5465             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5466                              "shuffle of 2 fields structure is not \
5467                               supported by target\n");
5468           return false;
5469         }
5470       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5471
5472       /* Generating permutation constant to shift all elements.
5473          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5474       for (i = 0; i < nelt; i++)
5475         sel[i] = nelt / 2 + i;
5476       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5477         {
5478           if (dump_enabled_p ())
5479             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5480                              "shift permutation is not supported by target\n");
5481           return false;
5482         }
5483       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5484
5485       /* Generating permutation constant to select vector from 2.
5486          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5487       for (i = 0; i < nelt / 2; i++)
5488         sel[i] = i;
5489       for (i = nelt / 2; i < nelt; i++)
5490         sel[i] = nelt + i;
5491       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5492         {
5493           if (dump_enabled_p ())
5494             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5495                              "select is not supported by target\n");
5496           return false;
5497         }
5498       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5499
5500       for (i = 0; i < log_length; i++)
5501         {
5502           for (j = 0; j < length; j += 2)
5503             {
5504               first_vect = dr_chain[j];
5505               second_vect = dr_chain[j + 1];
5506
5507               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5508               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5509                                                first_vect, first_vect,
5510                                                perm2_mask1);
5511               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5512               vect[0] = data_ref;
5513
5514               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5515               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5516                                                second_vect, second_vect,
5517                                                perm2_mask2);
5518               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5519               vect[1] = data_ref;
5520
5521               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5522               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5523                                                vect[0], vect[1], shift1_mask);
5524               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5525               (*result_chain)[j/2 + length/2] = data_ref;
5526
5527               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5528               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5529                                                vect[0], vect[1], select_mask);
5530               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5531               (*result_chain)[j/2] = data_ref;
5532             }
5533           memcpy (dr_chain.address (), result_chain->address (),
5534                   length * sizeof (tree));
5535         }
5536       return true;
5537     }
5538   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5539     {
5540       unsigned int k = 0, l = 0;
5541
5542       /* Generating permutation constant to get all elements in rigth order.
5543          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5544       for (i = 0; i < nelt; i++)
5545         {
5546           if (3 * k + (l % 3) >= nelt)
5547             {
5548               k = 0;
5549               l += (3 - (nelt % 3));
5550             }
5551           sel[i] = 3 * k + (l % 3);
5552           k++;
5553         }
5554       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5555         {
5556           if (dump_enabled_p ())
5557             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5558                              "shuffle of 3 fields structure is not \
5559                               supported by target\n");
5560           return false;
5561         }
5562       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5563
5564       /* Generating permutation constant to shift all elements.
5565          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5566       for (i = 0; i < nelt; i++)
5567         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5568       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5569         {
5570           if (dump_enabled_p ())
5571             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5572                              "shift permutation is not supported by target\n");
5573           return false;
5574         }
5575       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5576
5577       /* Generating permutation constant to shift all elements.
5578          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5579       for (i = 0; i < nelt; i++)
5580         sel[i] = 2 * (nelt / 3) + 1 + i;
5581       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5582         {
5583           if (dump_enabled_p ())
5584             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5585                              "shift permutation is not supported by target\n");
5586           return false;
5587         }
5588       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5589
5590       /* Generating permutation constant to shift all elements.
5591          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5592       for (i = 0; i < nelt; i++)
5593         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5594       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5595         {
5596           if (dump_enabled_p ())
5597             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5598                              "shift permutation is not supported by target\n");
5599           return false;
5600         }
5601       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5602
5603       /* Generating permutation constant to shift all elements.
5604          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5605       for (i = 0; i < nelt; i++)
5606         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5607       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5608         {
5609           if (dump_enabled_p ())
5610             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5611                              "shift permutation is not supported by target\n");
5612           return false;
5613         }
5614       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5615
5616       for (k = 0; k < 3; k++)
5617         {
5618           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5619           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5620                                            dr_chain[k], dr_chain[k],
5621                                            perm3_mask);
5622           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5623           vect[k] = data_ref;
5624         }
5625
5626       for (k = 0; k < 3; k++)
5627         {
5628           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5629           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5630                                            vect[k % 3], vect[(k + 1) % 3],
5631                                            shift1_mask);
5632           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5633           vect_shift[k] = data_ref;
5634         }
5635
5636       for (k = 0; k < 3; k++)
5637         {
5638           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5639           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5640                                            vect_shift[(4 - k) % 3],
5641                                            vect_shift[(3 - k) % 3],
5642                                            shift2_mask);
5643           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5644           vect[k] = data_ref;
5645         }
5646
5647       (*result_chain)[3 - (nelt % 3)] = vect[2];
5648
5649       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5650       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5651                                        vect[0], shift3_mask);
5652       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5653       (*result_chain)[nelt % 3] = data_ref;
5654
5655       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5656       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5657                                        vect[1], shift4_mask);
5658       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5659       (*result_chain)[0] = data_ref;
5660       return true;
5661     }
5662   return false;
5663 }
5664
5665 /* Function vect_transform_grouped_load.
5666
5667    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5668    to perform their permutation and ascribe the result vectorized statements to
5669    the scalar statements.
5670 */
5671
5672 void
5673 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5674                              gimple_stmt_iterator *gsi)
5675 {
5676   machine_mode mode;
5677   vec<tree> result_chain = vNULL;
5678
5679   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5680      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5681      vectors, that are ready for vector computation.  */
5682   result_chain.create (size);
5683
5684   /* If reassociation width for vector type is 2 or greater target machine can
5685      execute 2 or more vector instructions in parallel.  Otherwise try to
5686      get chain for loads group using vect_shift_permute_load_chain.  */
5687   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5688   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5689       || exact_log2 (size) != -1
5690       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5691                                          gsi, &result_chain))
5692     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5693   vect_record_grouped_load_vectors (stmt, result_chain);
5694   result_chain.release ();
5695 }
5696
5697 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5698    generated as part of the vectorization of STMT.  Assign the statement
5699    for each vector to the associated scalar statement.  */
5700
5701 void
5702 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5703 {
5704   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5705   gimple *next_stmt, *new_stmt;
5706   unsigned int i, gap_count;
5707   tree tmp_data_ref;
5708
5709   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5710      Since we scan the chain starting from it's first node, their order
5711      corresponds the order of data-refs in RESULT_CHAIN.  */
5712   next_stmt = first_stmt;
5713   gap_count = 1;
5714   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5715     {
5716       if (!next_stmt)
5717         break;
5718
5719       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5720        code elimination pass later.  No need to check for the first stmt in
5721        the group, since it always exists.
5722        GROUP_GAP is the number of steps in elements from the previous
5723        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5724        correspond to the gaps.  */
5725       if (next_stmt != first_stmt
5726           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5727       {
5728         gap_count++;
5729         continue;
5730       }
5731
5732       while (next_stmt)
5733         {
5734           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5735           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5736              copies, and we put the new vector statement in the first available
5737              RELATED_STMT.  */
5738           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5739             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5740           else
5741             {
5742               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5743                 {
5744                   gimple *prev_stmt =
5745                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5746                   gimple *rel_stmt =
5747                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5748                   while (rel_stmt)
5749                     {
5750                       prev_stmt = rel_stmt;
5751                       rel_stmt =
5752                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5753                     }
5754
5755                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5756                     new_stmt;
5757                 }
5758             }
5759
5760           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5761           gap_count = 1;
5762           /* If NEXT_STMT accesses the same DR as the previous statement,
5763              put the same TMP_DATA_REF as its vectorized statement; otherwise
5764              get the next data-ref from RESULT_CHAIN.  */
5765           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5766             break;
5767         }
5768     }
5769 }
5770
5771 /* Function vect_force_dr_alignment_p.
5772
5773    Returns whether the alignment of a DECL can be forced to be aligned
5774    on ALIGNMENT bit boundary.  */
5775
5776 bool
5777 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5778 {
5779   if (TREE_CODE (decl) != VAR_DECL)
5780     return false;
5781
5782   if (decl_in_symtab_p (decl)
5783       && !symtab_node::get (decl)->can_increase_alignment_p ())
5784     return false;
5785
5786   if (TREE_STATIC (decl))
5787     return (alignment <= MAX_OFILE_ALIGNMENT);
5788   else
5789     return (alignment <= MAX_STACK_ALIGNMENT);
5790 }
5791
5792
5793 /* Return whether the data reference DR is supported with respect to its
5794    alignment.
5795    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5796    it is aligned, i.e., check if it is possible to vectorize it with different
5797    alignment.  */
5798
5799 enum dr_alignment_support
5800 vect_supportable_dr_alignment (struct data_reference *dr,
5801                                bool check_aligned_accesses)
5802 {
5803   gimple *stmt = DR_STMT (dr);
5804   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5805   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5806   machine_mode mode = TYPE_MODE (vectype);
5807   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5808   struct loop *vect_loop = NULL;
5809   bool nested_in_vect_loop = false;
5810
5811   if (aligned_access_p (dr) && !check_aligned_accesses)
5812     return dr_aligned;
5813
5814   /* For now assume all conditional loads/stores support unaligned
5815      access without any special code.  */
5816   if (is_gimple_call (stmt)
5817       && gimple_call_internal_p (stmt)
5818       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5819           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5820     return dr_unaligned_supported;
5821
5822   if (loop_vinfo)
5823     {
5824       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5825       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5826     }
5827
5828   /* Possibly unaligned access.  */
5829
5830   /* We can choose between using the implicit realignment scheme (generating
5831      a misaligned_move stmt) and the explicit realignment scheme (generating
5832      aligned loads with a REALIGN_LOAD).  There are two variants to the
5833      explicit realignment scheme: optimized, and unoptimized.
5834      We can optimize the realignment only if the step between consecutive
5835      vector loads is equal to the vector size.  Since the vector memory
5836      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5837      is guaranteed that the misalignment amount remains the same throughout the
5838      execution of the vectorized loop.  Therefore, we can create the
5839      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5840      at the loop preheader.
5841
5842      However, in the case of outer-loop vectorization, when vectorizing a
5843      memory access in the inner-loop nested within the LOOP that is now being
5844      vectorized, while it is guaranteed that the misalignment of the
5845      vectorized memory access will remain the same in different outer-loop
5846      iterations, it is *not* guaranteed that is will remain the same throughout
5847      the execution of the inner-loop.  This is because the inner-loop advances
5848      with the original scalar step (and not in steps of VS).  If the inner-loop
5849      step happens to be a multiple of VS, then the misalignment remains fixed
5850      and we can use the optimized realignment scheme.  For example:
5851
5852       for (i=0; i<N; i++)
5853         for (j=0; j<M; j++)
5854           s += a[i+j];
5855
5856      When vectorizing the i-loop in the above example, the step between
5857      consecutive vector loads is 1, and so the misalignment does not remain
5858      fixed across the execution of the inner-loop, and the realignment cannot
5859      be optimized (as illustrated in the following pseudo vectorized loop):
5860
5861       for (i=0; i<N; i+=4)
5862         for (j=0; j<M; j++){
5863           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5864                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5865                          // (assuming that we start from an aligned address).
5866           }
5867
5868      We therefore have to use the unoptimized realignment scheme:
5869
5870       for (i=0; i<N; i+=4)
5871           for (j=k; j<M; j+=4)
5872           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5873                            // that the misalignment of the initial address is
5874                            // 0).
5875
5876      The loop can then be vectorized as follows:
5877
5878       for (k=0; k<4; k++){
5879         rt = get_realignment_token (&vp[k]);
5880         for (i=0; i<N; i+=4){
5881           v1 = vp[i+k];
5882           for (j=k; j<M; j+=4){
5883             v2 = vp[i+j+VS-1];
5884             va = REALIGN_LOAD <v1,v2,rt>;
5885             vs += va;
5886             v1 = v2;
5887           }
5888         }
5889     } */
5890
5891   if (DR_IS_READ (dr))
5892     {
5893       bool is_packed = false;
5894       tree type = (TREE_TYPE (DR_REF (dr)));
5895
5896       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5897           && (!targetm.vectorize.builtin_mask_for_load
5898               || targetm.vectorize.builtin_mask_for_load ()))
5899         {
5900           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5901           if ((nested_in_vect_loop
5902                && (TREE_INT_CST_LOW (DR_STEP (dr))
5903                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5904               || !loop_vinfo)
5905             return dr_explicit_realign;
5906           else
5907             return dr_explicit_realign_optimized;
5908         }
5909       if (!known_alignment_for_access_p (dr))
5910         is_packed = not_size_aligned (DR_REF (dr));
5911
5912       if ((TYPE_USER_ALIGN (type) && !is_packed)
5913           || targetm.vectorize.
5914                support_vector_misalignment (mode, type,
5915                                             DR_MISALIGNMENT (dr), is_packed))
5916         /* Can't software pipeline the loads, but can at least do them.  */
5917         return dr_unaligned_supported;
5918     }
5919   else
5920     {
5921       bool is_packed = false;
5922       tree type = (TREE_TYPE (DR_REF (dr)));
5923
5924       if (!known_alignment_for_access_p (dr))
5925         is_packed = not_size_aligned (DR_REF (dr));
5926
5927      if ((TYPE_USER_ALIGN (type) && !is_packed)
5928          || targetm.vectorize.
5929               support_vector_misalignment (mode, type,
5930                                            DR_MISALIGNMENT (dr), is_packed))
5931        return dr_unaligned_supported;
5932     }
5933
5934   /* Unsupported.  */
5935   return dr_unaligned_unsupported;
5936 }