gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "input.h"
  28 #include "alias.h"
  29 #include "symtab.h"
  30 #include "tree.h"
  31 #include "fold-const.h"
  32 #include "stor-layout.h"
  33 #include "tm_p.h"
  34 #include "target.h"
  35 #include "predict.h"
  36 #include "hard-reg-set.h"
  37 #include "function.h"
  38 #include "dominance.h"
  39 #include "cfg.h"
  40 #include "basic-block.h"
  41 #include "gimple-pretty-print.h"
  42 #include "tree-ssa-alias.h"
  43 #include "internal-fn.h"
  44 #include "tree-eh.h"
  45 #include "gimple-expr.h"
  46 #include "is-a.h"
  47 #include "gimple.h"
  48 #include "gimplify.h"
  49 #include "gimple-iterator.h"
  50 #include "gimplify-me.h"
  51 #include "gimple-ssa.h"
  52 #include "tree-phinodes.h"
  53 #include "ssa-iterators.h"
  54 #include "stringpool.h"
  55 #include "tree-ssanames.h"
  56 #include "tree-ssa-loop-ivopts.h"
  57 #include "tree-ssa-loop-manip.h"
  58 #include "tree-ssa-loop.h"
  59 #include "cfgloop.h"
  60 #include "tree-chrec.h"
  61 #include "tree-scalar-evolution.h"
  62 #include "tree-vectorizer.h"
  63 #include "diagnostic-core.h"
  64 #include "plugin-api.h"
  65 #include "ipa-ref.h"
  66 #include "cgraph.h"
  67 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  68 #include "rtl.h"
  69 #include "flags.h"
  70 #include "insn-config.h"
  71 #include "expmed.h"
  72 #include "dojump.h"
  73 #include "explow.h"
  74 #include "calls.h"
  75 #include "emit-rtl.h"
  76 #include "varasm.h"
  77 #include "stmt.h"
  78 #include "expr.h"
  79 #include "insn-codes.h"
  80 #include "optabs.h"
  81 #include "builtins.h"
  82
  83 /* Return true if load- or store-lanes optab OPTAB is implemented for
  84    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  85
  86 static bool
  87 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  88                               tree vectype, unsigned HOST_WIDE_INT count)
  89 {
  90   machine_mode mode, array_mode;
  91   bool limit_p;
  92
  93   mode = TYPE_MODE (vectype);
  94   limit_p = !targetm.array_mode_supported_p (mode, count);
  95   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  96                               MODE_INT, limit_p);
  97
  98   if (array_mode == BLKmode)
  99     {
 100       if (dump_enabled_p ())
 101         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 102                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
 103                          GET_MODE_NAME (mode), count);
 104       return false;
 105     }
 106
 107   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
 108     {
 109       if (dump_enabled_p ())
 110         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 111                          "cannot use %s<%s><%s>\n", name,
 112                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
 113       return false;
 114     }
 115
 116   if (dump_enabled_p ())
 117     dump_printf_loc (MSG_NOTE, vect_location,
 118                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
 119                      GET_MODE_NAME (mode));
 120
 121   return true;
 122 }
 123
 124
 125 /* Return the smallest scalar part of STMT.
 126    This is used to determine the vectype of the stmt.  We generally set the
 127    vectype according to the type of the result (lhs).  For stmts whose
 128    result-type is different than the type of the arguments (e.g., demotion,
 129    promotion), vectype will be reset appropriately (later).  Note that we have
 130    to visit the smallest datatype in this function, because that determines the
 131    VF.  If the smallest datatype in the loop is present only as the rhs of a
 132    promotion operation - we'd miss it.
 133    Such a case, where a variable of this datatype does not appear in the lhs
 134    anywhere in the loop, can only occur if it's an invariant: e.g.:
 135    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 136    invariant motion.  However, we cannot rely on invariant motion to always
 137    take invariants out of the loop, and so in the case of promotion we also
 138    have to check the rhs.
 139    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 140    types.  */
 141
 142 tree
 143 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 144                                HOST_WIDE_INT *rhs_size_unit)
 145 {
 146   tree scalar_type = gimple_expr_type (stmt);
 147   HOST_WIDE_INT lhs, rhs;
 148
 149   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 150
 151   if (is_gimple_assign (stmt)
 152       && (gimple_assign_cast_p (stmt)
 153           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 154           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 155           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 156     {
 157       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 158
 159       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 160       if (rhs < lhs)
 161         scalar_type = rhs_type;
 162     }
 163
 164   *lhs_size_unit = lhs;
 165   *rhs_size_unit = rhs;
 166   return scalar_type;
 167 }
 168
 169
 170 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 171    tested at run-time.  Return TRUE if DDR was successfully inserted.
 172    Return false if versioning is not supported.  */
 173
 174 static bool
 175 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 176 {
 177   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 178
 179   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 180     return false;
 181
 182   if (dump_enabled_p ())
 183     {
 184       dump_printf_loc (MSG_NOTE, vect_location,
 185                        "mark for run-time aliasing test between ");
 186       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 187       dump_printf (MSG_NOTE,  " and ");
 188       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 189       dump_printf (MSG_NOTE, "\n");
 190     }
 191
 192   if (optimize_loop_nest_for_size_p (loop))
 193     {
 194       if (dump_enabled_p ())
 195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 196                          "versioning not supported when optimizing"
 197                          " for size.\n");
 198       return false;
 199     }
 200
 201   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 202   if (loop->inner)
 203     {
 204       if (dump_enabled_p ())
 205         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 206                          "versioning not yet supported for outer-loops.\n");
 207       return false;
 208     }
 209
 210   /* FORNOW: We don't support creating runtime alias tests for non-constant
 211      step.  */
 212   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 213       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 214     {
 215       if (dump_enabled_p ())
 216         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 217                          "versioning not yet supported for non-constant "
 218                          "step\n");
 219       return false;
 220     }
 221
 222   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 223   return true;
 224 }
 225
 226
 227 /* Function vect_analyze_data_ref_dependence.
 228
 229    Return TRUE if there (might) exist a dependence between a memory-reference
 230    DRA and a memory-reference DRB.  When versioning for alias may check a
 231    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 232    the data dependence.  */
 233
 234 static bool
 235 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 236                                   loop_vec_info loop_vinfo, int *max_vf)
 237 {
 238   unsigned int i;
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   struct data_reference *dra = DDR_A (ddr);
 241   struct data_reference *drb = DDR_B (ddr);
 242   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 243   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 244   lambda_vector dist_v;
 245   unsigned int loop_depth;
 246
 247   /* In loop analysis all data references should be vectorizable.  */
 248   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 249       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 250     gcc_unreachable ();
 251
 252   /* Independent data accesses.  */
 253   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 254     return false;
 255
 256   if (dra == drb
 257       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 258     return false;
 259
 260   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 261      least two scalar iterations, there is always also a true dependence.
 262      As the vectorizer does not re-order loads and stores we can ignore
 263      the anti-dependence if TBAA can disambiguate both DRs similar to the
 264      case with known negative distance anti-dependences (positive
 265      distance anti-dependences would violate TBAA constraints).  */
 266   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 267        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 268       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 269                                  get_alias_set (DR_REF (drb))))
 270     return false;
 271
 272   /* Unknown data dependence.  */
 273   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 274     {
 275       /* If user asserted safelen consecutive iterations can be
 276          executed concurrently, assume independence.  */
 277       if (loop->safelen >= 2)
 278         {
 279           if (loop->safelen < *max_vf)
 280             *max_vf = loop->safelen;
 281           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 282           return false;
 283         }
 284
 285       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 286           || STMT_VINFO_GATHER_P (stmtinfo_b))
 287         {
 288           if (dump_enabled_p ())
 289             {
 290               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 291                                "versioning for alias not supported for: "
 292                                "can't determine dependence between ");
 293               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 294                                  DR_REF (dra));
 295               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 296               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 297                                  DR_REF (drb));
 298               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 299             }
 300           return true;
 301         }
 302
 303       if (dump_enabled_p ())
 304         {
 305           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 306                            "versioning for alias required: "
 307                            "can't determine dependence between ");
 308           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 309                              DR_REF (dra));
 310           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 311           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 312                              DR_REF (drb));
 313           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 314         }
 315
 316       /* Add to list of ddrs that need to be tested at run-time.  */
 317       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 318     }
 319
 320   /* Known data dependence.  */
 321   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 322     {
 323       /* If user asserted safelen consecutive iterations can be
 324          executed concurrently, assume independence.  */
 325       if (loop->safelen >= 2)
 326         {
 327           if (loop->safelen < *max_vf)
 328             *max_vf = loop->safelen;
 329           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 330           return false;
 331         }
 332
 333       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 334           || STMT_VINFO_GATHER_P (stmtinfo_b))
 335         {
 336           if (dump_enabled_p ())
 337             {
 338               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                "versioning for alias not supported for: "
 340                                "bad dist vector for ");
 341               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                  DR_REF (dra));
 343               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 344               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 345                                  DR_REF (drb));
 346               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 347             }
 348           return true;
 349         }
 350
 351       if (dump_enabled_p ())
 352         {
 353           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 354                            "versioning for alias required: "
 355                            "bad dist vector for ");
 356           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 357           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 358           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 359           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 360         }
 361       /* Add to list of ddrs that need to be tested at run-time.  */
 362       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 363     }
 364
 365   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 366   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 367     {
 368       int dist = dist_v[loop_depth];
 369
 370       if (dump_enabled_p ())
 371         dump_printf_loc (MSG_NOTE, vect_location,
 372                          "dependence distance  = %d.\n", dist);
 373
 374       if (dist == 0)
 375         {
 376           if (dump_enabled_p ())
 377             {
 378               dump_printf_loc (MSG_NOTE, vect_location,
 379                                "dependence distance == 0 between ");
 380               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 381               dump_printf (MSG_NOTE, " and ");
 382               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 383               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 384             }
 385
 386           /* When we perform grouped accesses and perform implicit CSE
 387              by detecting equal accesses and doing disambiguation with
 388              runtime alias tests like for
 389                 .. = a[i];
 390                 .. = a[i+1];
 391                 a[i] = ..;
 392                 a[i+1] = ..;
 393                 *p = ..;
 394                 .. = a[i];
 395                 .. = a[i+1];
 396              where we will end up loading { a[i], a[i+1] } once, make
 397              sure that inserting group loads before the first load and
 398              stores after the last store will do the right thing.
 399              Similar for groups like
 400                 a[i] = ...;
 401                 ... = a[i];
 402                 a[i+1] = ...;
 403              where loads from the group interleave with the store.  */
 404           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 405               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 406             {
 407               gimple earlier_stmt;
 408               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 409               if (DR_IS_WRITE
 410                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 411                 {
 412                   if (dump_enabled_p ())
 413                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 414                                      "READ_WRITE dependence in interleaving."
 415                                      "\n");
 416                   return true;
 417                 }
 418             }
 419
 420           continue;
 421         }
 422
 423       if (dist > 0 && DDR_REVERSED_P (ddr))
 424         {
 425           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 426              reversed (to make distance vector positive), and the actual
 427              distance is negative.  */
 428           if (dump_enabled_p ())
 429             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 430                              "dependence distance negative.\n");
 431           /* Record a negative dependence distance to later limit the
 432              amount of stmt copying / unrolling we can perform.
 433              Only need to handle read-after-write dependence.  */
 434           if (DR_IS_READ (drb)
 435               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 436                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 437             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 438           continue;
 439         }
 440
 441       if (abs (dist) >= 2
 442           && abs (dist) < *max_vf)
 443         {
 444           /* The dependence distance requires reduction of the maximal
 445              vectorization factor.  */
 446           *max_vf = abs (dist);
 447           if (dump_enabled_p ())
 448             dump_printf_loc (MSG_NOTE, vect_location,
 449                              "adjusting maximal vectorization factor to %i\n",
 450                              *max_vf);
 451         }
 452
 453       if (abs (dist) >= *max_vf)
 454         {
 455           /* Dependence distance does not create dependence, as far as
 456              vectorization is concerned, in this case.  */
 457           if (dump_enabled_p ())
 458             dump_printf_loc (MSG_NOTE, vect_location,
 459                              "dependence distance >= VF.\n");
 460           continue;
 461         }
 462
 463       if (dump_enabled_p ())
 464         {
 465           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 466                        "not vectorized, possible dependence "
 467                        "between data-refs ");
 468           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 469           dump_printf (MSG_NOTE,  " and ");
 470           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 471           dump_printf (MSG_NOTE,  "\n");
 472         }
 473
 474       return true;
 475     }
 476
 477   return false;
 478 }
 479
 480 /* Function vect_analyze_data_ref_dependences.
 481
 482    Examine all the data references in the loop, and make sure there do not
 483    exist any data dependences between them.  Set *MAX_VF according to
 484    the maximum vectorization factor the data dependences allow.  */
 485
 486 bool
 487 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 488 {
 489   unsigned int i;
 490   struct data_dependence_relation *ddr;
 491
 492   if (dump_enabled_p ())
 493     dump_printf_loc (MSG_NOTE, vect_location,
 494                      "=== vect_analyze_data_ref_dependences ===\n");
 495
 496   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 497   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 498                                 &LOOP_VINFO_DDRS (loop_vinfo),
 499                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 500     return false;
 501
 502   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 503     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 504       return false;
 505
 506   return true;
 507 }
 508
 509
 510 /* Function vect_slp_analyze_data_ref_dependence.
 511
 512    Return TRUE if there (might) exist a dependence between a memory-reference
 513    DRA and a memory-reference DRB.  When versioning for alias may check a
 514    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 515    the data dependence.  */
 516
 517 static bool
 518 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 519 {
 520   struct data_reference *dra = DDR_A (ddr);
 521   struct data_reference *drb = DDR_B (ddr);
 522
 523   /* We need to check dependences of statements marked as unvectorizable
 524      as well, they still can prohibit vectorization.  */
 525
 526   /* Independent data accesses.  */
 527   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 528     return false;
 529
 530   if (dra == drb)
 531     return false;
 532
 533   /* Read-read is OK.  */
 534   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 535     return false;
 536
 537   /* If dra and drb are part of the same interleaving chain consider
 538      them independent.  */
 539   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 540       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 541           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 542     return false;
 543
 544   /* Unknown data dependence.  */
 545   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 546     {
 547       if  (dump_enabled_p ())
 548         {
 549           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 550                            "can't determine dependence between ");
 551           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 552           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 553           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 554           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 555         }
 556     }
 557   else if (dump_enabled_p ())
 558     {
 559       dump_printf_loc (MSG_NOTE, vect_location,
 560                        "determined dependence between ");
 561       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 562       dump_printf (MSG_NOTE, " and ");
 563       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 564       dump_printf (MSG_NOTE,  "\n");
 565     }
 566
 567   /* We do not vectorize basic blocks with write-write dependencies.  */
 568   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 569     return true;
 570
 571   /* If we have a read-write dependence check that the load is before the store.
 572      When we vectorize basic blocks, vector load can be only before
 573      corresponding scalar load, and vector store can be only after its
 574      corresponding scalar store.  So the order of the acceses is preserved in
 575      case the load is before the store.  */
 576   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 577   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 578     {
 579       /* That only holds for load-store pairs taking part in vectorization.  */
 580       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 581           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 582         return false;
 583     }
 584
 585   return true;
 586 }
 587
 588
 589 /* Function vect_analyze_data_ref_dependences.
 590
 591    Examine all the data references in the basic-block, and make sure there
 592    do not exist any data dependences between them.  Set *MAX_VF according to
 593    the maximum vectorization factor the data dependences allow.  */
 594
 595 bool
 596 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 597 {
 598   struct data_dependence_relation *ddr;
 599   unsigned int i;
 600
 601   if (dump_enabled_p ())
 602     dump_printf_loc (MSG_NOTE, vect_location,
 603                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 604
 605   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 606                                 &BB_VINFO_DDRS (bb_vinfo),
 607                                 vNULL, true))
 608     return false;
 609
 610   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 611     if (vect_slp_analyze_data_ref_dependence (ddr))
 612       return false;
 613
 614   return true;
 615 }
 616
 617
 618 /* Function vect_compute_data_ref_alignment
 619
 620    Compute the misalignment of the data reference DR.
 621
 622    Output:
 623    1. If during the misalignment computation it is found that the data reference
 624       cannot be vectorized then false is returned.
 625    2. DR_MISALIGNMENT (DR) is defined.
 626
 627    FOR NOW: No analysis is actually performed. Misalignment is calculated
 628    only for trivial cases. TODO.  */
 629
 630 static bool
 631 vect_compute_data_ref_alignment (struct data_reference *dr)
 632 {
 633   gimple stmt = DR_STMT (dr);
 634   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 635   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 636   struct loop *loop = NULL;
 637   tree ref = DR_REF (dr);
 638   tree vectype;
 639   tree base, base_addr;
 640   bool base_aligned;
 641   tree misalign = NULL_TREE;
 642   tree aligned_to;
 643   unsigned HOST_WIDE_INT alignment;
 644
 645   if (dump_enabled_p ())
 646     dump_printf_loc (MSG_NOTE, vect_location,
 647                      "vect_compute_data_ref_alignment:\n");
 648
 649   if (loop_vinfo)
 650     loop = LOOP_VINFO_LOOP (loop_vinfo);
 651
 652   /* Initialize misalignment to unknown.  */
 653   SET_DR_MISALIGNMENT (dr, -1);
 654
 655   /* Strided accesses perform only component accesses, misalignment information
 656      is irrelevant for them.  */
 657   if (STMT_VINFO_STRIDED_P (stmt_info)
 658       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 659     return true;
 660
 661   if (tree_fits_shwi_p (DR_STEP (dr)))
 662     misalign = DR_INIT (dr);
 663   aligned_to = DR_ALIGNED_TO (dr);
 664   base_addr = DR_BASE_ADDRESS (dr);
 665   vectype = STMT_VINFO_VECTYPE (stmt_info);
 666
 667   /* In case the dataref is in an inner-loop of the loop that is being
 668      vectorized (LOOP), we use the base and misalignment information
 669      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 670      stays the same throughout the execution of the inner-loop, which is why
 671      we have to check that the stride of the dataref in the inner-loop evenly
 672      divides by the vector size.  */
 673   if (loop && nested_in_vect_loop_p (loop, stmt))
 674     {
 675       tree step = DR_STEP (dr);
 676
 677       if (tree_fits_shwi_p (step)
 678           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 679         {
 680           if (dump_enabled_p ())
 681             dump_printf_loc (MSG_NOTE, vect_location,
 682                              "inner step divides the vector-size.\n");
 683           misalign = STMT_VINFO_DR_INIT (stmt_info);
 684           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 685           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 686         }
 687       else
 688         {
 689           if (dump_enabled_p ())
 690             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 691                              "inner step doesn't divide the vector-size.\n");
 692           misalign = NULL_TREE;
 693         }
 694     }
 695
 696   /* Similarly, if we're doing basic-block vectorization, we can only use
 697      base and misalignment information relative to an innermost loop if the
 698      misalignment stays the same throughout the execution of the loop.
 699      As above, this is the case if the stride of the dataref evenly divides
 700      by the vector size.  */
 701   if (!loop)
 702     {
 703       tree step = DR_STEP (dr);
 704
 705       if (tree_fits_shwi_p (step)
 706           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 707         {
 708           if (dump_enabled_p ())
 709             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 710                              "SLP: step doesn't divide the vector-size.\n");
 711           misalign = NULL_TREE;
 712         }
 713     }
 714
 715   alignment = TYPE_ALIGN_UNIT (vectype);
 716
 717   if ((compare_tree_int (aligned_to, alignment) < 0)
 718       || !misalign)
 719     {
 720       if (dump_enabled_p ())
 721         {
 722           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 723                            "Unknown alignment for access: ");
 724           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 725           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 726         }
 727       return true;
 728     }
 729
 730   /* To look at alignment of the base we have to preserve an inner MEM_REF
 731      as that carries alignment information of the actual access.  */
 732   base = ref;
 733   while (handled_component_p (base))
 734     base = TREE_OPERAND (base, 0);
 735   if (TREE_CODE (base) == MEM_REF)
 736     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 737                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 738
 739   if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
 740     base_aligned = true;
 741   else
 742     base_aligned = false;
 743
 744   if (!base_aligned)
 745     {
 746       /* Strip an inner MEM_REF to a bare decl if possible.  */
 747       if (TREE_CODE (base) == MEM_REF
 748           && integer_zerop (TREE_OPERAND (base, 1))
 749           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 750         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 751
 752       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 753         {
 754           if (dump_enabled_p ())
 755             {
 756               dump_printf_loc (MSG_NOTE, vect_location,
 757                                "can't force alignment of ref: ");
 758               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 759               dump_printf (MSG_NOTE, "\n");
 760             }
 761           return true;
 762         }
 763
 764       /* Force the alignment of the decl.
 765          NOTE: This is the only change to the code we make during
 766          the analysis phase, before deciding to vectorize the loop.  */
 767       if (dump_enabled_p ())
 768         {
 769           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 770           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 771           dump_printf (MSG_NOTE, "\n");
 772         }
 773
 774       ((dataref_aux *)dr->aux)->base_decl = base;
 775       ((dataref_aux *)dr->aux)->base_misaligned = true;
 776     }
 777
 778   /* If this is a backward running DR then first access in the larger
 779      vectype actually is N-1 elements before the address in the DR.
 780      Adjust misalign accordingly.  */
 781   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 782     {
 783       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 784       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 785          otherwise we wouldn't be here.  */
 786       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 787       /* PLUS because DR_STEP was negative.  */
 788       misalign = size_binop (PLUS_EXPR, misalign, offset);
 789     }
 790
 791   SET_DR_MISALIGNMENT (dr,
 792                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 793
 794   if (dump_enabled_p ())
 795     {
 796       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 797                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 798       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 799       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 800     }
 801
 802   return true;
 803 }
 804
 805
 806 /* Function vect_compute_data_refs_alignment
 807
 808    Compute the misalignment of data references in the loop.
 809    Return FALSE if a data reference is found that cannot be vectorized.  */
 810
 811 static bool
 812 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 813                                   bb_vec_info bb_vinfo)
 814 {
 815   vec<data_reference_p> datarefs;
 816   struct data_reference *dr;
 817   unsigned int i;
 818
 819   if (loop_vinfo)
 820     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 821   else
 822     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 823
 824   FOR_EACH_VEC_ELT (datarefs, i, dr)
 825     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 826         && !vect_compute_data_ref_alignment (dr))
 827       {
 828         if (bb_vinfo)
 829           {
 830             /* Mark unsupported statement as unvectorizable.  */
 831             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 832             continue;
 833           }
 834         else
 835           return false;
 836       }
 837
 838   return true;
 839 }
 840
 841
 842 /* Function vect_update_misalignment_for_peel
 843
 844    DR - the data reference whose misalignment is to be adjusted.
 845    DR_PEEL - the data reference whose misalignment is being made
 846              zero in the vector loop by the peel.
 847    NPEEL - the number of iterations in the peel loop if the misalignment
 848            of DR_PEEL is known at compile time.  */
 849
 850 static void
 851 vect_update_misalignment_for_peel (struct data_reference *dr,
 852                                    struct data_reference *dr_peel, int npeel)
 853 {
 854   unsigned int i;
 855   vec<dr_p> same_align_drs;
 856   struct data_reference *current_dr;
 857   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 858   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 859   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 860   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 861
 862  /* For interleaved data accesses the step in the loop must be multiplied by
 863      the size of the interleaving group.  */
 864   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 865     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 866   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 867     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 868
 869   /* It can be assumed that the data refs with the same alignment as dr_peel
 870      are aligned in the vector loop.  */
 871   same_align_drs
 872     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 873   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 874     {
 875       if (current_dr != dr)
 876         continue;
 877       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 878                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 879       SET_DR_MISALIGNMENT (dr, 0);
 880       return;
 881     }
 882
 883   if (known_alignment_for_access_p (dr)
 884       && known_alignment_for_access_p (dr_peel))
 885     {
 886       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 887       int misal = DR_MISALIGNMENT (dr);
 888       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 889       misal += negative ? -npeel * dr_size : npeel * dr_size;
 890       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 891       SET_DR_MISALIGNMENT (dr, misal);
 892       return;
 893     }
 894
 895   if (dump_enabled_p ())
 896     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 897   SET_DR_MISALIGNMENT (dr, -1);
 898 }
 899
 900
 901 /* Function vect_verify_datarefs_alignment
 902
 903    Return TRUE if all data references in the loop can be
 904    handled with respect to alignment.  */
 905
 906 bool
 907 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 908 {
 909   vec<data_reference_p> datarefs;
 910   struct data_reference *dr;
 911   enum dr_alignment_support supportable_dr_alignment;
 912   unsigned int i;
 913
 914   if (loop_vinfo)
 915     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 916   else
 917     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 918
 919   FOR_EACH_VEC_ELT (datarefs, i, dr)
 920     {
 921       gimple stmt = DR_STMT (dr);
 922       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 923
 924       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 925         continue;
 926
 927       /* For interleaving, only the alignment of the first access matters.
 928          Skip statements marked as not vectorizable.  */
 929       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 930            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 931           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 932         continue;
 933
 934       /* Strided accesses perform only component accesses, alignment is
 935          irrelevant for them.  */
 936       if (STMT_VINFO_STRIDED_P (stmt_info)
 937           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 938         continue;
 939
 940       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 941       if (!supportable_dr_alignment)
 942         {
 943           if (dump_enabled_p ())
 944             {
 945               if (DR_IS_READ (dr))
 946                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 947                                  "not vectorized: unsupported unaligned load.");
 948               else
 949                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 950                                  "not vectorized: unsupported unaligned "
 951                                  "store.");
 952
 953               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 954                                  DR_REF (dr));
 955               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 956             }
 957           return false;
 958         }
 959       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 960         dump_printf_loc (MSG_NOTE, vect_location,
 961                          "Vectorizing an unaligned access.\n");
 962     }
 963   return true;
 964 }
 965
 966 /* Given an memory reference EXP return whether its alignment is less
 967    than its size.  */
 968
 969 static bool
 970 not_size_aligned (tree exp)
 971 {
 972   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 973     return true;
 974
 975   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 976           > get_object_alignment (exp));
 977 }
 978
 979 /* Function vector_alignment_reachable_p
 980
 981    Return true if vector alignment for DR is reachable by peeling
 982    a few loop iterations.  Return false otherwise.  */
 983
 984 static bool
 985 vector_alignment_reachable_p (struct data_reference *dr)
 986 {
 987   gimple stmt = DR_STMT (dr);
 988   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 989   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 990
 991   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 992     {
 993       /* For interleaved access we peel only if number of iterations in
 994          the prolog loop ({VF - misalignment}), is a multiple of the
 995          number of the interleaved accesses.  */
 996       int elem_size, mis_in_elements;
 997       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 998
 999       /* FORNOW: handle only known alignment.  */
1000       if (!known_alignment_for_access_p (dr))
1001         return false;
1002
1003       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1004       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1005
1006       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1007         return false;
1008     }
1009
1010   /* If misalignment is known at the compile time then allow peeling
1011      only if natural alignment is reachable through peeling.  */
1012   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1013     {
1014       HOST_WIDE_INT elmsize =
1015                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1016       if (dump_enabled_p ())
1017         {
1018           dump_printf_loc (MSG_NOTE, vect_location,
1019                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1020           dump_printf (MSG_NOTE,
1021                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1022         }
1023       if (DR_MISALIGNMENT (dr) % elmsize)
1024         {
1025           if (dump_enabled_p ())
1026             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1027                              "data size does not divide the misalignment.\n");
1028           return false;
1029         }
1030     }
1031
1032   if (!known_alignment_for_access_p (dr))
1033     {
1034       tree type = TREE_TYPE (DR_REF (dr));
1035       bool is_packed = not_size_aligned (DR_REF (dr));
1036       if (dump_enabled_p ())
1037         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1038                          "Unknown misalignment, is_packed = %d\n",is_packed);
1039       if ((TYPE_USER_ALIGN (type) && !is_packed)
1040           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1041         return true;
1042       else
1043         return false;
1044     }
1045
1046   return true;
1047 }
1048
1049
1050 /* Calculate the cost of the memory access represented by DR.  */
1051
1052 static void
1053 vect_get_data_access_cost (struct data_reference *dr,
1054                            unsigned int *inside_cost,
1055                            unsigned int *outside_cost,
1056                            stmt_vector_for_cost *body_cost_vec)
1057 {
1058   gimple stmt = DR_STMT (dr);
1059   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1060   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1061   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1062   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1063   int ncopies = vf / nunits;
1064
1065   if (DR_IS_READ (dr))
1066     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1067                         NULL, body_cost_vec, false);
1068   else
1069     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1070
1071   if (dump_enabled_p ())
1072     dump_printf_loc (MSG_NOTE, vect_location,
1073                      "vect_get_data_access_cost: inside_cost = %d, "
1074                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1075 }
1076
1077
1078 /* Insert DR into peeling hash table with NPEEL as key.  */
1079
1080 static void
1081 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1082                           int npeel)
1083 {
1084   struct _vect_peel_info elem, *slot;
1085   _vect_peel_info **new_slot;
1086   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1087
1088   elem.npeel = npeel;
1089   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1090   if (slot)
1091     slot->count++;
1092   else
1093     {
1094       slot = XNEW (struct _vect_peel_info);
1095       slot->npeel = npeel;
1096       slot->dr = dr;
1097       slot->count = 1;
1098       new_slot
1099         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1100       *new_slot = slot;
1101     }
1102
1103   if (!supportable_dr_alignment
1104       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1105     slot->count += VECT_MAX_COST;
1106 }
1107
1108
1109 /* Traverse peeling hash table to find peeling option that aligns maximum
1110    number of data accesses.  */
1111
1112 int
1113 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1114                                      _vect_peel_extended_info *max)
1115 {
1116   vect_peel_info elem = *slot;
1117
1118   if (elem->count > max->peel_info.count
1119       || (elem->count == max->peel_info.count
1120           && max->peel_info.npeel > elem->npeel))
1121     {
1122       max->peel_info.npeel = elem->npeel;
1123       max->peel_info.count = elem->count;
1124       max->peel_info.dr = elem->dr;
1125     }
1126
1127   return 1;
1128 }
1129
1130
1131 /* Traverse peeling hash table and calculate cost for each peeling option.
1132    Find the one with the lowest cost.  */
1133
1134 int
1135 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1136                                    _vect_peel_extended_info *min)
1137 {
1138   vect_peel_info elem = *slot;
1139   int save_misalignment, dummy;
1140   unsigned int inside_cost = 0, outside_cost = 0, i;
1141   gimple stmt = DR_STMT (elem->dr);
1142   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1143   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1144   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1145   struct data_reference *dr;
1146   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1147
1148   prologue_cost_vec.create (2);
1149   body_cost_vec.create (2);
1150   epilogue_cost_vec.create (2);
1151
1152   FOR_EACH_VEC_ELT (datarefs, i, dr)
1153     {
1154       stmt = DR_STMT (dr);
1155       stmt_info = vinfo_for_stmt (stmt);
1156       /* For interleaving, only the alignment of the first access
1157          matters.  */
1158       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1159           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1160         continue;
1161
1162       save_misalignment = DR_MISALIGNMENT (dr);
1163       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1164       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1165                                  &body_cost_vec);
1166       SET_DR_MISALIGNMENT (dr, save_misalignment);
1167     }
1168
1169   auto_vec<stmt_info_for_cost> scalar_cost_vec;
1170   vect_get_single_scalar_iteration_cost (loop_vinfo, &scalar_cost_vec);
1171   outside_cost += vect_get_known_peeling_cost
1172     (loop_vinfo, elem->npeel, &dummy,
1173      &scalar_cost_vec, &prologue_cost_vec, &epilogue_cost_vec);
1174
1175   /* Prologue and epilogue costs are added to the target model later.
1176      These costs depend only on the scalar iteration cost, the
1177      number of peeling iterations finally chosen, and the number of
1178      misaligned statements.  So discard the information found here.  */
1179   prologue_cost_vec.release ();
1180   epilogue_cost_vec.release ();
1181
1182   if (inside_cost < min->inside_cost
1183       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1184     {
1185       min->inside_cost = inside_cost;
1186       min->outside_cost = outside_cost;
1187       min->body_cost_vec.release ();
1188       min->body_cost_vec = body_cost_vec;
1189       min->peel_info.dr = elem->dr;
1190       min->peel_info.npeel = elem->npeel;
1191     }
1192   else
1193     body_cost_vec.release ();
1194
1195   return 1;
1196 }
1197
1198
1199 /* Choose best peeling option by traversing peeling hash table and either
1200    choosing an option with the lowest cost (if cost model is enabled) or the
1201    option that aligns as many accesses as possible.  */
1202
1203 static struct data_reference *
1204 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1205                                        unsigned int *npeel,
1206                                        stmt_vector_for_cost *body_cost_vec)
1207 {
1208    struct _vect_peel_extended_info res;
1209
1210    res.peel_info.dr = NULL;
1211    res.body_cost_vec = stmt_vector_for_cost ();
1212
1213    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1214      {
1215        res.inside_cost = INT_MAX;
1216        res.outside_cost = INT_MAX;
1217        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1218            ->traverse <_vect_peel_extended_info *,
1219                        vect_peeling_hash_get_lowest_cost> (&res);
1220      }
1221    else
1222      {
1223        res.peel_info.count = 0;
1224        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1225            ->traverse <_vect_peel_extended_info *,
1226                        vect_peeling_hash_get_most_frequent> (&res);
1227      }
1228
1229    *npeel = res.peel_info.npeel;
1230    *body_cost_vec = res.body_cost_vec;
1231    return res.peel_info.dr;
1232 }
1233
1234
1235 /* Function vect_enhance_data_refs_alignment
1236
1237    This pass will use loop versioning and loop peeling in order to enhance
1238    the alignment of data references in the loop.
1239
1240    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1241    original loop is to be vectorized.  Any other loops that are created by
1242    the transformations performed in this pass - are not supposed to be
1243    vectorized.  This restriction will be relaxed.
1244
1245    This pass will require a cost model to guide it whether to apply peeling
1246    or versioning or a combination of the two.  For example, the scheme that
1247    intel uses when given a loop with several memory accesses, is as follows:
1248    choose one memory access ('p') which alignment you want to force by doing
1249    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1250    other accesses are not necessarily aligned, or (2) use loop versioning to
1251    generate one loop in which all accesses are aligned, and another loop in
1252    which only 'p' is necessarily aligned.
1253
1254    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1255    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1256    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1257
1258    Devising a cost model is the most critical aspect of this work.  It will
1259    guide us on which access to peel for, whether to use loop versioning, how
1260    many versions to create, etc.  The cost model will probably consist of
1261    generic considerations as well as target specific considerations (on
1262    powerpc for example, misaligned stores are more painful than misaligned
1263    loads).
1264
1265    Here are the general steps involved in alignment enhancements:
1266
1267      -- original loop, before alignment analysis:
1268         for (i=0; i<N; i++){
1269           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1270           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1271         }
1272
1273      -- After vect_compute_data_refs_alignment:
1274         for (i=0; i<N; i++){
1275           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1276           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1277         }
1278
1279      -- Possibility 1: we do loop versioning:
1280      if (p is aligned) {
1281         for (i=0; i<N; i++){    # loop 1A
1282           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1283           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1284         }
1285      }
1286      else {
1287         for (i=0; i<N; i++){    # loop 1B
1288           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1289           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1290         }
1291      }
1292
1293      -- Possibility 2: we do loop peeling:
1294      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1295         x = q[i];
1296         p[i] = y;
1297      }
1298      for (i = 3; i < N; i++){   # loop 2A
1299         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1300         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1301      }
1302
1303      -- Possibility 3: combination of loop peeling and versioning:
1304      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1305         x = q[i];
1306         p[i] = y;
1307      }
1308      if (p is aligned) {
1309         for (i = 3; i<N; i++){  # loop 3A
1310           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1311           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1312         }
1313      }
1314      else {
1315         for (i = 3; i<N; i++){  # loop 3B
1316           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1317           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1318         }
1319      }
1320
1321      These loops are later passed to loop_transform to be vectorized.  The
1322      vectorizer will use the alignment information to guide the transformation
1323      (whether to generate regular loads/stores, or with special handling for
1324      misalignment).  */
1325
1326 bool
1327 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1328 {
1329   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1330   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1331   enum dr_alignment_support supportable_dr_alignment;
1332   struct data_reference *dr0 = NULL, *first_store = NULL;
1333   struct data_reference *dr;
1334   unsigned int i, j;
1335   bool do_peeling = false;
1336   bool do_versioning = false;
1337   bool stat;
1338   gimple stmt;
1339   stmt_vec_info stmt_info;
1340   unsigned int npeel = 0;
1341   bool all_misalignments_unknown = true;
1342   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1343   unsigned possible_npeel_number = 1;
1344   tree vectype;
1345   unsigned int nelements, mis, same_align_drs_max = 0;
1346   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1347
1348   if (dump_enabled_p ())
1349     dump_printf_loc (MSG_NOTE, vect_location,
1350                      "=== vect_enhance_data_refs_alignment ===\n");
1351
1352   /* While cost model enhancements are expected in the future, the high level
1353      view of the code at this time is as follows:
1354
1355      A) If there is a misaligned access then see if peeling to align
1356         this access can make all data references satisfy
1357         vect_supportable_dr_alignment.  If so, update data structures
1358         as needed and return true.
1359
1360      B) If peeling wasn't possible and there is a data reference with an
1361         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1362         then see if loop versioning checks can be used to make all data
1363         references satisfy vect_supportable_dr_alignment.  If so, update
1364         data structures as needed and return true.
1365
1366      C) If neither peeling nor versioning were successful then return false if
1367         any data reference does not satisfy vect_supportable_dr_alignment.
1368
1369      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1370
1371      Note, Possibility 3 above (which is peeling and versioning together) is not
1372      being done at this time.  */
1373
1374   /* (1) Peeling to force alignment.  */
1375
1376   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1377      Considerations:
1378      + How many accesses will become aligned due to the peeling
1379      - How many accesses will become unaligned due to the peeling,
1380        and the cost of misaligned accesses.
1381      - The cost of peeling (the extra runtime checks, the increase
1382        in code size).  */
1383
1384   FOR_EACH_VEC_ELT (datarefs, i, dr)
1385     {
1386       stmt = DR_STMT (dr);
1387       stmt_info = vinfo_for_stmt (stmt);
1388
1389       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1390         continue;
1391
1392       /* For interleaving, only the alignment of the first access
1393          matters.  */
1394       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1395           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1396         continue;
1397
1398       /* For invariant accesses there is nothing to enhance.  */
1399       if (integer_zerop (DR_STEP (dr)))
1400         continue;
1401
1402       /* Strided accesses perform only component accesses, alignment is
1403          irrelevant for them.  */
1404       if (STMT_VINFO_STRIDED_P (stmt_info)
1405           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1406         continue;
1407
1408       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1409       do_peeling = vector_alignment_reachable_p (dr);
1410       if (do_peeling)
1411         {
1412           if (known_alignment_for_access_p (dr))
1413             {
1414               unsigned int npeel_tmp;
1415               bool negative = tree_int_cst_compare (DR_STEP (dr),
1416                                                     size_zero_node) < 0;
1417
1418               /* Save info about DR in the hash table.  */
1419               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1420                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1421                   = new hash_table<peel_info_hasher> (1);
1422
1423               vectype = STMT_VINFO_VECTYPE (stmt_info);
1424               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1425               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1426                                                 TREE_TYPE (DR_REF (dr))));
1427               npeel_tmp = (negative
1428                            ? (mis - nelements) : (nelements - mis))
1429                   & (nelements - 1);
1430
1431               /* For multiple types, it is possible that the bigger type access
1432                  will have more than one peeling option.  E.g., a loop with two
1433                  types: one of size (vector size / 4), and the other one of
1434                  size (vector size / 8).  Vectorization factor will 8.  If both
1435                  access are misaligned by 3, the first one needs one scalar
1436                  iteration to be aligned, and the second one needs 5.  But the
1437                  the first one will be aligned also by peeling 5 scalar
1438                  iterations, and in that case both accesses will be aligned.
1439                  Hence, except for the immediate peeling amount, we also want
1440                  to try to add full vector size, while we don't exceed
1441                  vectorization factor.
1442                  We do this automtically for cost model, since we calculate cost
1443                  for every peeling option.  */
1444               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1445                 possible_npeel_number = vf /nelements;
1446
1447               /* Handle the aligned case. We may decide to align some other
1448                  access, making DR unaligned.  */
1449               if (DR_MISALIGNMENT (dr) == 0)
1450                 {
1451                   npeel_tmp = 0;
1452                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1453                     possible_npeel_number++;
1454                 }
1455
1456               for (j = 0; j < possible_npeel_number; j++)
1457                 {
1458                   gcc_assert (npeel_tmp <= vf);
1459                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1460                   npeel_tmp += nelements;
1461                 }
1462
1463               all_misalignments_unknown = false;
1464               /* Data-ref that was chosen for the case that all the
1465                  misalignments are unknown is not relevant anymore, since we
1466                  have a data-ref with known alignment.  */
1467               dr0 = NULL;
1468             }
1469           else
1470             {
1471               /* If we don't know any misalignment values, we prefer
1472                  peeling for data-ref that has the maximum number of data-refs
1473                  with the same alignment, unless the target prefers to align
1474                  stores over load.  */
1475               if (all_misalignments_unknown)
1476                 {
1477                   unsigned same_align_drs
1478                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1479                   if (!dr0
1480                       || same_align_drs_max < same_align_drs)
1481                     {
1482                       same_align_drs_max = same_align_drs;
1483                       dr0 = dr;
1484                     }
1485                   /* For data-refs with the same number of related
1486                      accesses prefer the one where the misalign
1487                      computation will be invariant in the outermost loop.  */
1488                   else if (same_align_drs_max == same_align_drs)
1489                     {
1490                       struct loop *ivloop0, *ivloop;
1491                       ivloop0 = outermost_invariant_loop_for_expr
1492                           (loop, DR_BASE_ADDRESS (dr0));
1493                       ivloop = outermost_invariant_loop_for_expr
1494                           (loop, DR_BASE_ADDRESS (dr));
1495                       if ((ivloop && !ivloop0)
1496                           || (ivloop && ivloop0
1497                               && flow_loop_nested_p (ivloop, ivloop0)))
1498                         dr0 = dr;
1499                     }
1500
1501                   if (!first_store && DR_IS_WRITE (dr))
1502                     first_store = dr;
1503                 }
1504
1505               /* If there are both known and unknown misaligned accesses in the
1506                  loop, we choose peeling amount according to the known
1507                  accesses.  */
1508               if (!supportable_dr_alignment)
1509                 {
1510                   dr0 = dr;
1511                   if (!first_store && DR_IS_WRITE (dr))
1512                     first_store = dr;
1513                 }
1514             }
1515         }
1516       else
1517         {
1518           if (!aligned_access_p (dr))
1519             {
1520               if (dump_enabled_p ())
1521                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1522                                  "vector alignment may not be reachable\n");
1523               break;
1524             }
1525         }
1526     }
1527
1528   /* Check if we can possibly peel the loop.  */
1529   if (!vect_can_advance_ivs_p (loop_vinfo)
1530       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1531     do_peeling = false;
1532
1533   if (do_peeling
1534       && all_misalignments_unknown
1535       && vect_supportable_dr_alignment (dr0, false))
1536     {
1537       /* Check if the target requires to prefer stores over loads, i.e., if
1538          misaligned stores are more expensive than misaligned loads (taking
1539          drs with same alignment into account).  */
1540       if (first_store && DR_IS_READ (dr0))
1541         {
1542           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1543           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1544           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1545           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1546           stmt_vector_for_cost dummy;
1547           dummy.create (2);
1548
1549           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1550                                      &dummy);
1551           vect_get_data_access_cost (first_store, &store_inside_cost,
1552                                      &store_outside_cost, &dummy);
1553
1554           dummy.release ();
1555
1556           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1557              aligning the load DR0).  */
1558           load_inside_penalty = store_inside_cost;
1559           load_outside_penalty = store_outside_cost;
1560           for (i = 0;
1561                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1562                           DR_STMT (first_store))).iterate (i, &dr);
1563                i++)
1564             if (DR_IS_READ (dr))
1565               {
1566                 load_inside_penalty += load_inside_cost;
1567                 load_outside_penalty += load_outside_cost;
1568               }
1569             else
1570               {
1571                 load_inside_penalty += store_inside_cost;
1572                 load_outside_penalty += store_outside_cost;
1573               }
1574
1575           /* Calculate the penalty for leaving DR0 unaligned (by
1576              aligning the FIRST_STORE).  */
1577           store_inside_penalty = load_inside_cost;
1578           store_outside_penalty = load_outside_cost;
1579           for (i = 0;
1580                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1581                       DR_STMT (dr0))).iterate (i, &dr);
1582                i++)
1583             if (DR_IS_READ (dr))
1584               {
1585                 store_inside_penalty += load_inside_cost;
1586                 store_outside_penalty += load_outside_cost;
1587               }
1588             else
1589               {
1590                 store_inside_penalty += store_inside_cost;
1591                 store_outside_penalty += store_outside_cost;
1592               }
1593
1594           if (load_inside_penalty > store_inside_penalty
1595               || (load_inside_penalty == store_inside_penalty
1596                   && load_outside_penalty > store_outside_penalty))
1597             dr0 = first_store;
1598         }
1599
1600       /* In case there are only loads with different unknown misalignments, use
1601          peeling only if it may help to align other accesses in the loop or
1602          if it may help improving load bandwith when we'd end up using
1603          unaligned loads.  */
1604       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1605       if (!first_store
1606           && !STMT_VINFO_SAME_ALIGN_REFS (
1607                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1608           && (vect_supportable_dr_alignment (dr0, false)
1609               != dr_unaligned_supported
1610               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1611                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1612         do_peeling = false;
1613     }
1614
1615   if (do_peeling && !dr0)
1616     {
1617       /* Peeling is possible, but there is no data access that is not supported
1618          unless aligned. So we try to choose the best possible peeling.  */
1619
1620       /* We should get here only if there are drs with known misalignment.  */
1621       gcc_assert (!all_misalignments_unknown);
1622
1623       /* Choose the best peeling from the hash table.  */
1624       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1625                                                    &body_cost_vec);
1626       if (!dr0 || !npeel)
1627         do_peeling = false;
1628     }
1629
1630   if (do_peeling)
1631     {
1632       stmt = DR_STMT (dr0);
1633       stmt_info = vinfo_for_stmt (stmt);
1634       vectype = STMT_VINFO_VECTYPE (stmt_info);
1635       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1636
1637       if (known_alignment_for_access_p (dr0))
1638         {
1639           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1640                                                 size_zero_node) < 0;
1641           if (!npeel)
1642             {
1643               /* Since it's known at compile time, compute the number of
1644                  iterations in the peeled loop (the peeling factor) for use in
1645                  updating DR_MISALIGNMENT values.  The peeling factor is the
1646                  vectorization factor minus the misalignment as an element
1647                  count.  */
1648               mis = DR_MISALIGNMENT (dr0);
1649               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1650               npeel = ((negative ? mis - nelements : nelements - mis)
1651                        & (nelements - 1));
1652             }
1653
1654           /* For interleaved data access every iteration accesses all the
1655              members of the group, therefore we divide the number of iterations
1656              by the group size.  */
1657           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1658           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1659             npeel /= GROUP_SIZE (stmt_info);
1660
1661           if (dump_enabled_p ())
1662             dump_printf_loc (MSG_NOTE, vect_location,
1663                              "Try peeling by %d\n", npeel);
1664         }
1665
1666       /* Ensure that all data refs can be vectorized after the peel.  */
1667       FOR_EACH_VEC_ELT (datarefs, i, dr)
1668         {
1669           int save_misalignment;
1670
1671           if (dr == dr0)
1672             continue;
1673
1674           stmt = DR_STMT (dr);
1675           stmt_info = vinfo_for_stmt (stmt);
1676           /* For interleaving, only the alignment of the first access
1677             matters.  */
1678           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1679               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1680             continue;
1681
1682           /* Strided accesses perform only component accesses, alignment is
1683              irrelevant for them.  */
1684           if (STMT_VINFO_STRIDED_P (stmt_info)
1685               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1686             continue;
1687
1688           save_misalignment = DR_MISALIGNMENT (dr);
1689           vect_update_misalignment_for_peel (dr, dr0, npeel);
1690           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1691           SET_DR_MISALIGNMENT (dr, save_misalignment);
1692
1693           if (!supportable_dr_alignment)
1694             {
1695               do_peeling = false;
1696               break;
1697             }
1698         }
1699
1700       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1701         {
1702           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1703           if (!stat)
1704             do_peeling = false;
1705           else
1706             {
1707               body_cost_vec.release ();
1708               return stat;
1709             }
1710         }
1711
1712       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1713       if (do_peeling)
1714         {
1715           unsigned max_allowed_peel
1716             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1717           if (max_allowed_peel != (unsigned)-1)
1718             {
1719               unsigned max_peel = npeel;
1720               if (max_peel == 0)
1721                 {
1722                   gimple dr_stmt = DR_STMT (dr0);
1723                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1724                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1725                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1726                 }
1727               if (max_peel > max_allowed_peel)
1728                 {
1729                   do_peeling = false;
1730                   if (dump_enabled_p ())
1731                     dump_printf_loc (MSG_NOTE, vect_location,
1732                         "Disable peeling, max peels reached: %d\n", max_peel);
1733                 }
1734             }
1735         }
1736
1737       /* Cost model #2 - if peeling may result in a remaining loop not
1738          iterating enough to be vectorized then do not peel.  */
1739       if (do_peeling
1740           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1741         {
1742           unsigned max_peel
1743             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1744           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1745               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1746             do_peeling = false;
1747         }
1748
1749       if (do_peeling)
1750         {
1751           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1752              If the misalignment of DR_i is identical to that of dr0 then set
1753              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1754              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1755              by the peeling factor times the element size of DR_i (MOD the
1756              vectorization factor times the size).  Otherwise, the
1757              misalignment of DR_i must be set to unknown.  */
1758           FOR_EACH_VEC_ELT (datarefs, i, dr)
1759             if (dr != dr0)
1760               vect_update_misalignment_for_peel (dr, dr0, npeel);
1761
1762           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1763           if (npeel)
1764             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1765           else
1766             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1767               = DR_MISALIGNMENT (dr0);
1768           SET_DR_MISALIGNMENT (dr0, 0);
1769           if (dump_enabled_p ())
1770             {
1771               dump_printf_loc (MSG_NOTE, vect_location,
1772                                "Alignment of access forced using peeling.\n");
1773               dump_printf_loc (MSG_NOTE, vect_location,
1774                                "Peeling for alignment will be applied.\n");
1775             }
1776           /* The inside-loop cost will be accounted for in vectorizable_load
1777              and vectorizable_store correctly with adjusted alignments.
1778              Drop the body_cst_vec on the floor here.  */
1779           body_cost_vec.release ();
1780
1781           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1782           gcc_assert (stat);
1783           return stat;
1784         }
1785     }
1786
1787   body_cost_vec.release ();
1788
1789   /* (2) Versioning to force alignment.  */
1790
1791   /* Try versioning if:
1792      1) optimize loop for speed
1793      2) there is at least one unsupported misaligned data ref with an unknown
1794         misalignment, and
1795      3) all misaligned data refs with a known misalignment are supported, and
1796      4) the number of runtime alignment checks is within reason.  */
1797
1798   do_versioning =
1799         optimize_loop_nest_for_speed_p (loop)
1800         && (!loop->inner); /* FORNOW */
1801
1802   if (do_versioning)
1803     {
1804       FOR_EACH_VEC_ELT (datarefs, i, dr)
1805         {
1806           stmt = DR_STMT (dr);
1807           stmt_info = vinfo_for_stmt (stmt);
1808
1809           /* For interleaving, only the alignment of the first access
1810              matters.  */
1811           if (aligned_access_p (dr)
1812               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1813                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1814             continue;
1815
1816           if (STMT_VINFO_STRIDED_P (stmt_info))
1817             {
1818               /* Strided loads perform only component accesses, alignment is
1819                  irrelevant for them.  */
1820               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1821                 continue;
1822               do_versioning = false;
1823               break;
1824             }
1825
1826           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1827
1828           if (!supportable_dr_alignment)
1829             {
1830               gimple stmt;
1831               int mask;
1832               tree vectype;
1833
1834               if (known_alignment_for_access_p (dr)
1835                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1836                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1837                 {
1838                   do_versioning = false;
1839                   break;
1840                 }
1841
1842               stmt = DR_STMT (dr);
1843               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1844               gcc_assert (vectype);
1845
1846               /* The rightmost bits of an aligned address must be zeros.
1847                  Construct the mask needed for this test.  For example,
1848                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1849                  mask must be 15 = 0xf. */
1850               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1851
1852               /* FORNOW: use the same mask to test all potentially unaligned
1853                  references in the loop.  The vectorizer currently supports
1854                  a single vector size, see the reference to
1855                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1856                  vectorization factor is computed.  */
1857               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1858                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1859               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1860               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1861                       DR_STMT (dr));
1862             }
1863         }
1864
1865       /* Versioning requires at least one misaligned data reference.  */
1866       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1867         do_versioning = false;
1868       else if (!do_versioning)
1869         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1870     }
1871
1872   if (do_versioning)
1873     {
1874       vec<gimple> may_misalign_stmts
1875         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1876       gimple stmt;
1877
1878       /* It can now be assumed that the data references in the statements
1879          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1880          of the loop being vectorized.  */
1881       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1882         {
1883           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1884           dr = STMT_VINFO_DATA_REF (stmt_info);
1885           SET_DR_MISALIGNMENT (dr, 0);
1886           if (dump_enabled_p ())
1887             dump_printf_loc (MSG_NOTE, vect_location,
1888                              "Alignment of access forced using versioning.\n");
1889         }
1890
1891       if (dump_enabled_p ())
1892         dump_printf_loc (MSG_NOTE, vect_location,
1893                          "Versioning for alignment will be applied.\n");
1894
1895       /* Peeling and versioning can't be done together at this time.  */
1896       gcc_assert (! (do_peeling && do_versioning));
1897
1898       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1899       gcc_assert (stat);
1900       return stat;
1901     }
1902
1903   /* This point is reached if neither peeling nor versioning is being done.  */
1904   gcc_assert (! (do_peeling || do_versioning));
1905
1906   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1907   return stat;
1908 }
1909
1910
1911 /* Function vect_find_same_alignment_drs.
1912
1913    Update group and alignment relations according to the chosen
1914    vectorization factor.  */
1915
1916 static void
1917 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1918                               loop_vec_info loop_vinfo)
1919 {
1920   unsigned int i;
1921   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1922   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1923   struct data_reference *dra = DDR_A (ddr);
1924   struct data_reference *drb = DDR_B (ddr);
1925   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1926   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1927   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1928   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1929   lambda_vector dist_v;
1930   unsigned int loop_depth;
1931
1932   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1933     return;
1934
1935   if (dra == drb)
1936     return;
1937
1938   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1939     return;
1940
1941   /* Loop-based vectorization and known data dependence.  */
1942   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1943     return;
1944
1945   /* Data-dependence analysis reports a distance vector of zero
1946      for data-references that overlap only in the first iteration
1947      but have different sign step (see PR45764).
1948      So as a sanity check require equal DR_STEP.  */
1949   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1950     return;
1951
1952   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1953   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1954     {
1955       int dist = dist_v[loop_depth];
1956
1957       if (dump_enabled_p ())
1958         dump_printf_loc (MSG_NOTE, vect_location,
1959                          "dependence distance  = %d.\n", dist);
1960
1961       /* Same loop iteration.  */
1962       if (dist == 0
1963           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1964         {
1965           /* Two references with distance zero have the same alignment.  */
1966           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1967           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1968           if (dump_enabled_p ())
1969             {
1970               dump_printf_loc (MSG_NOTE, vect_location,
1971                                "accesses have the same alignment.\n");
1972               dump_printf (MSG_NOTE,
1973                            "dependence distance modulo vf == 0 between ");
1974               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1975               dump_printf (MSG_NOTE,  " and ");
1976               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1977               dump_printf (MSG_NOTE, "\n");
1978             }
1979         }
1980     }
1981 }
1982
1983
1984 /* Function vect_analyze_data_refs_alignment
1985
1986    Analyze the alignment of the data-references in the loop.
1987    Return FALSE if a data reference is found that cannot be vectorized.  */
1988
1989 bool
1990 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1991                                   bb_vec_info bb_vinfo)
1992 {
1993   if (dump_enabled_p ())
1994     dump_printf_loc (MSG_NOTE, vect_location,
1995                      "=== vect_analyze_data_refs_alignment ===\n");
1996
1997   /* Mark groups of data references with same alignment using
1998      data dependence information.  */
1999   if (loop_vinfo)
2000     {
2001       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2002       struct data_dependence_relation *ddr;
2003       unsigned int i;
2004
2005       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2006         vect_find_same_alignment_drs (ddr, loop_vinfo);
2007     }
2008
2009   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2010     {
2011       if (dump_enabled_p ())
2012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013                          "not vectorized: can't calculate alignment "
2014                          "for data ref.\n");
2015       return false;
2016     }
2017
2018   return true;
2019 }
2020
2021
2022 /* Analyze groups of accesses: check that DR belongs to a group of
2023    accesses of legal size, step, etc.  Detect gaps, single element
2024    interleaving, and other special cases. Set grouped access info.
2025    Collect groups of strided stores for further use in SLP analysis.  */
2026
2027 static bool
2028 vect_analyze_group_access (struct data_reference *dr)
2029 {
2030   tree step = DR_STEP (dr);
2031   tree scalar_type = TREE_TYPE (DR_REF (dr));
2032   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2033   gimple stmt = DR_STMT (dr);
2034   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2035   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2036   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2037   HOST_WIDE_INT dr_step = -1;
2038   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2039   bool slp_impossible = false;
2040   struct loop *loop = NULL;
2041
2042   if (loop_vinfo)
2043     loop = LOOP_VINFO_LOOP (loop_vinfo);
2044
2045   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2046      size of the interleaving group (including gaps).  */
2047   if (tree_fits_shwi_p (step))
2048     {
2049       dr_step = tree_to_shwi (step);
2050       groupsize = absu_hwi (dr_step) / type_size;
2051     }
2052   else
2053     groupsize = 0;
2054
2055   /* Not consecutive access is possible only if it is a part of interleaving.  */
2056   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2057     {
2058       /* Check if it this DR is a part of interleaving, and is a single
2059          element of the group that is accessed in the loop.  */
2060
2061       /* Gaps are supported only for loads. STEP must be a multiple of the type
2062          size.  The size of the group must be a power of 2.  */
2063       if (DR_IS_READ (dr)
2064           && (dr_step % type_size) == 0
2065           && groupsize > 0
2066           && exact_log2 (groupsize) != -1)
2067         {
2068           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2069           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2070           if (dump_enabled_p ())
2071             {
2072               dump_printf_loc (MSG_NOTE, vect_location,
2073                                "Detected single element interleaving ");
2074               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2075               dump_printf (MSG_NOTE, " step ");
2076               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2077               dump_printf (MSG_NOTE, "\n");
2078             }
2079
2080           if (loop_vinfo)
2081             {
2082               if (dump_enabled_p ())
2083                 dump_printf_loc (MSG_NOTE, vect_location,
2084                                  "Data access with gaps requires scalar "
2085                                  "epilogue loop\n");
2086               if (loop->inner)
2087                 {
2088                   if (dump_enabled_p ())
2089                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2090                                      "Peeling for outer loop is not"
2091                                      " supported\n");
2092                   return false;
2093                 }
2094
2095               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2096             }
2097
2098           return true;
2099         }
2100
2101       if (dump_enabled_p ())
2102         {
2103           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2104                            "not consecutive access ");
2105           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2106           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2107         }
2108
2109       if (bb_vinfo)
2110         {
2111           /* Mark the statement as unvectorizable.  */
2112           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2113           return true;
2114         }
2115
2116       return false;
2117     }
2118
2119   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2120     {
2121       /* First stmt in the interleaving chain. Check the chain.  */
2122       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2123       struct data_reference *data_ref = dr;
2124       unsigned int count = 1;
2125       tree prev_init = DR_INIT (data_ref);
2126       gimple prev = stmt;
2127       HOST_WIDE_INT diff, gaps = 0;
2128
2129       while (next)
2130         {
2131           /* Skip same data-refs.  In case that two or more stmts share
2132              data-ref (supported only for loads), we vectorize only the first
2133              stmt, and the rest get their vectorized loads from the first
2134              one.  */
2135           if (!tree_int_cst_compare (DR_INIT (data_ref),
2136                                      DR_INIT (STMT_VINFO_DATA_REF (
2137                                                    vinfo_for_stmt (next)))))
2138             {
2139               if (DR_IS_WRITE (data_ref))
2140                 {
2141                   if (dump_enabled_p ())
2142                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2143                                      "Two store stmts share the same dr.\n");
2144                   return false;
2145                 }
2146
2147               /* For load use the same data-ref load.  */
2148               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2149
2150               prev = next;
2151               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2152               continue;
2153             }
2154
2155           prev = next;
2156           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2157
2158           /* All group members have the same STEP by construction.  */
2159           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2160
2161           /* Check that the distance between two accesses is equal to the type
2162              size. Otherwise, we have gaps.  */
2163           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2164                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2165           if (diff != 1)
2166             {
2167               /* FORNOW: SLP of accesses with gaps is not supported.  */
2168               slp_impossible = true;
2169               if (DR_IS_WRITE (data_ref))
2170                 {
2171                   if (dump_enabled_p ())
2172                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173                                      "interleaved store with gaps\n");
2174                   return false;
2175                 }
2176
2177               gaps += diff - 1;
2178             }
2179
2180           last_accessed_element += diff;
2181
2182           /* Store the gap from the previous member of the group. If there is no
2183              gap in the access, GROUP_GAP is always 1.  */
2184           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2185
2186           prev_init = DR_INIT (data_ref);
2187           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2188           /* Count the number of data-refs in the chain.  */
2189           count++;
2190         }
2191
2192       if (groupsize == 0)
2193         groupsize = count + gaps;
2194
2195       /* Check that the size of the interleaving is equal to count for stores,
2196          i.e., that there are no gaps.  */
2197       if (groupsize != count
2198           && !DR_IS_READ (dr))
2199         {
2200           if (dump_enabled_p ())
2201             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2202                              "interleaved store with gaps\n");
2203           return false;
2204         }
2205
2206       /* If there is a gap after the last load in the group it is the
2207          difference between the groupsize and the last accessed
2208          element.
2209          When there is no gap, this difference should be 0.  */
2210       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2211
2212       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2213       if (dump_enabled_p ())
2214         {
2215           dump_printf_loc (MSG_NOTE, vect_location,
2216                            "Detected interleaving of size %d starting with ",
2217                            (int)groupsize);
2218           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2219           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2220             dump_printf_loc (MSG_NOTE, vect_location,
2221                              "There is a gap of %d elements after the group\n",
2222                              (int)GROUP_GAP (vinfo_for_stmt (stmt)));
2223         }
2224
2225       /* SLP: create an SLP data structure for every interleaving group of
2226          stores for further analysis in vect_analyse_slp.  */
2227       if (DR_IS_WRITE (dr) && !slp_impossible)
2228         {
2229           if (loop_vinfo)
2230             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2231           if (bb_vinfo)
2232             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2233         }
2234
2235       /* There is a gap in the end of the group.  */
2236       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2237         {
2238           if (dump_enabled_p ())
2239             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240                              "Data access with gaps requires scalar "
2241                              "epilogue loop\n");
2242           if (loop->inner)
2243             {
2244               if (dump_enabled_p ())
2245                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246                                  "Peeling for outer loop is not supported\n");
2247               return false;
2248             }
2249
2250           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2251         }
2252     }
2253
2254   return true;
2255 }
2256
2257
2258 /* Analyze the access pattern of the data-reference DR.
2259    In case of non-consecutive accesses call vect_analyze_group_access() to
2260    analyze groups of accesses.  */
2261
2262 static bool
2263 vect_analyze_data_ref_access (struct data_reference *dr)
2264 {
2265   tree step = DR_STEP (dr);
2266   tree scalar_type = TREE_TYPE (DR_REF (dr));
2267   gimple stmt = DR_STMT (dr);
2268   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2269   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2270   struct loop *loop = NULL;
2271
2272   if (loop_vinfo)
2273     loop = LOOP_VINFO_LOOP (loop_vinfo);
2274
2275   if (loop_vinfo && !step)
2276     {
2277       if (dump_enabled_p ())
2278         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2279                          "bad data-ref access in loop\n");
2280       return false;
2281     }
2282
2283   /* Allow loads with zero step in inner-loop vectorization.  */
2284   if (loop_vinfo && integer_zerop (step))
2285     {
2286       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2287       if (!nested_in_vect_loop_p (loop, stmt))
2288         return DR_IS_READ (dr);
2289       /* Allow references with zero step for outer loops marked
2290          with pragma omp simd only - it guarantees absence of
2291          loop-carried dependencies between inner loop iterations.  */
2292       if (!loop->force_vectorize)
2293         {
2294           if (dump_enabled_p ())
2295             dump_printf_loc (MSG_NOTE, vect_location,
2296                              "zero step in inner loop of nest\n");
2297           return false;
2298         }
2299     }
2300
2301   if (loop && nested_in_vect_loop_p (loop, stmt))
2302     {
2303       /* Interleaved accesses are not yet supported within outer-loop
2304         vectorization for references in the inner-loop.  */
2305       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2306
2307       /* For the rest of the analysis we use the outer-loop step.  */
2308       step = STMT_VINFO_DR_STEP (stmt_info);
2309       if (integer_zerop (step))
2310         {
2311           if (dump_enabled_p ())
2312             dump_printf_loc (MSG_NOTE, vect_location,
2313                              "zero step in outer loop.\n");
2314           if (DR_IS_READ (dr))
2315             return true;
2316           else
2317             return false;
2318         }
2319     }
2320
2321   /* Consecutive?  */
2322   if (TREE_CODE (step) == INTEGER_CST)
2323     {
2324       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2325       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2326           || (dr_step < 0
2327               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2328         {
2329           /* Mark that it is not interleaving.  */
2330           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2331           return true;
2332         }
2333     }
2334
2335   if (loop && nested_in_vect_loop_p (loop, stmt))
2336     {
2337       if (dump_enabled_p ())
2338         dump_printf_loc (MSG_NOTE, vect_location,
2339                          "grouped access in outer loop.\n");
2340       return false;
2341     }
2342
2343
2344   /* Assume this is a DR handled by non-constant strided load case.  */
2345   if (TREE_CODE (step) != INTEGER_CST)
2346     return (STMT_VINFO_STRIDED_P (stmt_info)
2347             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2348                 || vect_analyze_group_access (dr)));
2349
2350   /* Not consecutive access - check if it's a part of interleaving group.  */
2351   return vect_analyze_group_access (dr);
2352 }
2353
2354
2355
2356 /*  A helper function used in the comparator function to sort data
2357     references.  T1 and T2 are two data references to be compared.
2358     The function returns -1, 0, or 1.  */
2359
2360 static int
2361 compare_tree (tree t1, tree t2)
2362 {
2363   int i, cmp;
2364   enum tree_code code;
2365   char tclass;
2366
2367   if (t1 == t2)
2368     return 0;
2369   if (t1 == NULL)
2370     return -1;
2371   if (t2 == NULL)
2372     return 1;
2373
2374
2375   if (TREE_CODE (t1) != TREE_CODE (t2))
2376     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2377
2378   code = TREE_CODE (t1);
2379   switch (code)
2380     {
2381     /* For const values, we can just use hash values for comparisons.  */
2382     case INTEGER_CST:
2383     case REAL_CST:
2384     case FIXED_CST:
2385     case STRING_CST:
2386     case COMPLEX_CST:
2387     case VECTOR_CST:
2388       {
2389         hashval_t h1 = iterative_hash_expr (t1, 0);
2390         hashval_t h2 = iterative_hash_expr (t2, 0);
2391         if (h1 != h2)
2392           return h1 < h2 ? -1 : 1;
2393         break;
2394       }
2395
2396     case SSA_NAME:
2397       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2398       if (cmp != 0)
2399         return cmp;
2400
2401       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2402         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2403       break;
2404
2405     default:
2406       tclass = TREE_CODE_CLASS (code);
2407
2408       /* For var-decl, we could compare their UIDs.  */
2409       if (tclass == tcc_declaration)
2410         {
2411           if (DECL_UID (t1) != DECL_UID (t2))
2412             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2413           break;
2414         }
2415
2416       /* For expressions with operands, compare their operands recursively.  */
2417       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2418         {
2419           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2420           if (cmp != 0)
2421             return cmp;
2422         }
2423     }
2424
2425   return 0;
2426 }
2427
2428
2429 /* Compare two data-references DRA and DRB to group them into chunks
2430    suitable for grouping.  */
2431
2432 static int
2433 dr_group_sort_cmp (const void *dra_, const void *drb_)
2434 {
2435   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2436   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2437   int cmp;
2438
2439   /* Stabilize sort.  */
2440   if (dra == drb)
2441     return 0;
2442
2443   /* Ordering of DRs according to base.  */
2444   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2445     {
2446       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2447       if (cmp != 0)
2448         return cmp;
2449     }
2450
2451   /* And according to DR_OFFSET.  */
2452   if (!dr_equal_offsets_p (dra, drb))
2453     {
2454       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2455       if (cmp != 0)
2456         return cmp;
2457     }
2458
2459   /* Put reads before writes.  */
2460   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2461     return DR_IS_READ (dra) ? -1 : 1;
2462
2463   /* Then sort after access size.  */
2464   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2465                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2466     {
2467       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2468                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2469       if (cmp != 0)
2470         return cmp;
2471     }
2472
2473   /* And after step.  */
2474   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2475     {
2476       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2477       if (cmp != 0)
2478         return cmp;
2479     }
2480
2481   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2482   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2483   if (cmp == 0)
2484     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2485   return cmp;
2486 }
2487
2488 /* Function vect_analyze_data_ref_accesses.
2489
2490    Analyze the access pattern of all the data references in the loop.
2491
2492    FORNOW: the only access pattern that is considered vectorizable is a
2493            simple step 1 (consecutive) access.
2494
2495    FORNOW: handle only arrays and pointer accesses.  */
2496
2497 bool
2498 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2499 {
2500   unsigned int i;
2501   vec<data_reference_p> datarefs;
2502   struct data_reference *dr;
2503
2504   if (dump_enabled_p ())
2505     dump_printf_loc (MSG_NOTE, vect_location,
2506                      "=== vect_analyze_data_ref_accesses ===\n");
2507
2508   if (loop_vinfo)
2509     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2510   else
2511     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2512
2513   if (datarefs.is_empty ())
2514     return true;
2515
2516   /* Sort the array of datarefs to make building the interleaving chains
2517      linear.  Don't modify the original vector's order, it is needed for
2518      determining what dependencies are reversed.  */
2519   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2520   datarefs_copy.qsort (dr_group_sort_cmp);
2521
2522   /* Build the interleaving chains.  */
2523   for (i = 0; i < datarefs_copy.length () - 1;)
2524     {
2525       data_reference_p dra = datarefs_copy[i];
2526       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2527       stmt_vec_info lastinfo = NULL;
2528       for (i = i + 1; i < datarefs_copy.length (); ++i)
2529         {
2530           data_reference_p drb = datarefs_copy[i];
2531           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2532
2533           /* ???  Imperfect sorting (non-compatible types, non-modulo
2534              accesses, same accesses) can lead to a group to be artificially
2535              split here as we don't just skip over those.  If it really
2536              matters we can push those to a worklist and re-iterate
2537              over them.  The we can just skip ahead to the next DR here.  */
2538
2539           /* Check that the data-refs have same first location (except init)
2540              and they are both either store or load (not load and store,
2541              not masked loads or stores).  */
2542           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2543               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2544                                    DR_BASE_ADDRESS (drb), 0)
2545               || !dr_equal_offsets_p (dra, drb)
2546               || !gimple_assign_single_p (DR_STMT (dra))
2547               || !gimple_assign_single_p (DR_STMT (drb)))
2548             break;
2549
2550           /* Check that the data-refs have the same constant size.  */
2551           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2552           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2553           if (!tree_fits_uhwi_p (sza)
2554               || !tree_fits_uhwi_p (szb)
2555               || !tree_int_cst_equal (sza, szb))
2556             break;
2557
2558           /* Check that the data-refs have the same step.  */
2559           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2560             break;
2561
2562           /* Do not place the same access in the interleaving chain twice.  */
2563           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2564             break;
2565
2566           /* Check the types are compatible.
2567              ???  We don't distinguish this during sorting.  */
2568           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2569                                    TREE_TYPE (DR_REF (drb))))
2570             break;
2571
2572           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2573           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2574           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2575           gcc_assert (init_a < init_b);
2576
2577           /* If init_b == init_a + the size of the type * k, we have an
2578              interleaving, and DRA is accessed before DRB.  */
2579           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2580           if ((init_b - init_a) % type_size_a != 0)
2581             break;
2582
2583           /* If we have a store, the accesses are adjacent.  This splits
2584              groups into chunks we support (we don't support vectorization
2585              of stores with gaps).  */
2586           if (!DR_IS_READ (dra)
2587               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2588                                              (DR_INIT (datarefs_copy[i-1]))
2589                   != type_size_a))
2590             break;
2591
2592           /* If the step (if not zero or non-constant) is greater than the
2593              difference between data-refs' inits this splits groups into
2594              suitable sizes.  */
2595           if (tree_fits_shwi_p (DR_STEP (dra)))
2596             {
2597               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2598               if (step != 0 && step <= (init_b - init_a))
2599                 break;
2600             }
2601
2602           if (dump_enabled_p ())
2603             {
2604               dump_printf_loc (MSG_NOTE, vect_location,
2605                                "Detected interleaving ");
2606               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2607               dump_printf (MSG_NOTE,  " and ");
2608               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2609               dump_printf (MSG_NOTE, "\n");
2610             }
2611
2612           /* Link the found element into the group list.  */
2613           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2614             {
2615               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2616               lastinfo = stmtinfo_a;
2617             }
2618           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2619           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2620           lastinfo = stmtinfo_b;
2621         }
2622     }
2623
2624   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2625     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2626         && !vect_analyze_data_ref_access (dr))
2627       {
2628         if (dump_enabled_p ())
2629           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2630                            "not vectorized: complicated access pattern.\n");
2631
2632         if (bb_vinfo)
2633           {
2634             /* Mark the statement as not vectorizable.  */
2635             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2636             continue;
2637           }
2638         else
2639           {
2640             datarefs_copy.release ();
2641             return false;
2642           }
2643       }
2644
2645   datarefs_copy.release ();
2646   return true;
2647 }
2648
2649
2650 /* Operator == between two dr_with_seg_len objects.
2651
2652    This equality operator is used to make sure two data refs
2653    are the same one so that we will consider to combine the
2654    aliasing checks of those two pairs of data dependent data
2655    refs.  */
2656
2657 static bool
2658 operator == (const dr_with_seg_len& d1,
2659              const dr_with_seg_len& d2)
2660 {
2661   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2662                           DR_BASE_ADDRESS (d2.dr), 0)
2663            && compare_tree (d1.offset, d2.offset) == 0
2664            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2665 }
2666
2667 /* Function comp_dr_with_seg_len_pair.
2668
2669    Comparison function for sorting objects of dr_with_seg_len_pair_t
2670    so that we can combine aliasing checks in one scan.  */
2671
2672 static int
2673 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2674 {
2675   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2676   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2677
2678   const dr_with_seg_len &p11 = p1->first,
2679                         &p12 = p1->second,
2680                         &p21 = p2->first,
2681                         &p22 = p2->second;
2682
2683   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2684      if a and c have the same basic address snd step, and b and d have the same
2685      address and step.  Therefore, if any a&c or b&d don't have the same address
2686      and step, we don't care the order of those two pairs after sorting.  */
2687   int comp_res;
2688
2689   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2690                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2691     return comp_res;
2692   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2693                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2694     return comp_res;
2695   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2696     return comp_res;
2697   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2698     return comp_res;
2699   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2700     return comp_res;
2701   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2702     return comp_res;
2703
2704   return 0;
2705 }
2706
2707 /* Function vect_vfa_segment_size.
2708
2709    Create an expression that computes the size of segment
2710    that will be accessed for a data reference.  The functions takes into
2711    account that realignment loads may access one more vector.
2712
2713    Input:
2714      DR: The data reference.
2715      LENGTH_FACTOR: segment length to consider.
2716
2717    Return an expression whose value is the size of segment which will be
2718    accessed by DR.  */
2719
2720 static tree
2721 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2722 {
2723   tree segment_length;
2724
2725   if (integer_zerop (DR_STEP (dr)))
2726     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2727   else
2728     segment_length = size_binop (MULT_EXPR,
2729                                  fold_convert (sizetype, DR_STEP (dr)),
2730                                  fold_convert (sizetype, length_factor));
2731
2732   if (vect_supportable_dr_alignment (dr, false)
2733         == dr_explicit_realign_optimized)
2734     {
2735       tree vector_size = TYPE_SIZE_UNIT
2736                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2737
2738       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2739     }
2740   return segment_length;
2741 }
2742
2743 /* Function vect_prune_runtime_alias_test_list.
2744
2745    Prune a list of ddrs to be tested at run-time by versioning for alias.
2746    Merge several alias checks into one if possible.
2747    Return FALSE if resulting list of ddrs is longer then allowed by
2748    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2749
2750 bool
2751 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2752 {
2753   vec<ddr_p> may_alias_ddrs =
2754     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2755   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2756     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2757   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2758   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2759
2760   ddr_p ddr;
2761   unsigned int i;
2762   tree length_factor;
2763
2764   if (dump_enabled_p ())
2765     dump_printf_loc (MSG_NOTE, vect_location,
2766                      "=== vect_prune_runtime_alias_test_list ===\n");
2767
2768   if (may_alias_ddrs.is_empty ())
2769     return true;
2770
2771   /* Basically, for each pair of dependent data refs store_ptr_0
2772      and load_ptr_0, we create an expression:
2773
2774      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2775      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2776
2777      for aliasing checks.  However, in some cases we can decrease
2778      the number of checks by combining two checks into one.  For
2779      example, suppose we have another pair of data refs store_ptr_0
2780      and load_ptr_1, and if the following condition is satisfied:
2781
2782      load_ptr_0 < load_ptr_1  &&
2783      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2784
2785      (this condition means, in each iteration of vectorized loop,
2786      the accessed memory of store_ptr_0 cannot be between the memory
2787      of load_ptr_0 and load_ptr_1.)
2788
2789      we then can use only the following expression to finish the
2790      alising checks between store_ptr_0 & load_ptr_0 and
2791      store_ptr_0 & load_ptr_1:
2792
2793      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2794      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2795
2796      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2797      same basic address.  */
2798
2799   comp_alias_ddrs.create (may_alias_ddrs.length ());
2800
2801   /* First, we collect all data ref pairs for aliasing checks.  */
2802   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2803     {
2804       struct data_reference *dr_a, *dr_b;
2805       gimple dr_group_first_a, dr_group_first_b;
2806       tree segment_length_a, segment_length_b;
2807       gimple stmt_a, stmt_b;
2808
2809       dr_a = DDR_A (ddr);
2810       stmt_a = DR_STMT (DDR_A (ddr));
2811       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2812       if (dr_group_first_a)
2813         {
2814           stmt_a = dr_group_first_a;
2815           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2816         }
2817
2818       dr_b = DDR_B (ddr);
2819       stmt_b = DR_STMT (DDR_B (ddr));
2820       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2821       if (dr_group_first_b)
2822         {
2823           stmt_b = dr_group_first_b;
2824           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2825         }
2826
2827       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2828         length_factor = scalar_loop_iters;
2829       else
2830         length_factor = size_int (vect_factor);
2831       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2832       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2833
2834       dr_with_seg_len_pair_t dr_with_seg_len_pair
2835           (dr_with_seg_len (dr_a, segment_length_a),
2836            dr_with_seg_len (dr_b, segment_length_b));
2837
2838       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2839         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2840
2841       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2842     }
2843
2844   /* Second, we sort the collected data ref pairs so that we can scan
2845      them once to combine all possible aliasing checks.  */
2846   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2847
2848   /* Third, we scan the sorted dr pairs and check if we can combine
2849      alias checks of two neighbouring dr pairs.  */
2850   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2851     {
2852       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2853       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2854                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2855                       *dr_a2 = &comp_alias_ddrs[i].first,
2856                       *dr_b2 = &comp_alias_ddrs[i].second;
2857
2858       /* Remove duplicate data ref pairs.  */
2859       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2860         {
2861           if (dump_enabled_p ())
2862             {
2863               dump_printf_loc (MSG_NOTE, vect_location,
2864                                "found equal ranges ");
2865               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2866                                  DR_REF (dr_a1->dr));
2867               dump_printf (MSG_NOTE,  ", ");
2868               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2869                                  DR_REF (dr_b1->dr));
2870               dump_printf (MSG_NOTE,  " and ");
2871               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2872                                  DR_REF (dr_a2->dr));
2873               dump_printf (MSG_NOTE,  ", ");
2874               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2875                                  DR_REF (dr_b2->dr));
2876               dump_printf (MSG_NOTE, "\n");
2877             }
2878
2879           comp_alias_ddrs.ordered_remove (i--);
2880           continue;
2881         }
2882
2883       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2884         {
2885           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2886              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2887           if (*dr_a1 == *dr_a2)
2888             {
2889               std::swap (dr_a1, dr_b1);
2890               std::swap (dr_a2, dr_b2);
2891             }
2892
2893           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2894                                 DR_BASE_ADDRESS (dr_a2->dr),
2895                                 0)
2896               || !tree_fits_shwi_p (dr_a1->offset)
2897               || !tree_fits_shwi_p (dr_a2->offset))
2898             continue;
2899
2900           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2901                                 - tree_to_shwi (dr_a1->offset));
2902
2903
2904           /* Now we check if the following condition is satisfied:
2905
2906              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2907
2908              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2909              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2910              have to make a best estimation.  We can get the minimum value
2911              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2912              then either of the following two conditions can guarantee the
2913              one above:
2914
2915              1: DIFF <= MIN_SEG_LEN_B
2916              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2917
2918              */
2919
2920           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2921                                           ? tree_to_shwi (dr_b1->seg_len)
2922                                           : vect_factor);
2923
2924           if (diff <= min_seg_len_b
2925               || (tree_fits_shwi_p (dr_a1->seg_len)
2926                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2927             {
2928               if (dump_enabled_p ())
2929                 {
2930                   dump_printf_loc (MSG_NOTE, vect_location,
2931                                    "merging ranges for ");
2932                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2933                                      DR_REF (dr_a1->dr));
2934                   dump_printf (MSG_NOTE,  ", ");
2935                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2936                                      DR_REF (dr_b1->dr));
2937                   dump_printf (MSG_NOTE,  " and ");
2938                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2939                                      DR_REF (dr_a2->dr));
2940                   dump_printf (MSG_NOTE,  ", ");
2941                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2942                                      DR_REF (dr_b2->dr));
2943                   dump_printf (MSG_NOTE, "\n");
2944                 }
2945
2946               dr_a1->seg_len = size_binop (PLUS_EXPR,
2947                                            dr_a2->seg_len, size_int (diff));
2948               comp_alias_ddrs.ordered_remove (i--);
2949             }
2950         }
2951     }
2952
2953   dump_printf_loc (MSG_NOTE, vect_location,
2954                    "improved number of alias checks from %d to %d\n",
2955                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2956   if ((int) comp_alias_ddrs.length () >
2957       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2958     return false;
2959
2960   return true;
2961 }
2962
2963 /* Check whether a non-affine read in stmt is suitable for gather load
2964    and if so, return a builtin decl for that operation.  */
2965
2966 tree
2967 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2968                    tree *offp, int *scalep)
2969 {
2970   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2971   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2972   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2973   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2974   tree offtype = NULL_TREE;
2975   tree decl, base, off;
2976   machine_mode pmode;
2977   int punsignedp, pvolatilep;
2978
2979   base = DR_REF (dr);
2980   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2981      see if we can use the def stmt of the address.  */
2982   if (is_gimple_call (stmt)
2983       && gimple_call_internal_p (stmt)
2984       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2985           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2986       && TREE_CODE (base) == MEM_REF
2987       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2988       && integer_zerop (TREE_OPERAND (base, 1))
2989       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2990     {
2991       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2992       if (is_gimple_assign (def_stmt)
2993           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
2994         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
2995     }
2996
2997   /* The gather builtins need address of the form
2998      loop_invariant + vector * {1, 2, 4, 8}
2999      or
3000      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3001      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3002      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3003      multiplications and additions in it.  To get a vector, we need
3004      a single SSA_NAME that will be defined in the loop and will
3005      contain everything that is not loop invariant and that can be
3006      vectorized.  The following code attempts to find such a preexistng
3007      SSA_NAME OFF and put the loop invariants into a tree BASE
3008      that can be gimplified before the loop.  */
3009   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3010                               &pmode, &punsignedp, &pvolatilep, false);
3011   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3012
3013   if (TREE_CODE (base) == MEM_REF)
3014     {
3015       if (!integer_zerop (TREE_OPERAND (base, 1)))
3016         {
3017           if (off == NULL_TREE)
3018             {
3019               offset_int moff = mem_ref_offset (base);
3020               off = wide_int_to_tree (sizetype, moff);
3021             }
3022           else
3023             off = size_binop (PLUS_EXPR, off,
3024                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3025         }
3026       base = TREE_OPERAND (base, 0);
3027     }
3028   else
3029     base = build_fold_addr_expr (base);
3030
3031   if (off == NULL_TREE)
3032     off = size_zero_node;
3033
3034   /* If base is not loop invariant, either off is 0, then we start with just
3035      the constant offset in the loop invariant BASE and continue with base
3036      as OFF, otherwise give up.
3037      We could handle that case by gimplifying the addition of base + off
3038      into some SSA_NAME and use that as off, but for now punt.  */
3039   if (!expr_invariant_in_loop_p (loop, base))
3040     {
3041       if (!integer_zerop (off))
3042         return NULL_TREE;
3043       off = base;
3044       base = size_int (pbitpos / BITS_PER_UNIT);
3045     }
3046   /* Otherwise put base + constant offset into the loop invariant BASE
3047      and continue with OFF.  */
3048   else
3049     {
3050       base = fold_convert (sizetype, base);
3051       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3052     }
3053
3054   /* OFF at this point may be either a SSA_NAME or some tree expression
3055      from get_inner_reference.  Try to peel off loop invariants from it
3056      into BASE as long as possible.  */
3057   STRIP_NOPS (off);
3058   while (offtype == NULL_TREE)
3059     {
3060       enum tree_code code;
3061       tree op0, op1, add = NULL_TREE;
3062
3063       if (TREE_CODE (off) == SSA_NAME)
3064         {
3065           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3066
3067           if (expr_invariant_in_loop_p (loop, off))
3068             return NULL_TREE;
3069
3070           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3071             break;
3072
3073           op0 = gimple_assign_rhs1 (def_stmt);
3074           code = gimple_assign_rhs_code (def_stmt);
3075           op1 = gimple_assign_rhs2 (def_stmt);
3076         }
3077       else
3078         {
3079           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3080             return NULL_TREE;
3081           code = TREE_CODE (off);
3082           extract_ops_from_tree (off, &code, &op0, &op1);
3083         }
3084       switch (code)
3085         {
3086         case POINTER_PLUS_EXPR:
3087         case PLUS_EXPR:
3088           if (expr_invariant_in_loop_p (loop, op0))
3089             {
3090               add = op0;
3091               off = op1;
3092             do_add:
3093               add = fold_convert (sizetype, add);
3094               if (scale != 1)
3095                 add = size_binop (MULT_EXPR, add, size_int (scale));
3096               base = size_binop (PLUS_EXPR, base, add);
3097               continue;
3098             }
3099           if (expr_invariant_in_loop_p (loop, op1))
3100             {
3101               add = op1;
3102               off = op0;
3103               goto do_add;
3104             }
3105           break;
3106         case MINUS_EXPR:
3107           if (expr_invariant_in_loop_p (loop, op1))
3108             {
3109               add = fold_convert (sizetype, op1);
3110               add = size_binop (MINUS_EXPR, size_zero_node, add);
3111               off = op0;
3112               goto do_add;
3113             }
3114           break;
3115         case MULT_EXPR:
3116           if (scale == 1 && tree_fits_shwi_p (op1))
3117             {
3118               scale = tree_to_shwi (op1);
3119               off = op0;
3120               continue;
3121             }
3122           break;
3123         case SSA_NAME:
3124           off = op0;
3125           continue;
3126         CASE_CONVERT:
3127           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3128               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3129             break;
3130           if (TYPE_PRECISION (TREE_TYPE (op0))
3131               == TYPE_PRECISION (TREE_TYPE (off)))
3132             {
3133               off = op0;
3134               continue;
3135             }
3136           if (TYPE_PRECISION (TREE_TYPE (op0))
3137               < TYPE_PRECISION (TREE_TYPE (off)))
3138             {
3139               off = op0;
3140               offtype = TREE_TYPE (off);
3141               STRIP_NOPS (off);
3142               continue;
3143             }
3144           break;
3145         default:
3146           break;
3147         }
3148       break;
3149     }
3150
3151   /* If at the end OFF still isn't a SSA_NAME or isn't
3152      defined in the loop, punt.  */
3153   if (TREE_CODE (off) != SSA_NAME
3154       || expr_invariant_in_loop_p (loop, off))
3155     return NULL_TREE;
3156
3157   if (offtype == NULL_TREE)
3158     offtype = TREE_TYPE (off);
3159
3160   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3161                                            offtype, scale);
3162   if (decl == NULL_TREE)
3163     return NULL_TREE;
3164
3165   if (basep)
3166     *basep = base;
3167   if (offp)
3168     *offp = off;
3169   if (scalep)
3170     *scalep = scale;
3171   return decl;
3172 }
3173
3174 /* Function vect_analyze_data_refs.
3175
3176   Find all the data references in the loop or basic block.
3177
3178    The general structure of the analysis of data refs in the vectorizer is as
3179    follows:
3180    1- vect_analyze_data_refs(loop/bb): call
3181       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3182       in the loop/bb and their dependences.
3183    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3184    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3185    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3186
3187 */
3188
3189 bool
3190 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3191                         bb_vec_info bb_vinfo,
3192                         int *min_vf, unsigned *n_stmts)
3193 {
3194   struct loop *loop = NULL;
3195   basic_block bb = NULL;
3196   unsigned int i;
3197   vec<data_reference_p> datarefs;
3198   struct data_reference *dr;
3199   tree scalar_type;
3200
3201   if (dump_enabled_p ())
3202     dump_printf_loc (MSG_NOTE, vect_location,
3203                      "=== vect_analyze_data_refs ===\n");
3204
3205   if (loop_vinfo)
3206     {
3207       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3208
3209       loop = LOOP_VINFO_LOOP (loop_vinfo);
3210       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3211       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3212         {
3213           if (dump_enabled_p ())
3214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3215                              "not vectorized: loop contains function calls"
3216                              " or data references that cannot be analyzed\n");
3217           return false;
3218         }
3219
3220       for (i = 0; i < loop->num_nodes; i++)
3221         {
3222           gimple_stmt_iterator gsi;
3223
3224           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3225             {
3226               gimple stmt = gsi_stmt (gsi);
3227               if (is_gimple_debug (stmt))
3228                 continue;
3229               ++*n_stmts;
3230               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3231                 {
3232                   if (is_gimple_call (stmt) && loop->safelen)
3233                     {
3234                       tree fndecl = gimple_call_fndecl (stmt), op;
3235                       if (fndecl != NULL_TREE)
3236                         {
3237                           struct cgraph_node *node = cgraph_node::get (fndecl);
3238                           if (node != NULL && node->simd_clones != NULL)
3239                             {
3240                               unsigned int j, n = gimple_call_num_args (stmt);
3241                               for (j = 0; j < n; j++)
3242                                 {
3243                                   op = gimple_call_arg (stmt, j);
3244                                   if (DECL_P (op)
3245                                       || (REFERENCE_CLASS_P (op)
3246                                           && get_base_address (op)))
3247                                     break;
3248                                 }
3249                               op = gimple_call_lhs (stmt);
3250                               /* Ignore #pragma omp declare simd functions
3251                                  if they don't have data references in the
3252                                  call stmt itself.  */
3253                               if (j == n
3254                                   && !(op
3255                                        && (DECL_P (op)
3256                                            || (REFERENCE_CLASS_P (op)
3257                                                && get_base_address (op)))))
3258                                 continue;
3259                             }
3260                         }
3261                     }
3262                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3263                   if (dump_enabled_p ())
3264                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3265                                      "not vectorized: loop contains function "
3266                                      "calls or data references that cannot "
3267                                      "be analyzed\n");
3268                   return false;
3269                 }
3270             }
3271         }
3272
3273       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3274     }
3275   else
3276     {
3277       gimple_stmt_iterator gsi;
3278
3279       bb = BB_VINFO_BB (bb_vinfo);
3280       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3281         {
3282           gimple stmt = gsi_stmt (gsi);
3283           if (is_gimple_debug (stmt))
3284             continue;
3285           ++*n_stmts;
3286           if (!find_data_references_in_stmt (NULL, stmt,
3287                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3288             {
3289               /* Mark the rest of the basic-block as unvectorizable.  */
3290               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3291                 {
3292                   stmt = gsi_stmt (gsi);
3293                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3294                 }
3295               break;
3296             }
3297         }
3298
3299       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3300     }
3301
3302   /* Go through the data-refs, check that the analysis succeeded.  Update
3303      pointer from stmt_vec_info struct to DR and vectype.  */
3304
3305   FOR_EACH_VEC_ELT (datarefs, i, dr)
3306     {
3307       gimple stmt;
3308       stmt_vec_info stmt_info;
3309       tree base, offset, init;
3310       bool gather = false;
3311       bool simd_lane_access = false;
3312       int vf;
3313
3314 again:
3315       if (!dr || !DR_REF (dr))
3316         {
3317           if (dump_enabled_p ())
3318             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3319                              "not vectorized: unhandled data-ref\n");
3320           return false;
3321         }
3322
3323       stmt = DR_STMT (dr);
3324       stmt_info = vinfo_for_stmt (stmt);
3325
3326       /* Discard clobbers from the dataref vector.  We will remove
3327          clobber stmts during vectorization.  */
3328       if (gimple_clobber_p (stmt))
3329         {
3330           free_data_ref (dr);
3331           if (i == datarefs.length () - 1)
3332             {
3333               datarefs.pop ();
3334               break;
3335             }
3336           datarefs.ordered_remove (i);
3337           dr = datarefs[i];
3338           goto again;
3339         }
3340
3341       /* Check that analysis of the data-ref succeeded.  */
3342       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3343           || !DR_STEP (dr))
3344         {
3345           bool maybe_gather
3346             = DR_IS_READ (dr)
3347               && !TREE_THIS_VOLATILE (DR_REF (dr))
3348               && targetm.vectorize.builtin_gather != NULL;
3349           bool maybe_simd_lane_access
3350             = loop_vinfo && loop->simduid;
3351
3352           /* If target supports vector gather loads, or if this might be
3353              a SIMD lane access, see if they can't be used.  */
3354           if (loop_vinfo
3355               && (maybe_gather || maybe_simd_lane_access)
3356               && !nested_in_vect_loop_p (loop, stmt))
3357             {
3358               struct data_reference *newdr
3359                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3360                                    DR_REF (dr), stmt, true);
3361               gcc_assert (newdr != NULL && DR_REF (newdr));
3362               if (DR_BASE_ADDRESS (newdr)
3363                   && DR_OFFSET (newdr)
3364                   && DR_INIT (newdr)
3365                   && DR_STEP (newdr)
3366                   && integer_zerop (DR_STEP (newdr)))
3367                 {
3368                   if (maybe_simd_lane_access)
3369                     {
3370                       tree off = DR_OFFSET (newdr);
3371                       STRIP_NOPS (off);
3372                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3373                           && TREE_CODE (off) == MULT_EXPR
3374                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3375                         {
3376                           tree step = TREE_OPERAND (off, 1);
3377                           off = TREE_OPERAND (off, 0);
3378                           STRIP_NOPS (off);
3379                           if (CONVERT_EXPR_P (off)
3380                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3381                                                                           0)))
3382                                  < TYPE_PRECISION (TREE_TYPE (off)))
3383                             off = TREE_OPERAND (off, 0);
3384                           if (TREE_CODE (off) == SSA_NAME)
3385                             {
3386                               gimple def = SSA_NAME_DEF_STMT (off);
3387                               tree reft = TREE_TYPE (DR_REF (newdr));
3388                               if (is_gimple_call (def)
3389                                   && gimple_call_internal_p (def)
3390                                   && (gimple_call_internal_fn (def)
3391                                       == IFN_GOMP_SIMD_LANE))
3392                                 {
3393                                   tree arg = gimple_call_arg (def, 0);
3394                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3395                                   arg = SSA_NAME_VAR (arg);
3396                                   if (arg == loop->simduid
3397                                       /* For now.  */
3398                                       && tree_int_cst_equal
3399                                            (TYPE_SIZE_UNIT (reft),
3400                                             step))
3401                                     {
3402                                       DR_OFFSET (newdr) = ssize_int (0);
3403                                       DR_STEP (newdr) = step;
3404                                       DR_ALIGNED_TO (newdr)
3405                                         = size_int (BIGGEST_ALIGNMENT);
3406                                       dr = newdr;
3407                                       simd_lane_access = true;
3408                                     }
3409                                 }
3410                             }
3411                         }
3412                     }
3413                   if (!simd_lane_access && maybe_gather)
3414                     {
3415                       dr = newdr;
3416                       gather = true;
3417                     }
3418                 }
3419               if (!gather && !simd_lane_access)
3420                 free_data_ref (newdr);
3421             }
3422
3423           if (!gather && !simd_lane_access)
3424             {
3425               if (dump_enabled_p ())
3426                 {
3427                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3428                                    "not vectorized: data ref analysis "
3429                                    "failed ");
3430                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3431                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3432                 }
3433
3434               if (bb_vinfo)
3435                 break;
3436
3437               return false;
3438             }
3439         }
3440
3441       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3442         {
3443           if (dump_enabled_p ())
3444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3445                              "not vectorized: base addr of dr is a "
3446                              "constant\n");
3447
3448           if (bb_vinfo)
3449             break;
3450
3451           if (gather || simd_lane_access)
3452             free_data_ref (dr);
3453           return false;
3454         }
3455
3456       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3457         {
3458           if (dump_enabled_p ())
3459             {
3460               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3461                                "not vectorized: volatile type ");
3462               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3463               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3464             }
3465
3466           if (bb_vinfo)
3467             break;
3468
3469           return false;
3470         }
3471
3472       if (stmt_can_throw_internal (stmt))
3473         {
3474           if (dump_enabled_p ())
3475             {
3476               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3477                                "not vectorized: statement can throw an "
3478                                "exception ");
3479               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3480               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3481             }
3482
3483           if (bb_vinfo)
3484             break;
3485
3486           if (gather || simd_lane_access)
3487             free_data_ref (dr);
3488           return false;
3489         }
3490
3491       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3492           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3493         {
3494           if (dump_enabled_p ())
3495             {
3496               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3497                                "not vectorized: statement is bitfield "
3498                                "access ");
3499               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3500               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3501             }
3502
3503           if (bb_vinfo)
3504             break;
3505
3506           if (gather || simd_lane_access)
3507             free_data_ref (dr);
3508           return false;
3509         }
3510
3511       base = unshare_expr (DR_BASE_ADDRESS (dr));
3512       offset = unshare_expr (DR_OFFSET (dr));
3513       init = unshare_expr (DR_INIT (dr));
3514
3515       if (is_gimple_call (stmt)
3516           && (!gimple_call_internal_p (stmt)
3517               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3518                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3519         {
3520           if (dump_enabled_p ())
3521             {
3522               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3523                                "not vectorized: dr in a call ");
3524               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3525               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3526             }
3527
3528           if (bb_vinfo)
3529             break;
3530
3531           if (gather || simd_lane_access)
3532             free_data_ref (dr);
3533           return false;
3534         }
3535
3536       /* Update DR field in stmt_vec_info struct.  */
3537
3538       /* If the dataref is in an inner-loop of the loop that is considered for
3539          for vectorization, we also want to analyze the access relative to
3540          the outer-loop (DR contains information only relative to the
3541          inner-most enclosing loop).  We do that by building a reference to the
3542          first location accessed by the inner-loop, and analyze it relative to
3543          the outer-loop.  */
3544       if (loop && nested_in_vect_loop_p (loop, stmt))
3545         {
3546           tree outer_step, outer_base, outer_init;
3547           HOST_WIDE_INT pbitsize, pbitpos;
3548           tree poffset;
3549           machine_mode pmode;
3550           int punsignedp, pvolatilep;
3551           affine_iv base_iv, offset_iv;
3552           tree dinit;
3553
3554           /* Build a reference to the first location accessed by the
3555              inner-loop: *(BASE+INIT).  (The first location is actually
3556              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3557           tree inner_base = build_fold_indirect_ref
3558                                 (fold_build_pointer_plus (base, init));
3559
3560           if (dump_enabled_p ())
3561             {
3562               dump_printf_loc (MSG_NOTE, vect_location,
3563                                "analyze in outer-loop: ");
3564               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3565               dump_printf (MSG_NOTE, "\n");
3566             }
3567
3568           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3569                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3570           gcc_assert (outer_base != NULL_TREE);
3571
3572           if (pbitpos % BITS_PER_UNIT != 0)
3573             {
3574               if (dump_enabled_p ())
3575                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3576                                  "failed: bit offset alignment.\n");
3577               return false;
3578             }
3579
3580           outer_base = build_fold_addr_expr (outer_base);
3581           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3582                           &base_iv, false))
3583             {
3584               if (dump_enabled_p ())
3585                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3586                                  "failed: evolution of base is not affine.\n");
3587               return false;
3588             }
3589
3590           if (offset)
3591             {
3592               if (poffset)
3593                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3594                                        poffset);
3595               else
3596                 poffset = offset;
3597             }
3598
3599           if (!poffset)
3600             {
3601               offset_iv.base = ssize_int (0);
3602               offset_iv.step = ssize_int (0);
3603             }
3604           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3605                                &offset_iv, false))
3606             {
3607               if (dump_enabled_p ())
3608                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3609                                  "evolution of offset is not affine.\n");
3610               return false;
3611             }
3612
3613           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3614           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3615           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3616           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3617           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3618
3619           outer_step = size_binop (PLUS_EXPR,
3620                                 fold_convert (ssizetype, base_iv.step),
3621                                 fold_convert (ssizetype, offset_iv.step));
3622
3623           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3624           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3625           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3626           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3627           STMT_VINFO_DR_OFFSET (stmt_info) =
3628                                 fold_convert (ssizetype, offset_iv.base);
3629           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3630                                 size_int (highest_pow2_factor (offset_iv.base));
3631
3632           if (dump_enabled_p ())
3633             {
3634               dump_printf_loc (MSG_NOTE, vect_location,
3635                                "\touter base_address: ");
3636               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3637                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3638               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3639               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3640                                  STMT_VINFO_DR_OFFSET (stmt_info));
3641               dump_printf (MSG_NOTE,
3642                            "\n\touter constant offset from base address: ");
3643               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3644                                  STMT_VINFO_DR_INIT (stmt_info));
3645               dump_printf (MSG_NOTE, "\n\touter step: ");
3646               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3647                                  STMT_VINFO_DR_STEP (stmt_info));
3648               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3649               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3650                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3651               dump_printf (MSG_NOTE, "\n");
3652             }
3653         }
3654
3655       if (STMT_VINFO_DATA_REF (stmt_info))
3656         {
3657           if (dump_enabled_p ())
3658             {
3659               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3660                                "not vectorized: more than one data ref "
3661                                "in stmt: ");
3662               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3663               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3664             }
3665
3666           if (bb_vinfo)
3667             break;
3668
3669           if (gather || simd_lane_access)
3670             free_data_ref (dr);
3671           return false;
3672         }
3673
3674       STMT_VINFO_DATA_REF (stmt_info) = dr;
3675       if (simd_lane_access)
3676         {
3677           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3678           free_data_ref (datarefs[i]);
3679           datarefs[i] = dr;
3680         }
3681
3682       /* Set vectype for STMT.  */
3683       scalar_type = TREE_TYPE (DR_REF (dr));
3684       STMT_VINFO_VECTYPE (stmt_info)
3685         = get_vectype_for_scalar_type (scalar_type);
3686       if (!STMT_VINFO_VECTYPE (stmt_info))
3687         {
3688           if (dump_enabled_p ())
3689             {
3690               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3691                                "not vectorized: no vectype for stmt: ");
3692               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3693               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3694               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3695                                  scalar_type);
3696               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3697             }
3698
3699           if (bb_vinfo)
3700             break;
3701
3702           if (gather || simd_lane_access)
3703             {
3704               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3705               if (gather)
3706                 free_data_ref (dr);
3707             }
3708           return false;
3709         }
3710       else
3711         {
3712           if (dump_enabled_p ())
3713             {
3714               dump_printf_loc (MSG_NOTE, vect_location,
3715                                "got vectype for stmt: ");
3716               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3717               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3718                                  STMT_VINFO_VECTYPE (stmt_info));
3719               dump_printf (MSG_NOTE, "\n");
3720             }
3721         }
3722
3723       /* Adjust the minimal vectorization factor according to the
3724          vector type.  */
3725       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3726       if (vf > *min_vf)
3727         *min_vf = vf;
3728
3729       if (gather)
3730         {
3731           tree off;
3732
3733           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3734           if (gather
3735               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3736             gather = false;
3737           if (!gather)
3738             {
3739               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3740               free_data_ref (dr);
3741               if (dump_enabled_p ())
3742                 {
3743                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3744                                    "not vectorized: not suitable for gather "
3745                                    "load ");
3746                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3747                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3748                 }
3749               return false;
3750             }
3751
3752           datarefs[i] = dr;
3753           STMT_VINFO_GATHER_P (stmt_info) = true;
3754         }
3755       else if (loop_vinfo
3756                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3757         {
3758           if (nested_in_vect_loop_p (loop, stmt))
3759             {
3760               if (dump_enabled_p ())
3761                 {
3762                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3763                                    "not vectorized: not suitable for strided "
3764                                    "load ");
3765                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3766                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3767                 }
3768               return false;
3769             }
3770           STMT_VINFO_STRIDED_P (stmt_info) = true;
3771         }
3772     }
3773
3774   /* If we stopped analysis at the first dataref we could not analyze
3775      when trying to vectorize a basic-block mark the rest of the datarefs
3776      as not vectorizable and truncate the vector of datarefs.  That
3777      avoids spending useless time in analyzing their dependence.  */
3778   if (i != datarefs.length ())
3779     {
3780       gcc_assert (bb_vinfo != NULL);
3781       for (unsigned j = i; j < datarefs.length (); ++j)
3782         {
3783           data_reference_p dr = datarefs[j];
3784           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3785           free_data_ref (dr);
3786         }
3787       datarefs.truncate (i);
3788     }
3789
3790   return true;
3791 }
3792
3793
3794 /* Function vect_get_new_vect_var.
3795
3796    Returns a name for a new variable.  The current naming scheme appends the
3797    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3798    the name of vectorizer generated variables, and appends that to NAME if
3799    provided.  */
3800
3801 tree
3802 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3803 {
3804   const char *prefix;
3805   tree new_vect_var;
3806
3807   switch (var_kind)
3808   {
3809   case vect_simple_var:
3810     prefix = "vect";
3811     break;
3812   case vect_scalar_var:
3813     prefix = "stmp";
3814     break;
3815   case vect_pointer_var:
3816     prefix = "vectp";
3817     break;
3818   default:
3819     gcc_unreachable ();
3820   }
3821
3822   if (name)
3823     {
3824       char* tmp = concat (prefix, "_", name, NULL);
3825       new_vect_var = create_tmp_reg (type, tmp);
3826       free (tmp);
3827     }
3828   else
3829     new_vect_var = create_tmp_reg (type, prefix);
3830
3831   return new_vect_var;
3832 }
3833
3834 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3835
3836 static void
3837 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3838                                   stmt_vec_info stmt_info)
3839 {
3840   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3841   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3842   int misalign = DR_MISALIGNMENT (dr);
3843   if (misalign == -1)
3844     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3845   else
3846     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3847 }
3848
3849 /* Function vect_create_addr_base_for_vector_ref.
3850
3851    Create an expression that computes the address of the first memory location
3852    that will be accessed for a data reference.
3853
3854    Input:
3855    STMT: The statement containing the data reference.
3856    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3857    OFFSET: Optional. If supplied, it is be added to the initial address.
3858    LOOP:    Specify relative to which loop-nest should the address be computed.
3859             For example, when the dataref is in an inner-loop nested in an
3860             outer-loop that is now being vectorized, LOOP can be either the
3861             outer-loop, or the inner-loop.  The first memory location accessed
3862             by the following dataref ('in' points to short):
3863
3864                 for (i=0; i<N; i++)
3865                    for (j=0; j<M; j++)
3866                      s += in[i+j]
3867
3868             is as follows:
3869             if LOOP=i_loop:     &in             (relative to i_loop)
3870             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3871    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3872             initial address.  Unlike OFFSET, which is number of elements to
3873             be added, BYTE_OFFSET is measured in bytes.
3874
3875    Output:
3876    1. Return an SSA_NAME whose value is the address of the memory location of
3877       the first vector of the data reference.
3878    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3879       these statement(s) which define the returned SSA_NAME.
3880
3881    FORNOW: We are only handling array accesses with step 1.  */
3882
3883 tree
3884 vect_create_addr_base_for_vector_ref (gimple stmt,
3885                                       gimple_seq *new_stmt_list,
3886                                       tree offset,
3887                                       struct loop *loop,
3888                                       tree byte_offset)
3889 {
3890   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3891   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3892   tree data_ref_base;
3893   const char *base_name;
3894   tree addr_base;
3895   tree dest;
3896   gimple_seq seq = NULL;
3897   tree base_offset;
3898   tree init;
3899   tree vect_ptr_type;
3900   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3901   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3902
3903   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3904     {
3905       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3906
3907       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3908
3909       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3910       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3911       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3912     }
3913   else
3914     {
3915       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3916       base_offset = unshare_expr (DR_OFFSET (dr));
3917       init = unshare_expr (DR_INIT (dr));
3918     }
3919
3920   if (loop_vinfo)
3921     base_name = get_name (data_ref_base);
3922   else
3923     {
3924       base_offset = ssize_int (0);
3925       init = ssize_int (0);
3926       base_name = get_name (DR_REF (dr));
3927     }
3928
3929   /* Create base_offset */
3930   base_offset = size_binop (PLUS_EXPR,
3931                             fold_convert (sizetype, base_offset),
3932                             fold_convert (sizetype, init));
3933
3934   if (offset)
3935     {
3936       offset = fold_build2 (MULT_EXPR, sizetype,
3937                             fold_convert (sizetype, offset), step);
3938       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3939                                  base_offset, offset);
3940     }
3941   if (byte_offset)
3942     {
3943       byte_offset = fold_convert (sizetype, byte_offset);
3944       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3945                                  base_offset, byte_offset);
3946     }
3947
3948   /* base + base_offset */
3949   if (loop_vinfo)
3950     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3951   else
3952     {
3953       addr_base = build1 (ADDR_EXPR,
3954                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3955                           unshare_expr (DR_REF (dr)));
3956     }
3957
3958   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3959   addr_base = fold_convert (vect_ptr_type, addr_base);
3960   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3961   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3962   gimple_seq_add_seq (new_stmt_list, seq);
3963
3964   if (DR_PTR_INFO (dr)
3965       && TREE_CODE (addr_base) == SSA_NAME)
3966     {
3967       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3968       if (offset || byte_offset)
3969         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3970     }
3971
3972   if (dump_enabled_p ())
3973     {
3974       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3975       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3976       dump_printf (MSG_NOTE, "\n");
3977     }
3978
3979   return addr_base;
3980 }
3981
3982
3983 /* Function vect_create_data_ref_ptr.
3984
3985    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3986    location accessed in the loop by STMT, along with the def-use update
3987    chain to appropriately advance the pointer through the loop iterations.
3988    Also set aliasing information for the pointer.  This pointer is used by
3989    the callers to this function to create a memory reference expression for
3990    vector load/store access.
3991
3992    Input:
3993    1. STMT: a stmt that references memory. Expected to be of the form
3994          GIMPLE_ASSIGN <name, data-ref> or
3995          GIMPLE_ASSIGN <data-ref, name>.
3996    2. AGGR_TYPE: the type of the reference, which should be either a vector
3997         or an array.
3998    3. AT_LOOP: the loop where the vector memref is to be created.
3999    4. OFFSET (optional): an offset to be added to the initial address accessed
4000         by the data-ref in STMT.
4001    5. BSI: location where the new stmts are to be placed if there is no loop
4002    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4003         pointing to the initial address.
4004    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4005         to the initial address accessed by the data-ref in STMT.  This is
4006         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4007         in bytes.
4008
4009    Output:
4010    1. Declare a new ptr to vector_type, and have it point to the base of the
4011       data reference (initial addressed accessed by the data reference).
4012       For example, for vector of type V8HI, the following code is generated:
4013
4014       v8hi *ap;
4015       ap = (v8hi *)initial_address;
4016
4017       if OFFSET is not supplied:
4018          initial_address = &a[init];
4019       if OFFSET is supplied:
4020          initial_address = &a[init + OFFSET];
4021       if BYTE_OFFSET is supplied:
4022          initial_address = &a[init] + BYTE_OFFSET;
4023
4024       Return the initial_address in INITIAL_ADDRESS.
4025
4026    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4027       update the pointer in each iteration of the loop.
4028
4029       Return the increment stmt that updates the pointer in PTR_INCR.
4030
4031    3. Set INV_P to true if the access pattern of the data reference in the
4032       vectorized loop is invariant.  Set it to false otherwise.
4033
4034    4. Return the pointer.  */
4035
4036 tree
4037 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4038                           tree offset, tree *initial_address,
4039                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
4040                           bool only_init, bool *inv_p, tree byte_offset)
4041 {
4042   const char *base_name;
4043   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4044   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4045   struct loop *loop = NULL;
4046   bool nested_in_vect_loop = false;
4047   struct loop *containing_loop = NULL;
4048   tree aggr_ptr_type;
4049   tree aggr_ptr;
4050   tree new_temp;
4051   gimple vec_stmt;
4052   gimple_seq new_stmt_list = NULL;
4053   edge pe = NULL;
4054   basic_block new_bb;
4055   tree aggr_ptr_init;
4056   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4057   tree aptr;
4058   gimple_stmt_iterator incr_gsi;
4059   bool insert_after;
4060   tree indx_before_incr, indx_after_incr;
4061   gimple incr;
4062   tree step;
4063   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4064
4065   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4066               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4067
4068   if (loop_vinfo)
4069     {
4070       loop = LOOP_VINFO_LOOP (loop_vinfo);
4071       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4072       containing_loop = (gimple_bb (stmt))->loop_father;
4073       pe = loop_preheader_edge (loop);
4074     }
4075   else
4076     {
4077       gcc_assert (bb_vinfo);
4078       only_init = true;
4079       *ptr_incr = NULL;
4080     }
4081
4082   /* Check the step (evolution) of the load in LOOP, and record
4083      whether it's invariant.  */
4084   if (nested_in_vect_loop)
4085     step = STMT_VINFO_DR_STEP (stmt_info);
4086   else
4087     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4088
4089   if (integer_zerop (step))
4090     *inv_p = true;
4091   else
4092     *inv_p = false;
4093
4094   /* Create an expression for the first address accessed by this load
4095      in LOOP.  */
4096   base_name = get_name (DR_BASE_ADDRESS (dr));
4097
4098   if (dump_enabled_p ())
4099     {
4100       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4101       dump_printf_loc (MSG_NOTE, vect_location,
4102                        "create %s-pointer variable to type: ",
4103                        get_tree_code_name (TREE_CODE (aggr_type)));
4104       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4105       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4106         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4107       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4108         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4109       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4110         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4111       else
4112         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4113       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4114       dump_printf (MSG_NOTE, "\n");
4115     }
4116
4117   /* (1) Create the new aggregate-pointer variable.
4118      Vector and array types inherit the alias set of their component
4119      type by default so we need to use a ref-all pointer if the data
4120      reference does not conflict with the created aggregated data
4121      reference because it is not addressable.  */
4122   bool need_ref_all = false;
4123   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4124                               get_alias_set (DR_REF (dr))))
4125     need_ref_all = true;
4126   /* Likewise for any of the data references in the stmt group.  */
4127   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4128     {
4129       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4130       do
4131         {
4132           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4133           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4134           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4135                                       get_alias_set (DR_REF (sdr))))
4136             {
4137               need_ref_all = true;
4138               break;
4139             }
4140           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4141         }
4142       while (orig_stmt);
4143     }
4144   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4145                                                need_ref_all);
4146   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4147
4148
4149   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4150      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4151      def-use update cycles for the pointer: one relative to the outer-loop
4152      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4153      to the inner-loop (which is the inner-most loop containing the dataref),
4154      and this is done be step (5) below.
4155
4156      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4157      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4158      redundant.  Steps (3),(4) create the following:
4159
4160         vp0 = &base_addr;
4161         LOOP:   vp1 = phi(vp0,vp2)
4162                 ...
4163                 ...
4164                 vp2 = vp1 + step
4165                 goto LOOP
4166
4167      If there is an inner-loop nested in loop, then step (5) will also be
4168      applied, and an additional update in the inner-loop will be created:
4169
4170         vp0 = &base_addr;
4171         LOOP:   vp1 = phi(vp0,vp2)
4172                 ...
4173         inner:     vp3 = phi(vp1,vp4)
4174                    vp4 = vp3 + inner_step
4175                    if () goto inner
4176                 ...
4177                 vp2 = vp1 + step
4178                 if () goto LOOP   */
4179
4180   /* (2) Calculate the initial address of the aggregate-pointer, and set
4181      the aggregate-pointer to point to it before the loop.  */
4182
4183   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4184
4185   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4186                                                    offset, loop, byte_offset);
4187   if (new_stmt_list)
4188     {
4189       if (pe)
4190         {
4191           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4192           gcc_assert (!new_bb);
4193         }
4194       else
4195         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4196     }
4197
4198   *initial_address = new_temp;
4199
4200   /* Create: p = (aggr_type *) initial_base  */
4201   if (TREE_CODE (new_temp) != SSA_NAME
4202       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4203     {
4204       vec_stmt = gimple_build_assign (aggr_ptr,
4205                                       fold_convert (aggr_ptr_type, new_temp));
4206       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4207       /* Copy the points-to information if it exists. */
4208       if (DR_PTR_INFO (dr))
4209         vect_duplicate_ssa_name_ptr_info (aggr_ptr_init, dr, stmt_info);
4210       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4211       if (pe)
4212         {
4213           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4214           gcc_assert (!new_bb);
4215         }
4216       else
4217         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4218     }
4219   else
4220     aggr_ptr_init = new_temp;
4221
4222   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4223      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4224      inner-loop nested in LOOP (during outer-loop vectorization).  */
4225
4226   /* No update in loop is required.  */
4227   if (only_init && (!loop_vinfo || at_loop == loop))
4228     aptr = aggr_ptr_init;
4229   else
4230     {
4231       /* The step of the aggregate pointer is the type size.  */
4232       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4233       /* One exception to the above is when the scalar step of the load in
4234          LOOP is zero. In this case the step here is also zero.  */
4235       if (*inv_p)
4236         iv_step = size_zero_node;
4237       else if (tree_int_cst_sgn (step) == -1)
4238         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4239
4240       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4241
4242       create_iv (aggr_ptr_init,
4243                  fold_convert (aggr_ptr_type, iv_step),
4244                  aggr_ptr, loop, &incr_gsi, insert_after,
4245                  &indx_before_incr, &indx_after_incr);
4246       incr = gsi_stmt (incr_gsi);
4247       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4248
4249       /* Copy the points-to information if it exists. */
4250       if (DR_PTR_INFO (dr))
4251         {
4252           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4253           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4254         }
4255       if (ptr_incr)
4256         *ptr_incr = incr;
4257
4258       aptr = indx_before_incr;
4259     }
4260
4261   if (!nested_in_vect_loop || only_init)
4262     return aptr;
4263
4264
4265   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4266      nested in LOOP, if exists.  */
4267
4268   gcc_assert (nested_in_vect_loop);
4269   if (!only_init)
4270     {
4271       standard_iv_increment_position (containing_loop, &incr_gsi,
4272                                       &insert_after);
4273       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4274                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4275                  &indx_after_incr);
4276       incr = gsi_stmt (incr_gsi);
4277       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4278
4279       /* Copy the points-to information if it exists. */
4280       if (DR_PTR_INFO (dr))
4281         {
4282           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4283           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4284         }
4285       if (ptr_incr)
4286         *ptr_incr = incr;
4287
4288       return indx_before_incr;
4289     }
4290   else
4291     gcc_unreachable ();
4292 }
4293
4294
4295 /* Function bump_vector_ptr
4296
4297    Increment a pointer (to a vector type) by vector-size. If requested,
4298    i.e. if PTR-INCR is given, then also connect the new increment stmt
4299    to the existing def-use update-chain of the pointer, by modifying
4300    the PTR_INCR as illustrated below:
4301
4302    The pointer def-use update-chain before this function:
4303                         DATAREF_PTR = phi (p_0, p_2)
4304                         ....
4305         PTR_INCR:       p_2 = DATAREF_PTR + step
4306
4307    The pointer def-use update-chain after this function:
4308                         DATAREF_PTR = phi (p_0, p_2)
4309                         ....
4310                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4311                         ....
4312         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4313
4314    Input:
4315    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4316                  in the loop.
4317    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4318               the loop.  The increment amount across iterations is expected
4319               to be vector_size.
4320    BSI - location where the new update stmt is to be placed.
4321    STMT - the original scalar memory-access stmt that is being vectorized.
4322    BUMP - optional. The offset by which to bump the pointer. If not given,
4323           the offset is assumed to be vector_size.
4324
4325    Output: Return NEW_DATAREF_PTR as illustrated above.
4326
4327 */
4328
4329 tree
4330 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4331                  gimple stmt, tree bump)
4332 {
4333   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4334   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4335   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4336   tree update = TYPE_SIZE_UNIT (vectype);
4337   gassign *incr_stmt;
4338   ssa_op_iter iter;
4339   use_operand_p use_p;
4340   tree new_dataref_ptr;
4341
4342   if (bump)
4343     update = bump;
4344
4345   new_dataref_ptr = copy_ssa_name (dataref_ptr);
4346   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4347                                    dataref_ptr, update);
4348   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4349
4350   /* Copy the points-to information if it exists. */
4351   if (DR_PTR_INFO (dr))
4352     {
4353       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4354       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4355     }
4356
4357   if (!ptr_incr)
4358     return new_dataref_ptr;
4359
4360   /* Update the vector-pointer's cross-iteration increment.  */
4361   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4362     {
4363       tree use = USE_FROM_PTR (use_p);
4364
4365       if (use == dataref_ptr)
4366         SET_USE (use_p, new_dataref_ptr);
4367       else
4368         gcc_assert (tree_int_cst_compare (use, update) == 0);
4369     }
4370
4371   return new_dataref_ptr;
4372 }
4373
4374
4375 /* Function vect_create_destination_var.
4376
4377    Create a new temporary of type VECTYPE.  */
4378
4379 tree
4380 vect_create_destination_var (tree scalar_dest, tree vectype)
4381 {
4382   tree vec_dest;
4383   const char *name;
4384   char *new_name;
4385   tree type;
4386   enum vect_var_kind kind;
4387
4388   kind = vectype ? vect_simple_var : vect_scalar_var;
4389   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4390
4391   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4392
4393   name = get_name (scalar_dest);
4394   if (name)
4395     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4396   else
4397     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4398   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4399   free (new_name);
4400
4401   return vec_dest;
4402 }
4403
4404 /* Function vect_grouped_store_supported.
4405
4406    Returns TRUE if interleave high and interleave low permutations
4407    are supported, and FALSE otherwise.  */
4408
4409 bool
4410 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4411 {
4412   machine_mode mode = TYPE_MODE (vectype);
4413
4414   /* vect_permute_store_chain requires the group size to be equal to 3 or
4415      be a power of two.  */
4416   if (count != 3 && exact_log2 (count) == -1)
4417     {
4418       if (dump_enabled_p ())
4419         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4420                          "the size of the group of accesses"
4421                          " is not a power of 2 or not eqaul to 3\n");
4422       return false;
4423     }
4424
4425   /* Check that the permutation is supported.  */
4426   if (VECTOR_MODE_P (mode))
4427     {
4428       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4429       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4430
4431       if (count == 3)
4432         {
4433           unsigned int j0 = 0, j1 = 0, j2 = 0;
4434           unsigned int i, j;
4435
4436           for (j = 0; j < 3; j++)
4437             {
4438               int nelt0 = ((3 - j) * nelt) % 3;
4439               int nelt1 = ((3 - j) * nelt + 1) % 3;
4440               int nelt2 = ((3 - j) * nelt + 2) % 3;
4441               for (i = 0; i < nelt; i++)
4442                 {
4443                   if (3 * i + nelt0 < nelt)
4444                     sel[3 * i + nelt0] = j0++;
4445                   if (3 * i + nelt1 < nelt)
4446                     sel[3 * i + nelt1] = nelt + j1++;
4447                   if (3 * i + nelt2 < nelt)
4448                     sel[3 * i + nelt2] = 0;
4449                 }
4450               if (!can_vec_perm_p (mode, false, sel))
4451                 {
4452                   if (dump_enabled_p ())
4453                     dump_printf (MSG_MISSED_OPTIMIZATION,
4454                                  "permutaion op not supported by target.\n");
4455                   return false;
4456                 }
4457
4458               for (i = 0; i < nelt; i++)
4459                 {
4460                   if (3 * i + nelt0 < nelt)
4461                     sel[3 * i + nelt0] = 3 * i + nelt0;
4462                   if (3 * i + nelt1 < nelt)
4463                     sel[3 * i + nelt1] = 3 * i + nelt1;
4464                   if (3 * i + nelt2 < nelt)
4465                     sel[3 * i + nelt2] = nelt + j2++;
4466                 }
4467               if (!can_vec_perm_p (mode, false, sel))
4468                 {
4469                   if (dump_enabled_p ())
4470                     dump_printf (MSG_MISSED_OPTIMIZATION,
4471                                  "permutaion op not supported by target.\n");
4472                   return false;
4473                 }
4474             }
4475           return true;
4476         }
4477       else
4478         {
4479           /* If length is not equal to 3 then only power of 2 is supported.  */
4480           gcc_assert (exact_log2 (count) != -1);
4481
4482           for (i = 0; i < nelt / 2; i++)
4483             {
4484               sel[i * 2] = i;
4485               sel[i * 2 + 1] = i + nelt;
4486             }
4487             if (can_vec_perm_p (mode, false, sel))
4488               {
4489                 for (i = 0; i < nelt; i++)
4490                   sel[i] += nelt / 2;
4491                 if (can_vec_perm_p (mode, false, sel))
4492                   return true;
4493               }
4494         }
4495     }
4496
4497   if (dump_enabled_p ())
4498     dump_printf (MSG_MISSED_OPTIMIZATION,
4499                  "permutaion op not supported by target.\n");
4500   return false;
4501 }
4502
4503
4504 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4505    type VECTYPE.  */
4506
4507 bool
4508 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4509 {
4510   return vect_lanes_optab_supported_p ("vec_store_lanes",
4511                                        vec_store_lanes_optab,
4512                                        vectype, count);
4513 }
4514
4515
4516 /* Function vect_permute_store_chain.
4517
4518    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4519    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4520    the data correctly for the stores.  Return the final references for stores
4521    in RESULT_CHAIN.
4522
4523    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4524    The input is 4 vectors each containing 8 elements.  We assign a number to
4525    each element, the input sequence is:
4526
4527    1st vec:   0  1  2  3  4  5  6  7
4528    2nd vec:   8  9 10 11 12 13 14 15
4529    3rd vec:  16 17 18 19 20 21 22 23
4530    4th vec:  24 25 26 27 28 29 30 31
4531
4532    The output sequence should be:
4533
4534    1st vec:  0  8 16 24  1  9 17 25
4535    2nd vec:  2 10 18 26  3 11 19 27
4536    3rd vec:  4 12 20 28  5 13 21 30
4537    4th vec:  6 14 22 30  7 15 23 31
4538
4539    i.e., we interleave the contents of the four vectors in their order.
4540
4541    We use interleave_high/low instructions to create such output.  The input of
4542    each interleave_high/low operation is two vectors:
4543    1st vec    2nd vec
4544    0 1 2 3    4 5 6 7
4545    the even elements of the result vector are obtained left-to-right from the
4546    high/low elements of the first vector.  The odd elements of the result are
4547    obtained left-to-right from the high/low elements of the second vector.
4548    The output of interleave_high will be:   0 4 1 5
4549    and of interleave_low:                   2 6 3 7
4550
4551
4552    The permutation is done in log LENGTH stages.  In each stage interleave_high
4553    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4554    where the first argument is taken from the first half of DR_CHAIN and the
4555    second argument from it's second half.
4556    In our example,
4557
4558    I1: interleave_high (1st vec, 3rd vec)
4559    I2: interleave_low (1st vec, 3rd vec)
4560    I3: interleave_high (2nd vec, 4th vec)
4561    I4: interleave_low (2nd vec, 4th vec)
4562
4563    The output for the first stage is:
4564
4565    I1:  0 16  1 17  2 18  3 19
4566    I2:  4 20  5 21  6 22  7 23
4567    I3:  8 24  9 25 10 26 11 27
4568    I4: 12 28 13 29 14 30 15 31
4569
4570    The output of the second stage, i.e. the final result is:
4571
4572    I1:  0  8 16 24  1  9 17 25
4573    I2:  2 10 18 26  3 11 19 27
4574    I3:  4 12 20 28  5 13 21 30
4575    I4:  6 14 22 30  7 15 23 31.  */
4576
4577 void
4578 vect_permute_store_chain (vec<tree> dr_chain,
4579                           unsigned int length,
4580                           gimple stmt,
4581                           gimple_stmt_iterator *gsi,
4582                           vec<tree> *result_chain)
4583 {
4584   tree vect1, vect2, high, low;
4585   gimple perm_stmt;
4586   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4587   tree perm_mask_low, perm_mask_high;
4588   tree data_ref;
4589   tree perm3_mask_low, perm3_mask_high;
4590   unsigned int i, n, log_length = exact_log2 (length);
4591   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4592   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4593
4594   result_chain->quick_grow (length);
4595   memcpy (result_chain->address (), dr_chain.address (),
4596           length * sizeof (tree));
4597
4598   if (length == 3)
4599     {
4600       unsigned int j0 = 0, j1 = 0, j2 = 0;
4601
4602       for (j = 0; j < 3; j++)
4603         {
4604           int nelt0 = ((3 - j) * nelt) % 3;
4605           int nelt1 = ((3 - j) * nelt + 1) % 3;
4606           int nelt2 = ((3 - j) * nelt + 2) % 3;
4607
4608           for (i = 0; i < nelt; i++)
4609             {
4610               if (3 * i + nelt0 < nelt)
4611                 sel[3 * i + nelt0] = j0++;
4612               if (3 * i + nelt1 < nelt)
4613                 sel[3 * i + nelt1] = nelt + j1++;
4614               if (3 * i + nelt2 < nelt)
4615                 sel[3 * i + nelt2] = 0;
4616             }
4617           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4618
4619           for (i = 0; i < nelt; i++)
4620             {
4621               if (3 * i + nelt0 < nelt)
4622                 sel[3 * i + nelt0] = 3 * i + nelt0;
4623               if (3 * i + nelt1 < nelt)
4624                 sel[3 * i + nelt1] = 3 * i + nelt1;
4625               if (3 * i + nelt2 < nelt)
4626                 sel[3 * i + nelt2] = nelt + j2++;
4627             }
4628           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4629
4630           vect1 = dr_chain[0];
4631           vect2 = dr_chain[1];
4632
4633           /* Create interleaving stmt:
4634              low = VEC_PERM_EXPR <vect1, vect2,
4635                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4636                                    j + 2, nelt + j + 2, *, ...}>  */
4637           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4638           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4639                                            vect2, perm3_mask_low);
4640           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4641
4642           vect1 = data_ref;
4643           vect2 = dr_chain[2];
4644           /* Create interleaving stmt:
4645              low = VEC_PERM_EXPR <vect1, vect2,
4646                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4647                                    6, 7, nelt + j + 2, ...}>  */
4648           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4649           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4650                                            vect2, perm3_mask_high);
4651           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4652           (*result_chain)[j] = data_ref;
4653         }
4654     }
4655   else
4656     {
4657       /* If length is not equal to 3 then only power of 2 is supported.  */
4658       gcc_assert (exact_log2 (length) != -1);
4659
4660       for (i = 0, n = nelt / 2; i < n; i++)
4661         {
4662           sel[i * 2] = i;
4663           sel[i * 2 + 1] = i + nelt;
4664         }
4665         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4666
4667         for (i = 0; i < nelt; i++)
4668           sel[i] += nelt / 2;
4669         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4670
4671         for (i = 0, n = log_length; i < n; i++)
4672           {
4673             for (j = 0; j < length/2; j++)
4674               {
4675                 vect1 = dr_chain[j];
4676                 vect2 = dr_chain[j+length/2];
4677
4678                 /* Create interleaving stmt:
4679                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4680                                                         ...}>  */
4681                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4682                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4683                                                  vect2, perm_mask_high);
4684                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4685                 (*result_chain)[2*j] = high;
4686
4687                 /* Create interleaving stmt:
4688                    low = VEC_PERM_EXPR <vect1, vect2,
4689                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4690                                          ...}>  */
4691                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4692                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4693                                                  vect2, perm_mask_low);
4694                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4695                 (*result_chain)[2*j+1] = low;
4696               }
4697             memcpy (dr_chain.address (), result_chain->address (),
4698                     length * sizeof (tree));
4699           }
4700     }
4701 }
4702
4703 /* Function vect_setup_realignment
4704
4705    This function is called when vectorizing an unaligned load using
4706    the dr_explicit_realign[_optimized] scheme.
4707    This function generates the following code at the loop prolog:
4708
4709       p = initial_addr;
4710    x  msq_init = *(floor(p));   # prolog load
4711       realignment_token = call target_builtin;
4712     loop:
4713    x  msq = phi (msq_init, ---)
4714
4715    The stmts marked with x are generated only for the case of
4716    dr_explicit_realign_optimized.
4717
4718    The code above sets up a new (vector) pointer, pointing to the first
4719    location accessed by STMT, and a "floor-aligned" load using that pointer.
4720    It also generates code to compute the "realignment-token" (if the relevant
4721    target hook was defined), and creates a phi-node at the loop-header bb
4722    whose arguments are the result of the prolog-load (created by this
4723    function) and the result of a load that takes place in the loop (to be
4724    created by the caller to this function).
4725
4726    For the case of dr_explicit_realign_optimized:
4727    The caller to this function uses the phi-result (msq) to create the
4728    realignment code inside the loop, and sets up the missing phi argument,
4729    as follows:
4730     loop:
4731       msq = phi (msq_init, lsq)
4732       lsq = *(floor(p'));        # load in loop
4733       result = realign_load (msq, lsq, realignment_token);
4734
4735    For the case of dr_explicit_realign:
4736     loop:
4737       msq = *(floor(p));        # load in loop
4738       p' = p + (VS-1);
4739       lsq = *(floor(p'));       # load in loop
4740       result = realign_load (msq, lsq, realignment_token);
4741
4742    Input:
4743    STMT - (scalar) load stmt to be vectorized. This load accesses
4744           a memory location that may be unaligned.
4745    BSI - place where new code is to be inserted.
4746    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4747                               is used.
4748
4749    Output:
4750    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4751                        target hook, if defined.
4752    Return value - the result of the loop-header phi node.  */
4753
4754 tree
4755 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4756                         tree *realignment_token,
4757                         enum dr_alignment_support alignment_support_scheme,
4758                         tree init_addr,
4759                         struct loop **at_loop)
4760 {
4761   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4762   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4763   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4764   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4765   struct loop *loop = NULL;
4766   edge pe = NULL;
4767   tree scalar_dest = gimple_assign_lhs (stmt);
4768   tree vec_dest;
4769   gimple inc;
4770   tree ptr;
4771   tree data_ref;
4772   basic_block new_bb;
4773   tree msq_init = NULL_TREE;
4774   tree new_temp;
4775   gphi *phi_stmt;
4776   tree msq = NULL_TREE;
4777   gimple_seq stmts = NULL;
4778   bool inv_p;
4779   bool compute_in_loop = false;
4780   bool nested_in_vect_loop = false;
4781   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4782   struct loop *loop_for_initial_load = NULL;
4783
4784   if (loop_vinfo)
4785     {
4786       loop = LOOP_VINFO_LOOP (loop_vinfo);
4787       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4788     }
4789
4790   gcc_assert (alignment_support_scheme == dr_explicit_realign
4791               || alignment_support_scheme == dr_explicit_realign_optimized);
4792
4793   /* We need to generate three things:
4794      1. the misalignment computation
4795      2. the extra vector load (for the optimized realignment scheme).
4796      3. the phi node for the two vectors from which the realignment is
4797       done (for the optimized realignment scheme).  */
4798
4799   /* 1. Determine where to generate the misalignment computation.
4800
4801      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4802      calculation will be generated by this function, outside the loop (in the
4803      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4804      caller, inside the loop.
4805
4806      Background: If the misalignment remains fixed throughout the iterations of
4807      the loop, then both realignment schemes are applicable, and also the
4808      misalignment computation can be done outside LOOP.  This is because we are
4809      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4810      are a multiple of VS (the Vector Size), and therefore the misalignment in
4811      different vectorized LOOP iterations is always the same.
4812      The problem arises only if the memory access is in an inner-loop nested
4813      inside LOOP, which is now being vectorized using outer-loop vectorization.
4814      This is the only case when the misalignment of the memory access may not
4815      remain fixed throughout the iterations of the inner-loop (as explained in
4816      detail in vect_supportable_dr_alignment).  In this case, not only is the
4817      optimized realignment scheme not applicable, but also the misalignment
4818      computation (and generation of the realignment token that is passed to
4819      REALIGN_LOAD) have to be done inside the loop.
4820
4821      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4822      or not, which in turn determines if the misalignment is computed inside
4823      the inner-loop, or outside LOOP.  */
4824
4825   if (init_addr != NULL_TREE || !loop_vinfo)
4826     {
4827       compute_in_loop = true;
4828       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4829     }
4830
4831
4832   /* 2. Determine where to generate the extra vector load.
4833
4834      For the optimized realignment scheme, instead of generating two vector
4835      loads in each iteration, we generate a single extra vector load in the
4836      preheader of the loop, and in each iteration reuse the result of the
4837      vector load from the previous iteration.  In case the memory access is in
4838      an inner-loop nested inside LOOP, which is now being vectorized using
4839      outer-loop vectorization, we need to determine whether this initial vector
4840      load should be generated at the preheader of the inner-loop, or can be
4841      generated at the preheader of LOOP.  If the memory access has no evolution
4842      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4843      to be generated inside LOOP (in the preheader of the inner-loop).  */
4844
4845   if (nested_in_vect_loop)
4846     {
4847       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4848       bool invariant_in_outerloop =
4849             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4850       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4851     }
4852   else
4853     loop_for_initial_load = loop;
4854   if (at_loop)
4855     *at_loop = loop_for_initial_load;
4856
4857   if (loop_for_initial_load)
4858     pe = loop_preheader_edge (loop_for_initial_load);
4859
4860   /* 3. For the case of the optimized realignment, create the first vector
4861       load at the loop preheader.  */
4862
4863   if (alignment_support_scheme == dr_explicit_realign_optimized)
4864     {
4865       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4866       gassign *new_stmt;
4867
4868       gcc_assert (!compute_in_loop);
4869       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4870       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4871                                       NULL_TREE, &init_addr, NULL, &inc,
4872                                       true, &inv_p);
4873       new_temp = copy_ssa_name (ptr);
4874       new_stmt = gimple_build_assign
4875                    (new_temp, BIT_AND_EXPR, ptr,
4876                     build_int_cst (TREE_TYPE (ptr),
4877                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4878       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4879       gcc_assert (!new_bb);
4880       data_ref
4881         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4882                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4883       new_stmt = gimple_build_assign (vec_dest, data_ref);
4884       new_temp = make_ssa_name (vec_dest, new_stmt);
4885       gimple_assign_set_lhs (new_stmt, new_temp);
4886       if (pe)
4887         {
4888           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4889           gcc_assert (!new_bb);
4890         }
4891       else
4892          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4893
4894       msq_init = gimple_assign_lhs (new_stmt);
4895     }
4896
4897   /* 4. Create realignment token using a target builtin, if available.
4898       It is done either inside the containing loop, or before LOOP (as
4899       determined above).  */
4900
4901   if (targetm.vectorize.builtin_mask_for_load)
4902     {
4903       gcall *new_stmt;
4904       tree builtin_decl;
4905
4906       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4907       if (!init_addr)
4908         {
4909           /* Generate the INIT_ADDR computation outside LOOP.  */
4910           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4911                                                         NULL_TREE, loop);
4912           if (loop)
4913             {
4914               pe = loop_preheader_edge (loop);
4915               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4916               gcc_assert (!new_bb);
4917             }
4918           else
4919              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4920         }
4921
4922       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4923       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4924       vec_dest =
4925         vect_create_destination_var (scalar_dest,
4926                                      gimple_call_return_type (new_stmt));
4927       new_temp = make_ssa_name (vec_dest, new_stmt);
4928       gimple_call_set_lhs (new_stmt, new_temp);
4929
4930       if (compute_in_loop)
4931         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4932       else
4933         {
4934           /* Generate the misalignment computation outside LOOP.  */
4935           pe = loop_preheader_edge (loop);
4936           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4937           gcc_assert (!new_bb);
4938         }
4939
4940       *realignment_token = gimple_call_lhs (new_stmt);
4941
4942       /* The result of the CALL_EXPR to this builtin is determined from
4943          the value of the parameter and no global variables are touched
4944          which makes the builtin a "const" function.  Requiring the
4945          builtin to have the "const" attribute makes it unnecessary
4946          to call mark_call_clobbered.  */
4947       gcc_assert (TREE_READONLY (builtin_decl));
4948     }
4949
4950   if (alignment_support_scheme == dr_explicit_realign)
4951     return msq;
4952
4953   gcc_assert (!compute_in_loop);
4954   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4955
4956
4957   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4958
4959   pe = loop_preheader_edge (containing_loop);
4960   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4961   msq = make_ssa_name (vec_dest);
4962   phi_stmt = create_phi_node (msq, containing_loop->header);
4963   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4964
4965   return msq;
4966 }
4967
4968
4969 /* Function vect_grouped_load_supported.
4970
4971    Returns TRUE if even and odd permutations are supported,
4972    and FALSE otherwise.  */
4973
4974 bool
4975 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4976 {
4977   machine_mode mode = TYPE_MODE (vectype);
4978
4979   /* vect_permute_load_chain requires the group size to be equal to 3 or
4980      be a power of two.  */
4981   if (count != 3 && exact_log2 (count) == -1)
4982     {
4983       if (dump_enabled_p ())
4984         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4985                          "the size of the group of accesses"
4986                          " is not a power of 2 or not equal to 3\n");
4987       return false;
4988     }
4989
4990   /* Check that the permutation is supported.  */
4991   if (VECTOR_MODE_P (mode))
4992     {
4993       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4994       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4995
4996       if (count == 3)
4997         {
4998           unsigned int k;
4999           for (k = 0; k < 3; k++)
5000             {
5001               for (i = 0; i < nelt; i++)
5002                 if (3 * i + k < 2 * nelt)
5003                   sel[i] = 3 * i + k;
5004                 else
5005                   sel[i] = 0;
5006               if (!can_vec_perm_p (mode, false, sel))
5007                 {
5008                   if (dump_enabled_p ())
5009                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5010                                      "shuffle of 3 loads is not supported by"
5011                                      " target\n");
5012                   return false;
5013                 }
5014               for (i = 0, j = 0; i < nelt; i++)
5015                 if (3 * i + k < 2 * nelt)
5016                   sel[i] = i;
5017                 else
5018                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5019               if (!can_vec_perm_p (mode, false, sel))
5020                 {
5021                   if (dump_enabled_p ())
5022                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5023                                      "shuffle of 3 loads is not supported by"
5024                                      " target\n");
5025                   return false;
5026                 }
5027             }
5028           return true;
5029         }
5030       else
5031         {
5032           /* If length is not equal to 3 then only power of 2 is supported.  */
5033           gcc_assert (exact_log2 (count) != -1);
5034           for (i = 0; i < nelt; i++)
5035             sel[i] = i * 2;
5036           if (can_vec_perm_p (mode, false, sel))
5037             {
5038               for (i = 0; i < nelt; i++)
5039                 sel[i] = i * 2 + 1;
5040               if (can_vec_perm_p (mode, false, sel))
5041                 return true;
5042             }
5043         }
5044     }
5045
5046   if (dump_enabled_p ())
5047     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5048                      "extract even/odd not supported by target\n");
5049   return false;
5050 }
5051
5052 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5053    type VECTYPE.  */
5054
5055 bool
5056 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5057 {
5058   return vect_lanes_optab_supported_p ("vec_load_lanes",
5059                                        vec_load_lanes_optab,
5060                                        vectype, count);
5061 }
5062
5063 /* Function vect_permute_load_chain.
5064
5065    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5066    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5067    the input data correctly.  Return the final references for loads in
5068    RESULT_CHAIN.
5069
5070    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5071    The input is 4 vectors each containing 8 elements. We assign a number to each
5072    element, the input sequence is:
5073
5074    1st vec:   0  1  2  3  4  5  6  7
5075    2nd vec:   8  9 10 11 12 13 14 15
5076    3rd vec:  16 17 18 19 20 21 22 23
5077    4th vec:  24 25 26 27 28 29 30 31
5078
5079    The output sequence should be:
5080
5081    1st vec:  0 4  8 12 16 20 24 28
5082    2nd vec:  1 5  9 13 17 21 25 29
5083    3rd vec:  2 6 10 14 18 22 26 30
5084    4th vec:  3 7 11 15 19 23 27 31
5085
5086    i.e., the first output vector should contain the first elements of each
5087    interleaving group, etc.
5088
5089    We use extract_even/odd instructions to create such output.  The input of
5090    each extract_even/odd operation is two vectors
5091    1st vec    2nd vec
5092    0 1 2 3    4 5 6 7
5093
5094    and the output is the vector of extracted even/odd elements.  The output of
5095    extract_even will be:   0 2 4 6
5096    and of extract_odd:     1 3 5 7
5097
5098
5099    The permutation is done in log LENGTH stages.  In each stage extract_even
5100    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5101    their order.  In our example,
5102
5103    E1: extract_even (1st vec, 2nd vec)
5104    E2: extract_odd (1st vec, 2nd vec)
5105    E3: extract_even (3rd vec, 4th vec)
5106    E4: extract_odd (3rd vec, 4th vec)
5107
5108    The output for the first stage will be:
5109
5110    E1:  0  2  4  6  8 10 12 14
5111    E2:  1  3  5  7  9 11 13 15
5112    E3: 16 18 20 22 24 26 28 30
5113    E4: 17 19 21 23 25 27 29 31
5114
5115    In order to proceed and create the correct sequence for the next stage (or
5116    for the correct output, if the second stage is the last one, as in our
5117    example), we first put the output of extract_even operation and then the
5118    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5119    The input for the second stage is:
5120
5121    1st vec (E1):  0  2  4  6  8 10 12 14
5122    2nd vec (E3): 16 18 20 22 24 26 28 30
5123    3rd vec (E2):  1  3  5  7  9 11 13 15
5124    4th vec (E4): 17 19 21 23 25 27 29 31
5125
5126    The output of the second stage:
5127
5128    E1: 0 4  8 12 16 20 24 28
5129    E2: 2 6 10 14 18 22 26 30
5130    E3: 1 5  9 13 17 21 25 29
5131    E4: 3 7 11 15 19 23 27 31
5132
5133    And RESULT_CHAIN after reordering:
5134
5135    1st vec (E1):  0 4  8 12 16 20 24 28
5136    2nd vec (E3):  1 5  9 13 17 21 25 29
5137    3rd vec (E2):  2 6 10 14 18 22 26 30
5138    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5139
5140 static void
5141 vect_permute_load_chain (vec<tree> dr_chain,
5142                          unsigned int length,
5143                          gimple stmt,
5144                          gimple_stmt_iterator *gsi,
5145                          vec<tree> *result_chain)
5146 {
5147   tree data_ref, first_vect, second_vect;
5148   tree perm_mask_even, perm_mask_odd;
5149   tree perm3_mask_low, perm3_mask_high;
5150   gimple perm_stmt;
5151   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5152   unsigned int i, j, log_length = exact_log2 (length);
5153   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5154   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5155
5156   result_chain->quick_grow (length);
5157   memcpy (result_chain->address (), dr_chain.address (),
5158           length * sizeof (tree));
5159
5160   if (length == 3)
5161     {
5162       unsigned int k;
5163
5164       for (k = 0; k < 3; k++)
5165         {
5166           for (i = 0; i < nelt; i++)
5167             if (3 * i + k < 2 * nelt)
5168               sel[i] = 3 * i + k;
5169             else
5170               sel[i] = 0;
5171           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5172
5173           for (i = 0, j = 0; i < nelt; i++)
5174             if (3 * i + k < 2 * nelt)
5175               sel[i] = i;
5176             else
5177               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5178
5179           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5180
5181           first_vect = dr_chain[0];
5182           second_vect = dr_chain[1];
5183
5184           /* Create interleaving stmt (low part of):
5185              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5186                                                              ...}>  */
5187           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5188           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5189                                            second_vect, perm3_mask_low);
5190           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5191
5192           /* Create interleaving stmt (high part of):
5193              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5194                                                               ...}>  */
5195           first_vect = data_ref;
5196           second_vect = dr_chain[2];
5197           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5198           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5199                                            second_vect, perm3_mask_high);
5200           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5201           (*result_chain)[k] = data_ref;
5202         }
5203     }
5204   else
5205     {
5206       /* If length is not equal to 3 then only power of 2 is supported.  */
5207       gcc_assert (exact_log2 (length) != -1);
5208
5209       for (i = 0; i < nelt; ++i)
5210         sel[i] = i * 2;
5211       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5212
5213       for (i = 0; i < nelt; ++i)
5214         sel[i] = i * 2 + 1;
5215       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5216
5217       for (i = 0; i < log_length; i++)
5218         {
5219           for (j = 0; j < length; j += 2)
5220             {
5221               first_vect = dr_chain[j];
5222               second_vect = dr_chain[j+1];
5223
5224               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5225               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5226               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5227                                                first_vect, second_vect,
5228                                                perm_mask_even);
5229               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5230               (*result_chain)[j/2] = data_ref;
5231
5232               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5233               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5234               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5235                                                first_vect, second_vect,
5236                                                perm_mask_odd);
5237               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5238               (*result_chain)[j/2+length/2] = data_ref;
5239             }
5240           memcpy (dr_chain.address (), result_chain->address (),
5241                   length * sizeof (tree));
5242         }
5243     }
5244 }
5245
5246 /* Function vect_shift_permute_load_chain.
5247
5248    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5249    sequence of stmts to reorder the input data accordingly.
5250    Return the final references for loads in RESULT_CHAIN.
5251    Return true if successed, false otherwise.
5252
5253    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5254    The input is 3 vectors each containing 8 elements.  We assign a
5255    number to each element, the input sequence is:
5256
5257    1st vec:   0  1  2  3  4  5  6  7
5258    2nd vec:   8  9 10 11 12 13 14 15
5259    3rd vec:  16 17 18 19 20 21 22 23
5260
5261    The output sequence should be:
5262
5263    1st vec:  0 3 6  9 12 15 18 21
5264    2nd vec:  1 4 7 10 13 16 19 22
5265    3rd vec:  2 5 8 11 14 17 20 23
5266
5267    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5268
5269    First we shuffle all 3 vectors to get correct elements order:
5270
5271    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5272    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5273    3rd vec:  (16 19 22) (17 20 23) (18 21)
5274
5275    Next we unite and shift vector 3 times:
5276
5277    1st step:
5278      shift right by 6 the concatenation of:
5279      "1st vec" and  "2nd vec"
5280        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5281      "2nd vec" and  "3rd vec"
5282        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5283      "3rd vec" and  "1st vec"
5284        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5285                              | New vectors                   |
5286
5287      So that now new vectors are:
5288
5289      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5290      2nd vec:  (10 13) (16 19 22) (17 20 23)
5291      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5292
5293    2nd step:
5294      shift right by 5 the concatenation of:
5295      "1st vec" and  "3rd vec"
5296        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5297      "2nd vec" and  "1st vec"
5298        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5299      "3rd vec" and  "2nd vec"
5300        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5301                           | New vectors                   |
5302
5303      So that now new vectors are:
5304
5305      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5306      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5307      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5308
5309    3rd step:
5310      shift right by 5 the concatenation of:
5311      "1st vec" and  "1st vec"
5312        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5313      shift right by 3 the concatenation of:
5314      "2nd vec" and  "2nd vec"
5315                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5316                           | New vectors                   |
5317
5318      So that now all vectors are READY:
5319      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5320      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5321      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5322
5323    This algorithm is faster than one in vect_permute_load_chain if:
5324      1.  "shift of a concatination" is faster than general permutation.
5325          This is usually so.
5326      2.  The TARGET machine can't execute vector instructions in parallel.
5327          This is because each step of the algorithm depends on previous.
5328          The algorithm in vect_permute_load_chain is much more parallel.
5329
5330    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5331 */
5332
5333 static bool
5334 vect_shift_permute_load_chain (vec<tree> dr_chain,
5335                                unsigned int length,
5336                                gimple stmt,
5337                                gimple_stmt_iterator *gsi,
5338                                vec<tree> *result_chain)
5339 {
5340   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5341   tree perm2_mask1, perm2_mask2, perm3_mask;
5342   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5343   gimple perm_stmt;
5344
5345   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5346   unsigned int i;
5347   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5348   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5349   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5350   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5351
5352   result_chain->quick_grow (length);
5353   memcpy (result_chain->address (), dr_chain.address (),
5354           length * sizeof (tree));
5355
5356   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5357     {
5358       unsigned int j, log_length = exact_log2 (length);
5359       for (i = 0; i < nelt / 2; ++i)
5360         sel[i] = i * 2;
5361       for (i = 0; i < nelt / 2; ++i)
5362         sel[nelt / 2 + i] = i * 2 + 1;
5363       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5364         {
5365           if (dump_enabled_p ())
5366             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5367                              "shuffle of 2 fields structure is not \
5368                               supported by target\n");
5369           return false;
5370         }
5371       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5372
5373       for (i = 0; i < nelt / 2; ++i)
5374         sel[i] = i * 2 + 1;
5375       for (i = 0; i < nelt / 2; ++i)
5376         sel[nelt / 2 + i] = i * 2;
5377       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5378         {
5379           if (dump_enabled_p ())
5380             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5381                              "shuffle of 2 fields structure is not \
5382                               supported by target\n");
5383           return false;
5384         }
5385       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5386
5387       /* Generating permutation constant to shift all elements.
5388          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5389       for (i = 0; i < nelt; i++)
5390         sel[i] = nelt / 2 + i;
5391       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5392         {
5393           if (dump_enabled_p ())
5394             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5395                              "shift permutation is not supported by target\n");
5396           return false;
5397         }
5398       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5399
5400       /* Generating permutation constant to select vector from 2.
5401          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5402       for (i = 0; i < nelt / 2; i++)
5403         sel[i] = i;
5404       for (i = nelt / 2; i < nelt; i++)
5405         sel[i] = nelt + i;
5406       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5407         {
5408           if (dump_enabled_p ())
5409             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5410                              "select is not supported by target\n");
5411           return false;
5412         }
5413       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5414
5415       for (i = 0; i < log_length; i++)
5416         {
5417           for (j = 0; j < length; j += 2)
5418             {
5419               first_vect = dr_chain[j];
5420               second_vect = dr_chain[j + 1];
5421
5422               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5423               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5424                                                first_vect, first_vect,
5425                                                perm2_mask1);
5426               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5427               vect[0] = data_ref;
5428
5429               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5430               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5431                                                second_vect, second_vect,
5432                                                perm2_mask2);
5433               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5434               vect[1] = data_ref;
5435
5436               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5437               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5438                                                vect[0], vect[1], shift1_mask);
5439               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5440               (*result_chain)[j/2 + length/2] = data_ref;
5441
5442               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5443               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5444                                                vect[0], vect[1], select_mask);
5445               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5446               (*result_chain)[j/2] = data_ref;
5447             }
5448           memcpy (dr_chain.address (), result_chain->address (),
5449                   length * sizeof (tree));
5450         }
5451       return true;
5452     }
5453   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5454     {
5455       unsigned int k = 0, l = 0;
5456
5457       /* Generating permutation constant to get all elements in rigth order.
5458          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5459       for (i = 0; i < nelt; i++)
5460         {
5461           if (3 * k + (l % 3) >= nelt)
5462             {
5463               k = 0;
5464               l += (3 - (nelt % 3));
5465             }
5466           sel[i] = 3 * k + (l % 3);
5467           k++;
5468         }
5469       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5470         {
5471           if (dump_enabled_p ())
5472             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5473                              "shuffle of 3 fields structure is not \
5474                               supported by target\n");
5475           return false;
5476         }
5477       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5478
5479       /* Generating permutation constant to shift all elements.
5480          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5481       for (i = 0; i < nelt; i++)
5482         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5483       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5484         {
5485           if (dump_enabled_p ())
5486             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5487                              "shift permutation is not supported by target\n");
5488           return false;
5489         }
5490       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5491
5492       /* Generating permutation constant to shift all elements.
5493          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5494       for (i = 0; i < nelt; i++)
5495         sel[i] = 2 * (nelt / 3) + 1 + i;
5496       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5497         {
5498           if (dump_enabled_p ())
5499             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5500                              "shift permutation is not supported by target\n");
5501           return false;
5502         }
5503       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5504
5505       /* Generating permutation constant to shift all elements.
5506          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5507       for (i = 0; i < nelt; i++)
5508         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5509       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5510         {
5511           if (dump_enabled_p ())
5512             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5513                              "shift permutation is not supported by target\n");
5514           return false;
5515         }
5516       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5517
5518       /* Generating permutation constant to shift all elements.
5519          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5520       for (i = 0; i < nelt; i++)
5521         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5522       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5523         {
5524           if (dump_enabled_p ())
5525             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5526                              "shift permutation is not supported by target\n");
5527           return false;
5528         }
5529       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5530
5531       for (k = 0; k < 3; k++)
5532         {
5533           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5534           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5535                                            dr_chain[k], dr_chain[k],
5536                                            perm3_mask);
5537           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5538           vect[k] = data_ref;
5539         }
5540
5541       for (k = 0; k < 3; k++)
5542         {
5543           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5544           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5545                                            vect[k % 3], vect[(k + 1) % 3],
5546                                            shift1_mask);
5547           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5548           vect_shift[k] = data_ref;
5549         }
5550
5551       for (k = 0; k < 3; k++)
5552         {
5553           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5554           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5555                                            vect_shift[(4 - k) % 3],
5556                                            vect_shift[(3 - k) % 3],
5557                                            shift2_mask);
5558           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5559           vect[k] = data_ref;
5560         }
5561
5562       (*result_chain)[3 - (nelt % 3)] = vect[2];
5563
5564       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5565       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5566                                        vect[0], shift3_mask);
5567       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5568       (*result_chain)[nelt % 3] = data_ref;
5569
5570       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5571       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5572                                        vect[1], shift4_mask);
5573       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5574       (*result_chain)[0] = data_ref;
5575       return true;
5576     }
5577   return false;
5578 }
5579
5580 /* Function vect_transform_grouped_load.
5581
5582    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5583    to perform their permutation and ascribe the result vectorized statements to
5584    the scalar statements.
5585 */
5586
5587 void
5588 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5589                              gimple_stmt_iterator *gsi)
5590 {
5591   machine_mode mode;
5592   vec<tree> result_chain = vNULL;
5593
5594   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5595      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5596      vectors, that are ready for vector computation.  */
5597   result_chain.create (size);
5598
5599   /* If reassociation width for vector type is 2 or greater target machine can
5600      execute 2 or more vector instructions in parallel.  Otherwise try to
5601      get chain for loads group using vect_shift_permute_load_chain.  */
5602   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5603   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5604       || exact_log2 (size) != -1
5605       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5606                                          gsi, &result_chain))
5607     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5608   vect_record_grouped_load_vectors (stmt, result_chain);
5609   result_chain.release ();
5610 }
5611
5612 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5613    generated as part of the vectorization of STMT.  Assign the statement
5614    for each vector to the associated scalar statement.  */
5615
5616 void
5617 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5618 {
5619   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5620   gimple next_stmt, new_stmt;
5621   unsigned int i, gap_count;
5622   tree tmp_data_ref;
5623
5624   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5625      Since we scan the chain starting from it's first node, their order
5626      corresponds the order of data-refs in RESULT_CHAIN.  */
5627   next_stmt = first_stmt;
5628   gap_count = 1;
5629   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5630     {
5631       if (!next_stmt)
5632         break;
5633
5634       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5635        code elimination pass later.  No need to check for the first stmt in
5636        the group, since it always exists.
5637        GROUP_GAP is the number of steps in elements from the previous
5638        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5639        correspond to the gaps.  */
5640       if (next_stmt != first_stmt
5641           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5642       {
5643         gap_count++;
5644         continue;
5645       }
5646
5647       while (next_stmt)
5648         {
5649           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5650           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5651              copies, and we put the new vector statement in the first available
5652              RELATED_STMT.  */
5653           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5654             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5655           else
5656             {
5657               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5658                 {
5659                   gimple prev_stmt =
5660                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5661                   gimple rel_stmt =
5662                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5663                   while (rel_stmt)
5664                     {
5665                       prev_stmt = rel_stmt;
5666                       rel_stmt =
5667                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5668                     }
5669
5670                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5671                     new_stmt;
5672                 }
5673             }
5674
5675           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5676           gap_count = 1;
5677           /* If NEXT_STMT accesses the same DR as the previous statement,
5678              put the same TMP_DATA_REF as its vectorized statement; otherwise
5679              get the next data-ref from RESULT_CHAIN.  */
5680           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5681             break;
5682         }
5683     }
5684 }
5685
5686 /* Function vect_force_dr_alignment_p.
5687
5688    Returns whether the alignment of a DECL can be forced to be aligned
5689    on ALIGNMENT bit boundary.  */
5690
5691 bool
5692 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5693 {
5694   if (TREE_CODE (decl) != VAR_DECL)
5695     return false;
5696
5697   if (decl_in_symtab_p (decl)
5698       && !symtab_node::get (decl)->can_increase_alignment_p ())
5699     return false;
5700
5701   if (TREE_STATIC (decl))
5702     return (alignment <= MAX_OFILE_ALIGNMENT);
5703   else
5704     return (alignment <= MAX_STACK_ALIGNMENT);
5705 }
5706
5707
5708 /* Return whether the data reference DR is supported with respect to its
5709    alignment.
5710    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5711    it is aligned, i.e., check if it is possible to vectorize it with different
5712    alignment.  */
5713
5714 enum dr_alignment_support
5715 vect_supportable_dr_alignment (struct data_reference *dr,
5716                                bool check_aligned_accesses)
5717 {
5718   gimple stmt = DR_STMT (dr);
5719   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5720   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5721   machine_mode mode = TYPE_MODE (vectype);
5722   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5723   struct loop *vect_loop = NULL;
5724   bool nested_in_vect_loop = false;
5725
5726   if (aligned_access_p (dr) && !check_aligned_accesses)
5727     return dr_aligned;
5728
5729   /* For now assume all conditional loads/stores support unaligned
5730      access without any special code.  */
5731   if (is_gimple_call (stmt)
5732       && gimple_call_internal_p (stmt)
5733       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5734           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5735     return dr_unaligned_supported;
5736
5737   if (loop_vinfo)
5738     {
5739       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5740       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5741     }
5742
5743   /* Possibly unaligned access.  */
5744
5745   /* We can choose between using the implicit realignment scheme (generating
5746      a misaligned_move stmt) and the explicit realignment scheme (generating
5747      aligned loads with a REALIGN_LOAD).  There are two variants to the
5748      explicit realignment scheme: optimized, and unoptimized.
5749      We can optimize the realignment only if the step between consecutive
5750      vector loads is equal to the vector size.  Since the vector memory
5751      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5752      is guaranteed that the misalignment amount remains the same throughout the
5753      execution of the vectorized loop.  Therefore, we can create the
5754      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5755      at the loop preheader.
5756
5757      However, in the case of outer-loop vectorization, when vectorizing a
5758      memory access in the inner-loop nested within the LOOP that is now being
5759      vectorized, while it is guaranteed that the misalignment of the
5760      vectorized memory access will remain the same in different outer-loop
5761      iterations, it is *not* guaranteed that is will remain the same throughout
5762      the execution of the inner-loop.  This is because the inner-loop advances
5763      with the original scalar step (and not in steps of VS).  If the inner-loop
5764      step happens to be a multiple of VS, then the misalignment remains fixed
5765      and we can use the optimized realignment scheme.  For example:
5766
5767       for (i=0; i<N; i++)
5768         for (j=0; j<M; j++)
5769           s += a[i+j];
5770
5771      When vectorizing the i-loop in the above example, the step between
5772      consecutive vector loads is 1, and so the misalignment does not remain
5773      fixed across the execution of the inner-loop, and the realignment cannot
5774      be optimized (as illustrated in the following pseudo vectorized loop):
5775
5776       for (i=0; i<N; i+=4)
5777         for (j=0; j<M; j++){
5778           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5779                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5780                          // (assuming that we start from an aligned address).
5781           }
5782
5783      We therefore have to use the unoptimized realignment scheme:
5784
5785       for (i=0; i<N; i+=4)
5786           for (j=k; j<M; j+=4)
5787           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5788                            // that the misalignment of the initial address is
5789                            // 0).
5790
5791      The loop can then be vectorized as follows:
5792
5793       for (k=0; k<4; k++){
5794         rt = get_realignment_token (&vp[k]);
5795         for (i=0; i<N; i+=4){
5796           v1 = vp[i+k];
5797           for (j=k; j<M; j+=4){
5798             v2 = vp[i+j+VS-1];
5799             va = REALIGN_LOAD <v1,v2,rt>;
5800             vs += va;
5801             v1 = v2;
5802           }
5803         }
5804     } */
5805
5806   if (DR_IS_READ (dr))
5807     {
5808       bool is_packed = false;
5809       tree type = (TREE_TYPE (DR_REF (dr)));
5810
5811       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5812           && (!targetm.vectorize.builtin_mask_for_load
5813               || targetm.vectorize.builtin_mask_for_load ()))
5814         {
5815           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5816           if ((nested_in_vect_loop
5817                && (TREE_INT_CST_LOW (DR_STEP (dr))
5818                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5819               || !loop_vinfo)
5820             return dr_explicit_realign;
5821           else
5822             return dr_explicit_realign_optimized;
5823         }
5824       if (!known_alignment_for_access_p (dr))
5825         is_packed = not_size_aligned (DR_REF (dr));
5826
5827       if ((TYPE_USER_ALIGN (type) && !is_packed)
5828           || targetm.vectorize.
5829                support_vector_misalignment (mode, type,
5830                                             DR_MISALIGNMENT (dr), is_packed))
5831         /* Can't software pipeline the loads, but can at least do them.  */
5832         return dr_unaligned_supported;
5833     }
5834   else
5835     {
5836       bool is_packed = false;
5837       tree type = (TREE_TYPE (DR_REF (dr)));
5838
5839       if (!known_alignment_for_access_p (dr))
5840         is_packed = not_size_aligned (DR_REF (dr));
5841
5842      if ((TYPE_USER_ALIGN (type) && !is_packed)
5843          || targetm.vectorize.
5844               support_vector_misalignment (mode, type,
5845                                            DR_MISALIGNMENT (dr), is_packed))
5846        return dr_unaligned_supported;
5847     }
5848
5849   /* Unsupported.  */
5850   return dr_unaligned_unsupported;
5851 }