gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "hash-set.h"
  28 #include "machmode.h"
  29 #include "vec.h"
  30 #include "double-int.h"
  31 #include "input.h"
  32 #include "alias.h"
  33 #include "symtab.h"
  34 #include "wide-int.h"
  35 #include "inchash.h"
  36 #include "tree.h"
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "tm_p.h"
  40 #include "target.h"
  41 #include "predict.h"
  42 #include "hard-reg-set.h"
  43 #include "input.h"
  44 #include "function.h"
  45 #include "dominance.h"
  46 #include "cfg.h"
  47 #include "basic-block.h"
  48 #include "gimple-pretty-print.h"
  49 #include "tree-ssa-alias.h"
  50 #include "internal-fn.h"
  51 #include "tree-eh.h"
  52 #include "gimple-expr.h"
  53 #include "is-a.h"
  54 #include "gimple.h"
  55 #include "gimplify.h"
  56 #include "gimple-iterator.h"
  57 #include "gimplify-me.h"
  58 #include "gimple-ssa.h"
  59 #include "tree-phinodes.h"
  60 #include "ssa-iterators.h"
  61 #include "stringpool.h"
  62 #include "tree-ssanames.h"
  63 #include "tree-ssa-loop-ivopts.h"
  64 #include "tree-ssa-loop-manip.h"
  65 #include "tree-ssa-loop.h"
  66 #include "dumpfile.h"
  67 #include "cfgloop.h"
  68 #include "tree-chrec.h"
  69 #include "tree-scalar-evolution.h"
  70 #include "tree-vectorizer.h"
  71 #include "diagnostic-core.h"
  72 #include "hash-map.h"
  73 #include "plugin-api.h"
  74 #include "ipa-ref.h"
  75 #include "cgraph.h"
  76 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  77 #include "expr.h"
  78 #include "insn-codes.h"
  79 #include "optabs.h"
  80 #include "builtins.h"
  81 #include "varasm.h"
  82
  83 /* Return true if load- or store-lanes optab OPTAB is implemented for
  84    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  85
  86 static bool
  87 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  88                               tree vectype, unsigned HOST_WIDE_INT count)
  89 {
  90   machine_mode mode, array_mode;
  91   bool limit_p;
  92
  93   mode = TYPE_MODE (vectype);
  94   limit_p = !targetm.array_mode_supported_p (mode, count);
  95   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  96                               MODE_INT, limit_p);
  97
  98   if (array_mode == BLKmode)
  99     {
 100       if (dump_enabled_p ())
 101         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 102                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
 103                          GET_MODE_NAME (mode), count);
 104       return false;
 105     }
 106
 107   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
 108     {
 109       if (dump_enabled_p ())
 110         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 111                          "cannot use %s<%s><%s>\n", name,
 112                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
 113       return false;
 114     }
 115
 116   if (dump_enabled_p ())
 117     dump_printf_loc (MSG_NOTE, vect_location,
 118                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
 119                      GET_MODE_NAME (mode));
 120
 121   return true;
 122 }
 123
 124
 125 /* Return the smallest scalar part of STMT.
 126    This is used to determine the vectype of the stmt.  We generally set the
 127    vectype according to the type of the result (lhs).  For stmts whose
 128    result-type is different than the type of the arguments (e.g., demotion,
 129    promotion), vectype will be reset appropriately (later).  Note that we have
 130    to visit the smallest datatype in this function, because that determines the
 131    VF.  If the smallest datatype in the loop is present only as the rhs of a
 132    promotion operation - we'd miss it.
 133    Such a case, where a variable of this datatype does not appear in the lhs
 134    anywhere in the loop, can only occur if it's an invariant: e.g.:
 135    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 136    invariant motion.  However, we cannot rely on invariant motion to always
 137    take invariants out of the loop, and so in the case of promotion we also
 138    have to check the rhs.
 139    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 140    types.  */
 141
 142 tree
 143 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 144                                HOST_WIDE_INT *rhs_size_unit)
 145 {
 146   tree scalar_type = gimple_expr_type (stmt);
 147   HOST_WIDE_INT lhs, rhs;
 148
 149   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 150
 151   if (is_gimple_assign (stmt)
 152       && (gimple_assign_cast_p (stmt)
 153           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 154           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 155           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 156     {
 157       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 158
 159       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 160       if (rhs < lhs)
 161         scalar_type = rhs_type;
 162     }
 163
 164   *lhs_size_unit = lhs;
 165   *rhs_size_unit = rhs;
 166   return scalar_type;
 167 }
 168
 169
 170 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 171    tested at run-time.  Return TRUE if DDR was successfully inserted.
 172    Return false if versioning is not supported.  */
 173
 174 static bool
 175 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 176 {
 177   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 178
 179   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 180     return false;
 181
 182   if (dump_enabled_p ())
 183     {
 184       dump_printf_loc (MSG_NOTE, vect_location,
 185                        "mark for run-time aliasing test between ");
 186       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 187       dump_printf (MSG_NOTE,  " and ");
 188       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 189       dump_printf (MSG_NOTE, "\n");
 190     }
 191
 192   if (optimize_loop_nest_for_size_p (loop))
 193     {
 194       if (dump_enabled_p ())
 195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 196                          "versioning not supported when optimizing"
 197                          " for size.\n");
 198       return false;
 199     }
 200
 201   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 202   if (loop->inner)
 203     {
 204       if (dump_enabled_p ())
 205         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 206                          "versioning not yet supported for outer-loops.\n");
 207       return false;
 208     }
 209
 210   /* FORNOW: We don't support creating runtime alias tests for non-constant
 211      step.  */
 212   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 213       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 214     {
 215       if (dump_enabled_p ())
 216         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 217                          "versioning not yet supported for non-constant "
 218                          "step\n");
 219       return false;
 220     }
 221
 222   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 223   return true;
 224 }
 225
 226
 227 /* Function vect_analyze_data_ref_dependence.
 228
 229    Return TRUE if there (might) exist a dependence between a memory-reference
 230    DRA and a memory-reference DRB.  When versioning for alias may check a
 231    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 232    the data dependence.  */
 233
 234 static bool
 235 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 236                                   loop_vec_info loop_vinfo, int *max_vf)
 237 {
 238   unsigned int i;
 239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 240   struct data_reference *dra = DDR_A (ddr);
 241   struct data_reference *drb = DDR_B (ddr);
 242   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 243   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 244   lambda_vector dist_v;
 245   unsigned int loop_depth;
 246
 247   /* In loop analysis all data references should be vectorizable.  */
 248   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 249       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 250     gcc_unreachable ();
 251
 252   /* Independent data accesses.  */
 253   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 254     return false;
 255
 256   if (dra == drb
 257       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 258     return false;
 259
 260   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 261      least two scalar iterations, there is always also a true dependence.
 262      As the vectorizer does not re-order loads and stores we can ignore
 263      the anti-dependence if TBAA can disambiguate both DRs similar to the
 264      case with known negative distance anti-dependences (positive
 265      distance anti-dependences would violate TBAA constraints).  */
 266   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 267        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 268       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 269                                  get_alias_set (DR_REF (drb))))
 270     return false;
 271
 272   /* Unknown data dependence.  */
 273   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 274     {
 275       /* If user asserted safelen consecutive iterations can be
 276          executed concurrently, assume independence.  */
 277       if (loop->safelen >= 2)
 278         {
 279           if (loop->safelen < *max_vf)
 280             *max_vf = loop->safelen;
 281           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 282           return false;
 283         }
 284
 285       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 286           || STMT_VINFO_GATHER_P (stmtinfo_b))
 287         {
 288           if (dump_enabled_p ())
 289             {
 290               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 291                                "versioning for alias not supported for: "
 292                                "can't determine dependence between ");
 293               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 294                                  DR_REF (dra));
 295               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 296               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 297                                  DR_REF (drb));
 298               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 299             }
 300           return true;
 301         }
 302
 303       if (dump_enabled_p ())
 304         {
 305           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 306                            "versioning for alias required: "
 307                            "can't determine dependence between ");
 308           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 309                              DR_REF (dra));
 310           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 311           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 312                              DR_REF (drb));
 313           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 314         }
 315
 316       /* Add to list of ddrs that need to be tested at run-time.  */
 317       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 318     }
 319
 320   /* Known data dependence.  */
 321   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 322     {
 323       /* If user asserted safelen consecutive iterations can be
 324          executed concurrently, assume independence.  */
 325       if (loop->safelen >= 2)
 326         {
 327           if (loop->safelen < *max_vf)
 328             *max_vf = loop->safelen;
 329           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 330           return false;
 331         }
 332
 333       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 334           || STMT_VINFO_GATHER_P (stmtinfo_b))
 335         {
 336           if (dump_enabled_p ())
 337             {
 338               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                "versioning for alias not supported for: "
 340                                "bad dist vector for ");
 341               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                  DR_REF (dra));
 343               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 344               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 345                                  DR_REF (drb));
 346               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 347             }
 348           return true;
 349         }
 350
 351       if (dump_enabled_p ())
 352         {
 353           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 354                            "versioning for alias required: "
 355                            "bad dist vector for ");
 356           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 357           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 358           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 359           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 360         }
 361       /* Add to list of ddrs that need to be tested at run-time.  */
 362       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 363     }
 364
 365   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 366   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 367     {
 368       int dist = dist_v[loop_depth];
 369
 370       if (dump_enabled_p ())
 371         dump_printf_loc (MSG_NOTE, vect_location,
 372                          "dependence distance  = %d.\n", dist);
 373
 374       if (dist == 0)
 375         {
 376           if (dump_enabled_p ())
 377             {
 378               dump_printf_loc (MSG_NOTE, vect_location,
 379                                "dependence distance == 0 between ");
 380               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 381               dump_printf (MSG_NOTE, " and ");
 382               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 383               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 384             }
 385
 386           /* When we perform grouped accesses and perform implicit CSE
 387              by detecting equal accesses and doing disambiguation with
 388              runtime alias tests like for
 389                 .. = a[i];
 390                 .. = a[i+1];
 391                 a[i] = ..;
 392                 a[i+1] = ..;
 393                 *p = ..;
 394                 .. = a[i];
 395                 .. = a[i+1];
 396              where we will end up loading { a[i], a[i+1] } once, make
 397              sure that inserting group loads before the first load and
 398              stores after the last store will do the right thing.
 399              Similar for groups like
 400                 a[i] = ...;
 401                 ... = a[i];
 402                 a[i+1] = ...;
 403              where loads from the group interleave with the store.  */
 404           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 405               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 406             {
 407               gimple earlier_stmt;
 408               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 409               if (DR_IS_WRITE
 410                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 411                 {
 412                   if (dump_enabled_p ())
 413                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 414                                      "READ_WRITE dependence in interleaving."
 415                                      "\n");
 416                   return true;
 417                 }
 418             }
 419
 420           continue;
 421         }
 422
 423       if (dist > 0 && DDR_REVERSED_P (ddr))
 424         {
 425           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 426              reversed (to make distance vector positive), and the actual
 427              distance is negative.  */
 428           if (dump_enabled_p ())
 429             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 430                              "dependence distance negative.\n");
 431           /* Record a negative dependence distance to later limit the
 432              amount of stmt copying / unrolling we can perform.
 433              Only need to handle read-after-write dependence.  */
 434           if (DR_IS_READ (drb)
 435               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 436                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 437             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 438           continue;
 439         }
 440
 441       if (abs (dist) >= 2
 442           && abs (dist) < *max_vf)
 443         {
 444           /* The dependence distance requires reduction of the maximal
 445              vectorization factor.  */
 446           *max_vf = abs (dist);
 447           if (dump_enabled_p ())
 448             dump_printf_loc (MSG_NOTE, vect_location,
 449                              "adjusting maximal vectorization factor to %i\n",
 450                              *max_vf);
 451         }
 452
 453       if (abs (dist) >= *max_vf)
 454         {
 455           /* Dependence distance does not create dependence, as far as
 456              vectorization is concerned, in this case.  */
 457           if (dump_enabled_p ())
 458             dump_printf_loc (MSG_NOTE, vect_location,
 459                              "dependence distance >= VF.\n");
 460           continue;
 461         }
 462
 463       if (dump_enabled_p ())
 464         {
 465           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 466                        "not vectorized, possible dependence "
 467                        "between data-refs ");
 468           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 469           dump_printf (MSG_NOTE,  " and ");
 470           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 471           dump_printf (MSG_NOTE,  "\n");
 472         }
 473
 474       return true;
 475     }
 476
 477   return false;
 478 }
 479
 480 /* Function vect_analyze_data_ref_dependences.
 481
 482    Examine all the data references in the loop, and make sure there do not
 483    exist any data dependences between them.  Set *MAX_VF according to
 484    the maximum vectorization factor the data dependences allow.  */
 485
 486 bool
 487 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 488 {
 489   unsigned int i;
 490   struct data_dependence_relation *ddr;
 491
 492   if (dump_enabled_p ())
 493     dump_printf_loc (MSG_NOTE, vect_location,
 494                      "=== vect_analyze_data_ref_dependences ===\n");
 495
 496   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 497   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 498                                 &LOOP_VINFO_DDRS (loop_vinfo),
 499                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 500     return false;
 501
 502   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 503     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 504       return false;
 505
 506   return true;
 507 }
 508
 509
 510 /* Function vect_slp_analyze_data_ref_dependence.
 511
 512    Return TRUE if there (might) exist a dependence between a memory-reference
 513    DRA and a memory-reference DRB.  When versioning for alias may check a
 514    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 515    the data dependence.  */
 516
 517 static bool
 518 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 519 {
 520   struct data_reference *dra = DDR_A (ddr);
 521   struct data_reference *drb = DDR_B (ddr);
 522
 523   /* We need to check dependences of statements marked as unvectorizable
 524      as well, they still can prohibit vectorization.  */
 525
 526   /* Independent data accesses.  */
 527   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 528     return false;
 529
 530   if (dra == drb)
 531     return false;
 532
 533   /* Read-read is OK.  */
 534   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 535     return false;
 536
 537   /* If dra and drb are part of the same interleaving chain consider
 538      them independent.  */
 539   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 540       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 541           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 542     return false;
 543
 544   /* Unknown data dependence.  */
 545   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 546     {
 547       if  (dump_enabled_p ())
 548         {
 549           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 550                            "can't determine dependence between ");
 551           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 552           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 553           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 554           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 555         }
 556     }
 557   else if (dump_enabled_p ())
 558     {
 559       dump_printf_loc (MSG_NOTE, vect_location,
 560                        "determined dependence between ");
 561       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 562       dump_printf (MSG_NOTE, " and ");
 563       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 564       dump_printf (MSG_NOTE,  "\n");
 565     }
 566
 567   /* We do not vectorize basic blocks with write-write dependencies.  */
 568   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 569     return true;
 570
 571   /* If we have a read-write dependence check that the load is before the store.
 572      When we vectorize basic blocks, vector load can be only before
 573      corresponding scalar load, and vector store can be only after its
 574      corresponding scalar store.  So the order of the acceses is preserved in
 575      case the load is before the store.  */
 576   gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 577   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 578     {
 579       /* That only holds for load-store pairs taking part in vectorization.  */
 580       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 581           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 582         return false;
 583     }
 584
 585   return true;
 586 }
 587
 588
 589 /* Function vect_analyze_data_ref_dependences.
 590
 591    Examine all the data references in the basic-block, and make sure there
 592    do not exist any data dependences between them.  Set *MAX_VF according to
 593    the maximum vectorization factor the data dependences allow.  */
 594
 595 bool
 596 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 597 {
 598   struct data_dependence_relation *ddr;
 599   unsigned int i;
 600
 601   if (dump_enabled_p ())
 602     dump_printf_loc (MSG_NOTE, vect_location,
 603                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 604
 605   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 606                                 &BB_VINFO_DDRS (bb_vinfo),
 607                                 vNULL, true))
 608     return false;
 609
 610   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 611     if (vect_slp_analyze_data_ref_dependence (ddr))
 612       return false;
 613
 614   return true;
 615 }
 616
 617
 618 /* Function vect_compute_data_ref_alignment
 619
 620    Compute the misalignment of the data reference DR.
 621
 622    Output:
 623    1. If during the misalignment computation it is found that the data reference
 624       cannot be vectorized then false is returned.
 625    2. DR_MISALIGNMENT (DR) is defined.
 626
 627    FOR NOW: No analysis is actually performed. Misalignment is calculated
 628    only for trivial cases. TODO.  */
 629
 630 static bool
 631 vect_compute_data_ref_alignment (struct data_reference *dr)
 632 {
 633   gimple stmt = DR_STMT (dr);
 634   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 635   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 636   struct loop *loop = NULL;
 637   tree ref = DR_REF (dr);
 638   tree vectype;
 639   tree base, base_addr;
 640   bool base_aligned;
 641   tree misalign;
 642   tree aligned_to, alignment;
 643
 644   if (dump_enabled_p ())
 645     dump_printf_loc (MSG_NOTE, vect_location,
 646                      "vect_compute_data_ref_alignment:\n");
 647
 648   if (loop_vinfo)
 649     loop = LOOP_VINFO_LOOP (loop_vinfo);
 650
 651   /* Initialize misalignment to unknown.  */
 652   SET_DR_MISALIGNMENT (dr, -1);
 653
 654   /* Strided loads perform only component accesses, misalignment information
 655      is irrelevant for them.  */
 656   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 657     return true;
 658
 659   misalign = DR_INIT (dr);
 660   aligned_to = DR_ALIGNED_TO (dr);
 661   base_addr = DR_BASE_ADDRESS (dr);
 662   vectype = STMT_VINFO_VECTYPE (stmt_info);
 663
 664   /* In case the dataref is in an inner-loop of the loop that is being
 665      vectorized (LOOP), we use the base and misalignment information
 666      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 667      stays the same throughout the execution of the inner-loop, which is why
 668      we have to check that the stride of the dataref in the inner-loop evenly
 669      divides by the vector size.  */
 670   if (loop && nested_in_vect_loop_p (loop, stmt))
 671     {
 672       tree step = DR_STEP (dr);
 673       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 674
 675       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 676         {
 677           if (dump_enabled_p ())
 678             dump_printf_loc (MSG_NOTE, vect_location,
 679                              "inner step divides the vector-size.\n");
 680           misalign = STMT_VINFO_DR_INIT (stmt_info);
 681           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 682           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 683         }
 684       else
 685         {
 686           if (dump_enabled_p ())
 687             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 688                              "inner step doesn't divide the vector-size.\n");
 689           misalign = NULL_TREE;
 690         }
 691     }
 692
 693   /* Similarly, if we're doing basic-block vectorization, we can only use
 694      base and misalignment information relative to an innermost loop if the
 695      misalignment stays the same throughout the execution of the loop.
 696      As above, this is the case if the stride of the dataref evenly divides
 697      by the vector size.  */
 698   if (!loop)
 699     {
 700       tree step = DR_STEP (dr);
 701       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 702
 703       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 704         {
 705           if (dump_enabled_p ())
 706             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 707                              "SLP: step doesn't divide the vector-size.\n");
 708           misalign = NULL_TREE;
 709         }
 710     }
 711
 712   base = build_fold_indirect_ref (base_addr);
 713   alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
 714
 715   if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
 716       || !misalign)
 717     {
 718       if (dump_enabled_p ())
 719         {
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown alignment for access: ");
 722           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
 723           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 724         }
 725       return true;
 726     }
 727
 728   if ((DECL_P (base)
 729        && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
 730                                 alignment) >= 0)
 731       || (TREE_CODE (base_addr) == SSA_NAME
 732           && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
 733                                                       TREE_TYPE (base_addr)))),
 734                                    alignment) >= 0)
 735       || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
 736     base_aligned = true;
 737   else
 738     base_aligned = false;
 739
 740   if (!base_aligned)
 741     {
 742       /* Do not change the alignment of global variables here if
 743          flag_section_anchors is enabled as we already generated
 744          RTL for other functions.  Most global variables should
 745          have been aligned during the IPA increase_alignment pass.  */
 746       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
 747           || (TREE_STATIC (base) && flag_section_anchors))
 748         {
 749           if (dump_enabled_p ())
 750             {
 751               dump_printf_loc (MSG_NOTE, vect_location,
 752                                "can't force alignment of ref: ");
 753               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 754               dump_printf (MSG_NOTE, "\n");
 755             }
 756           return true;
 757         }
 758
 759       /* Force the alignment of the decl.
 760          NOTE: This is the only change to the code we make during
 761          the analysis phase, before deciding to vectorize the loop.  */
 762       if (dump_enabled_p ())
 763         {
 764           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 765           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 766           dump_printf (MSG_NOTE, "\n");
 767         }
 768
 769       ((dataref_aux *)dr->aux)->base_decl = base;
 770       ((dataref_aux *)dr->aux)->base_misaligned = true;
 771     }
 772
 773   /* If this is a backward running DR then first access in the larger
 774      vectype actually is N-1 elements before the address in the DR.
 775      Adjust misalign accordingly.  */
 776   if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
 777     {
 778       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 779       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 780          otherwise we wouldn't be here.  */
 781       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 782       /* PLUS because DR_STEP was negative.  */
 783       misalign = size_binop (PLUS_EXPR, misalign, offset);
 784     }
 785
 786   /* Modulo alignment.  */
 787   misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
 788
 789   if (!tree_fits_uhwi_p (misalign))
 790     {
 791       /* Negative or overflowed misalignment value.  */
 792       if (dump_enabled_p ())
 793         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 794                          "unexpected misalign value\n");
 795       return false;
 796     }
 797
 798   SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
 799
 800   if (dump_enabled_p ())
 801     {
 802       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 803                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 804       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 805       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 806     }
 807
 808   return true;
 809 }
 810
 811
 812 /* Function vect_compute_data_refs_alignment
 813
 814    Compute the misalignment of data references in the loop.
 815    Return FALSE if a data reference is found that cannot be vectorized.  */
 816
 817 static bool
 818 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 819                                   bb_vec_info bb_vinfo)
 820 {
 821   vec<data_reference_p> datarefs;
 822   struct data_reference *dr;
 823   unsigned int i;
 824
 825   if (loop_vinfo)
 826     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 827   else
 828     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 829
 830   FOR_EACH_VEC_ELT (datarefs, i, dr)
 831     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 832         && !vect_compute_data_ref_alignment (dr))
 833       {
 834         if (bb_vinfo)
 835           {
 836             /* Mark unsupported statement as unvectorizable.  */
 837             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 838             continue;
 839           }
 840         else
 841           return false;
 842       }
 843
 844   return true;
 845 }
 846
 847
 848 /* Function vect_update_misalignment_for_peel
 849
 850    DR - the data reference whose misalignment is to be adjusted.
 851    DR_PEEL - the data reference whose misalignment is being made
 852              zero in the vector loop by the peel.
 853    NPEEL - the number of iterations in the peel loop if the misalignment
 854            of DR_PEEL is known at compile time.  */
 855
 856 static void
 857 vect_update_misalignment_for_peel (struct data_reference *dr,
 858                                    struct data_reference *dr_peel, int npeel)
 859 {
 860   unsigned int i;
 861   vec<dr_p> same_align_drs;
 862   struct data_reference *current_dr;
 863   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 864   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 865   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 866   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 867
 868  /* For interleaved data accesses the step in the loop must be multiplied by
 869      the size of the interleaving group.  */
 870   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 871     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 872   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 873     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 874
 875   /* It can be assumed that the data refs with the same alignment as dr_peel
 876      are aligned in the vector loop.  */
 877   same_align_drs
 878     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 879   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 880     {
 881       if (current_dr != dr)
 882         continue;
 883       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 884                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 885       SET_DR_MISALIGNMENT (dr, 0);
 886       return;
 887     }
 888
 889   if (known_alignment_for_access_p (dr)
 890       && known_alignment_for_access_p (dr_peel))
 891     {
 892       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 893       int misal = DR_MISALIGNMENT (dr);
 894       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 895       misal += negative ? -npeel * dr_size : npeel * dr_size;
 896       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 897       SET_DR_MISALIGNMENT (dr, misal);
 898       return;
 899     }
 900
 901   if (dump_enabled_p ())
 902     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 903   SET_DR_MISALIGNMENT (dr, -1);
 904 }
 905
 906
 907 /* Function vect_verify_datarefs_alignment
 908
 909    Return TRUE if all data references in the loop can be
 910    handled with respect to alignment.  */
 911
 912 bool
 913 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 914 {
 915   vec<data_reference_p> datarefs;
 916   struct data_reference *dr;
 917   enum dr_alignment_support supportable_dr_alignment;
 918   unsigned int i;
 919
 920   if (loop_vinfo)
 921     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 922   else
 923     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 924
 925   FOR_EACH_VEC_ELT (datarefs, i, dr)
 926     {
 927       gimple stmt = DR_STMT (dr);
 928       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 929
 930       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 931         continue;
 932
 933       /* For interleaving, only the alignment of the first access matters.
 934          Skip statements marked as not vectorizable.  */
 935       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 936            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 937           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 938         continue;
 939
 940       /* Strided loads perform only component accesses, alignment is
 941          irrelevant for them.  */
 942       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 943         continue;
 944
 945       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 946       if (!supportable_dr_alignment)
 947         {
 948           if (dump_enabled_p ())
 949             {
 950               if (DR_IS_READ (dr))
 951                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 952                                  "not vectorized: unsupported unaligned load.");
 953               else
 954                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 955                                  "not vectorized: unsupported unaligned "
 956                                  "store.");
 957
 958               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 959                                  DR_REF (dr));
 960               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 961             }
 962           return false;
 963         }
 964       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 965         dump_printf_loc (MSG_NOTE, vect_location,
 966                          "Vectorizing an unaligned access.\n");
 967     }
 968   return true;
 969 }
 970
 971 /* Given an memory reference EXP return whether its alignment is less
 972    than its size.  */
 973
 974 static bool
 975 not_size_aligned (tree exp)
 976 {
 977   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 978     return true;
 979
 980   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 981           > get_object_alignment (exp));
 982 }
 983
 984 /* Function vector_alignment_reachable_p
 985
 986    Return true if vector alignment for DR is reachable by peeling
 987    a few loop iterations.  Return false otherwise.  */
 988
 989 static bool
 990 vector_alignment_reachable_p (struct data_reference *dr)
 991 {
 992   gimple stmt = DR_STMT (dr);
 993   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 994   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 995
 996   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 997     {
 998       /* For interleaved access we peel only if number of iterations in
 999          the prolog loop ({VF - misalignment}), is a multiple of the
1000          number of the interleaved accesses.  */
1001       int elem_size, mis_in_elements;
1002       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1003
1004       /* FORNOW: handle only known alignment.  */
1005       if (!known_alignment_for_access_p (dr))
1006         return false;
1007
1008       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1009       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1010
1011       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1012         return false;
1013     }
1014
1015   /* If misalignment is known at the compile time then allow peeling
1016      only if natural alignment is reachable through peeling.  */
1017   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1018     {
1019       HOST_WIDE_INT elmsize =
1020                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1021       if (dump_enabled_p ())
1022         {
1023           dump_printf_loc (MSG_NOTE, vect_location,
1024                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1025           dump_printf (MSG_NOTE,
1026                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1027         }
1028       if (DR_MISALIGNMENT (dr) % elmsize)
1029         {
1030           if (dump_enabled_p ())
1031             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032                              "data size does not divide the misalignment.\n");
1033           return false;
1034         }
1035     }
1036
1037   if (!known_alignment_for_access_p (dr))
1038     {
1039       tree type = TREE_TYPE (DR_REF (dr));
1040       bool is_packed = not_size_aligned (DR_REF (dr));
1041       if (dump_enabled_p ())
1042         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043                          "Unknown misalignment, is_packed = %d\n",is_packed);
1044       if ((TYPE_USER_ALIGN (type) && !is_packed)
1045           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1046         return true;
1047       else
1048         return false;
1049     }
1050
1051   return true;
1052 }
1053
1054
1055 /* Calculate the cost of the memory access represented by DR.  */
1056
1057 static void
1058 vect_get_data_access_cost (struct data_reference *dr,
1059                            unsigned int *inside_cost,
1060                            unsigned int *outside_cost,
1061                            stmt_vector_for_cost *body_cost_vec)
1062 {
1063   gimple stmt = DR_STMT (dr);
1064   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1065   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1066   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1067   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1068   int ncopies = vf / nunits;
1069
1070   if (DR_IS_READ (dr))
1071     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1072                         NULL, body_cost_vec, false);
1073   else
1074     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1075
1076   if (dump_enabled_p ())
1077     dump_printf_loc (MSG_NOTE, vect_location,
1078                      "vect_get_data_access_cost: inside_cost = %d, "
1079                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1080 }
1081
1082
1083 /* Insert DR into peeling hash table with NPEEL as key.  */
1084
1085 static void
1086 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1087                           int npeel)
1088 {
1089   struct _vect_peel_info elem, *slot;
1090   _vect_peel_info **new_slot;
1091   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1092
1093   elem.npeel = npeel;
1094   slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1095   if (slot)
1096     slot->count++;
1097   else
1098     {
1099       slot = XNEW (struct _vect_peel_info);
1100       slot->npeel = npeel;
1101       slot->dr = dr;
1102       slot->count = 1;
1103       new_slot
1104         = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1105       *new_slot = slot;
1106     }
1107
1108   if (!supportable_dr_alignment
1109       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1110     slot->count += VECT_MAX_COST;
1111 }
1112
1113
1114 /* Traverse peeling hash table to find peeling option that aligns maximum
1115    number of data accesses.  */
1116
1117 int
1118 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1119                                      _vect_peel_extended_info *max)
1120 {
1121   vect_peel_info elem = *slot;
1122
1123   if (elem->count > max->peel_info.count
1124       || (elem->count == max->peel_info.count
1125           && max->peel_info.npeel > elem->npeel))
1126     {
1127       max->peel_info.npeel = elem->npeel;
1128       max->peel_info.count = elem->count;
1129       max->peel_info.dr = elem->dr;
1130     }
1131
1132   return 1;
1133 }
1134
1135
1136 /* Traverse peeling hash table and calculate cost for each peeling option.
1137    Find the one with the lowest cost.  */
1138
1139 int
1140 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1141                                    _vect_peel_extended_info *min)
1142 {
1143   vect_peel_info elem = *slot;
1144   int save_misalignment, dummy;
1145   unsigned int inside_cost = 0, outside_cost = 0, i;
1146   gimple stmt = DR_STMT (elem->dr);
1147   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1148   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1149   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1150   struct data_reference *dr;
1151   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1152   int single_iter_cost;
1153
1154   prologue_cost_vec.create (2);
1155   body_cost_vec.create (2);
1156   epilogue_cost_vec.create (2);
1157
1158   FOR_EACH_VEC_ELT (datarefs, i, dr)
1159     {
1160       stmt = DR_STMT (dr);
1161       stmt_info = vinfo_for_stmt (stmt);
1162       /* For interleaving, only the alignment of the first access
1163          matters.  */
1164       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1165           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1166         continue;
1167
1168       save_misalignment = DR_MISALIGNMENT (dr);
1169       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1170       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1171                                  &body_cost_vec);
1172       SET_DR_MISALIGNMENT (dr, save_misalignment);
1173     }
1174
1175   single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1176   outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1177                                                &dummy, single_iter_cost,
1178                                                &prologue_cost_vec,
1179                                                &epilogue_cost_vec);
1180
1181   /* Prologue and epilogue costs are added to the target model later.
1182      These costs depend only on the scalar iteration cost, the
1183      number of peeling iterations finally chosen, and the number of
1184      misaligned statements.  So discard the information found here.  */
1185   prologue_cost_vec.release ();
1186   epilogue_cost_vec.release ();
1187
1188   if (inside_cost < min->inside_cost
1189       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1190     {
1191       min->inside_cost = inside_cost;
1192       min->outside_cost = outside_cost;
1193       min->body_cost_vec.release ();
1194       min->body_cost_vec = body_cost_vec;
1195       min->peel_info.dr = elem->dr;
1196       min->peel_info.npeel = elem->npeel;
1197     }
1198   else
1199     body_cost_vec.release ();
1200
1201   return 1;
1202 }
1203
1204
1205 /* Choose best peeling option by traversing peeling hash table and either
1206    choosing an option with the lowest cost (if cost model is enabled) or the
1207    option that aligns as many accesses as possible.  */
1208
1209 static struct data_reference *
1210 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1211                                        unsigned int *npeel,
1212                                        stmt_vector_for_cost *body_cost_vec)
1213 {
1214    struct _vect_peel_extended_info res;
1215
1216    res.peel_info.dr = NULL;
1217    res.body_cost_vec = stmt_vector_for_cost ();
1218
1219    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1220      {
1221        res.inside_cost = INT_MAX;
1222        res.outside_cost = INT_MAX;
1223        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1224            ->traverse <_vect_peel_extended_info *,
1225                        vect_peeling_hash_get_lowest_cost> (&res);
1226      }
1227    else
1228      {
1229        res.peel_info.count = 0;
1230        LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1231            ->traverse <_vect_peel_extended_info *,
1232                        vect_peeling_hash_get_most_frequent> (&res);
1233      }
1234
1235    *npeel = res.peel_info.npeel;
1236    *body_cost_vec = res.body_cost_vec;
1237    return res.peel_info.dr;
1238 }
1239
1240
1241 /* Function vect_enhance_data_refs_alignment
1242
1243    This pass will use loop versioning and loop peeling in order to enhance
1244    the alignment of data references in the loop.
1245
1246    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1247    original loop is to be vectorized.  Any other loops that are created by
1248    the transformations performed in this pass - are not supposed to be
1249    vectorized.  This restriction will be relaxed.
1250
1251    This pass will require a cost model to guide it whether to apply peeling
1252    or versioning or a combination of the two.  For example, the scheme that
1253    intel uses when given a loop with several memory accesses, is as follows:
1254    choose one memory access ('p') which alignment you want to force by doing
1255    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1256    other accesses are not necessarily aligned, or (2) use loop versioning to
1257    generate one loop in which all accesses are aligned, and another loop in
1258    which only 'p' is necessarily aligned.
1259
1260    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1261    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1262    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1263
1264    Devising a cost model is the most critical aspect of this work.  It will
1265    guide us on which access to peel for, whether to use loop versioning, how
1266    many versions to create, etc.  The cost model will probably consist of
1267    generic considerations as well as target specific considerations (on
1268    powerpc for example, misaligned stores are more painful than misaligned
1269    loads).
1270
1271    Here are the general steps involved in alignment enhancements:
1272
1273      -- original loop, before alignment analysis:
1274         for (i=0; i<N; i++){
1275           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1276           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1277         }
1278
1279      -- After vect_compute_data_refs_alignment:
1280         for (i=0; i<N; i++){
1281           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1282           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1283         }
1284
1285      -- Possibility 1: we do loop versioning:
1286      if (p is aligned) {
1287         for (i=0; i<N; i++){    # loop 1A
1288           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1289           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1290         }
1291      }
1292      else {
1293         for (i=0; i<N; i++){    # loop 1B
1294           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1295           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1296         }
1297      }
1298
1299      -- Possibility 2: we do loop peeling:
1300      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1301         x = q[i];
1302         p[i] = y;
1303      }
1304      for (i = 3; i < N; i++){   # loop 2A
1305         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1306         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1307      }
1308
1309      -- Possibility 3: combination of loop peeling and versioning:
1310      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1311         x = q[i];
1312         p[i] = y;
1313      }
1314      if (p is aligned) {
1315         for (i = 3; i<N; i++){  # loop 3A
1316           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1317           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1318         }
1319      }
1320      else {
1321         for (i = 3; i<N; i++){  # loop 3B
1322           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1323           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1324         }
1325      }
1326
1327      These loops are later passed to loop_transform to be vectorized.  The
1328      vectorizer will use the alignment information to guide the transformation
1329      (whether to generate regular loads/stores, or with special handling for
1330      misalignment).  */
1331
1332 bool
1333 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1334 {
1335   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1336   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1337   enum dr_alignment_support supportable_dr_alignment;
1338   struct data_reference *dr0 = NULL, *first_store = NULL;
1339   struct data_reference *dr;
1340   unsigned int i, j;
1341   bool do_peeling = false;
1342   bool do_versioning = false;
1343   bool stat;
1344   gimple stmt;
1345   stmt_vec_info stmt_info;
1346   unsigned int npeel = 0;
1347   bool all_misalignments_unknown = true;
1348   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1349   unsigned possible_npeel_number = 1;
1350   tree vectype;
1351   unsigned int nelements, mis, same_align_drs_max = 0;
1352   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1353
1354   if (dump_enabled_p ())
1355     dump_printf_loc (MSG_NOTE, vect_location,
1356                      "=== vect_enhance_data_refs_alignment ===\n");
1357
1358   /* While cost model enhancements are expected in the future, the high level
1359      view of the code at this time is as follows:
1360
1361      A) If there is a misaligned access then see if peeling to align
1362         this access can make all data references satisfy
1363         vect_supportable_dr_alignment.  If so, update data structures
1364         as needed and return true.
1365
1366      B) If peeling wasn't possible and there is a data reference with an
1367         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1368         then see if loop versioning checks can be used to make all data
1369         references satisfy vect_supportable_dr_alignment.  If so, update
1370         data structures as needed and return true.
1371
1372      C) If neither peeling nor versioning were successful then return false if
1373         any data reference does not satisfy vect_supportable_dr_alignment.
1374
1375      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1376
1377      Note, Possibility 3 above (which is peeling and versioning together) is not
1378      being done at this time.  */
1379
1380   /* (1) Peeling to force alignment.  */
1381
1382   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1383      Considerations:
1384      + How many accesses will become aligned due to the peeling
1385      - How many accesses will become unaligned due to the peeling,
1386        and the cost of misaligned accesses.
1387      - The cost of peeling (the extra runtime checks, the increase
1388        in code size).  */
1389
1390   FOR_EACH_VEC_ELT (datarefs, i, dr)
1391     {
1392       stmt = DR_STMT (dr);
1393       stmt_info = vinfo_for_stmt (stmt);
1394
1395       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1396         continue;
1397
1398       /* For interleaving, only the alignment of the first access
1399          matters.  */
1400       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1401           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1402         continue;
1403
1404       /* For invariant accesses there is nothing to enhance.  */
1405       if (integer_zerop (DR_STEP (dr)))
1406         continue;
1407
1408       /* Strided loads perform only component accesses, alignment is
1409          irrelevant for them.  */
1410       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1411         continue;
1412
1413       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1414       do_peeling = vector_alignment_reachable_p (dr);
1415       if (do_peeling)
1416         {
1417           if (known_alignment_for_access_p (dr))
1418             {
1419               unsigned int npeel_tmp;
1420               bool negative = tree_int_cst_compare (DR_STEP (dr),
1421                                                     size_zero_node) < 0;
1422
1423               /* Save info about DR in the hash table.  */
1424               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1425                 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1426                   = new hash_table<peel_info_hasher> (1);
1427
1428               vectype = STMT_VINFO_VECTYPE (stmt_info);
1429               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1430               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1431                                                 TREE_TYPE (DR_REF (dr))));
1432               npeel_tmp = (negative
1433                            ? (mis - nelements) : (nelements - mis))
1434                   & (nelements - 1);
1435
1436               /* For multiple types, it is possible that the bigger type access
1437                  will have more than one peeling option.  E.g., a loop with two
1438                  types: one of size (vector size / 4), and the other one of
1439                  size (vector size / 8).  Vectorization factor will 8.  If both
1440                  access are misaligned by 3, the first one needs one scalar
1441                  iteration to be aligned, and the second one needs 5.  But the
1442                  the first one will be aligned also by peeling 5 scalar
1443                  iterations, and in that case both accesses will be aligned.
1444                  Hence, except for the immediate peeling amount, we also want
1445                  to try to add full vector size, while we don't exceed
1446                  vectorization factor.
1447                  We do this automtically for cost model, since we calculate cost
1448                  for every peeling option.  */
1449               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1450                 possible_npeel_number = vf /nelements;
1451
1452               /* Handle the aligned case. We may decide to align some other
1453                  access, making DR unaligned.  */
1454               if (DR_MISALIGNMENT (dr) == 0)
1455                 {
1456                   npeel_tmp = 0;
1457                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1458                     possible_npeel_number++;
1459                 }
1460
1461               for (j = 0; j < possible_npeel_number; j++)
1462                 {
1463                   gcc_assert (npeel_tmp <= vf);
1464                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1465                   npeel_tmp += nelements;
1466                 }
1467
1468               all_misalignments_unknown = false;
1469               /* Data-ref that was chosen for the case that all the
1470                  misalignments are unknown is not relevant anymore, since we
1471                  have a data-ref with known alignment.  */
1472               dr0 = NULL;
1473             }
1474           else
1475             {
1476               /* If we don't know any misalignment values, we prefer
1477                  peeling for data-ref that has the maximum number of data-refs
1478                  with the same alignment, unless the target prefers to align
1479                  stores over load.  */
1480               if (all_misalignments_unknown)
1481                 {
1482                   unsigned same_align_drs
1483                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1484                   if (!dr0
1485                       || same_align_drs_max < same_align_drs)
1486                     {
1487                       same_align_drs_max = same_align_drs;
1488                       dr0 = dr;
1489                     }
1490                   /* For data-refs with the same number of related
1491                      accesses prefer the one where the misalign
1492                      computation will be invariant in the outermost loop.  */
1493                   else if (same_align_drs_max == same_align_drs)
1494                     {
1495                       struct loop *ivloop0, *ivloop;
1496                       ivloop0 = outermost_invariant_loop_for_expr
1497                           (loop, DR_BASE_ADDRESS (dr0));
1498                       ivloop = outermost_invariant_loop_for_expr
1499                           (loop, DR_BASE_ADDRESS (dr));
1500                       if ((ivloop && !ivloop0)
1501                           || (ivloop && ivloop0
1502                               && flow_loop_nested_p (ivloop, ivloop0)))
1503                         dr0 = dr;
1504                     }
1505
1506                   if (!first_store && DR_IS_WRITE (dr))
1507                     first_store = dr;
1508                 }
1509
1510               /* If there are both known and unknown misaligned accesses in the
1511                  loop, we choose peeling amount according to the known
1512                  accesses.  */
1513               if (!supportable_dr_alignment)
1514                 {
1515                   dr0 = dr;
1516                   if (!first_store && DR_IS_WRITE (dr))
1517                     first_store = dr;
1518                 }
1519             }
1520         }
1521       else
1522         {
1523           if (!aligned_access_p (dr))
1524             {
1525               if (dump_enabled_p ())
1526                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1527                                  "vector alignment may not be reachable\n");
1528               break;
1529             }
1530         }
1531     }
1532
1533   /* Check if we can possibly peel the loop.  */
1534   if (!vect_can_advance_ivs_p (loop_vinfo)
1535       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1536     do_peeling = false;
1537
1538   /* If we don't know how many times the peeling loop will run
1539      assume it will run VF-1 times and disable peeling if the remaining
1540      iters are less than the vectorization factor.  */
1541   if (do_peeling
1542       && all_misalignments_unknown
1543       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1544       && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1545           < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1546     do_peeling = false;
1547
1548   if (do_peeling
1549       && all_misalignments_unknown
1550       && vect_supportable_dr_alignment (dr0, false))
1551     {
1552       /* Check if the target requires to prefer stores over loads, i.e., if
1553          misaligned stores are more expensive than misaligned loads (taking
1554          drs with same alignment into account).  */
1555       if (first_store && DR_IS_READ (dr0))
1556         {
1557           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1558           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1559           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1560           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1561           stmt_vector_for_cost dummy;
1562           dummy.create (2);
1563
1564           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1565                                      &dummy);
1566           vect_get_data_access_cost (first_store, &store_inside_cost,
1567                                      &store_outside_cost, &dummy);
1568
1569           dummy.release ();
1570
1571           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1572              aligning the load DR0).  */
1573           load_inside_penalty = store_inside_cost;
1574           load_outside_penalty = store_outside_cost;
1575           for (i = 0;
1576                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1577                           DR_STMT (first_store))).iterate (i, &dr);
1578                i++)
1579             if (DR_IS_READ (dr))
1580               {
1581                 load_inside_penalty += load_inside_cost;
1582                 load_outside_penalty += load_outside_cost;
1583               }
1584             else
1585               {
1586                 load_inside_penalty += store_inside_cost;
1587                 load_outside_penalty += store_outside_cost;
1588               }
1589
1590           /* Calculate the penalty for leaving DR0 unaligned (by
1591              aligning the FIRST_STORE).  */
1592           store_inside_penalty = load_inside_cost;
1593           store_outside_penalty = load_outside_cost;
1594           for (i = 0;
1595                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1596                       DR_STMT (dr0))).iterate (i, &dr);
1597                i++)
1598             if (DR_IS_READ (dr))
1599               {
1600                 store_inside_penalty += load_inside_cost;
1601                 store_outside_penalty += load_outside_cost;
1602               }
1603             else
1604               {
1605                 store_inside_penalty += store_inside_cost;
1606                 store_outside_penalty += store_outside_cost;
1607               }
1608
1609           if (load_inside_penalty > store_inside_penalty
1610               || (load_inside_penalty == store_inside_penalty
1611                   && load_outside_penalty > store_outside_penalty))
1612             dr0 = first_store;
1613         }
1614
1615       /* In case there are only loads with different unknown misalignments, use
1616          peeling only if it may help to align other accesses in the loop.  */
1617       if (!first_store
1618           && !STMT_VINFO_SAME_ALIGN_REFS (
1619                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1620           && vect_supportable_dr_alignment (dr0, false)
1621               != dr_unaligned_supported)
1622         do_peeling = false;
1623     }
1624
1625   if (do_peeling && !dr0)
1626     {
1627       /* Peeling is possible, but there is no data access that is not supported
1628          unless aligned. So we try to choose the best possible peeling.  */
1629
1630       /* We should get here only if there are drs with known misalignment.  */
1631       gcc_assert (!all_misalignments_unknown);
1632
1633       /* Choose the best peeling from the hash table.  */
1634       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1635                                                    &body_cost_vec);
1636       if (!dr0 || !npeel)
1637         do_peeling = false;
1638
1639       /* If peeling by npeel will result in a remaining loop not iterating
1640          enough to be vectorized then do not peel.  */
1641       if (do_peeling
1642           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1643           && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1644               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1645         do_peeling = false;
1646     }
1647
1648   if (do_peeling)
1649     {
1650       stmt = DR_STMT (dr0);
1651       stmt_info = vinfo_for_stmt (stmt);
1652       vectype = STMT_VINFO_VECTYPE (stmt_info);
1653       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1654
1655       if (known_alignment_for_access_p (dr0))
1656         {
1657           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1658                                                 size_zero_node) < 0;
1659           if (!npeel)
1660             {
1661               /* Since it's known at compile time, compute the number of
1662                  iterations in the peeled loop (the peeling factor) for use in
1663                  updating DR_MISALIGNMENT values.  The peeling factor is the
1664                  vectorization factor minus the misalignment as an element
1665                  count.  */
1666               mis = DR_MISALIGNMENT (dr0);
1667               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1668               npeel = ((negative ? mis - nelements : nelements - mis)
1669                        & (nelements - 1));
1670             }
1671
1672           /* For interleaved data access every iteration accesses all the
1673              members of the group, therefore we divide the number of iterations
1674              by the group size.  */
1675           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1676           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1677             npeel /= GROUP_SIZE (stmt_info);
1678
1679           if (dump_enabled_p ())
1680             dump_printf_loc (MSG_NOTE, vect_location,
1681                              "Try peeling by %d\n", npeel);
1682         }
1683
1684       /* Ensure that all data refs can be vectorized after the peel.  */
1685       FOR_EACH_VEC_ELT (datarefs, i, dr)
1686         {
1687           int save_misalignment;
1688
1689           if (dr == dr0)
1690             continue;
1691
1692           stmt = DR_STMT (dr);
1693           stmt_info = vinfo_for_stmt (stmt);
1694           /* For interleaving, only the alignment of the first access
1695             matters.  */
1696           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1697               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1698             continue;
1699
1700           /* Strided loads perform only component accesses, alignment is
1701              irrelevant for them.  */
1702           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1703             continue;
1704
1705           save_misalignment = DR_MISALIGNMENT (dr);
1706           vect_update_misalignment_for_peel (dr, dr0, npeel);
1707           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1708           SET_DR_MISALIGNMENT (dr, save_misalignment);
1709
1710           if (!supportable_dr_alignment)
1711             {
1712               do_peeling = false;
1713               break;
1714             }
1715         }
1716
1717       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1718         {
1719           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1720           if (!stat)
1721             do_peeling = false;
1722           else
1723             {
1724               body_cost_vec.release ();
1725               return stat;
1726             }
1727         }
1728
1729       if (do_peeling)
1730         {
1731           unsigned max_allowed_peel
1732             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1733           if (max_allowed_peel != (unsigned)-1)
1734             {
1735               unsigned max_peel = npeel;
1736               if (max_peel == 0)
1737                 {
1738                   gimple dr_stmt = DR_STMT (dr0);
1739                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1740                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1741                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1742                 }
1743               if (max_peel > max_allowed_peel)
1744                 {
1745                   do_peeling = false;
1746                   if (dump_enabled_p ())
1747                     dump_printf_loc (MSG_NOTE, vect_location,
1748                         "Disable peeling, max peels reached: %d\n", max_peel);
1749                 }
1750             }
1751         }
1752
1753       if (do_peeling)
1754         {
1755           stmt_info_for_cost *si;
1756           void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1757
1758           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1759              If the misalignment of DR_i is identical to that of dr0 then set
1760              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1761              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1762              by the peeling factor times the element size of DR_i (MOD the
1763              vectorization factor times the size).  Otherwise, the
1764              misalignment of DR_i must be set to unknown.  */
1765           FOR_EACH_VEC_ELT (datarefs, i, dr)
1766             if (dr != dr0)
1767               vect_update_misalignment_for_peel (dr, dr0, npeel);
1768
1769           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1770           if (npeel)
1771             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1772           else
1773             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1774               = DR_MISALIGNMENT (dr0);
1775           SET_DR_MISALIGNMENT (dr0, 0);
1776           if (dump_enabled_p ())
1777             {
1778               dump_printf_loc (MSG_NOTE, vect_location,
1779                                "Alignment of access forced using peeling.\n");
1780               dump_printf_loc (MSG_NOTE, vect_location,
1781                                "Peeling for alignment will be applied.\n");
1782             }
1783           /* We've delayed passing the inside-loop peeling costs to the
1784              target cost model until we were sure peeling would happen.
1785              Do so now.  */
1786           if (body_cost_vec.exists ())
1787             {
1788               FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1789                 {
1790                   struct _stmt_vec_info *stmt_info
1791                     = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1792                   (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1793                                         si->misalign, vect_body);
1794                 }
1795               body_cost_vec.release ();
1796             }
1797
1798           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1799           gcc_assert (stat);
1800           return stat;
1801         }
1802     }
1803
1804   body_cost_vec.release ();
1805
1806   /* (2) Versioning to force alignment.  */
1807
1808   /* Try versioning if:
1809      1) optimize loop for speed
1810      2) there is at least one unsupported misaligned data ref with an unknown
1811         misalignment, and
1812      3) all misaligned data refs with a known misalignment are supported, and
1813      4) the number of runtime alignment checks is within reason.  */
1814
1815   do_versioning =
1816         optimize_loop_nest_for_speed_p (loop)
1817         && (!loop->inner); /* FORNOW */
1818
1819   if (do_versioning)
1820     {
1821       FOR_EACH_VEC_ELT (datarefs, i, dr)
1822         {
1823           stmt = DR_STMT (dr);
1824           stmt_info = vinfo_for_stmt (stmt);
1825
1826           /* For interleaving, only the alignment of the first access
1827              matters.  */
1828           if (aligned_access_p (dr)
1829               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1830                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1831             continue;
1832
1833           /* Strided loads perform only component accesses, alignment is
1834              irrelevant for them.  */
1835           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1836             continue;
1837
1838           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1839
1840           if (!supportable_dr_alignment)
1841             {
1842               gimple stmt;
1843               int mask;
1844               tree vectype;
1845
1846               if (known_alignment_for_access_p (dr)
1847                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1848                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1849                 {
1850                   do_versioning = false;
1851                   break;
1852                 }
1853
1854               stmt = DR_STMT (dr);
1855               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1856               gcc_assert (vectype);
1857
1858               /* The rightmost bits of an aligned address must be zeros.
1859                  Construct the mask needed for this test.  For example,
1860                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1861                  mask must be 15 = 0xf. */
1862               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1863
1864               /* FORNOW: use the same mask to test all potentially unaligned
1865                  references in the loop.  The vectorizer currently supports
1866                  a single vector size, see the reference to
1867                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1868                  vectorization factor is computed.  */
1869               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1870                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1871               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1872               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1873                       DR_STMT (dr));
1874             }
1875         }
1876
1877       /* Versioning requires at least one misaligned data reference.  */
1878       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1879         do_versioning = false;
1880       else if (!do_versioning)
1881         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1882     }
1883
1884   if (do_versioning)
1885     {
1886       vec<gimple> may_misalign_stmts
1887         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1888       gimple stmt;
1889
1890       /* It can now be assumed that the data references in the statements
1891          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1892          of the loop being vectorized.  */
1893       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1894         {
1895           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1896           dr = STMT_VINFO_DATA_REF (stmt_info);
1897           SET_DR_MISALIGNMENT (dr, 0);
1898           if (dump_enabled_p ())
1899             dump_printf_loc (MSG_NOTE, vect_location,
1900                              "Alignment of access forced using versioning.\n");
1901         }
1902
1903       if (dump_enabled_p ())
1904         dump_printf_loc (MSG_NOTE, vect_location,
1905                          "Versioning for alignment will be applied.\n");
1906
1907       /* Peeling and versioning can't be done together at this time.  */
1908       gcc_assert (! (do_peeling && do_versioning));
1909
1910       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1911       gcc_assert (stat);
1912       return stat;
1913     }
1914
1915   /* This point is reached if neither peeling nor versioning is being done.  */
1916   gcc_assert (! (do_peeling || do_versioning));
1917
1918   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1919   return stat;
1920 }
1921
1922
1923 /* Function vect_find_same_alignment_drs.
1924
1925    Update group and alignment relations according to the chosen
1926    vectorization factor.  */
1927
1928 static void
1929 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1930                               loop_vec_info loop_vinfo)
1931 {
1932   unsigned int i;
1933   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1934   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1935   struct data_reference *dra = DDR_A (ddr);
1936   struct data_reference *drb = DDR_B (ddr);
1937   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1938   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1939   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1940   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1941   lambda_vector dist_v;
1942   unsigned int loop_depth;
1943
1944   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1945     return;
1946
1947   if (dra == drb)
1948     return;
1949
1950   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1951     return;
1952
1953   /* Loop-based vectorization and known data dependence.  */
1954   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1955     return;
1956
1957   /* Data-dependence analysis reports a distance vector of zero
1958      for data-references that overlap only in the first iteration
1959      but have different sign step (see PR45764).
1960      So as a sanity check require equal DR_STEP.  */
1961   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1962     return;
1963
1964   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1965   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1966     {
1967       int dist = dist_v[loop_depth];
1968
1969       if (dump_enabled_p ())
1970         dump_printf_loc (MSG_NOTE, vect_location,
1971                          "dependence distance  = %d.\n", dist);
1972
1973       /* Same loop iteration.  */
1974       if (dist == 0
1975           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1976         {
1977           /* Two references with distance zero have the same alignment.  */
1978           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1979           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1980           if (dump_enabled_p ())
1981             {
1982               dump_printf_loc (MSG_NOTE, vect_location,
1983                                "accesses have the same alignment.\n");
1984               dump_printf (MSG_NOTE,
1985                            "dependence distance modulo vf == 0 between ");
1986               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1987               dump_printf (MSG_NOTE,  " and ");
1988               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1989               dump_printf (MSG_NOTE, "\n");
1990             }
1991         }
1992     }
1993 }
1994
1995
1996 /* Function vect_analyze_data_refs_alignment
1997
1998    Analyze the alignment of the data-references in the loop.
1999    Return FALSE if a data reference is found that cannot be vectorized.  */
2000
2001 bool
2002 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
2003                                   bb_vec_info bb_vinfo)
2004 {
2005   if (dump_enabled_p ())
2006     dump_printf_loc (MSG_NOTE, vect_location,
2007                      "=== vect_analyze_data_refs_alignment ===\n");
2008
2009   /* Mark groups of data references with same alignment using
2010      data dependence information.  */
2011   if (loop_vinfo)
2012     {
2013       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2014       struct data_dependence_relation *ddr;
2015       unsigned int i;
2016
2017       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2018         vect_find_same_alignment_drs (ddr, loop_vinfo);
2019     }
2020
2021   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2022     {
2023       if (dump_enabled_p ())
2024         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2025                          "not vectorized: can't calculate alignment "
2026                          "for data ref.\n");
2027       return false;
2028     }
2029
2030   return true;
2031 }
2032
2033
2034 /* Analyze groups of accesses: check that DR belongs to a group of
2035    accesses of legal size, step, etc.  Detect gaps, single element
2036    interleaving, and other special cases. Set grouped access info.
2037    Collect groups of strided stores for further use in SLP analysis.  */
2038
2039 static bool
2040 vect_analyze_group_access (struct data_reference *dr)
2041 {
2042   tree step = DR_STEP (dr);
2043   tree scalar_type = TREE_TYPE (DR_REF (dr));
2044   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2045   gimple stmt = DR_STMT (dr);
2046   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2047   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2048   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2049   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2050   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2051   bool slp_impossible = false;
2052   struct loop *loop = NULL;
2053
2054   if (loop_vinfo)
2055     loop = LOOP_VINFO_LOOP (loop_vinfo);
2056
2057   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2058      size of the interleaving group (including gaps).  */
2059   groupsize = absu_hwi (dr_step) / type_size;
2060
2061   /* Not consecutive access is possible only if it is a part of interleaving.  */
2062   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2063     {
2064       /* Check if it this DR is a part of interleaving, and is a single
2065          element of the group that is accessed in the loop.  */
2066
2067       /* Gaps are supported only for loads. STEP must be a multiple of the type
2068          size.  The size of the group must be a power of 2.  */
2069       if (DR_IS_READ (dr)
2070           && (dr_step % type_size) == 0
2071           && groupsize > 0
2072           && exact_log2 (groupsize) != -1)
2073         {
2074           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2075           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2076           if (dump_enabled_p ())
2077             {
2078               dump_printf_loc (MSG_NOTE, vect_location,
2079                                "Detected single element interleaving ");
2080               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2081               dump_printf (MSG_NOTE, " step ");
2082               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2083               dump_printf (MSG_NOTE, "\n");
2084             }
2085
2086           if (loop_vinfo)
2087             {
2088               if (dump_enabled_p ())
2089                 dump_printf_loc (MSG_NOTE, vect_location,
2090                                  "Data access with gaps requires scalar "
2091                                  "epilogue loop\n");
2092               if (loop->inner)
2093                 {
2094                   if (dump_enabled_p ())
2095                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2096                                      "Peeling for outer loop is not"
2097                                      " supported\n");
2098                   return false;
2099                 }
2100
2101               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2102             }
2103
2104           return true;
2105         }
2106
2107       if (dump_enabled_p ())
2108         {
2109           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2110                            "not consecutive access ");
2111           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2112           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2113         }
2114
2115       if (bb_vinfo)
2116         {
2117           /* Mark the statement as unvectorizable.  */
2118           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2119           return true;
2120         }
2121
2122       return false;
2123     }
2124
2125   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2126     {
2127       /* First stmt in the interleaving chain. Check the chain.  */
2128       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2129       struct data_reference *data_ref = dr;
2130       unsigned int count = 1;
2131       tree prev_init = DR_INIT (data_ref);
2132       gimple prev = stmt;
2133       HOST_WIDE_INT diff, gaps = 0;
2134       unsigned HOST_WIDE_INT count_in_bytes;
2135
2136       while (next)
2137         {
2138           /* Skip same data-refs.  In case that two or more stmts share
2139              data-ref (supported only for loads), we vectorize only the first
2140              stmt, and the rest get their vectorized loads from the first
2141              one.  */
2142           if (!tree_int_cst_compare (DR_INIT (data_ref),
2143                                      DR_INIT (STMT_VINFO_DATA_REF (
2144                                                    vinfo_for_stmt (next)))))
2145             {
2146               if (DR_IS_WRITE (data_ref))
2147                 {
2148                   if (dump_enabled_p ())
2149                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150                                      "Two store stmts share the same dr.\n");
2151                   return false;
2152                 }
2153
2154               /* For load use the same data-ref load.  */
2155               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2156
2157               prev = next;
2158               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2159               continue;
2160             }
2161
2162           prev = next;
2163           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2164
2165           /* All group members have the same STEP by construction.  */
2166           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2167
2168           /* Check that the distance between two accesses is equal to the type
2169              size. Otherwise, we have gaps.  */
2170           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2171                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2172           if (diff != 1)
2173             {
2174               /* FORNOW: SLP of accesses with gaps is not supported.  */
2175               slp_impossible = true;
2176               if (DR_IS_WRITE (data_ref))
2177                 {
2178                   if (dump_enabled_p ())
2179                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180                                      "interleaved store with gaps\n");
2181                   return false;
2182                 }
2183
2184               gaps += diff - 1;
2185             }
2186
2187           last_accessed_element += diff;
2188
2189           /* Store the gap from the previous member of the group. If there is no
2190              gap in the access, GROUP_GAP is always 1.  */
2191           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2192
2193           prev_init = DR_INIT (data_ref);
2194           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2195           /* Count the number of data-refs in the chain.  */
2196           count++;
2197         }
2198
2199       /* COUNT is the number of accesses found, we multiply it by the size of
2200          the type to get COUNT_IN_BYTES.  */
2201       count_in_bytes = type_size * count;
2202
2203       /* Check that the size of the interleaving (including gaps) is not
2204          greater than STEP.  */
2205       if (dr_step != 0
2206           && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2207         {
2208           if (dump_enabled_p ())
2209             {
2210               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2211                                "interleaving size is greater than step for ");
2212               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2213                                  DR_REF (dr));
2214               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2215             }
2216           return false;
2217         }
2218
2219       /* Check that the size of the interleaving is equal to STEP for stores,
2220          i.e., that there are no gaps.  */
2221       if (dr_step != 0
2222           && absu_hwi (dr_step) != count_in_bytes)
2223         {
2224           if (DR_IS_READ (dr))
2225             {
2226               slp_impossible = true;
2227               /* There is a gap after the last load in the group. This gap is a
2228                  difference between the groupsize and the number of elements.
2229                  When there is no gap, this difference should be 0.  */
2230               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2231             }
2232           else
2233             {
2234               if (dump_enabled_p ())
2235                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236                                  "interleaved store with gaps\n");
2237               return false;
2238             }
2239         }
2240
2241       /* Check that STEP is a multiple of type size.  */
2242       if (dr_step != 0
2243           && (dr_step % type_size) != 0)
2244         {
2245           if (dump_enabled_p ())
2246             {
2247               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248                                "step is not a multiple of type size: step ");
2249               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2250               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2251               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2252                                  TYPE_SIZE_UNIT (scalar_type));
2253               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2254             }
2255           return false;
2256         }
2257
2258       if (groupsize == 0)
2259         groupsize = count;
2260
2261       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2262       if (dump_enabled_p ())
2263         dump_printf_loc (MSG_NOTE, vect_location,
2264                          "Detected interleaving of size %d\n", (int)groupsize);
2265
2266       /* SLP: create an SLP data structure for every interleaving group of
2267          stores for further analysis in vect_analyse_slp.  */
2268       if (DR_IS_WRITE (dr) && !slp_impossible)
2269         {
2270           if (loop_vinfo)
2271             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2272           if (bb_vinfo)
2273             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2274         }
2275
2276       /* There is a gap in the end of the group.  */
2277       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2278         {
2279           if (dump_enabled_p ())
2280             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2281                              "Data access with gaps requires scalar "
2282                              "epilogue loop\n");
2283           if (loop->inner)
2284             {
2285               if (dump_enabled_p ())
2286                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287                                  "Peeling for outer loop is not supported\n");
2288               return false;
2289             }
2290
2291           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2292         }
2293     }
2294
2295   return true;
2296 }
2297
2298
2299 /* Analyze the access pattern of the data-reference DR.
2300    In case of non-consecutive accesses call vect_analyze_group_access() to
2301    analyze groups of accesses.  */
2302
2303 static bool
2304 vect_analyze_data_ref_access (struct data_reference *dr)
2305 {
2306   tree step = DR_STEP (dr);
2307   tree scalar_type = TREE_TYPE (DR_REF (dr));
2308   gimple stmt = DR_STMT (dr);
2309   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2310   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2311   struct loop *loop = NULL;
2312
2313   if (loop_vinfo)
2314     loop = LOOP_VINFO_LOOP (loop_vinfo);
2315
2316   if (loop_vinfo && !step)
2317     {
2318       if (dump_enabled_p ())
2319         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320                          "bad data-ref access in loop\n");
2321       return false;
2322     }
2323
2324   /* Allow invariant loads in not nested loops.  */
2325   if (loop_vinfo && integer_zerop (step))
2326     {
2327       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2328       if (nested_in_vect_loop_p (loop, stmt))
2329         {
2330           if (dump_enabled_p ())
2331             dump_printf_loc (MSG_NOTE, vect_location,
2332                              "zero step in inner loop of nest\n");
2333           return false;
2334         }
2335       return DR_IS_READ (dr);
2336     }
2337
2338   if (loop && nested_in_vect_loop_p (loop, stmt))
2339     {
2340       /* Interleaved accesses are not yet supported within outer-loop
2341         vectorization for references in the inner-loop.  */
2342       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2343
2344       /* For the rest of the analysis we use the outer-loop step.  */
2345       step = STMT_VINFO_DR_STEP (stmt_info);
2346       if (integer_zerop (step))
2347         {
2348           if (dump_enabled_p ())
2349             dump_printf_loc (MSG_NOTE, vect_location,
2350                              "zero step in outer loop.\n");
2351           if (DR_IS_READ (dr))
2352             return true;
2353           else
2354             return false;
2355         }
2356     }
2357
2358   /* Consecutive?  */
2359   if (TREE_CODE (step) == INTEGER_CST)
2360     {
2361       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2362       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2363           || (dr_step < 0
2364               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2365         {
2366           /* Mark that it is not interleaving.  */
2367           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2368           return true;
2369         }
2370     }
2371
2372   if (loop && nested_in_vect_loop_p (loop, stmt))
2373     {
2374       if (dump_enabled_p ())
2375         dump_printf_loc (MSG_NOTE, vect_location,
2376                          "grouped access in outer loop.\n");
2377       return false;
2378     }
2379
2380   /* Assume this is a DR handled by non-constant strided load case.  */
2381   if (TREE_CODE (step) != INTEGER_CST)
2382     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2383
2384   /* Not consecutive access - check if it's a part of interleaving group.  */
2385   return vect_analyze_group_access (dr);
2386 }
2387
2388
2389
2390 /*  A helper function used in the comparator function to sort data
2391     references.  T1 and T2 are two data references to be compared.
2392     The function returns -1, 0, or 1.  */
2393
2394 static int
2395 compare_tree (tree t1, tree t2)
2396 {
2397   int i, cmp;
2398   enum tree_code code;
2399   char tclass;
2400
2401   if (t1 == t2)
2402     return 0;
2403   if (t1 == NULL)
2404     return -1;
2405   if (t2 == NULL)
2406     return 1;
2407
2408
2409   if (TREE_CODE (t1) != TREE_CODE (t2))
2410     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2411
2412   code = TREE_CODE (t1);
2413   switch (code)
2414     {
2415     /* For const values, we can just use hash values for comparisons.  */
2416     case INTEGER_CST:
2417     case REAL_CST:
2418     case FIXED_CST:
2419     case STRING_CST:
2420     case COMPLEX_CST:
2421     case VECTOR_CST:
2422       {
2423         hashval_t h1 = iterative_hash_expr (t1, 0);
2424         hashval_t h2 = iterative_hash_expr (t2, 0);
2425         if (h1 != h2)
2426           return h1 < h2 ? -1 : 1;
2427         break;
2428       }
2429
2430     case SSA_NAME:
2431       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2432       if (cmp != 0)
2433         return cmp;
2434
2435       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2436         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2437       break;
2438
2439     default:
2440       tclass = TREE_CODE_CLASS (code);
2441
2442       /* For var-decl, we could compare their UIDs.  */
2443       if (tclass == tcc_declaration)
2444         {
2445           if (DECL_UID (t1) != DECL_UID (t2))
2446             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2447           break;
2448         }
2449
2450       /* For expressions with operands, compare their operands recursively.  */
2451       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2452         {
2453           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2454           if (cmp != 0)
2455             return cmp;
2456         }
2457     }
2458
2459   return 0;
2460 }
2461
2462
2463 /* Compare two data-references DRA and DRB to group them into chunks
2464    suitable for grouping.  */
2465
2466 static int
2467 dr_group_sort_cmp (const void *dra_, const void *drb_)
2468 {
2469   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2470   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2471   int cmp;
2472
2473   /* Stabilize sort.  */
2474   if (dra == drb)
2475     return 0;
2476
2477   /* Ordering of DRs according to base.  */
2478   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2479     {
2480       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2481       if (cmp != 0)
2482         return cmp;
2483     }
2484
2485   /* And according to DR_OFFSET.  */
2486   if (!dr_equal_offsets_p (dra, drb))
2487     {
2488       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2489       if (cmp != 0)
2490         return cmp;
2491     }
2492
2493   /* Put reads before writes.  */
2494   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2495     return DR_IS_READ (dra) ? -1 : 1;
2496
2497   /* Then sort after access size.  */
2498   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2499                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2500     {
2501       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2502                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2503       if (cmp != 0)
2504         return cmp;
2505     }
2506
2507   /* And after step.  */
2508   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2509     {
2510       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2511       if (cmp != 0)
2512         return cmp;
2513     }
2514
2515   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2516   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2517   if (cmp == 0)
2518     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2519   return cmp;
2520 }
2521
2522 /* Function vect_analyze_data_ref_accesses.
2523
2524    Analyze the access pattern of all the data references in the loop.
2525
2526    FORNOW: the only access pattern that is considered vectorizable is a
2527            simple step 1 (consecutive) access.
2528
2529    FORNOW: handle only arrays and pointer accesses.  */
2530
2531 bool
2532 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2533 {
2534   unsigned int i;
2535   vec<data_reference_p> datarefs;
2536   struct data_reference *dr;
2537
2538   if (dump_enabled_p ())
2539     dump_printf_loc (MSG_NOTE, vect_location,
2540                      "=== vect_analyze_data_ref_accesses ===\n");
2541
2542   if (loop_vinfo)
2543     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2544   else
2545     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2546
2547   if (datarefs.is_empty ())
2548     return true;
2549
2550   /* Sort the array of datarefs to make building the interleaving chains
2551      linear.  Don't modify the original vector's order, it is needed for
2552      determining what dependencies are reversed.  */
2553   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2554   datarefs_copy.qsort (dr_group_sort_cmp);
2555
2556   /* Build the interleaving chains.  */
2557   for (i = 0; i < datarefs_copy.length () - 1;)
2558     {
2559       data_reference_p dra = datarefs_copy[i];
2560       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2561       stmt_vec_info lastinfo = NULL;
2562       for (i = i + 1; i < datarefs_copy.length (); ++i)
2563         {
2564           data_reference_p drb = datarefs_copy[i];
2565           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2566
2567           /* ???  Imperfect sorting (non-compatible types, non-modulo
2568              accesses, same accesses) can lead to a group to be artificially
2569              split here as we don't just skip over those.  If it really
2570              matters we can push those to a worklist and re-iterate
2571              over them.  The we can just skip ahead to the next DR here.  */
2572
2573           /* Check that the data-refs have same first location (except init)
2574              and they are both either store or load (not load and store,
2575              not masked loads or stores).  */
2576           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2577               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2578                                    DR_BASE_ADDRESS (drb), 0)
2579               || !dr_equal_offsets_p (dra, drb)
2580               || !gimple_assign_single_p (DR_STMT (dra))
2581               || !gimple_assign_single_p (DR_STMT (drb)))
2582             break;
2583
2584           /* Check that the data-refs have the same constant size and step.  */
2585           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2586           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2587           if (!tree_fits_uhwi_p (sza)
2588               || !tree_fits_uhwi_p (szb)
2589               || !tree_int_cst_equal (sza, szb)
2590               || !tree_fits_shwi_p (DR_STEP (dra))
2591               || !tree_fits_shwi_p (DR_STEP (drb))
2592               || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2593             break;
2594
2595           /* Do not place the same access in the interleaving chain twice.  */
2596           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2597             break;
2598
2599           /* Check the types are compatible.
2600              ???  We don't distinguish this during sorting.  */
2601           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2602                                    TREE_TYPE (DR_REF (drb))))
2603             break;
2604
2605           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2606           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2607           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2608           gcc_assert (init_a < init_b);
2609
2610           /* If init_b == init_a + the size of the type * k, we have an
2611              interleaving, and DRA is accessed before DRB.  */
2612           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2613           if ((init_b - init_a) % type_size_a != 0)
2614             break;
2615
2616           /* The step (if not zero) is greater than the difference between
2617              data-refs' inits.  This splits groups into suitable sizes.  */
2618           HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2619           if (step != 0 && step <= (init_b - init_a))
2620             break;
2621
2622           if (dump_enabled_p ())
2623             {
2624               dump_printf_loc (MSG_NOTE, vect_location,
2625                                "Detected interleaving ");
2626               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2627               dump_printf (MSG_NOTE,  " and ");
2628               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2629               dump_printf (MSG_NOTE, "\n");
2630             }
2631
2632           /* Link the found element into the group list.  */
2633           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2634             {
2635               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2636               lastinfo = stmtinfo_a;
2637             }
2638           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2639           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2640           lastinfo = stmtinfo_b;
2641         }
2642     }
2643
2644   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2645     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2646         && !vect_analyze_data_ref_access (dr))
2647       {
2648         if (dump_enabled_p ())
2649           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2650                            "not vectorized: complicated access pattern.\n");
2651
2652         if (bb_vinfo)
2653           {
2654             /* Mark the statement as not vectorizable.  */
2655             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2656             continue;
2657           }
2658         else
2659           {
2660             datarefs_copy.release ();
2661             return false;
2662           }
2663       }
2664
2665   datarefs_copy.release ();
2666   return true;
2667 }
2668
2669
2670 /* Operator == between two dr_with_seg_len objects.
2671
2672    This equality operator is used to make sure two data refs
2673    are the same one so that we will consider to combine the
2674    aliasing checks of those two pairs of data dependent data
2675    refs.  */
2676
2677 static bool
2678 operator == (const dr_with_seg_len& d1,
2679              const dr_with_seg_len& d2)
2680 {
2681   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2682                           DR_BASE_ADDRESS (d2.dr), 0)
2683            && compare_tree (d1.offset, d2.offset) == 0
2684            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2685 }
2686
2687 /* Function comp_dr_with_seg_len_pair.
2688
2689    Comparison function for sorting objects of dr_with_seg_len_pair_t
2690    so that we can combine aliasing checks in one scan.  */
2691
2692 static int
2693 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2694 {
2695   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2696   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2697
2698   const dr_with_seg_len &p11 = p1->first,
2699                         &p12 = p1->second,
2700                         &p21 = p2->first,
2701                         &p22 = p2->second;
2702
2703   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2704      if a and c have the same basic address snd step, and b and d have the same
2705      address and step.  Therefore, if any a&c or b&d don't have the same address
2706      and step, we don't care the order of those two pairs after sorting.  */
2707   int comp_res;
2708
2709   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2710                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2711     return comp_res;
2712   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2713                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2714     return comp_res;
2715   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2716     return comp_res;
2717   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2718     return comp_res;
2719   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2720     return comp_res;
2721   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2722     return comp_res;
2723
2724   return 0;
2725 }
2726
2727 /* Function vect_vfa_segment_size.
2728
2729    Create an expression that computes the size of segment
2730    that will be accessed for a data reference.  The functions takes into
2731    account that realignment loads may access one more vector.
2732
2733    Input:
2734      DR: The data reference.
2735      LENGTH_FACTOR: segment length to consider.
2736
2737    Return an expression whose value is the size of segment which will be
2738    accessed by DR.  */
2739
2740 static tree
2741 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2742 {
2743   tree segment_length;
2744
2745   if (integer_zerop (DR_STEP (dr)))
2746     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2747   else
2748     segment_length = size_binop (MULT_EXPR,
2749                                  fold_convert (sizetype, DR_STEP (dr)),
2750                                  fold_convert (sizetype, length_factor));
2751
2752   if (vect_supportable_dr_alignment (dr, false)
2753         == dr_explicit_realign_optimized)
2754     {
2755       tree vector_size = TYPE_SIZE_UNIT
2756                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2757
2758       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2759     }
2760   return segment_length;
2761 }
2762
2763 /* Function vect_prune_runtime_alias_test_list.
2764
2765    Prune a list of ddrs to be tested at run-time by versioning for alias.
2766    Merge several alias checks into one if possible.
2767    Return FALSE if resulting list of ddrs is longer then allowed by
2768    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2769
2770 bool
2771 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2772 {
2773   vec<ddr_p> may_alias_ddrs =
2774     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2775   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2776     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2777   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2778   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2779
2780   ddr_p ddr;
2781   unsigned int i;
2782   tree length_factor;
2783
2784   if (dump_enabled_p ())
2785     dump_printf_loc (MSG_NOTE, vect_location,
2786                      "=== vect_prune_runtime_alias_test_list ===\n");
2787
2788   if (may_alias_ddrs.is_empty ())
2789     return true;
2790
2791   /* Basically, for each pair of dependent data refs store_ptr_0
2792      and load_ptr_0, we create an expression:
2793
2794      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2795      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2796
2797      for aliasing checks.  However, in some cases we can decrease
2798      the number of checks by combining two checks into one.  For
2799      example, suppose we have another pair of data refs store_ptr_0
2800      and load_ptr_1, and if the following condition is satisfied:
2801
2802      load_ptr_0 < load_ptr_1  &&
2803      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2804
2805      (this condition means, in each iteration of vectorized loop,
2806      the accessed memory of store_ptr_0 cannot be between the memory
2807      of load_ptr_0 and load_ptr_1.)
2808
2809      we then can use only the following expression to finish the
2810      alising checks between store_ptr_0 & load_ptr_0 and
2811      store_ptr_0 & load_ptr_1:
2812
2813      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2814      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2815
2816      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2817      same basic address.  */
2818
2819   comp_alias_ddrs.create (may_alias_ddrs.length ());
2820
2821   /* First, we collect all data ref pairs for aliasing checks.  */
2822   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2823     {
2824       struct data_reference *dr_a, *dr_b;
2825       gimple dr_group_first_a, dr_group_first_b;
2826       tree segment_length_a, segment_length_b;
2827       gimple stmt_a, stmt_b;
2828
2829       dr_a = DDR_A (ddr);
2830       stmt_a = DR_STMT (DDR_A (ddr));
2831       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2832       if (dr_group_first_a)
2833         {
2834           stmt_a = dr_group_first_a;
2835           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2836         }
2837
2838       dr_b = DDR_B (ddr);
2839       stmt_b = DR_STMT (DDR_B (ddr));
2840       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2841       if (dr_group_first_b)
2842         {
2843           stmt_b = dr_group_first_b;
2844           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2845         }
2846
2847       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2848         length_factor = scalar_loop_iters;
2849       else
2850         length_factor = size_int (vect_factor);
2851       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2852       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2853
2854       dr_with_seg_len_pair_t dr_with_seg_len_pair
2855           (dr_with_seg_len (dr_a, segment_length_a),
2856            dr_with_seg_len (dr_b, segment_length_b));
2857
2858       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2859         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2860
2861       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2862     }
2863
2864   /* Second, we sort the collected data ref pairs so that we can scan
2865      them once to combine all possible aliasing checks.  */
2866   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2867
2868   /* Third, we scan the sorted dr pairs and check if we can combine
2869      alias checks of two neighbouring dr pairs.  */
2870   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2871     {
2872       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2873       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2874                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2875                       *dr_a2 = &comp_alias_ddrs[i].first,
2876                       *dr_b2 = &comp_alias_ddrs[i].second;
2877
2878       /* Remove duplicate data ref pairs.  */
2879       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2880         {
2881           if (dump_enabled_p ())
2882             {
2883               dump_printf_loc (MSG_NOTE, vect_location,
2884                                "found equal ranges ");
2885               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2886                                  DR_REF (dr_a1->dr));
2887               dump_printf (MSG_NOTE,  ", ");
2888               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2889                                  DR_REF (dr_b1->dr));
2890               dump_printf (MSG_NOTE,  " and ");
2891               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2892                                  DR_REF (dr_a2->dr));
2893               dump_printf (MSG_NOTE,  ", ");
2894               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2895                                  DR_REF (dr_b2->dr));
2896               dump_printf (MSG_NOTE, "\n");
2897             }
2898
2899           comp_alias_ddrs.ordered_remove (i--);
2900           continue;
2901         }
2902
2903       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2904         {
2905           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2906              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2907           if (*dr_a1 == *dr_a2)
2908             {
2909               std::swap (dr_a1, dr_b1);
2910               std::swap (dr_a2, dr_b2);
2911             }
2912
2913           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2914                                 DR_BASE_ADDRESS (dr_a2->dr),
2915                                 0)
2916               || !tree_fits_shwi_p (dr_a1->offset)
2917               || !tree_fits_shwi_p (dr_a2->offset))
2918             continue;
2919
2920           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2921                                 - tree_to_shwi (dr_a1->offset));
2922
2923
2924           /* Now we check if the following condition is satisfied:
2925
2926              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2927
2928              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2929              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2930              have to make a best estimation.  We can get the minimum value
2931              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2932              then either of the following two conditions can guarantee the
2933              one above:
2934
2935              1: DIFF <= MIN_SEG_LEN_B
2936              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2937
2938              */
2939
2940           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2941                                           ? tree_to_shwi (dr_b1->seg_len)
2942                                           : vect_factor);
2943
2944           if (diff <= min_seg_len_b
2945               || (tree_fits_shwi_p (dr_a1->seg_len)
2946                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2947             {
2948               if (dump_enabled_p ())
2949                 {
2950                   dump_printf_loc (MSG_NOTE, vect_location,
2951                                    "merging ranges for ");
2952                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2953                                      DR_REF (dr_a1->dr));
2954                   dump_printf (MSG_NOTE,  ", ");
2955                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2956                                      DR_REF (dr_b1->dr));
2957                   dump_printf (MSG_NOTE,  " and ");
2958                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2959                                      DR_REF (dr_a2->dr));
2960                   dump_printf (MSG_NOTE,  ", ");
2961                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2962                                      DR_REF (dr_b2->dr));
2963                   dump_printf (MSG_NOTE, "\n");
2964                 }
2965
2966               dr_a1->seg_len = size_binop (PLUS_EXPR,
2967                                            dr_a2->seg_len, size_int (diff));
2968               comp_alias_ddrs.ordered_remove (i--);
2969             }
2970         }
2971     }
2972
2973   dump_printf_loc (MSG_NOTE, vect_location,
2974                    "improved number of alias checks from %d to %d\n",
2975                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
2976   if ((int) comp_alias_ddrs.length () >
2977       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2978     return false;
2979
2980   return true;
2981 }
2982
2983 /* Check whether a non-affine read in stmt is suitable for gather load
2984    and if so, return a builtin decl for that operation.  */
2985
2986 tree
2987 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2988                    tree *offp, int *scalep)
2989 {
2990   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2991   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2992   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2993   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2994   tree offtype = NULL_TREE;
2995   tree decl, base, off;
2996   machine_mode pmode;
2997   int punsignedp, pvolatilep;
2998
2999   base = DR_REF (dr);
3000   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3001      see if we can use the def stmt of the address.  */
3002   if (is_gimple_call (stmt)
3003       && gimple_call_internal_p (stmt)
3004       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3005           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3006       && TREE_CODE (base) == MEM_REF
3007       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3008       && integer_zerop (TREE_OPERAND (base, 1))
3009       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3010     {
3011       gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3012       if (is_gimple_assign (def_stmt)
3013           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3014         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3015     }
3016
3017   /* The gather builtins need address of the form
3018      loop_invariant + vector * {1, 2, 4, 8}
3019      or
3020      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3021      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3022      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3023      multiplications and additions in it.  To get a vector, we need
3024      a single SSA_NAME that will be defined in the loop and will
3025      contain everything that is not loop invariant and that can be
3026      vectorized.  The following code attempts to find such a preexistng
3027      SSA_NAME OFF and put the loop invariants into a tree BASE
3028      that can be gimplified before the loop.  */
3029   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3030                               &pmode, &punsignedp, &pvolatilep, false);
3031   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3032
3033   if (TREE_CODE (base) == MEM_REF)
3034     {
3035       if (!integer_zerop (TREE_OPERAND (base, 1)))
3036         {
3037           if (off == NULL_TREE)
3038             {
3039               offset_int moff = mem_ref_offset (base);
3040               off = wide_int_to_tree (sizetype, moff);
3041             }
3042           else
3043             off = size_binop (PLUS_EXPR, off,
3044                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3045         }
3046       base = TREE_OPERAND (base, 0);
3047     }
3048   else
3049     base = build_fold_addr_expr (base);
3050
3051   if (off == NULL_TREE)
3052     off = size_zero_node;
3053
3054   /* If base is not loop invariant, either off is 0, then we start with just
3055      the constant offset in the loop invariant BASE and continue with base
3056      as OFF, otherwise give up.
3057      We could handle that case by gimplifying the addition of base + off
3058      into some SSA_NAME and use that as off, but for now punt.  */
3059   if (!expr_invariant_in_loop_p (loop, base))
3060     {
3061       if (!integer_zerop (off))
3062         return NULL_TREE;
3063       off = base;
3064       base = size_int (pbitpos / BITS_PER_UNIT);
3065     }
3066   /* Otherwise put base + constant offset into the loop invariant BASE
3067      and continue with OFF.  */
3068   else
3069     {
3070       base = fold_convert (sizetype, base);
3071       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3072     }
3073
3074   /* OFF at this point may be either a SSA_NAME or some tree expression
3075      from get_inner_reference.  Try to peel off loop invariants from it
3076      into BASE as long as possible.  */
3077   STRIP_NOPS (off);
3078   while (offtype == NULL_TREE)
3079     {
3080       enum tree_code code;
3081       tree op0, op1, add = NULL_TREE;
3082
3083       if (TREE_CODE (off) == SSA_NAME)
3084         {
3085           gimple def_stmt = SSA_NAME_DEF_STMT (off);
3086
3087           if (expr_invariant_in_loop_p (loop, off))
3088             return NULL_TREE;
3089
3090           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3091             break;
3092
3093           op0 = gimple_assign_rhs1 (def_stmt);
3094           code = gimple_assign_rhs_code (def_stmt);
3095           op1 = gimple_assign_rhs2 (def_stmt);
3096         }
3097       else
3098         {
3099           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3100             return NULL_TREE;
3101           code = TREE_CODE (off);
3102           extract_ops_from_tree (off, &code, &op0, &op1);
3103         }
3104       switch (code)
3105         {
3106         case POINTER_PLUS_EXPR:
3107         case PLUS_EXPR:
3108           if (expr_invariant_in_loop_p (loop, op0))
3109             {
3110               add = op0;
3111               off = op1;
3112             do_add:
3113               add = fold_convert (sizetype, add);
3114               if (scale != 1)
3115                 add = size_binop (MULT_EXPR, add, size_int (scale));
3116               base = size_binop (PLUS_EXPR, base, add);
3117               continue;
3118             }
3119           if (expr_invariant_in_loop_p (loop, op1))
3120             {
3121               add = op1;
3122               off = op0;
3123               goto do_add;
3124             }
3125           break;
3126         case MINUS_EXPR:
3127           if (expr_invariant_in_loop_p (loop, op1))
3128             {
3129               add = fold_convert (sizetype, op1);
3130               add = size_binop (MINUS_EXPR, size_zero_node, add);
3131               off = op0;
3132               goto do_add;
3133             }
3134           break;
3135         case MULT_EXPR:
3136           if (scale == 1 && tree_fits_shwi_p (op1))
3137             {
3138               scale = tree_to_shwi (op1);
3139               off = op0;
3140               continue;
3141             }
3142           break;
3143         case SSA_NAME:
3144           off = op0;
3145           continue;
3146         CASE_CONVERT:
3147           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3148               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3149             break;
3150           if (TYPE_PRECISION (TREE_TYPE (op0))
3151               == TYPE_PRECISION (TREE_TYPE (off)))
3152             {
3153               off = op0;
3154               continue;
3155             }
3156           if (TYPE_PRECISION (TREE_TYPE (op0))
3157               < TYPE_PRECISION (TREE_TYPE (off)))
3158             {
3159               off = op0;
3160               offtype = TREE_TYPE (off);
3161               STRIP_NOPS (off);
3162               continue;
3163             }
3164           break;
3165         default:
3166           break;
3167         }
3168       break;
3169     }
3170
3171   /* If at the end OFF still isn't a SSA_NAME or isn't
3172      defined in the loop, punt.  */
3173   if (TREE_CODE (off) != SSA_NAME
3174       || expr_invariant_in_loop_p (loop, off))
3175     return NULL_TREE;
3176
3177   if (offtype == NULL_TREE)
3178     offtype = TREE_TYPE (off);
3179
3180   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3181                                            offtype, scale);
3182   if (decl == NULL_TREE)
3183     return NULL_TREE;
3184
3185   if (basep)
3186     *basep = base;
3187   if (offp)
3188     *offp = off;
3189   if (scalep)
3190     *scalep = scale;
3191   return decl;
3192 }
3193
3194 /* Function vect_analyze_data_refs.
3195
3196   Find all the data references in the loop or basic block.
3197
3198    The general structure of the analysis of data refs in the vectorizer is as
3199    follows:
3200    1- vect_analyze_data_refs(loop/bb): call
3201       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3202       in the loop/bb and their dependences.
3203    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3204    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3205    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3206
3207 */
3208
3209 bool
3210 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3211                         bb_vec_info bb_vinfo,
3212                         int *min_vf, unsigned *n_stmts)
3213 {
3214   struct loop *loop = NULL;
3215   basic_block bb = NULL;
3216   unsigned int i;
3217   vec<data_reference_p> datarefs;
3218   struct data_reference *dr;
3219   tree scalar_type;
3220
3221   if (dump_enabled_p ())
3222     dump_printf_loc (MSG_NOTE, vect_location,
3223                      "=== vect_analyze_data_refs ===\n");
3224
3225   if (loop_vinfo)
3226     {
3227       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3228
3229       loop = LOOP_VINFO_LOOP (loop_vinfo);
3230       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3231       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3232         {
3233           if (dump_enabled_p ())
3234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3235                              "not vectorized: loop contains function calls"
3236                              " or data references that cannot be analyzed\n");
3237           return false;
3238         }
3239
3240       for (i = 0; i < loop->num_nodes; i++)
3241         {
3242           gimple_stmt_iterator gsi;
3243
3244           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3245             {
3246               gimple stmt = gsi_stmt (gsi);
3247               if (is_gimple_debug (stmt))
3248                 continue;
3249               ++*n_stmts;
3250               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3251                 {
3252                   if (is_gimple_call (stmt) && loop->safelen)
3253                     {
3254                       tree fndecl = gimple_call_fndecl (stmt), op;
3255                       if (fndecl != NULL_TREE)
3256                         {
3257                           struct cgraph_node *node = cgraph_node::get (fndecl);
3258                           if (node != NULL && node->simd_clones != NULL)
3259                             {
3260                               unsigned int j, n = gimple_call_num_args (stmt);
3261                               for (j = 0; j < n; j++)
3262                                 {
3263                                   op = gimple_call_arg (stmt, j);
3264                                   if (DECL_P (op)
3265                                       || (REFERENCE_CLASS_P (op)
3266                                           && get_base_address (op)))
3267                                     break;
3268                                 }
3269                               op = gimple_call_lhs (stmt);
3270                               /* Ignore #pragma omp declare simd functions
3271                                  if they don't have data references in the
3272                                  call stmt itself.  */
3273                               if (j == n
3274                                   && !(op
3275                                        && (DECL_P (op)
3276                                            || (REFERENCE_CLASS_P (op)
3277                                                && get_base_address (op)))))
3278                                 continue;
3279                             }
3280                         }
3281                     }
3282                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3283                   if (dump_enabled_p ())
3284                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3285                                      "not vectorized: loop contains function "
3286                                      "calls or data references that cannot "
3287                                      "be analyzed\n");
3288                   return false;
3289                 }
3290             }
3291         }
3292
3293       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3294     }
3295   else
3296     {
3297       gimple_stmt_iterator gsi;
3298
3299       bb = BB_VINFO_BB (bb_vinfo);
3300       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3301         {
3302           gimple stmt = gsi_stmt (gsi);
3303           if (is_gimple_debug (stmt))
3304             continue;
3305           ++*n_stmts;
3306           if (!find_data_references_in_stmt (NULL, stmt,
3307                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3308             {
3309               /* Mark the rest of the basic-block as unvectorizable.  */
3310               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3311                 {
3312                   stmt = gsi_stmt (gsi);
3313                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3314                 }
3315               break;
3316             }
3317         }
3318
3319       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3320     }
3321
3322   /* Go through the data-refs, check that the analysis succeeded.  Update
3323      pointer from stmt_vec_info struct to DR and vectype.  */
3324
3325   FOR_EACH_VEC_ELT (datarefs, i, dr)
3326     {
3327       gimple stmt;
3328       stmt_vec_info stmt_info;
3329       tree base, offset, init;
3330       bool gather = false;
3331       bool simd_lane_access = false;
3332       int vf;
3333
3334 again:
3335       if (!dr || !DR_REF (dr))
3336         {
3337           if (dump_enabled_p ())
3338             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3339                              "not vectorized: unhandled data-ref\n");
3340           return false;
3341         }
3342
3343       stmt = DR_STMT (dr);
3344       stmt_info = vinfo_for_stmt (stmt);
3345
3346       /* Discard clobbers from the dataref vector.  We will remove
3347          clobber stmts during vectorization.  */
3348       if (gimple_clobber_p (stmt))
3349         {
3350           free_data_ref (dr);
3351           if (i == datarefs.length () - 1)
3352             {
3353               datarefs.pop ();
3354               break;
3355             }
3356           datarefs.ordered_remove (i);
3357           dr = datarefs[i];
3358           goto again;
3359         }
3360
3361       /* Check that analysis of the data-ref succeeded.  */
3362       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3363           || !DR_STEP (dr))
3364         {
3365           bool maybe_gather
3366             = DR_IS_READ (dr)
3367               && !TREE_THIS_VOLATILE (DR_REF (dr))
3368               && targetm.vectorize.builtin_gather != NULL;
3369           bool maybe_simd_lane_access
3370             = loop_vinfo && loop->simduid;
3371
3372           /* If target supports vector gather loads, or if this might be
3373              a SIMD lane access, see if they can't be used.  */
3374           if (loop_vinfo
3375               && (maybe_gather || maybe_simd_lane_access)
3376               && !nested_in_vect_loop_p (loop, stmt))
3377             {
3378               struct data_reference *newdr
3379                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3380                                    DR_REF (dr), stmt, true);
3381               gcc_assert (newdr != NULL && DR_REF (newdr));
3382               if (DR_BASE_ADDRESS (newdr)
3383                   && DR_OFFSET (newdr)
3384                   && DR_INIT (newdr)
3385                   && DR_STEP (newdr)
3386                   && integer_zerop (DR_STEP (newdr)))
3387                 {
3388                   if (maybe_simd_lane_access)
3389                     {
3390                       tree off = DR_OFFSET (newdr);
3391                       STRIP_NOPS (off);
3392                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3393                           && TREE_CODE (off) == MULT_EXPR
3394                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3395                         {
3396                           tree step = TREE_OPERAND (off, 1);
3397                           off = TREE_OPERAND (off, 0);
3398                           STRIP_NOPS (off);
3399                           if (CONVERT_EXPR_P (off)
3400                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3401                                                                           0)))
3402                                  < TYPE_PRECISION (TREE_TYPE (off)))
3403                             off = TREE_OPERAND (off, 0);
3404                           if (TREE_CODE (off) == SSA_NAME)
3405                             {
3406                               gimple def = SSA_NAME_DEF_STMT (off);
3407                               tree reft = TREE_TYPE (DR_REF (newdr));
3408                               if (is_gimple_call (def)
3409                                   && gimple_call_internal_p (def)
3410                                   && (gimple_call_internal_fn (def)
3411                                       == IFN_GOMP_SIMD_LANE))
3412                                 {
3413                                   tree arg = gimple_call_arg (def, 0);
3414                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3415                                   arg = SSA_NAME_VAR (arg);
3416                                   if (arg == loop->simduid
3417                                       /* For now.  */
3418                                       && tree_int_cst_equal
3419                                            (TYPE_SIZE_UNIT (reft),
3420                                             step))
3421                                     {
3422                                       DR_OFFSET (newdr) = ssize_int (0);
3423                                       DR_STEP (newdr) = step;
3424                                       DR_ALIGNED_TO (newdr)
3425                                         = size_int (BIGGEST_ALIGNMENT);
3426                                       dr = newdr;
3427                                       simd_lane_access = true;
3428                                     }
3429                                 }
3430                             }
3431                         }
3432                     }
3433                   if (!simd_lane_access && maybe_gather)
3434                     {
3435                       dr = newdr;
3436                       gather = true;
3437                     }
3438                 }
3439               if (!gather && !simd_lane_access)
3440                 free_data_ref (newdr);
3441             }
3442
3443           if (!gather && !simd_lane_access)
3444             {
3445               if (dump_enabled_p ())
3446                 {
3447                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3448                                    "not vectorized: data ref analysis "
3449                                    "failed ");
3450                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3451                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3452                 }
3453
3454               if (bb_vinfo)
3455                 break;
3456
3457               return false;
3458             }
3459         }
3460
3461       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3462         {
3463           if (dump_enabled_p ())
3464             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3465                              "not vectorized: base addr of dr is a "
3466                              "constant\n");
3467
3468           if (bb_vinfo)
3469             break;
3470
3471           if (gather || simd_lane_access)
3472             free_data_ref (dr);
3473           return false;
3474         }
3475
3476       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3477         {
3478           if (dump_enabled_p ())
3479             {
3480               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3481                                "not vectorized: volatile type ");
3482               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3483               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3484             }
3485
3486           if (bb_vinfo)
3487             break;
3488
3489           return false;
3490         }
3491
3492       if (stmt_can_throw_internal (stmt))
3493         {
3494           if (dump_enabled_p ())
3495             {
3496               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3497                                "not vectorized: statement can throw an "
3498                                "exception ");
3499               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3500               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3501             }
3502
3503           if (bb_vinfo)
3504             break;
3505
3506           if (gather || simd_lane_access)
3507             free_data_ref (dr);
3508           return false;
3509         }
3510
3511       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3512           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3513         {
3514           if (dump_enabled_p ())
3515             {
3516               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3517                                "not vectorized: statement is bitfield "
3518                                "access ");
3519               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3520               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3521             }
3522
3523           if (bb_vinfo)
3524             break;
3525
3526           if (gather || simd_lane_access)
3527             free_data_ref (dr);
3528           return false;
3529         }
3530
3531       base = unshare_expr (DR_BASE_ADDRESS (dr));
3532       offset = unshare_expr (DR_OFFSET (dr));
3533       init = unshare_expr (DR_INIT (dr));
3534
3535       if (is_gimple_call (stmt)
3536           && (!gimple_call_internal_p (stmt)
3537               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3538                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3539         {
3540           if (dump_enabled_p ())
3541             {
3542               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3543                                "not vectorized: dr in a call ");
3544               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3545               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3546             }
3547
3548           if (bb_vinfo)
3549             break;
3550
3551           if (gather || simd_lane_access)
3552             free_data_ref (dr);
3553           return false;
3554         }
3555
3556       /* Update DR field in stmt_vec_info struct.  */
3557
3558       /* If the dataref is in an inner-loop of the loop that is considered for
3559          for vectorization, we also want to analyze the access relative to
3560          the outer-loop (DR contains information only relative to the
3561          inner-most enclosing loop).  We do that by building a reference to the
3562          first location accessed by the inner-loop, and analyze it relative to
3563          the outer-loop.  */
3564       if (loop && nested_in_vect_loop_p (loop, stmt))
3565         {
3566           tree outer_step, outer_base, outer_init;
3567           HOST_WIDE_INT pbitsize, pbitpos;
3568           tree poffset;
3569           machine_mode pmode;
3570           int punsignedp, pvolatilep;
3571           affine_iv base_iv, offset_iv;
3572           tree dinit;
3573
3574           /* Build a reference to the first location accessed by the
3575              inner-loop: *(BASE+INIT).  (The first location is actually
3576              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3577           tree inner_base = build_fold_indirect_ref
3578                                 (fold_build_pointer_plus (base, init));
3579
3580           if (dump_enabled_p ())
3581             {
3582               dump_printf_loc (MSG_NOTE, vect_location,
3583                                "analyze in outer-loop: ");
3584               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3585               dump_printf (MSG_NOTE, "\n");
3586             }
3587
3588           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3589                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3590           gcc_assert (outer_base != NULL_TREE);
3591
3592           if (pbitpos % BITS_PER_UNIT != 0)
3593             {
3594               if (dump_enabled_p ())
3595                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3596                                  "failed: bit offset alignment.\n");
3597               return false;
3598             }
3599
3600           outer_base = build_fold_addr_expr (outer_base);
3601           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3602                           &base_iv, false))
3603             {
3604               if (dump_enabled_p ())
3605                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3606                                  "failed: evolution of base is not affine.\n");
3607               return false;
3608             }
3609
3610           if (offset)
3611             {
3612               if (poffset)
3613                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3614                                        poffset);
3615               else
3616                 poffset = offset;
3617             }
3618
3619           if (!poffset)
3620             {
3621               offset_iv.base = ssize_int (0);
3622               offset_iv.step = ssize_int (0);
3623             }
3624           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3625                                &offset_iv, false))
3626             {
3627               if (dump_enabled_p ())
3628                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3629                                  "evolution of offset is not affine.\n");
3630               return false;
3631             }
3632
3633           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3634           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3635           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3636           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3637           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3638
3639           outer_step = size_binop (PLUS_EXPR,
3640                                 fold_convert (ssizetype, base_iv.step),
3641                                 fold_convert (ssizetype, offset_iv.step));
3642
3643           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3644           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3645           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3646           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3647           STMT_VINFO_DR_OFFSET (stmt_info) =
3648                                 fold_convert (ssizetype, offset_iv.base);
3649           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3650                                 size_int (highest_pow2_factor (offset_iv.base));
3651
3652           if (dump_enabled_p ())
3653             {
3654               dump_printf_loc (MSG_NOTE, vect_location,
3655                                "\touter base_address: ");
3656               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3657                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3658               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3659               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660                                  STMT_VINFO_DR_OFFSET (stmt_info));
3661               dump_printf (MSG_NOTE,
3662                            "\n\touter constant offset from base address: ");
3663               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3664                                  STMT_VINFO_DR_INIT (stmt_info));
3665               dump_printf (MSG_NOTE, "\n\touter step: ");
3666               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3667                                  STMT_VINFO_DR_STEP (stmt_info));
3668               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3669               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3670                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3671               dump_printf (MSG_NOTE, "\n");
3672             }
3673         }
3674
3675       if (STMT_VINFO_DATA_REF (stmt_info))
3676         {
3677           if (dump_enabled_p ())
3678             {
3679               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3680                                "not vectorized: more than one data ref "
3681                                "in stmt: ");
3682               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3683               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3684             }
3685
3686           if (bb_vinfo)
3687             break;
3688
3689           if (gather || simd_lane_access)
3690             free_data_ref (dr);
3691           return false;
3692         }
3693
3694       STMT_VINFO_DATA_REF (stmt_info) = dr;
3695       if (simd_lane_access)
3696         {
3697           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3698           free_data_ref (datarefs[i]);
3699           datarefs[i] = dr;
3700         }
3701
3702       /* Set vectype for STMT.  */
3703       scalar_type = TREE_TYPE (DR_REF (dr));
3704       STMT_VINFO_VECTYPE (stmt_info)
3705         = get_vectype_for_scalar_type (scalar_type);
3706       if (!STMT_VINFO_VECTYPE (stmt_info))
3707         {
3708           if (dump_enabled_p ())
3709             {
3710               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3711                                "not vectorized: no vectype for stmt: ");
3712               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3713               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3714               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3715                                  scalar_type);
3716               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3717             }
3718
3719           if (bb_vinfo)
3720             break;
3721
3722           if (gather || simd_lane_access)
3723             {
3724               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3725               if (gather)
3726                 free_data_ref (dr);
3727             }
3728           return false;
3729         }
3730       else
3731         {
3732           if (dump_enabled_p ())
3733             {
3734               dump_printf_loc (MSG_NOTE, vect_location,
3735                                "got vectype for stmt: ");
3736               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3737               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3738                                  STMT_VINFO_VECTYPE (stmt_info));
3739               dump_printf (MSG_NOTE, "\n");
3740             }
3741         }
3742
3743       /* Adjust the minimal vectorization factor according to the
3744          vector type.  */
3745       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3746       if (vf > *min_vf)
3747         *min_vf = vf;
3748
3749       if (gather)
3750         {
3751           tree off;
3752
3753           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3754           if (gather
3755               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3756             gather = false;
3757           if (!gather)
3758             {
3759               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3760               free_data_ref (dr);
3761               if (dump_enabled_p ())
3762                 {
3763                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3764                                    "not vectorized: not suitable for gather "
3765                                    "load ");
3766                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3767                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3768                 }
3769               return false;
3770             }
3771
3772           datarefs[i] = dr;
3773           STMT_VINFO_GATHER_P (stmt_info) = true;
3774         }
3775       else if (loop_vinfo
3776                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3777         {
3778           if (nested_in_vect_loop_p (loop, stmt)
3779               || !DR_IS_READ (dr))
3780             {
3781               if (dump_enabled_p ())
3782                 {
3783                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3784                                    "not vectorized: not suitable for strided "
3785                                    "load ");
3786                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3787                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3788                 }
3789               return false;
3790             }
3791           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3792         }
3793     }
3794
3795   /* If we stopped analysis at the first dataref we could not analyze
3796      when trying to vectorize a basic-block mark the rest of the datarefs
3797      as not vectorizable and truncate the vector of datarefs.  That
3798      avoids spending useless time in analyzing their dependence.  */
3799   if (i != datarefs.length ())
3800     {
3801       gcc_assert (bb_vinfo != NULL);
3802       for (unsigned j = i; j < datarefs.length (); ++j)
3803         {
3804           data_reference_p dr = datarefs[j];
3805           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3806           free_data_ref (dr);
3807         }
3808       datarefs.truncate (i);
3809     }
3810
3811   return true;
3812 }
3813
3814
3815 /* Function vect_get_new_vect_var.
3816
3817    Returns a name for a new variable.  The current naming scheme appends the
3818    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3819    the name of vectorizer generated variables, and appends that to NAME if
3820    provided.  */
3821
3822 tree
3823 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3824 {
3825   const char *prefix;
3826   tree new_vect_var;
3827
3828   switch (var_kind)
3829   {
3830   case vect_simple_var:
3831     prefix = "vect";
3832     break;
3833   case vect_scalar_var:
3834     prefix = "stmp";
3835     break;
3836   case vect_pointer_var:
3837     prefix = "vectp";
3838     break;
3839   default:
3840     gcc_unreachable ();
3841   }
3842
3843   if (name)
3844     {
3845       char* tmp = concat (prefix, "_", name, NULL);
3846       new_vect_var = create_tmp_reg (type, tmp);
3847       free (tmp);
3848     }
3849   else
3850     new_vect_var = create_tmp_reg (type, prefix);
3851
3852   return new_vect_var;
3853 }
3854
3855
3856 /* Function vect_create_addr_base_for_vector_ref.
3857
3858    Create an expression that computes the address of the first memory location
3859    that will be accessed for a data reference.
3860
3861    Input:
3862    STMT: The statement containing the data reference.
3863    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3864    OFFSET: Optional. If supplied, it is be added to the initial address.
3865    LOOP:    Specify relative to which loop-nest should the address be computed.
3866             For example, when the dataref is in an inner-loop nested in an
3867             outer-loop that is now being vectorized, LOOP can be either the
3868             outer-loop, or the inner-loop.  The first memory location accessed
3869             by the following dataref ('in' points to short):
3870
3871                 for (i=0; i<N; i++)
3872                    for (j=0; j<M; j++)
3873                      s += in[i+j]
3874
3875             is as follows:
3876             if LOOP=i_loop:     &in             (relative to i_loop)
3877             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3878    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3879             initial address.  Unlike OFFSET, which is number of elements to
3880             be added, BYTE_OFFSET is measured in bytes.
3881
3882    Output:
3883    1. Return an SSA_NAME whose value is the address of the memory location of
3884       the first vector of the data reference.
3885    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3886       these statement(s) which define the returned SSA_NAME.
3887
3888    FORNOW: We are only handling array accesses with step 1.  */
3889
3890 tree
3891 vect_create_addr_base_for_vector_ref (gimple stmt,
3892                                       gimple_seq *new_stmt_list,
3893                                       tree offset,
3894                                       struct loop *loop,
3895                                       tree byte_offset)
3896 {
3897   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3898   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3899   tree data_ref_base;
3900   const char *base_name;
3901   tree addr_base;
3902   tree dest;
3903   gimple_seq seq = NULL;
3904   tree base_offset;
3905   tree init;
3906   tree vect_ptr_type;
3907   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3908   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3909
3910   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3911     {
3912       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3913
3914       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3915
3916       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3917       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3918       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3919     }
3920   else
3921     {
3922       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3923       base_offset = unshare_expr (DR_OFFSET (dr));
3924       init = unshare_expr (DR_INIT (dr));
3925     }
3926
3927   if (loop_vinfo)
3928     base_name = get_name (data_ref_base);
3929   else
3930     {
3931       base_offset = ssize_int (0);
3932       init = ssize_int (0);
3933       base_name = get_name (DR_REF (dr));
3934     }
3935
3936   /* Create base_offset */
3937   base_offset = size_binop (PLUS_EXPR,
3938                             fold_convert (sizetype, base_offset),
3939                             fold_convert (sizetype, init));
3940
3941   if (offset)
3942     {
3943       offset = fold_build2 (MULT_EXPR, sizetype,
3944                             fold_convert (sizetype, offset), step);
3945       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3946                                  base_offset, offset);
3947     }
3948   if (byte_offset)
3949     {
3950       byte_offset = fold_convert (sizetype, byte_offset);
3951       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3952                                  base_offset, byte_offset);
3953     }
3954
3955   /* base + base_offset */
3956   if (loop_vinfo)
3957     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3958   else
3959     {
3960       addr_base = build1 (ADDR_EXPR,
3961                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3962                           unshare_expr (DR_REF (dr)));
3963     }
3964
3965   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3966   addr_base = fold_convert (vect_ptr_type, addr_base);
3967   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3968   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3969   gimple_seq_add_seq (new_stmt_list, seq);
3970
3971   if (DR_PTR_INFO (dr)
3972       && TREE_CODE (addr_base) == SSA_NAME)
3973     {
3974       duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3975       unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3976       int misalign = DR_MISALIGNMENT (dr);
3977       if (offset || byte_offset || (misalign == -1))
3978         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3979       else
3980         set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign);
3981     }
3982
3983   if (dump_enabled_p ())
3984     {
3985       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3986       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3987       dump_printf (MSG_NOTE, "\n");
3988     }
3989
3990   return addr_base;
3991 }
3992
3993
3994 /* Function vect_create_data_ref_ptr.
3995
3996    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3997    location accessed in the loop by STMT, along with the def-use update
3998    chain to appropriately advance the pointer through the loop iterations.
3999    Also set aliasing information for the pointer.  This pointer is used by
4000    the callers to this function to create a memory reference expression for
4001    vector load/store access.
4002
4003    Input:
4004    1. STMT: a stmt that references memory. Expected to be of the form
4005          GIMPLE_ASSIGN <name, data-ref> or
4006          GIMPLE_ASSIGN <data-ref, name>.
4007    2. AGGR_TYPE: the type of the reference, which should be either a vector
4008         or an array.
4009    3. AT_LOOP: the loop where the vector memref is to be created.
4010    4. OFFSET (optional): an offset to be added to the initial address accessed
4011         by the data-ref in STMT.
4012    5. BSI: location where the new stmts are to be placed if there is no loop
4013    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4014         pointing to the initial address.
4015    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4016         to the initial address accessed by the data-ref in STMT.  This is
4017         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4018         in bytes.
4019
4020    Output:
4021    1. Declare a new ptr to vector_type, and have it point to the base of the
4022       data reference (initial addressed accessed by the data reference).
4023       For example, for vector of type V8HI, the following code is generated:
4024
4025       v8hi *ap;
4026       ap = (v8hi *)initial_address;
4027
4028       if OFFSET is not supplied:
4029          initial_address = &a[init];
4030       if OFFSET is supplied:
4031          initial_address = &a[init + OFFSET];
4032       if BYTE_OFFSET is supplied:
4033          initial_address = &a[init] + BYTE_OFFSET;
4034
4035       Return the initial_address in INITIAL_ADDRESS.
4036
4037    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4038       update the pointer in each iteration of the loop.
4039
4040       Return the increment stmt that updates the pointer in PTR_INCR.
4041
4042    3. Set INV_P to true if the access pattern of the data reference in the
4043       vectorized loop is invariant.  Set it to false otherwise.
4044
4045    4. Return the pointer.  */
4046
4047 tree
4048 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4049                           tree offset, tree *initial_address,
4050                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
4051                           bool only_init, bool *inv_p, tree byte_offset)
4052 {
4053   const char *base_name;
4054   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4055   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4056   struct loop *loop = NULL;
4057   bool nested_in_vect_loop = false;
4058   struct loop *containing_loop = NULL;
4059   tree aggr_ptr_type;
4060   tree aggr_ptr;
4061   tree new_temp;
4062   gimple vec_stmt;
4063   gimple_seq new_stmt_list = NULL;
4064   edge pe = NULL;
4065   basic_block new_bb;
4066   tree aggr_ptr_init;
4067   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4068   tree aptr;
4069   gimple_stmt_iterator incr_gsi;
4070   bool insert_after;
4071   tree indx_before_incr, indx_after_incr;
4072   gimple incr;
4073   tree step;
4074   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4075
4076   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4077               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4078
4079   if (loop_vinfo)
4080     {
4081       loop = LOOP_VINFO_LOOP (loop_vinfo);
4082       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4083       containing_loop = (gimple_bb (stmt))->loop_father;
4084       pe = loop_preheader_edge (loop);
4085     }
4086   else
4087     {
4088       gcc_assert (bb_vinfo);
4089       only_init = true;
4090       *ptr_incr = NULL;
4091     }
4092
4093   /* Check the step (evolution) of the load in LOOP, and record
4094      whether it's invariant.  */
4095   if (nested_in_vect_loop)
4096     step = STMT_VINFO_DR_STEP (stmt_info);
4097   else
4098     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4099
4100   if (integer_zerop (step))
4101     *inv_p = true;
4102   else
4103     *inv_p = false;
4104
4105   /* Create an expression for the first address accessed by this load
4106      in LOOP.  */
4107   base_name = get_name (DR_BASE_ADDRESS (dr));
4108
4109   if (dump_enabled_p ())
4110     {
4111       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4112       dump_printf_loc (MSG_NOTE, vect_location,
4113                        "create %s-pointer variable to type: ",
4114                        get_tree_code_name (TREE_CODE (aggr_type)));
4115       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4116       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4117         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4118       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4119         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4120       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4121         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4122       else
4123         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4124       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4125       dump_printf (MSG_NOTE, "\n");
4126     }
4127
4128   /* (1) Create the new aggregate-pointer variable.
4129      Vector and array types inherit the alias set of their component
4130      type by default so we need to use a ref-all pointer if the data
4131      reference does not conflict with the created aggregated data
4132      reference because it is not addressable.  */
4133   bool need_ref_all = false;
4134   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4135                               get_alias_set (DR_REF (dr))))
4136     need_ref_all = true;
4137   /* Likewise for any of the data references in the stmt group.  */
4138   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4139     {
4140       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4141       do
4142         {
4143           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4144           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4145           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4146                                       get_alias_set (DR_REF (sdr))))
4147             {
4148               need_ref_all = true;
4149               break;
4150             }
4151           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4152         }
4153       while (orig_stmt);
4154     }
4155   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4156                                                need_ref_all);
4157   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4158
4159
4160   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4161      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4162      def-use update cycles for the pointer: one relative to the outer-loop
4163      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4164      to the inner-loop (which is the inner-most loop containing the dataref),
4165      and this is done be step (5) below.
4166
4167      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4168      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4169      redundant.  Steps (3),(4) create the following:
4170
4171         vp0 = &base_addr;
4172         LOOP:   vp1 = phi(vp0,vp2)
4173                 ...
4174                 ...
4175                 vp2 = vp1 + step
4176                 goto LOOP
4177
4178      If there is an inner-loop nested in loop, then step (5) will also be
4179      applied, and an additional update in the inner-loop will be created:
4180
4181         vp0 = &base_addr;
4182         LOOP:   vp1 = phi(vp0,vp2)
4183                 ...
4184         inner:     vp3 = phi(vp1,vp4)
4185                    vp4 = vp3 + inner_step
4186                    if () goto inner
4187                 ...
4188                 vp2 = vp1 + step
4189                 if () goto LOOP   */
4190
4191   /* (2) Calculate the initial address of the aggregate-pointer, and set
4192      the aggregate-pointer to point to it before the loop.  */
4193
4194   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4195
4196   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4197                                                    offset, loop, byte_offset);
4198   if (new_stmt_list)
4199     {
4200       if (pe)
4201         {
4202           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4203           gcc_assert (!new_bb);
4204         }
4205       else
4206         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4207     }
4208
4209   *initial_address = new_temp;
4210
4211   /* Create: p = (aggr_type *) initial_base  */
4212   if (TREE_CODE (new_temp) != SSA_NAME
4213       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4214     {
4215       vec_stmt = gimple_build_assign (aggr_ptr,
4216                                       fold_convert (aggr_ptr_type, new_temp));
4217       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4218       /* Copy the points-to information if it exists. */
4219       if (DR_PTR_INFO (dr))
4220         duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4221       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4222       if (pe)
4223         {
4224           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4225           gcc_assert (!new_bb);
4226         }
4227       else
4228         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4229     }
4230   else
4231     aggr_ptr_init = new_temp;
4232
4233   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4234      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4235      inner-loop nested in LOOP (during outer-loop vectorization).  */
4236
4237   /* No update in loop is required.  */
4238   if (only_init && (!loop_vinfo || at_loop == loop))
4239     aptr = aggr_ptr_init;
4240   else
4241     {
4242       /* The step of the aggregate pointer is the type size.  */
4243       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4244       /* One exception to the above is when the scalar step of the load in
4245          LOOP is zero. In this case the step here is also zero.  */
4246       if (*inv_p)
4247         iv_step = size_zero_node;
4248       else if (tree_int_cst_sgn (step) == -1)
4249         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4250
4251       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4252
4253       create_iv (aggr_ptr_init,
4254                  fold_convert (aggr_ptr_type, iv_step),
4255                  aggr_ptr, loop, &incr_gsi, insert_after,
4256                  &indx_before_incr, &indx_after_incr);
4257       incr = gsi_stmt (incr_gsi);
4258       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4259
4260       /* Copy the points-to information if it exists. */
4261       if (DR_PTR_INFO (dr))
4262         {
4263           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4264           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4265         }
4266       if (ptr_incr)
4267         *ptr_incr = incr;
4268
4269       aptr = indx_before_incr;
4270     }
4271
4272   if (!nested_in_vect_loop || only_init)
4273     return aptr;
4274
4275
4276   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4277      nested in LOOP, if exists.  */
4278
4279   gcc_assert (nested_in_vect_loop);
4280   if (!only_init)
4281     {
4282       standard_iv_increment_position (containing_loop, &incr_gsi,
4283                                       &insert_after);
4284       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4285                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4286                  &indx_after_incr);
4287       incr = gsi_stmt (incr_gsi);
4288       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4289
4290       /* Copy the points-to information if it exists. */
4291       if (DR_PTR_INFO (dr))
4292         {
4293           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4294           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4295         }
4296       if (ptr_incr)
4297         *ptr_incr = incr;
4298
4299       return indx_before_incr;
4300     }
4301   else
4302     gcc_unreachable ();
4303 }
4304
4305
4306 /* Function bump_vector_ptr
4307
4308    Increment a pointer (to a vector type) by vector-size. If requested,
4309    i.e. if PTR-INCR is given, then also connect the new increment stmt
4310    to the existing def-use update-chain of the pointer, by modifying
4311    the PTR_INCR as illustrated below:
4312
4313    The pointer def-use update-chain before this function:
4314                         DATAREF_PTR = phi (p_0, p_2)
4315                         ....
4316         PTR_INCR:       p_2 = DATAREF_PTR + step
4317
4318    The pointer def-use update-chain after this function:
4319                         DATAREF_PTR = phi (p_0, p_2)
4320                         ....
4321                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4322                         ....
4323         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4324
4325    Input:
4326    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4327                  in the loop.
4328    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4329               the loop.  The increment amount across iterations is expected
4330               to be vector_size.
4331    BSI - location where the new update stmt is to be placed.
4332    STMT - the original scalar memory-access stmt that is being vectorized.
4333    BUMP - optional. The offset by which to bump the pointer. If not given,
4334           the offset is assumed to be vector_size.
4335
4336    Output: Return NEW_DATAREF_PTR as illustrated above.
4337
4338 */
4339
4340 tree
4341 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4342                  gimple stmt, tree bump)
4343 {
4344   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4345   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4346   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4347   tree update = TYPE_SIZE_UNIT (vectype);
4348   gassign *incr_stmt;
4349   ssa_op_iter iter;
4350   use_operand_p use_p;
4351   tree new_dataref_ptr;
4352
4353   if (bump)
4354     update = bump;
4355
4356   new_dataref_ptr = copy_ssa_name (dataref_ptr);
4357   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4358                                    dataref_ptr, update);
4359   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4360
4361   /* Copy the points-to information if it exists. */
4362   if (DR_PTR_INFO (dr))
4363     {
4364       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4365       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4366     }
4367
4368   if (!ptr_incr)
4369     return new_dataref_ptr;
4370
4371   /* Update the vector-pointer's cross-iteration increment.  */
4372   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4373     {
4374       tree use = USE_FROM_PTR (use_p);
4375
4376       if (use == dataref_ptr)
4377         SET_USE (use_p, new_dataref_ptr);
4378       else
4379         gcc_assert (tree_int_cst_compare (use, update) == 0);
4380     }
4381
4382   return new_dataref_ptr;
4383 }
4384
4385
4386 /* Function vect_create_destination_var.
4387
4388    Create a new temporary of type VECTYPE.  */
4389
4390 tree
4391 vect_create_destination_var (tree scalar_dest, tree vectype)
4392 {
4393   tree vec_dest;
4394   const char *name;
4395   char *new_name;
4396   tree type;
4397   enum vect_var_kind kind;
4398
4399   kind = vectype ? vect_simple_var : vect_scalar_var;
4400   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4401
4402   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4403
4404   name = get_name (scalar_dest);
4405   if (name)
4406     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4407   else
4408     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4409   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4410   free (new_name);
4411
4412   return vec_dest;
4413 }
4414
4415 /* Function vect_grouped_store_supported.
4416
4417    Returns TRUE if interleave high and interleave low permutations
4418    are supported, and FALSE otherwise.  */
4419
4420 bool
4421 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4422 {
4423   machine_mode mode = TYPE_MODE (vectype);
4424
4425   /* vect_permute_store_chain requires the group size to be equal to 3 or
4426      be a power of two.  */
4427   if (count != 3 && exact_log2 (count) == -1)
4428     {
4429       if (dump_enabled_p ())
4430         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4431                          "the size of the group of accesses"
4432                          " is not a power of 2 or not eqaul to 3\n");
4433       return false;
4434     }
4435
4436   /* Check that the permutation is supported.  */
4437   if (VECTOR_MODE_P (mode))
4438     {
4439       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4440       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4441
4442       if (count == 3)
4443         {
4444           unsigned int j0 = 0, j1 = 0, j2 = 0;
4445           unsigned int i, j;
4446
4447           for (j = 0; j < 3; j++)
4448             {
4449               int nelt0 = ((3 - j) * nelt) % 3;
4450               int nelt1 = ((3 - j) * nelt + 1) % 3;
4451               int nelt2 = ((3 - j) * nelt + 2) % 3;
4452               for (i = 0; i < nelt; i++)
4453                 {
4454                   if (3 * i + nelt0 < nelt)
4455                     sel[3 * i + nelt0] = j0++;
4456                   if (3 * i + nelt1 < nelt)
4457                     sel[3 * i + nelt1] = nelt + j1++;
4458                   if (3 * i + nelt2 < nelt)
4459                     sel[3 * i + nelt2] = 0;
4460                 }
4461               if (!can_vec_perm_p (mode, false, sel))
4462                 {
4463                   if (dump_enabled_p ())
4464                     dump_printf (MSG_MISSED_OPTIMIZATION,
4465                                  "permutaion op not supported by target.\n");
4466                   return false;
4467                 }
4468
4469               for (i = 0; i < nelt; i++)
4470                 {
4471                   if (3 * i + nelt0 < nelt)
4472                     sel[3 * i + nelt0] = 3 * i + nelt0;
4473                   if (3 * i + nelt1 < nelt)
4474                     sel[3 * i + nelt1] = 3 * i + nelt1;
4475                   if (3 * i + nelt2 < nelt)
4476                     sel[3 * i + nelt2] = nelt + j2++;
4477                 }
4478               if (!can_vec_perm_p (mode, false, sel))
4479                 {
4480                   if (dump_enabled_p ())
4481                     dump_printf (MSG_MISSED_OPTIMIZATION,
4482                                  "permutaion op not supported by target.\n");
4483                   return false;
4484                 }
4485             }
4486           return true;
4487         }
4488       else
4489         {
4490           /* If length is not equal to 3 then only power of 2 is supported.  */
4491           gcc_assert (exact_log2 (count) != -1);
4492
4493           for (i = 0; i < nelt / 2; i++)
4494             {
4495               sel[i * 2] = i;
4496               sel[i * 2 + 1] = i + nelt;
4497             }
4498             if (can_vec_perm_p (mode, false, sel))
4499               {
4500                 for (i = 0; i < nelt; i++)
4501                   sel[i] += nelt / 2;
4502                 if (can_vec_perm_p (mode, false, sel))
4503                   return true;
4504               }
4505         }
4506     }
4507
4508   if (dump_enabled_p ())
4509     dump_printf (MSG_MISSED_OPTIMIZATION,
4510                  "permutaion op not supported by target.\n");
4511   return false;
4512 }
4513
4514
4515 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4516    type VECTYPE.  */
4517
4518 bool
4519 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4520 {
4521   return vect_lanes_optab_supported_p ("vec_store_lanes",
4522                                        vec_store_lanes_optab,
4523                                        vectype, count);
4524 }
4525
4526
4527 /* Function vect_permute_store_chain.
4528
4529    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4530    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4531    the data correctly for the stores.  Return the final references for stores
4532    in RESULT_CHAIN.
4533
4534    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4535    The input is 4 vectors each containing 8 elements.  We assign a number to
4536    each element, the input sequence is:
4537
4538    1st vec:   0  1  2  3  4  5  6  7
4539    2nd vec:   8  9 10 11 12 13 14 15
4540    3rd vec:  16 17 18 19 20 21 22 23
4541    4th vec:  24 25 26 27 28 29 30 31
4542
4543    The output sequence should be:
4544
4545    1st vec:  0  8 16 24  1  9 17 25
4546    2nd vec:  2 10 18 26  3 11 19 27
4547    3rd vec:  4 12 20 28  5 13 21 30
4548    4th vec:  6 14 22 30  7 15 23 31
4549
4550    i.e., we interleave the contents of the four vectors in their order.
4551
4552    We use interleave_high/low instructions to create such output.  The input of
4553    each interleave_high/low operation is two vectors:
4554    1st vec    2nd vec
4555    0 1 2 3    4 5 6 7
4556    the even elements of the result vector are obtained left-to-right from the
4557    high/low elements of the first vector.  The odd elements of the result are
4558    obtained left-to-right from the high/low elements of the second vector.
4559    The output of interleave_high will be:   0 4 1 5
4560    and of interleave_low:                   2 6 3 7
4561
4562
4563    The permutation is done in log LENGTH stages.  In each stage interleave_high
4564    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4565    where the first argument is taken from the first half of DR_CHAIN and the
4566    second argument from it's second half.
4567    In our example,
4568
4569    I1: interleave_high (1st vec, 3rd vec)
4570    I2: interleave_low (1st vec, 3rd vec)
4571    I3: interleave_high (2nd vec, 4th vec)
4572    I4: interleave_low (2nd vec, 4th vec)
4573
4574    The output for the first stage is:
4575
4576    I1:  0 16  1 17  2 18  3 19
4577    I2:  4 20  5 21  6 22  7 23
4578    I3:  8 24  9 25 10 26 11 27
4579    I4: 12 28 13 29 14 30 15 31
4580
4581    The output of the second stage, i.e. the final result is:
4582
4583    I1:  0  8 16 24  1  9 17 25
4584    I2:  2 10 18 26  3 11 19 27
4585    I3:  4 12 20 28  5 13 21 30
4586    I4:  6 14 22 30  7 15 23 31.  */
4587
4588 void
4589 vect_permute_store_chain (vec<tree> dr_chain,
4590                           unsigned int length,
4591                           gimple stmt,
4592                           gimple_stmt_iterator *gsi,
4593                           vec<tree> *result_chain)
4594 {
4595   tree vect1, vect2, high, low;
4596   gimple perm_stmt;
4597   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4598   tree perm_mask_low, perm_mask_high;
4599   tree data_ref;
4600   tree perm3_mask_low, perm3_mask_high;
4601   unsigned int i, n, log_length = exact_log2 (length);
4602   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4603   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4604
4605   result_chain->quick_grow (length);
4606   memcpy (result_chain->address (), dr_chain.address (),
4607           length * sizeof (tree));
4608
4609   if (length == 3)
4610     {
4611       unsigned int j0 = 0, j1 = 0, j2 = 0;
4612
4613       for (j = 0; j < 3; j++)
4614         {
4615           int nelt0 = ((3 - j) * nelt) % 3;
4616           int nelt1 = ((3 - j) * nelt + 1) % 3;
4617           int nelt2 = ((3 - j) * nelt + 2) % 3;
4618
4619           for (i = 0; i < nelt; i++)
4620             {
4621               if (3 * i + nelt0 < nelt)
4622                 sel[3 * i + nelt0] = j0++;
4623               if (3 * i + nelt1 < nelt)
4624                 sel[3 * i + nelt1] = nelt + j1++;
4625               if (3 * i + nelt2 < nelt)
4626                 sel[3 * i + nelt2] = 0;
4627             }
4628           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4629
4630           for (i = 0; i < nelt; i++)
4631             {
4632               if (3 * i + nelt0 < nelt)
4633                 sel[3 * i + nelt0] = 3 * i + nelt0;
4634               if (3 * i + nelt1 < nelt)
4635                 sel[3 * i + nelt1] = 3 * i + nelt1;
4636               if (3 * i + nelt2 < nelt)
4637                 sel[3 * i + nelt2] = nelt + j2++;
4638             }
4639           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4640
4641           vect1 = dr_chain[0];
4642           vect2 = dr_chain[1];
4643
4644           /* Create interleaving stmt:
4645              low = VEC_PERM_EXPR <vect1, vect2,
4646                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4647                                    j + 2, nelt + j + 2, *, ...}>  */
4648           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4649           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4650                                            vect2, perm3_mask_low);
4651           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4652
4653           vect1 = data_ref;
4654           vect2 = dr_chain[2];
4655           /* Create interleaving stmt:
4656              low = VEC_PERM_EXPR <vect1, vect2,
4657                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4658                                    6, 7, nelt + j + 2, ...}>  */
4659           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4660           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4661                                            vect2, perm3_mask_high);
4662           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4663           (*result_chain)[j] = data_ref;
4664         }
4665     }
4666   else
4667     {
4668       /* If length is not equal to 3 then only power of 2 is supported.  */
4669       gcc_assert (exact_log2 (length) != -1);
4670
4671       for (i = 0, n = nelt / 2; i < n; i++)
4672         {
4673           sel[i * 2] = i;
4674           sel[i * 2 + 1] = i + nelt;
4675         }
4676         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4677
4678         for (i = 0; i < nelt; i++)
4679           sel[i] += nelt / 2;
4680         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4681
4682         for (i = 0, n = log_length; i < n; i++)
4683           {
4684             for (j = 0; j < length/2; j++)
4685               {
4686                 vect1 = dr_chain[j];
4687                 vect2 = dr_chain[j+length/2];
4688
4689                 /* Create interleaving stmt:
4690                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4691                                                         ...}>  */
4692                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4693                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4694                                                  vect2, perm_mask_high);
4695                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4696                 (*result_chain)[2*j] = high;
4697
4698                 /* Create interleaving stmt:
4699                    low = VEC_PERM_EXPR <vect1, vect2,
4700                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4701                                          ...}>  */
4702                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4703                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4704                                                  vect2, perm_mask_low);
4705                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4706                 (*result_chain)[2*j+1] = low;
4707               }
4708             memcpy (dr_chain.address (), result_chain->address (),
4709                     length * sizeof (tree));
4710           }
4711     }
4712 }
4713
4714 /* Function vect_setup_realignment
4715
4716    This function is called when vectorizing an unaligned load using
4717    the dr_explicit_realign[_optimized] scheme.
4718    This function generates the following code at the loop prolog:
4719
4720       p = initial_addr;
4721    x  msq_init = *(floor(p));   # prolog load
4722       realignment_token = call target_builtin;
4723     loop:
4724    x  msq = phi (msq_init, ---)
4725
4726    The stmts marked with x are generated only for the case of
4727    dr_explicit_realign_optimized.
4728
4729    The code above sets up a new (vector) pointer, pointing to the first
4730    location accessed by STMT, and a "floor-aligned" load using that pointer.
4731    It also generates code to compute the "realignment-token" (if the relevant
4732    target hook was defined), and creates a phi-node at the loop-header bb
4733    whose arguments are the result of the prolog-load (created by this
4734    function) and the result of a load that takes place in the loop (to be
4735    created by the caller to this function).
4736
4737    For the case of dr_explicit_realign_optimized:
4738    The caller to this function uses the phi-result (msq) to create the
4739    realignment code inside the loop, and sets up the missing phi argument,
4740    as follows:
4741     loop:
4742       msq = phi (msq_init, lsq)
4743       lsq = *(floor(p'));        # load in loop
4744       result = realign_load (msq, lsq, realignment_token);
4745
4746    For the case of dr_explicit_realign:
4747     loop:
4748       msq = *(floor(p));        # load in loop
4749       p' = p + (VS-1);
4750       lsq = *(floor(p'));       # load in loop
4751       result = realign_load (msq, lsq, realignment_token);
4752
4753    Input:
4754    STMT - (scalar) load stmt to be vectorized. This load accesses
4755           a memory location that may be unaligned.
4756    BSI - place where new code is to be inserted.
4757    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4758                               is used.
4759
4760    Output:
4761    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4762                        target hook, if defined.
4763    Return value - the result of the loop-header phi node.  */
4764
4765 tree
4766 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4767                         tree *realignment_token,
4768                         enum dr_alignment_support alignment_support_scheme,
4769                         tree init_addr,
4770                         struct loop **at_loop)
4771 {
4772   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4773   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4774   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4775   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4776   struct loop *loop = NULL;
4777   edge pe = NULL;
4778   tree scalar_dest = gimple_assign_lhs (stmt);
4779   tree vec_dest;
4780   gimple inc;
4781   tree ptr;
4782   tree data_ref;
4783   basic_block new_bb;
4784   tree msq_init = NULL_TREE;
4785   tree new_temp;
4786   gphi *phi_stmt;
4787   tree msq = NULL_TREE;
4788   gimple_seq stmts = NULL;
4789   bool inv_p;
4790   bool compute_in_loop = false;
4791   bool nested_in_vect_loop = false;
4792   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4793   struct loop *loop_for_initial_load = NULL;
4794
4795   if (loop_vinfo)
4796     {
4797       loop = LOOP_VINFO_LOOP (loop_vinfo);
4798       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4799     }
4800
4801   gcc_assert (alignment_support_scheme == dr_explicit_realign
4802               || alignment_support_scheme == dr_explicit_realign_optimized);
4803
4804   /* We need to generate three things:
4805      1. the misalignment computation
4806      2. the extra vector load (for the optimized realignment scheme).
4807      3. the phi node for the two vectors from which the realignment is
4808       done (for the optimized realignment scheme).  */
4809
4810   /* 1. Determine where to generate the misalignment computation.
4811
4812      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4813      calculation will be generated by this function, outside the loop (in the
4814      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4815      caller, inside the loop.
4816
4817      Background: If the misalignment remains fixed throughout the iterations of
4818      the loop, then both realignment schemes are applicable, and also the
4819      misalignment computation can be done outside LOOP.  This is because we are
4820      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4821      are a multiple of VS (the Vector Size), and therefore the misalignment in
4822      different vectorized LOOP iterations is always the same.
4823      The problem arises only if the memory access is in an inner-loop nested
4824      inside LOOP, which is now being vectorized using outer-loop vectorization.
4825      This is the only case when the misalignment of the memory access may not
4826      remain fixed throughout the iterations of the inner-loop (as explained in
4827      detail in vect_supportable_dr_alignment).  In this case, not only is the
4828      optimized realignment scheme not applicable, but also the misalignment
4829      computation (and generation of the realignment token that is passed to
4830      REALIGN_LOAD) have to be done inside the loop.
4831
4832      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4833      or not, which in turn determines if the misalignment is computed inside
4834      the inner-loop, or outside LOOP.  */
4835
4836   if (init_addr != NULL_TREE || !loop_vinfo)
4837     {
4838       compute_in_loop = true;
4839       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4840     }
4841
4842
4843   /* 2. Determine where to generate the extra vector load.
4844
4845      For the optimized realignment scheme, instead of generating two vector
4846      loads in each iteration, we generate a single extra vector load in the
4847      preheader of the loop, and in each iteration reuse the result of the
4848      vector load from the previous iteration.  In case the memory access is in
4849      an inner-loop nested inside LOOP, which is now being vectorized using
4850      outer-loop vectorization, we need to determine whether this initial vector
4851      load should be generated at the preheader of the inner-loop, or can be
4852      generated at the preheader of LOOP.  If the memory access has no evolution
4853      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4854      to be generated inside LOOP (in the preheader of the inner-loop).  */
4855
4856   if (nested_in_vect_loop)
4857     {
4858       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4859       bool invariant_in_outerloop =
4860             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4861       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4862     }
4863   else
4864     loop_for_initial_load = loop;
4865   if (at_loop)
4866     *at_loop = loop_for_initial_load;
4867
4868   if (loop_for_initial_load)
4869     pe = loop_preheader_edge (loop_for_initial_load);
4870
4871   /* 3. For the case of the optimized realignment, create the first vector
4872       load at the loop preheader.  */
4873
4874   if (alignment_support_scheme == dr_explicit_realign_optimized)
4875     {
4876       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4877       gassign *new_stmt;
4878
4879       gcc_assert (!compute_in_loop);
4880       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4881       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4882                                       NULL_TREE, &init_addr, NULL, &inc,
4883                                       true, &inv_p);
4884       new_temp = copy_ssa_name (ptr);
4885       new_stmt = gimple_build_assign
4886                    (new_temp, BIT_AND_EXPR, ptr,
4887                     build_int_cst (TREE_TYPE (ptr),
4888                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4889       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4890       gcc_assert (!new_bb);
4891       data_ref
4892         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4893                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4894       new_stmt = gimple_build_assign (vec_dest, data_ref);
4895       new_temp = make_ssa_name (vec_dest, new_stmt);
4896       gimple_assign_set_lhs (new_stmt, new_temp);
4897       if (pe)
4898         {
4899           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4900           gcc_assert (!new_bb);
4901         }
4902       else
4903          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4904
4905       msq_init = gimple_assign_lhs (new_stmt);
4906     }
4907
4908   /* 4. Create realignment token using a target builtin, if available.
4909       It is done either inside the containing loop, or before LOOP (as
4910       determined above).  */
4911
4912   if (targetm.vectorize.builtin_mask_for_load)
4913     {
4914       gcall *new_stmt;
4915       tree builtin_decl;
4916
4917       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4918       if (!init_addr)
4919         {
4920           /* Generate the INIT_ADDR computation outside LOOP.  */
4921           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4922                                                         NULL_TREE, loop);
4923           if (loop)
4924             {
4925               pe = loop_preheader_edge (loop);
4926               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4927               gcc_assert (!new_bb);
4928             }
4929           else
4930              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4931         }
4932
4933       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4934       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4935       vec_dest =
4936         vect_create_destination_var (scalar_dest,
4937                                      gimple_call_return_type (new_stmt));
4938       new_temp = make_ssa_name (vec_dest, new_stmt);
4939       gimple_call_set_lhs (new_stmt, new_temp);
4940
4941       if (compute_in_loop)
4942         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4943       else
4944         {
4945           /* Generate the misalignment computation outside LOOP.  */
4946           pe = loop_preheader_edge (loop);
4947           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4948           gcc_assert (!new_bb);
4949         }
4950
4951       *realignment_token = gimple_call_lhs (new_stmt);
4952
4953       /* The result of the CALL_EXPR to this builtin is determined from
4954          the value of the parameter and no global variables are touched
4955          which makes the builtin a "const" function.  Requiring the
4956          builtin to have the "const" attribute makes it unnecessary
4957          to call mark_call_clobbered.  */
4958       gcc_assert (TREE_READONLY (builtin_decl));
4959     }
4960
4961   if (alignment_support_scheme == dr_explicit_realign)
4962     return msq;
4963
4964   gcc_assert (!compute_in_loop);
4965   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4966
4967
4968   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4969
4970   pe = loop_preheader_edge (containing_loop);
4971   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4972   msq = make_ssa_name (vec_dest);
4973   phi_stmt = create_phi_node (msq, containing_loop->header);
4974   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4975
4976   return msq;
4977 }
4978
4979
4980 /* Function vect_grouped_load_supported.
4981
4982    Returns TRUE if even and odd permutations are supported,
4983    and FALSE otherwise.  */
4984
4985 bool
4986 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4987 {
4988   machine_mode mode = TYPE_MODE (vectype);
4989
4990   /* vect_permute_load_chain requires the group size to be equal to 3 or
4991      be a power of two.  */
4992   if (count != 3 && exact_log2 (count) == -1)
4993     {
4994       if (dump_enabled_p ())
4995         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4996                          "the size of the group of accesses"
4997                          " is not a power of 2 or not equal to 3\n");
4998       return false;
4999     }
5000
5001   /* Check that the permutation is supported.  */
5002   if (VECTOR_MODE_P (mode))
5003     {
5004       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5005       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5006
5007       if (count == 3)
5008         {
5009           unsigned int k;
5010           for (k = 0; k < 3; k++)
5011             {
5012               for (i = 0; i < nelt; i++)
5013                 if (3 * i + k < 2 * nelt)
5014                   sel[i] = 3 * i + k;
5015                 else
5016                   sel[i] = 0;
5017               if (!can_vec_perm_p (mode, false, sel))
5018                 {
5019                   if (dump_enabled_p ())
5020                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5021                                      "shuffle of 3 loads is not supported by"
5022                                      " target\n");
5023                     return false;
5024                 }
5025               for (i = 0, j = 0; i < nelt; i++)
5026                 if (3 * i + k < 2 * nelt)
5027                   sel[i] = i;
5028                 else
5029                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5030               if (!can_vec_perm_p (mode, false, sel))
5031                 {
5032                   if (dump_enabled_p ())
5033                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5034                                      "shuffle of 3 loads is not supported by"
5035                                      " target\n");
5036                   return false;
5037                 }
5038             }
5039           return true;
5040         }
5041       else
5042         {
5043           /* If length is not equal to 3 then only power of 2 is supported.  */
5044           gcc_assert (exact_log2 (count) != -1);
5045           for (i = 0; i < nelt; i++)
5046             sel[i] = i * 2;
5047           if (can_vec_perm_p (mode, false, sel))
5048             {
5049               for (i = 0; i < nelt; i++)
5050                 sel[i] = i * 2 + 1;
5051               if (can_vec_perm_p (mode, false, sel))
5052                 return true;
5053             }
5054         }
5055     }
5056
5057   if (dump_enabled_p ())
5058     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5059                      "extract even/odd not supported by target\n");
5060   return false;
5061 }
5062
5063 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5064    type VECTYPE.  */
5065
5066 bool
5067 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5068 {
5069   return vect_lanes_optab_supported_p ("vec_load_lanes",
5070                                        vec_load_lanes_optab,
5071                                        vectype, count);
5072 }
5073
5074 /* Function vect_permute_load_chain.
5075
5076    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5077    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5078    the input data correctly.  Return the final references for loads in
5079    RESULT_CHAIN.
5080
5081    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5082    The input is 4 vectors each containing 8 elements. We assign a number to each
5083    element, the input sequence is:
5084
5085    1st vec:   0  1  2  3  4  5  6  7
5086    2nd vec:   8  9 10 11 12 13 14 15
5087    3rd vec:  16 17 18 19 20 21 22 23
5088    4th vec:  24 25 26 27 28 29 30 31
5089
5090    The output sequence should be:
5091
5092    1st vec:  0 4  8 12 16 20 24 28
5093    2nd vec:  1 5  9 13 17 21 25 29
5094    3rd vec:  2 6 10 14 18 22 26 30
5095    4th vec:  3 7 11 15 19 23 27 31
5096
5097    i.e., the first output vector should contain the first elements of each
5098    interleaving group, etc.
5099
5100    We use extract_even/odd instructions to create such output.  The input of
5101    each extract_even/odd operation is two vectors
5102    1st vec    2nd vec
5103    0 1 2 3    4 5 6 7
5104
5105    and the output is the vector of extracted even/odd elements.  The output of
5106    extract_even will be:   0 2 4 6
5107    and of extract_odd:     1 3 5 7
5108
5109
5110    The permutation is done in log LENGTH stages.  In each stage extract_even
5111    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5112    their order.  In our example,
5113
5114    E1: extract_even (1st vec, 2nd vec)
5115    E2: extract_odd (1st vec, 2nd vec)
5116    E3: extract_even (3rd vec, 4th vec)
5117    E4: extract_odd (3rd vec, 4th vec)
5118
5119    The output for the first stage will be:
5120
5121    E1:  0  2  4  6  8 10 12 14
5122    E2:  1  3  5  7  9 11 13 15
5123    E3: 16 18 20 22 24 26 28 30
5124    E4: 17 19 21 23 25 27 29 31
5125
5126    In order to proceed and create the correct sequence for the next stage (or
5127    for the correct output, if the second stage is the last one, as in our
5128    example), we first put the output of extract_even operation and then the
5129    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5130    The input for the second stage is:
5131
5132    1st vec (E1):  0  2  4  6  8 10 12 14
5133    2nd vec (E3): 16 18 20 22 24 26 28 30
5134    3rd vec (E2):  1  3  5  7  9 11 13 15
5135    4th vec (E4): 17 19 21 23 25 27 29 31
5136
5137    The output of the second stage:
5138
5139    E1: 0 4  8 12 16 20 24 28
5140    E2: 2 6 10 14 18 22 26 30
5141    E3: 1 5  9 13 17 21 25 29
5142    E4: 3 7 11 15 19 23 27 31
5143
5144    And RESULT_CHAIN after reordering:
5145
5146    1st vec (E1):  0 4  8 12 16 20 24 28
5147    2nd vec (E3):  1 5  9 13 17 21 25 29
5148    3rd vec (E2):  2 6 10 14 18 22 26 30
5149    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5150
5151 static void
5152 vect_permute_load_chain (vec<tree> dr_chain,
5153                          unsigned int length,
5154                          gimple stmt,
5155                          gimple_stmt_iterator *gsi,
5156                          vec<tree> *result_chain)
5157 {
5158   tree data_ref, first_vect, second_vect;
5159   tree perm_mask_even, perm_mask_odd;
5160   tree perm3_mask_low, perm3_mask_high;
5161   gimple perm_stmt;
5162   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5163   unsigned int i, j, log_length = exact_log2 (length);
5164   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5165   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5166
5167   result_chain->quick_grow (length);
5168   memcpy (result_chain->address (), dr_chain.address (),
5169           length * sizeof (tree));
5170
5171   if (length == 3)
5172     {
5173       unsigned int k;
5174
5175       for (k = 0; k < 3; k++)
5176         {
5177           for (i = 0; i < nelt; i++)
5178             if (3 * i + k < 2 * nelt)
5179               sel[i] = 3 * i + k;
5180             else
5181               sel[i] = 0;
5182           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5183
5184           for (i = 0, j = 0; i < nelt; i++)
5185             if (3 * i + k < 2 * nelt)
5186               sel[i] = i;
5187             else
5188               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5189
5190           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5191
5192           first_vect = dr_chain[0];
5193           second_vect = dr_chain[1];
5194
5195           /* Create interleaving stmt (low part of):
5196              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5197                                                              ...}>  */
5198           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5199           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5200                                            second_vect, perm3_mask_low);
5201           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5202
5203           /* Create interleaving stmt (high part of):
5204              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5205                                                               ...}>  */
5206           first_vect = data_ref;
5207           second_vect = dr_chain[2];
5208           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5209           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5210                                            second_vect, perm3_mask_high);
5211           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5212           (*result_chain)[k] = data_ref;
5213         }
5214     }
5215   else
5216     {
5217       /* If length is not equal to 3 then only power of 2 is supported.  */
5218       gcc_assert (exact_log2 (length) != -1);
5219
5220       for (i = 0; i < nelt; ++i)
5221         sel[i] = i * 2;
5222       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5223
5224       for (i = 0; i < nelt; ++i)
5225         sel[i] = i * 2 + 1;
5226       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5227
5228       for (i = 0; i < log_length; i++)
5229         {
5230           for (j = 0; j < length; j += 2)
5231             {
5232               first_vect = dr_chain[j];
5233               second_vect = dr_chain[j+1];
5234
5235               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5236               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5237               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5238                                                first_vect, second_vect,
5239                                                perm_mask_even);
5240               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5241               (*result_chain)[j/2] = data_ref;
5242
5243               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5244               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5245               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5246                                                first_vect, second_vect,
5247                                                perm_mask_odd);
5248               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5249               (*result_chain)[j/2+length/2] = data_ref;
5250             }
5251           memcpy (dr_chain.address (), result_chain->address (),
5252                   length * sizeof (tree));
5253         }
5254     }
5255 }
5256
5257 /* Function vect_shift_permute_load_chain.
5258
5259    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5260    sequence of stmts to reorder the input data accordingly.
5261    Return the final references for loads in RESULT_CHAIN.
5262    Return true if successed, false otherwise.
5263
5264    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5265    The input is 3 vectors each containing 8 elements.  We assign a
5266    number to each element, the input sequence is:
5267
5268    1st vec:   0  1  2  3  4  5  6  7
5269    2nd vec:   8  9 10 11 12 13 14 15
5270    3rd vec:  16 17 18 19 20 21 22 23
5271
5272    The output sequence should be:
5273
5274    1st vec:  0 3 6  9 12 15 18 21
5275    2nd vec:  1 4 7 10 13 16 19 22
5276    3rd vec:  2 5 8 11 14 17 20 23
5277
5278    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5279
5280    First we shuffle all 3 vectors to get correct elements order:
5281
5282    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5283    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5284    3rd vec:  (16 19 22) (17 20 23) (18 21)
5285
5286    Next we unite and shift vector 3 times:
5287
5288    1st step:
5289      shift right by 6 the concatenation of:
5290      "1st vec" and  "2nd vec"
5291        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5292      "2nd vec" and  "3rd vec"
5293        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5294      "3rd vec" and  "1st vec"
5295        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5296                              | New vectors                   |
5297
5298      So that now new vectors are:
5299
5300      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5301      2nd vec:  (10 13) (16 19 22) (17 20 23)
5302      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5303
5304    2nd step:
5305      shift right by 5 the concatenation of:
5306      "1st vec" and  "3rd vec"
5307        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5308      "2nd vec" and  "1st vec"
5309        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5310      "3rd vec" and  "2nd vec"
5311        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5312                           | New vectors                   |
5313
5314      So that now new vectors are:
5315
5316      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5317      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5318      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5319
5320    3rd step:
5321      shift right by 5 the concatenation of:
5322      "1st vec" and  "1st vec"
5323        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5324      shift right by 3 the concatenation of:
5325      "2nd vec" and  "2nd vec"
5326                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5327                           | New vectors                   |
5328
5329      So that now all vectors are READY:
5330      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5331      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5332      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5333
5334    This algorithm is faster than one in vect_permute_load_chain if:
5335      1.  "shift of a concatination" is faster than general permutation.
5336          This is usually so.
5337      2.  The TARGET machine can't execute vector instructions in parallel.
5338          This is because each step of the algorithm depends on previous.
5339          The algorithm in vect_permute_load_chain is much more parallel.
5340
5341    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5342 */
5343
5344 static bool
5345 vect_shift_permute_load_chain (vec<tree> dr_chain,
5346                                unsigned int length,
5347                                gimple stmt,
5348                                gimple_stmt_iterator *gsi,
5349                                vec<tree> *result_chain)
5350 {
5351   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5352   tree perm2_mask1, perm2_mask2, perm3_mask;
5353   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5354   gimple perm_stmt;
5355
5356   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5357   unsigned int i;
5358   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5359   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5360   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5361   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5362
5363   result_chain->quick_grow (length);
5364   memcpy (result_chain->address (), dr_chain.address (),
5365           length * sizeof (tree));
5366
5367   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5368     {
5369       unsigned int j, log_length = exact_log2 (length);
5370       for (i = 0; i < nelt / 2; ++i)
5371         sel[i] = i * 2;
5372       for (i = 0; i < nelt / 2; ++i)
5373         sel[nelt / 2 + i] = i * 2 + 1;
5374       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5375         {
5376           if (dump_enabled_p ())
5377             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5378                              "shuffle of 2 fields structure is not \
5379                               supported by target\n");
5380           return false;
5381         }
5382       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5383
5384       for (i = 0; i < nelt / 2; ++i)
5385         sel[i] = i * 2 + 1;
5386       for (i = 0; i < nelt / 2; ++i)
5387         sel[nelt / 2 + i] = i * 2;
5388       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5389         {
5390           if (dump_enabled_p ())
5391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5392                              "shuffle of 2 fields structure is not \
5393                               supported by target\n");
5394           return false;
5395         }
5396       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5397
5398       /* Generating permutation constant to shift all elements.
5399          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5400       for (i = 0; i < nelt; i++)
5401         sel[i] = nelt / 2 + i;
5402       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5403         {
5404           if (dump_enabled_p ())
5405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5406                              "shift permutation is not supported by target\n");
5407           return false;
5408         }
5409       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5410
5411       /* Generating permutation constant to select vector from 2.
5412          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5413       for (i = 0; i < nelt / 2; i++)
5414         sel[i] = i;
5415       for (i = nelt / 2; i < nelt; i++)
5416         sel[i] = nelt + i;
5417       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5418         {
5419           if (dump_enabled_p ())
5420             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5421                              "select is not supported by target\n");
5422           return false;
5423         }
5424       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5425
5426       for (i = 0; i < log_length; i++)
5427         {
5428           for (j = 0; j < length; j += 2)
5429             {
5430               first_vect = dr_chain[j];
5431               second_vect = dr_chain[j + 1];
5432
5433               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5434               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5435                                                first_vect, first_vect,
5436                                                perm2_mask1);
5437               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5438               vect[0] = data_ref;
5439
5440               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5441               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5442                                                second_vect, second_vect,
5443                                                perm2_mask2);
5444               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5445               vect[1] = data_ref;
5446
5447               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5448               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5449                                                vect[0], vect[1], shift1_mask);
5450               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5451               (*result_chain)[j/2 + length/2] = data_ref;
5452
5453               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5454               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5455                                                vect[0], vect[1], select_mask);
5456               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5457               (*result_chain)[j/2] = data_ref;
5458             }
5459           memcpy (dr_chain.address (), result_chain->address (),
5460                   length * sizeof (tree));
5461         }
5462       return true;
5463     }
5464   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5465     {
5466       unsigned int k = 0, l = 0;
5467
5468       /* Generating permutation constant to get all elements in rigth order.
5469          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5470       for (i = 0; i < nelt; i++)
5471         {
5472           if (3 * k + (l % 3) >= nelt)
5473             {
5474               k = 0;
5475               l += (3 - (nelt % 3));
5476             }
5477           sel[i] = 3 * k + (l % 3);
5478           k++;
5479         }
5480       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5481         {
5482           if (dump_enabled_p ())
5483             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5484                              "shuffle of 3 fields structure is not \
5485                               supported by target\n");
5486           return false;
5487         }
5488       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5489
5490       /* Generating permutation constant to shift all elements.
5491          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5492       for (i = 0; i < nelt; i++)
5493         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5494       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5495         {
5496           if (dump_enabled_p ())
5497             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5498                              "shift permutation is not supported by target\n");
5499           return false;
5500         }
5501       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5502
5503       /* Generating permutation constant to shift all elements.
5504          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5505       for (i = 0; i < nelt; i++)
5506         sel[i] = 2 * (nelt / 3) + 1 + i;
5507       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5508         {
5509           if (dump_enabled_p ())
5510             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5511                              "shift permutation is not supported by target\n");
5512           return false;
5513         }
5514       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5515
5516       /* Generating permutation constant to shift all elements.
5517          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5518       for (i = 0; i < nelt; i++)
5519         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5520       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5521         {
5522           if (dump_enabled_p ())
5523             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5524                              "shift permutation is not supported by target\n");
5525           return false;
5526         }
5527       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5528
5529       /* Generating permutation constant to shift all elements.
5530          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5531       for (i = 0; i < nelt; i++)
5532         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5533       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5534         {
5535           if (dump_enabled_p ())
5536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5537                              "shift permutation is not supported by target\n");
5538           return false;
5539         }
5540       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5541
5542       for (k = 0; k < 3; k++)
5543         {
5544           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5545           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5546                                            dr_chain[k], dr_chain[k],
5547                                            perm3_mask);
5548           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5549           vect[k] = data_ref;
5550         }
5551
5552       for (k = 0; k < 3; k++)
5553         {
5554           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5555           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5556                                            vect[k % 3], vect[(k + 1) % 3],
5557                                            shift1_mask);
5558           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5559           vect_shift[k] = data_ref;
5560         }
5561
5562       for (k = 0; k < 3; k++)
5563         {
5564           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5565           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5566                                            vect_shift[(4 - k) % 3],
5567                                            vect_shift[(3 - k) % 3],
5568                                            shift2_mask);
5569           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5570           vect[k] = data_ref;
5571         }
5572
5573       (*result_chain)[3 - (nelt % 3)] = vect[2];
5574
5575       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5576       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5577                                        vect[0], shift3_mask);
5578       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5579       (*result_chain)[nelt % 3] = data_ref;
5580
5581       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5582       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5583                                        vect[1], shift4_mask);
5584       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5585       (*result_chain)[0] = data_ref;
5586       return true;
5587     }
5588   return false;
5589 }
5590
5591 /* Function vect_transform_grouped_load.
5592
5593    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5594    to perform their permutation and ascribe the result vectorized statements to
5595    the scalar statements.
5596 */
5597
5598 void
5599 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5600                              gimple_stmt_iterator *gsi)
5601 {
5602   machine_mode mode;
5603   vec<tree> result_chain = vNULL;
5604
5605   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5606      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5607      vectors, that are ready for vector computation.  */
5608   result_chain.create (size);
5609
5610   /* If reassociation width for vector type is 2 or greater target machine can
5611      execute 2 or more vector instructions in parallel.  Otherwise try to
5612      get chain for loads group using vect_shift_permute_load_chain.  */
5613   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5614   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5615       || exact_log2 (size) != -1
5616       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5617                                          gsi, &result_chain))
5618     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5619   vect_record_grouped_load_vectors (stmt, result_chain);
5620   result_chain.release ();
5621 }
5622
5623 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5624    generated as part of the vectorization of STMT.  Assign the statement
5625    for each vector to the associated scalar statement.  */
5626
5627 void
5628 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5629 {
5630   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5631   gimple next_stmt, new_stmt;
5632   unsigned int i, gap_count;
5633   tree tmp_data_ref;
5634
5635   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5636      Since we scan the chain starting from it's first node, their order
5637      corresponds the order of data-refs in RESULT_CHAIN.  */
5638   next_stmt = first_stmt;
5639   gap_count = 1;
5640   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5641     {
5642       if (!next_stmt)
5643         break;
5644
5645       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5646        code elimination pass later.  No need to check for the first stmt in
5647        the group, since it always exists.
5648        GROUP_GAP is the number of steps in elements from the previous
5649        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5650        correspond to the gaps.  */
5651       if (next_stmt != first_stmt
5652           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5653       {
5654         gap_count++;
5655         continue;
5656       }
5657
5658       while (next_stmt)
5659         {
5660           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5661           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5662              copies, and we put the new vector statement in the first available
5663              RELATED_STMT.  */
5664           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5665             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5666           else
5667             {
5668               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5669                 {
5670                   gimple prev_stmt =
5671                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5672                   gimple rel_stmt =
5673                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5674                   while (rel_stmt)
5675                     {
5676                       prev_stmt = rel_stmt;
5677                       rel_stmt =
5678                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5679                     }
5680
5681                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5682                     new_stmt;
5683                 }
5684             }
5685
5686           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5687           gap_count = 1;
5688           /* If NEXT_STMT accesses the same DR as the previous statement,
5689              put the same TMP_DATA_REF as its vectorized statement; otherwise
5690              get the next data-ref from RESULT_CHAIN.  */
5691           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5692             break;
5693         }
5694     }
5695 }
5696
5697 /* Function vect_force_dr_alignment_p.
5698
5699    Returns whether the alignment of a DECL can be forced to be aligned
5700    on ALIGNMENT bit boundary.  */
5701
5702 bool
5703 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5704 {
5705   if (TREE_CODE (decl) != VAR_DECL)
5706     return false;
5707
5708   /* With -fno-toplevel-reorder we may have already output the constant.  */
5709   if (TREE_ASM_WRITTEN (decl))
5710     return false;
5711
5712   /* Constant pool entries may be shared and not properly merged by LTO.  */
5713   if (DECL_IN_CONSTANT_POOL (decl))
5714     return false;
5715
5716   if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
5717     {
5718       symtab_node *snode;
5719
5720       /* We cannot change alignment of symbols that may bind to symbols
5721          in other translation unit that may contain a definition with lower
5722          alignment.  */
5723       if (!decl_binds_to_current_def_p (decl))
5724         return false;
5725
5726       /* When compiling partition, be sure the symbol is not output by other
5727          partition.  */
5728       snode = symtab_node::get (decl);
5729       if (flag_ltrans
5730           && (snode->in_other_partition
5731               || snode->get_partitioning_class () == SYMBOL_DUPLICATE))
5732         return false;
5733     }
5734
5735   /* Do not override the alignment as specified by the ABI when the used
5736      attribute is set.  */
5737   if (DECL_PRESERVE_P (decl))
5738     return false;
5739
5740   /* Do not override explicit alignment set by the user when an explicit
5741      section name is also used.  This is a common idiom used by many
5742      software projects.  */
5743   if (TREE_STATIC (decl)
5744       && DECL_SECTION_NAME (decl) != NULL
5745       && !symtab_node::get (decl)->implicit_section)
5746     return false;
5747
5748   /* If symbol is an alias, we need to check that target is OK.  */
5749   if (TREE_STATIC (decl))
5750     {
5751       tree target = symtab_node::get (decl)->ultimate_alias_target ()->decl;
5752       if (target != decl)
5753         {
5754           if (DECL_PRESERVE_P (target))
5755             return false;
5756           decl = target;
5757         }
5758     }
5759
5760   if (TREE_STATIC (decl))
5761     return (alignment <= MAX_OFILE_ALIGNMENT);
5762   else
5763     return (alignment <= MAX_STACK_ALIGNMENT);
5764 }
5765
5766
5767 /* Return whether the data reference DR is supported with respect to its
5768    alignment.
5769    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5770    it is aligned, i.e., check if it is possible to vectorize it with different
5771    alignment.  */
5772
5773 enum dr_alignment_support
5774 vect_supportable_dr_alignment (struct data_reference *dr,
5775                                bool check_aligned_accesses)
5776 {
5777   gimple stmt = DR_STMT (dr);
5778   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5779   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5780   machine_mode mode = TYPE_MODE (vectype);
5781   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5782   struct loop *vect_loop = NULL;
5783   bool nested_in_vect_loop = false;
5784
5785   if (aligned_access_p (dr) && !check_aligned_accesses)
5786     return dr_aligned;
5787
5788   /* For now assume all conditional loads/stores support unaligned
5789      access without any special code.  */
5790   if (is_gimple_call (stmt)
5791       && gimple_call_internal_p (stmt)
5792       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5793           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5794     return dr_unaligned_supported;
5795
5796   if (loop_vinfo)
5797     {
5798       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5799       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5800     }
5801
5802   /* Possibly unaligned access.  */
5803
5804   /* We can choose between using the implicit realignment scheme (generating
5805      a misaligned_move stmt) and the explicit realignment scheme (generating
5806      aligned loads with a REALIGN_LOAD).  There are two variants to the
5807      explicit realignment scheme: optimized, and unoptimized.
5808      We can optimize the realignment only if the step between consecutive
5809      vector loads is equal to the vector size.  Since the vector memory
5810      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5811      is guaranteed that the misalignment amount remains the same throughout the
5812      execution of the vectorized loop.  Therefore, we can create the
5813      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5814      at the loop preheader.
5815
5816      However, in the case of outer-loop vectorization, when vectorizing a
5817      memory access in the inner-loop nested within the LOOP that is now being
5818      vectorized, while it is guaranteed that the misalignment of the
5819      vectorized memory access will remain the same in different outer-loop
5820      iterations, it is *not* guaranteed that is will remain the same throughout
5821      the execution of the inner-loop.  This is because the inner-loop advances
5822      with the original scalar step (and not in steps of VS).  If the inner-loop
5823      step happens to be a multiple of VS, then the misalignment remains fixed
5824      and we can use the optimized realignment scheme.  For example:
5825
5826       for (i=0; i<N; i++)
5827         for (j=0; j<M; j++)
5828           s += a[i+j];
5829
5830      When vectorizing the i-loop in the above example, the step between
5831      consecutive vector loads is 1, and so the misalignment does not remain
5832      fixed across the execution of the inner-loop, and the realignment cannot
5833      be optimized (as illustrated in the following pseudo vectorized loop):
5834
5835       for (i=0; i<N; i+=4)
5836         for (j=0; j<M; j++){
5837           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5838                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5839                          // (assuming that we start from an aligned address).
5840           }
5841
5842      We therefore have to use the unoptimized realignment scheme:
5843
5844       for (i=0; i<N; i+=4)
5845           for (j=k; j<M; j+=4)
5846           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5847                            // that the misalignment of the initial address is
5848                            // 0).
5849
5850      The loop can then be vectorized as follows:
5851
5852       for (k=0; k<4; k++){
5853         rt = get_realignment_token (&vp[k]);
5854         for (i=0; i<N; i+=4){
5855           v1 = vp[i+k];
5856           for (j=k; j<M; j+=4){
5857             v2 = vp[i+j+VS-1];
5858             va = REALIGN_LOAD <v1,v2,rt>;
5859             vs += va;
5860             v1 = v2;
5861           }
5862         }
5863     } */
5864
5865   if (DR_IS_READ (dr))
5866     {
5867       bool is_packed = false;
5868       tree type = (TREE_TYPE (DR_REF (dr)));
5869
5870       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5871           && (!targetm.vectorize.builtin_mask_for_load
5872               || targetm.vectorize.builtin_mask_for_load ()))
5873         {
5874           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5875           if ((nested_in_vect_loop
5876                && (TREE_INT_CST_LOW (DR_STEP (dr))
5877                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5878               || !loop_vinfo)
5879             return dr_explicit_realign;
5880           else
5881             return dr_explicit_realign_optimized;
5882         }
5883       if (!known_alignment_for_access_p (dr))
5884         is_packed = not_size_aligned (DR_REF (dr));
5885
5886       if ((TYPE_USER_ALIGN (type) && !is_packed)
5887           || targetm.vectorize.
5888                support_vector_misalignment (mode, type,
5889                                             DR_MISALIGNMENT (dr), is_packed))
5890         /* Can't software pipeline the loads, but can at least do them.  */
5891         return dr_unaligned_supported;
5892     }
5893   else
5894     {
5895       bool is_packed = false;
5896       tree type = (TREE_TYPE (DR_REF (dr)));
5897
5898       if (!known_alignment_for_access_p (dr))
5899         is_packed = not_size_aligned (DR_REF (dr));
5900
5901      if ((TYPE_USER_ALIGN (type) && !is_packed)
5902          || targetm.vectorize.
5903               support_vector_misalignment (mode, type,
5904                                            DR_MISALIGNMENT (dr), is_packed))
5905        return dr_unaligned_supported;
5906     }
5907
5908   /* Unsupported.  */
5909   return dr_unaligned_unsupported;
5910 }