gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "backend.h"
  27 #include "predict.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "rtl.h"
  31 #include "ssa.h"
  32 #include "alias.h"
  33 #include "fold-const.h"
  34 #include "stor-layout.h"
  35 #include "tm_p.h"
  36 #include "target.h"
  37 #include "gimple-pretty-print.h"
  38 #include "internal-fn.h"
  39 #include "tree-eh.h"
  40 #include "gimplify.h"
  41 #include "gimple-iterator.h"
  42 #include "gimplify-me.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "tree-ssa-loop-manip.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-chrec.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "diagnostic-core.h"
  51 #include "cgraph.h"
  52 #include "expr.h"
  53 #include "insn-codes.h"
  54 #include "optabs-tree.h"
  55 #include "builtins.h"
  56 #include "params.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   limit_p = !targetm.array_mode_supported_p (mode, count);
  70   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  71                               MODE_INT, limit_p);
  72
  73   if (array_mode == BLKmode)
  74     {
  75       if (dump_enabled_p ())
  76         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
  78                          GET_MODE_NAME (mode), count);
  79       return false;
  80     }
  81
  82   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  83     {
  84       if (dump_enabled_p ())
  85         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  86                          "cannot use %s<%s><%s>\n", name,
  87                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  88       return false;
  89     }
  90
  91   if (dump_enabled_p ())
  92     dump_printf_loc (MSG_NOTE, vect_location,
  93                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  94                      GET_MODE_NAME (mode));
  95
  96   return true;
  97 }
  98
  99
 100 /* Return the smallest scalar part of STMT.
 101    This is used to determine the vectype of the stmt.  We generally set the
 102    vectype according to the type of the result (lhs).  For stmts whose
 103    result-type is different than the type of the arguments (e.g., demotion,
 104    promotion), vectype will be reset appropriately (later).  Note that we have
 105    to visit the smallest datatype in this function, because that determines the
 106    VF.  If the smallest datatype in the loop is present only as the rhs of a
 107    promotion operation - we'd miss it.
 108    Such a case, where a variable of this datatype does not appear in the lhs
 109    anywhere in the loop, can only occur if it's an invariant: e.g.:
 110    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 111    invariant motion.  However, we cannot rely on invariant motion to always
 112    take invariants out of the loop, and so in the case of promotion we also
 113    have to check the rhs.
 114    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 115    types.  */
 116
 117 tree
 118 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
 119                                HOST_WIDE_INT *rhs_size_unit)
 120 {
 121   tree scalar_type = gimple_expr_type (stmt);
 122   HOST_WIDE_INT lhs, rhs;
 123
 124   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 125
 126   if (is_gimple_assign (stmt)
 127       && (gimple_assign_cast_p (stmt)
 128           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 129           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 130           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 131     {
 132       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 133
 134       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 135       if (rhs < lhs)
 136         scalar_type = rhs_type;
 137     }
 138
 139   *lhs_size_unit = lhs;
 140   *rhs_size_unit = rhs;
 141   return scalar_type;
 142 }
 143
 144
 145 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 146    tested at run-time.  Return TRUE if DDR was successfully inserted.
 147    Return false if versioning is not supported.  */
 148
 149 static bool
 150 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 151 {
 152   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 153
 154   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 155     return false;
 156
 157   if (dump_enabled_p ())
 158     {
 159       dump_printf_loc (MSG_NOTE, vect_location,
 160                        "mark for run-time aliasing test between ");
 161       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 162       dump_printf (MSG_NOTE,  " and ");
 163       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 164       dump_printf (MSG_NOTE, "\n");
 165     }
 166
 167   if (optimize_loop_nest_for_size_p (loop))
 168     {
 169       if (dump_enabled_p ())
 170         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 171                          "versioning not supported when optimizing"
 172                          " for size.\n");
 173       return false;
 174     }
 175
 176   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 177   if (loop->inner)
 178     {
 179       if (dump_enabled_p ())
 180         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 181                          "versioning not yet supported for outer-loops.\n");
 182       return false;
 183     }
 184
 185   /* FORNOW: We don't support creating runtime alias tests for non-constant
 186      step.  */
 187   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 188       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 189     {
 190       if (dump_enabled_p ())
 191         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 192                          "versioning not yet supported for non-constant "
 193                          "step\n");
 194       return false;
 195     }
 196
 197   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 198   return true;
 199 }
 200
 201
 202 /* Function vect_analyze_data_ref_dependence.
 203
 204    Return TRUE if there (might) exist a dependence between a memory-reference
 205    DRA and a memory-reference DRB.  When versioning for alias may check a
 206    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 207    the data dependence.  */
 208
 209 static bool
 210 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 211                                   loop_vec_info loop_vinfo, int *max_vf)
 212 {
 213   unsigned int i;
 214   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 215   struct data_reference *dra = DDR_A (ddr);
 216   struct data_reference *drb = DDR_B (ddr);
 217   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 218   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 219   lambda_vector dist_v;
 220   unsigned int loop_depth;
 221
 222   /* In loop analysis all data references should be vectorizable.  */
 223   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 224       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 225     gcc_unreachable ();
 226
 227   /* Independent data accesses.  */
 228   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 229     return false;
 230
 231   if (dra == drb
 232       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 233     return false;
 234
 235   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 236      least two scalar iterations, there is always also a true dependence.
 237      As the vectorizer does not re-order loads and stores we can ignore
 238      the anti-dependence if TBAA can disambiguate both DRs similar to the
 239      case with known negative distance anti-dependences (positive
 240      distance anti-dependences would violate TBAA constraints).  */
 241   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 242        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 243       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 244                                  get_alias_set (DR_REF (drb))))
 245     return false;
 246
 247   /* Unknown data dependence.  */
 248   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 249     {
 250       /* If user asserted safelen consecutive iterations can be
 251          executed concurrently, assume independence.  */
 252       if (loop->safelen >= 2)
 253         {
 254           if (loop->safelen < *max_vf)
 255             *max_vf = loop->safelen;
 256           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 257           return false;
 258         }
 259
 260       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 261           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 262         {
 263           if (dump_enabled_p ())
 264             {
 265               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 266                                "versioning for alias not supported for: "
 267                                "can't determine dependence between ");
 268               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 269                                  DR_REF (dra));
 270               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 271               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 272                                  DR_REF (drb));
 273               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 274             }
 275           return true;
 276         }
 277
 278       if (dump_enabled_p ())
 279         {
 280           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 281                            "versioning for alias required: "
 282                            "can't determine dependence between ");
 283           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 284                              DR_REF (dra));
 285           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 286           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 287                              DR_REF (drb));
 288           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 289         }
 290
 291       /* Add to list of ddrs that need to be tested at run-time.  */
 292       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 293     }
 294
 295   /* Known data dependence.  */
 296   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 297     {
 298       /* If user asserted safelen consecutive iterations can be
 299          executed concurrently, assume independence.  */
 300       if (loop->safelen >= 2)
 301         {
 302           if (loop->safelen < *max_vf)
 303             *max_vf = loop->safelen;
 304           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 305           return false;
 306         }
 307
 308       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 309           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 310         {
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 314                                "versioning for alias not supported for: "
 315                                "bad dist vector for ");
 316               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 317                                  DR_REF (dra));
 318               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 319               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 320                                  DR_REF (drb));
 321               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 322             }
 323           return true;
 324         }
 325
 326       if (dump_enabled_p ())
 327         {
 328           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 329                            "versioning for alias required: "
 330                            "bad dist vector for ");
 331           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 332           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 333           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 334           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 335         }
 336       /* Add to list of ddrs that need to be tested at run-time.  */
 337       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 338     }
 339
 340   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 341   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 342     {
 343       int dist = dist_v[loop_depth];
 344
 345       if (dump_enabled_p ())
 346         dump_printf_loc (MSG_NOTE, vect_location,
 347                          "dependence distance  = %d.\n", dist);
 348
 349       if (dist == 0)
 350         {
 351           if (dump_enabled_p ())
 352             {
 353               dump_printf_loc (MSG_NOTE, vect_location,
 354                                "dependence distance == 0 between ");
 355               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 356               dump_printf (MSG_NOTE, " and ");
 357               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 358               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 359             }
 360
 361           /* When we perform grouped accesses and perform implicit CSE
 362              by detecting equal accesses and doing disambiguation with
 363              runtime alias tests like for
 364                 .. = a[i];
 365                 .. = a[i+1];
 366                 a[i] = ..;
 367                 a[i+1] = ..;
 368                 *p = ..;
 369                 .. = a[i];
 370                 .. = a[i+1];
 371              where we will end up loading { a[i], a[i+1] } once, make
 372              sure that inserting group loads before the first load and
 373              stores after the last store will do the right thing.
 374              Similar for groups like
 375                 a[i] = ...;
 376                 ... = a[i];
 377                 a[i+1] = ...;
 378              where loads from the group interleave with the store.  */
 379           if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 380               || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 381             {
 382               gimple *earlier_stmt;
 383               earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 384               if (DR_IS_WRITE
 385                     (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 386                 {
 387                   if (dump_enabled_p ())
 388                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                                      "READ_WRITE dependence in interleaving."
 390                                      "\n");
 391                   return true;
 392                 }
 393             }
 394
 395           continue;
 396         }
 397
 398       if (dist > 0 && DDR_REVERSED_P (ddr))
 399         {
 400           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 401              reversed (to make distance vector positive), and the actual
 402              distance is negative.  */
 403           if (dump_enabled_p ())
 404             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 405                              "dependence distance negative.\n");
 406           /* Record a negative dependence distance to later limit the
 407              amount of stmt copying / unrolling we can perform.
 408              Only need to handle read-after-write dependence.  */
 409           if (DR_IS_READ (drb)
 410               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 411                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 412             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 413           continue;
 414         }
 415
 416       if (abs (dist) >= 2
 417           && abs (dist) < *max_vf)
 418         {
 419           /* The dependence distance requires reduction of the maximal
 420              vectorization factor.  */
 421           *max_vf = abs (dist);
 422           if (dump_enabled_p ())
 423             dump_printf_loc (MSG_NOTE, vect_location,
 424                              "adjusting maximal vectorization factor to %i\n",
 425                              *max_vf);
 426         }
 427
 428       if (abs (dist) >= *max_vf)
 429         {
 430           /* Dependence distance does not create dependence, as far as
 431              vectorization is concerned, in this case.  */
 432           if (dump_enabled_p ())
 433             dump_printf_loc (MSG_NOTE, vect_location,
 434                              "dependence distance >= VF.\n");
 435           continue;
 436         }
 437
 438       if (dump_enabled_p ())
 439         {
 440           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 441                        "not vectorized, possible dependence "
 442                        "between data-refs ");
 443           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 444           dump_printf (MSG_NOTE,  " and ");
 445           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 446           dump_printf (MSG_NOTE,  "\n");
 447         }
 448
 449       return true;
 450     }
 451
 452   return false;
 453 }
 454
 455 /* Function vect_analyze_data_ref_dependences.
 456
 457    Examine all the data references in the loop, and make sure there do not
 458    exist any data dependences between them.  Set *MAX_VF according to
 459    the maximum vectorization factor the data dependences allow.  */
 460
 461 bool
 462 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 463 {
 464   unsigned int i;
 465   struct data_dependence_relation *ddr;
 466
 467   if (dump_enabled_p ())
 468     dump_printf_loc (MSG_NOTE, vect_location,
 469                      "=== vect_analyze_data_ref_dependences ===\n");
 470
 471   LOOP_VINFO_DDRS (loop_vinfo)
 472     .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 473              * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 474   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 475   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 476                                 &LOOP_VINFO_DDRS (loop_vinfo),
 477                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 478     return false;
 479
 480   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 481     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 482       return false;
 483
 484   return true;
 485 }
 486
 487
 488 /* Function vect_slp_analyze_data_ref_dependence.
 489
 490    Return TRUE if there (might) exist a dependence between a memory-reference
 491    DRA and a memory-reference DRB.  When versioning for alias may check a
 492    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 493    the data dependence.  */
 494
 495 static bool
 496 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 497 {
 498   struct data_reference *dra = DDR_A (ddr);
 499   struct data_reference *drb = DDR_B (ddr);
 500
 501   /* We need to check dependences of statements marked as unvectorizable
 502      as well, they still can prohibit vectorization.  */
 503
 504   /* Independent data accesses.  */
 505   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 506     return false;
 507
 508   if (dra == drb)
 509     return false;
 510
 511   /* Read-read is OK.  */
 512   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 513     return false;
 514
 515   /* If dra and drb are part of the same interleaving chain consider
 516      them independent.  */
 517   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 518       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 519           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 520     return false;
 521
 522   /* Unknown data dependence.  */
 523   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 524     {
 525       if  (dump_enabled_p ())
 526         {
 527           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 528                            "can't determine dependence between ");
 529           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 530           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 531           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 532           dump_printf (MSG_MISSED_OPTIMIZATION,  "\n");
 533         }
 534     }
 535   else if (dump_enabled_p ())
 536     {
 537       dump_printf_loc (MSG_NOTE, vect_location,
 538                        "determined dependence between ");
 539       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 540       dump_printf (MSG_NOTE, " and ");
 541       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 542       dump_printf (MSG_NOTE,  "\n");
 543     }
 544
 545   /* We do not vectorize basic blocks with write-write dependencies.  */
 546   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 547     return true;
 548
 549   /* If we have a read-write dependence check that the load is before the store.
 550      When we vectorize basic blocks, vector load can be only before
 551      corresponding scalar load, and vector store can be only after its
 552      corresponding scalar store.  So the order of the acceses is preserved in
 553      case the load is before the store.  */
 554   gimple *earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 555   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 556     {
 557       /* That only holds for load-store pairs taking part in vectorization.  */
 558       if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
 559           && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
 560         return false;
 561     }
 562
 563   return true;
 564 }
 565
 566
 567 /* Function vect_analyze_data_ref_dependences.
 568
 569    Examine all the data references in the basic-block, and make sure there
 570    do not exist any data dependences between them.  Set *MAX_VF according to
 571    the maximum vectorization factor the data dependences allow.  */
 572
 573 bool
 574 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 575 {
 576   struct data_dependence_relation *ddr;
 577   unsigned int i;
 578
 579   if (dump_enabled_p ())
 580     dump_printf_loc (MSG_NOTE, vect_location,
 581                      "=== vect_slp_analyze_data_ref_dependences ===\n");
 582
 583   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 584                                 &BB_VINFO_DDRS (bb_vinfo),
 585                                 vNULL, true))
 586     return false;
 587
 588   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 589     if (vect_slp_analyze_data_ref_dependence (ddr))
 590       return false;
 591
 592   return true;
 593 }
 594
 595
 596 /* Function vect_compute_data_ref_alignment
 597
 598    Compute the misalignment of the data reference DR.
 599
 600    Output:
 601    1. If during the misalignment computation it is found that the data reference
 602       cannot be vectorized then false is returned.
 603    2. DR_MISALIGNMENT (DR) is defined.
 604
 605    FOR NOW: No analysis is actually performed. Misalignment is calculated
 606    only for trivial cases. TODO.  */
 607
 608 static bool
 609 vect_compute_data_ref_alignment (struct data_reference *dr)
 610 {
 611   gimple *stmt = DR_STMT (dr);
 612   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 613   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 614   struct loop *loop = NULL;
 615   tree ref = DR_REF (dr);
 616   tree vectype;
 617   tree base, base_addr;
 618   tree misalign = NULL_TREE;
 619   tree aligned_to;
 620   unsigned HOST_WIDE_INT alignment;
 621
 622   if (dump_enabled_p ())
 623     dump_printf_loc (MSG_NOTE, vect_location,
 624                      "vect_compute_data_ref_alignment:\n");
 625
 626   if (loop_vinfo)
 627     loop = LOOP_VINFO_LOOP (loop_vinfo);
 628
 629   /* Initialize misalignment to unknown.  */
 630   SET_DR_MISALIGNMENT (dr, -1);
 631
 632   /* Strided accesses perform only component accesses, misalignment information
 633      is irrelevant for them.  */
 634   if (STMT_VINFO_STRIDED_P (stmt_info)
 635       && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 636     return true;
 637
 638   if (tree_fits_shwi_p (DR_STEP (dr)))
 639     misalign = DR_INIT (dr);
 640   aligned_to = DR_ALIGNED_TO (dr);
 641   base_addr = DR_BASE_ADDRESS (dr);
 642   vectype = STMT_VINFO_VECTYPE (stmt_info);
 643
 644   /* In case the dataref is in an inner-loop of the loop that is being
 645      vectorized (LOOP), we use the base and misalignment information
 646      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 647      stays the same throughout the execution of the inner-loop, which is why
 648      we have to check that the stride of the dataref in the inner-loop evenly
 649      divides by the vector size.  */
 650   if (loop && nested_in_vect_loop_p (loop, stmt))
 651     {
 652       tree step = DR_STEP (dr);
 653
 654       if (tree_fits_shwi_p (step)
 655           && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 656         {
 657           if (dump_enabled_p ())
 658             dump_printf_loc (MSG_NOTE, vect_location,
 659                              "inner step divides the vector-size.\n");
 660           misalign = STMT_VINFO_DR_INIT (stmt_info);
 661           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 662           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 663         }
 664       else
 665         {
 666           if (dump_enabled_p ())
 667             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 668                              "inner step doesn't divide the vector-size.\n");
 669           misalign = NULL_TREE;
 670         }
 671     }
 672
 673   /* Similarly we can only use base and misalignment information relative to
 674      an innermost loop if the misalignment stays the same throughout the
 675      execution of the loop.  As above, this is the case if the stride of
 676      the dataref evenly divides by the vector size.  */
 677   else
 678     {
 679       tree step = DR_STEP (dr);
 680       unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
 681
 682       if (tree_fits_shwi_p (step)
 683           && ((tree_to_shwi (step) * vf)
 684               % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
 685         {
 686           if (dump_enabled_p ())
 687             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 688                              "step doesn't divide the vector-size.\n");
 689           misalign = NULL_TREE;
 690         }
 691     }
 692
 693   /* To look at alignment of the base we have to preserve an inner MEM_REF
 694      as that carries alignment information of the actual access.  */
 695   base = ref;
 696   while (handled_component_p (base))
 697     base = TREE_OPERAND (base, 0);
 698   if (TREE_CODE (base) == MEM_REF)
 699     base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
 700                    build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
 701   unsigned int base_alignment = get_object_alignment (base);
 702
 703   if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
 704     DR_VECT_AUX (dr)->base_element_aligned = true;
 705
 706   alignment = TYPE_ALIGN_UNIT (vectype);
 707
 708   if ((compare_tree_int (aligned_to, alignment) < 0)
 709       || !misalign)
 710     {
 711       if (dump_enabled_p ())
 712         {
 713           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 714                            "Unknown alignment for access: ");
 715           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 716           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 717         }
 718       return true;
 719     }
 720
 721   if (base_alignment < TYPE_ALIGN (vectype))
 722     {
 723       /* Strip an inner MEM_REF to a bare decl if possible.  */
 724       if (TREE_CODE (base) == MEM_REF
 725           && integer_zerop (TREE_OPERAND (base, 1))
 726           && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
 727         base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
 728
 729       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
 730         {
 731           if (dump_enabled_p ())
 732             {
 733               dump_printf_loc (MSG_NOTE, vect_location,
 734                                "can't force alignment of ref: ");
 735               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 736               dump_printf (MSG_NOTE, "\n");
 737             }
 738           return true;
 739         }
 740
 741       /* Force the alignment of the decl.
 742          NOTE: This is the only change to the code we make during
 743          the analysis phase, before deciding to vectorize the loop.  */
 744       if (dump_enabled_p ())
 745         {
 746           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 747           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 748           dump_printf (MSG_NOTE, "\n");
 749         }
 750
 751       DR_VECT_AUX (dr)->base_decl = base;
 752       DR_VECT_AUX (dr)->base_misaligned = true;
 753       DR_VECT_AUX (dr)->base_element_aligned = true;
 754     }
 755
 756   /* If this is a backward running DR then first access in the larger
 757      vectype actually is N-1 elements before the address in the DR.
 758      Adjust misalign accordingly.  */
 759   if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
 760     {
 761       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 762       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 763          otherwise we wouldn't be here.  */
 764       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 765       /* PLUS because DR_STEP was negative.  */
 766       misalign = size_binop (PLUS_EXPR, misalign, offset);
 767     }
 768
 769   SET_DR_MISALIGNMENT (dr,
 770                        wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
 771
 772   if (dump_enabled_p ())
 773     {
 774       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 775                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 776       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 777       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 778     }
 779
 780   return true;
 781 }
 782
 783
 784 /* Function vect_compute_data_refs_alignment
 785
 786    Compute the misalignment of data references in the loop.
 787    Return FALSE if a data reference is found that cannot be vectorized.  */
 788
 789 static bool
 790 vect_compute_data_refs_alignment (vec_info *vinfo)
 791 {
 792   vec<data_reference_p> datarefs = vinfo->datarefs;
 793   struct data_reference *dr;
 794   unsigned int i;
 795
 796   FOR_EACH_VEC_ELT (datarefs, i, dr)
 797     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 798         && !vect_compute_data_ref_alignment (dr))
 799       {
 800         if (is_a <bb_vec_info> (vinfo))
 801           {
 802             /* Mark unsupported statement as unvectorizable.  */
 803             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 804             continue;
 805           }
 806         else
 807           return false;
 808       }
 809
 810   return true;
 811 }
 812
 813
 814 /* Function vect_update_misalignment_for_peel
 815
 816    DR - the data reference whose misalignment is to be adjusted.
 817    DR_PEEL - the data reference whose misalignment is being made
 818              zero in the vector loop by the peel.
 819    NPEEL - the number of iterations in the peel loop if the misalignment
 820            of DR_PEEL is known at compile time.  */
 821
 822 static void
 823 vect_update_misalignment_for_peel (struct data_reference *dr,
 824                                    struct data_reference *dr_peel, int npeel)
 825 {
 826   unsigned int i;
 827   vec<dr_p> same_align_drs;
 828   struct data_reference *current_dr;
 829   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 830   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 831   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 832   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 833
 834  /* For interleaved data accesses the step in the loop must be multiplied by
 835      the size of the interleaving group.  */
 836   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 837     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 838   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 839     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 840
 841   /* It can be assumed that the data refs with the same alignment as dr_peel
 842      are aligned in the vector loop.  */
 843   same_align_drs
 844     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 845   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 846     {
 847       if (current_dr != dr)
 848         continue;
 849       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 850                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 851       SET_DR_MISALIGNMENT (dr, 0);
 852       return;
 853     }
 854
 855   if (known_alignment_for_access_p (dr)
 856       && known_alignment_for_access_p (dr_peel))
 857     {
 858       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 859       int misal = DR_MISALIGNMENT (dr);
 860       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 861       misal += negative ? -npeel * dr_size : npeel * dr_size;
 862       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 863       SET_DR_MISALIGNMENT (dr, misal);
 864       return;
 865     }
 866
 867   if (dump_enabled_p ())
 868     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
 869   SET_DR_MISALIGNMENT (dr, -1);
 870 }
 871
 872
 873 /* Function vect_verify_datarefs_alignment
 874
 875    Return TRUE if all data references in the loop can be
 876    handled with respect to alignment.  */
 877
 878 bool
 879 vect_verify_datarefs_alignment (vec_info *vinfo)
 880 {
 881   vec<data_reference_p> datarefs = vinfo->datarefs;
 882   struct data_reference *dr;
 883   enum dr_alignment_support supportable_dr_alignment;
 884   unsigned int i;
 885
 886   FOR_EACH_VEC_ELT (datarefs, i, dr)
 887     {
 888       gimple *stmt = DR_STMT (dr);
 889       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 890
 891       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 892         continue;
 893
 894       /* For interleaving, only the alignment of the first access matters.
 895          Skip statements marked as not vectorizable.  */
 896       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 897            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 898           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 899         continue;
 900
 901       /* Strided accesses perform only component accesses, alignment is
 902          irrelevant for them.  */
 903       if (STMT_VINFO_STRIDED_P (stmt_info)
 904           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
 905         continue;
 906
 907       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 908       if (!supportable_dr_alignment)
 909         {
 910           if (dump_enabled_p ())
 911             {
 912               if (DR_IS_READ (dr))
 913                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 914                                  "not vectorized: unsupported unaligned load.");
 915               else
 916                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 917                                  "not vectorized: unsupported unaligned "
 918                                  "store.");
 919
 920               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 921                                  DR_REF (dr));
 922               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 923             }
 924           return false;
 925         }
 926       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 927         dump_printf_loc (MSG_NOTE, vect_location,
 928                          "Vectorizing an unaligned access.\n");
 929     }
 930   return true;
 931 }
 932
 933 /* Given an memory reference EXP return whether its alignment is less
 934    than its size.  */
 935
 936 static bool
 937 not_size_aligned (tree exp)
 938 {
 939   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
 940     return true;
 941
 942   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
 943           > get_object_alignment (exp));
 944 }
 945
 946 /* Function vector_alignment_reachable_p
 947
 948    Return true if vector alignment for DR is reachable by peeling
 949    a few loop iterations.  Return false otherwise.  */
 950
 951 static bool
 952 vector_alignment_reachable_p (struct data_reference *dr)
 953 {
 954   gimple *stmt = DR_STMT (dr);
 955   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 956   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 957
 958   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 959     {
 960       /* For interleaved access we peel only if number of iterations in
 961          the prolog loop ({VF - misalignment}), is a multiple of the
 962          number of the interleaved accesses.  */
 963       int elem_size, mis_in_elements;
 964       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 965
 966       /* FORNOW: handle only known alignment.  */
 967       if (!known_alignment_for_access_p (dr))
 968         return false;
 969
 970       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 971       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 972
 973       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
 974         return false;
 975     }
 976
 977   /* If misalignment is known at the compile time then allow peeling
 978      only if natural alignment is reachable through peeling.  */
 979   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
 980     {
 981       HOST_WIDE_INT elmsize =
 982                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
 983       if (dump_enabled_p ())
 984         {
 985           dump_printf_loc (MSG_NOTE, vect_location,
 986                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
 987           dump_printf (MSG_NOTE,
 988                        ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
 989         }
 990       if (DR_MISALIGNMENT (dr) % elmsize)
 991         {
 992           if (dump_enabled_p ())
 993             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 994                              "data size does not divide the misalignment.\n");
 995           return false;
 996         }
 997     }
 998
 999   if (!known_alignment_for_access_p (dr))
1000     {
1001       tree type = TREE_TYPE (DR_REF (dr));
1002       bool is_packed = not_size_aligned (DR_REF (dr));
1003       if (dump_enabled_p ())
1004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1005                          "Unknown misalignment, is_packed = %d\n",is_packed);
1006       if ((TYPE_USER_ALIGN (type) && !is_packed)
1007           || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1008         return true;
1009       else
1010         return false;
1011     }
1012
1013   return true;
1014 }
1015
1016
1017 /* Calculate the cost of the memory access represented by DR.  */
1018
1019 static void
1020 vect_get_data_access_cost (struct data_reference *dr,
1021                            unsigned int *inside_cost,
1022                            unsigned int *outside_cost,
1023                            stmt_vector_for_cost *body_cost_vec)
1024 {
1025   gimple *stmt = DR_STMT (dr);
1026   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1027   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1028   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1029   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1030   int ncopies = vf / nunits;
1031
1032   if (DR_IS_READ (dr))
1033     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1034                         NULL, body_cost_vec, false);
1035   else
1036     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1037
1038   if (dump_enabled_p ())
1039     dump_printf_loc (MSG_NOTE, vect_location,
1040                      "vect_get_data_access_cost: inside_cost = %d, "
1041                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1042 }
1043
1044
1045 typedef struct _vect_peel_info
1046 {
1047   int npeel;
1048   struct data_reference *dr;
1049   unsigned int count;
1050 } *vect_peel_info;
1051
1052 typedef struct _vect_peel_extended_info
1053 {
1054   struct _vect_peel_info peel_info;
1055   unsigned int inside_cost;
1056   unsigned int outside_cost;
1057   stmt_vector_for_cost body_cost_vec;
1058 } *vect_peel_extended_info;
1059
1060
1061 /* Peeling hashtable helpers.  */
1062
1063 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1064 {
1065   static inline hashval_t hash (const _vect_peel_info *);
1066   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1067 };
1068
1069 inline hashval_t
1070 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1071 {
1072   return (hashval_t) peel_info->npeel;
1073 }
1074
1075 inline bool
1076 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1077 {
1078   return (a->npeel == b->npeel);
1079 }
1080
1081
1082 /* Insert DR into peeling hash table with NPEEL as key.  */
1083
1084 static void
1085 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1086                           loop_vec_info loop_vinfo, struct data_reference *dr,
1087                           int npeel)
1088 {
1089   struct _vect_peel_info elem, *slot;
1090   _vect_peel_info **new_slot;
1091   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1092
1093   elem.npeel = npeel;
1094   slot = peeling_htab->find (&elem);
1095   if (slot)
1096     slot->count++;
1097   else
1098     {
1099       slot = XNEW (struct _vect_peel_info);
1100       slot->npeel = npeel;
1101       slot->dr = dr;
1102       slot->count = 1;
1103       new_slot = peeling_htab->find_slot (slot, INSERT);
1104       *new_slot = slot;
1105     }
1106
1107   if (!supportable_dr_alignment
1108       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1109     slot->count += VECT_MAX_COST;
1110 }
1111
1112
1113 /* Traverse peeling hash table to find peeling option that aligns maximum
1114    number of data accesses.  */
1115
1116 int
1117 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1118                                      _vect_peel_extended_info *max)
1119 {
1120   vect_peel_info elem = *slot;
1121
1122   if (elem->count > max->peel_info.count
1123       || (elem->count == max->peel_info.count
1124           && max->peel_info.npeel > elem->npeel))
1125     {
1126       max->peel_info.npeel = elem->npeel;
1127       max->peel_info.count = elem->count;
1128       max->peel_info.dr = elem->dr;
1129     }
1130
1131   return 1;
1132 }
1133
1134
1135 /* Traverse peeling hash table and calculate cost for each peeling option.
1136    Find the one with the lowest cost.  */
1137
1138 int
1139 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1140                                    _vect_peel_extended_info *min)
1141 {
1142   vect_peel_info elem = *slot;
1143   int save_misalignment, dummy;
1144   unsigned int inside_cost = 0, outside_cost = 0, i;
1145   gimple *stmt = DR_STMT (elem->dr);
1146   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1147   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1148   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1149   struct data_reference *dr;
1150   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1151
1152   prologue_cost_vec.create (2);
1153   body_cost_vec.create (2);
1154   epilogue_cost_vec.create (2);
1155
1156   FOR_EACH_VEC_ELT (datarefs, i, dr)
1157     {
1158       stmt = DR_STMT (dr);
1159       stmt_info = vinfo_for_stmt (stmt);
1160       /* For interleaving, only the alignment of the first access
1161          matters.  */
1162       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1163           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1164         continue;
1165
1166       save_misalignment = DR_MISALIGNMENT (dr);
1167       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1168       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1169                                  &body_cost_vec);
1170       SET_DR_MISALIGNMENT (dr, save_misalignment);
1171     }
1172
1173   outside_cost += vect_get_known_peeling_cost
1174     (loop_vinfo, elem->npeel, &dummy,
1175      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1176      &prologue_cost_vec, &epilogue_cost_vec);
1177
1178   /* Prologue and epilogue costs are added to the target model later.
1179      These costs depend only on the scalar iteration cost, the
1180      number of peeling iterations finally chosen, and the number of
1181      misaligned statements.  So discard the information found here.  */
1182   prologue_cost_vec.release ();
1183   epilogue_cost_vec.release ();
1184
1185   if (inside_cost < min->inside_cost
1186       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1187     {
1188       min->inside_cost = inside_cost;
1189       min->outside_cost = outside_cost;
1190       min->body_cost_vec.release ();
1191       min->body_cost_vec = body_cost_vec;
1192       min->peel_info.dr = elem->dr;
1193       min->peel_info.npeel = elem->npeel;
1194     }
1195   else
1196     body_cost_vec.release ();
1197
1198   return 1;
1199 }
1200
1201
1202 /* Choose best peeling option by traversing peeling hash table and either
1203    choosing an option with the lowest cost (if cost model is enabled) or the
1204    option that aligns as many accesses as possible.  */
1205
1206 static struct data_reference *
1207 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1208                                        loop_vec_info loop_vinfo,
1209                                        unsigned int *npeel,
1210                                        stmt_vector_for_cost *body_cost_vec)
1211 {
1212    struct _vect_peel_extended_info res;
1213
1214    res.peel_info.dr = NULL;
1215    res.body_cost_vec = stmt_vector_for_cost ();
1216
1217    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1218      {
1219        res.inside_cost = INT_MAX;
1220        res.outside_cost = INT_MAX;
1221        peeling_htab->traverse <_vect_peel_extended_info *,
1222                                vect_peeling_hash_get_lowest_cost> (&res);
1223      }
1224    else
1225      {
1226        res.peel_info.count = 0;
1227        peeling_htab->traverse <_vect_peel_extended_info *,
1228                                vect_peeling_hash_get_most_frequent> (&res);
1229      }
1230
1231    *npeel = res.peel_info.npeel;
1232    *body_cost_vec = res.body_cost_vec;
1233    return res.peel_info.dr;
1234 }
1235
1236
1237 /* Function vect_enhance_data_refs_alignment
1238
1239    This pass will use loop versioning and loop peeling in order to enhance
1240    the alignment of data references in the loop.
1241
1242    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1243    original loop is to be vectorized.  Any other loops that are created by
1244    the transformations performed in this pass - are not supposed to be
1245    vectorized.  This restriction will be relaxed.
1246
1247    This pass will require a cost model to guide it whether to apply peeling
1248    or versioning or a combination of the two.  For example, the scheme that
1249    intel uses when given a loop with several memory accesses, is as follows:
1250    choose one memory access ('p') which alignment you want to force by doing
1251    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1252    other accesses are not necessarily aligned, or (2) use loop versioning to
1253    generate one loop in which all accesses are aligned, and another loop in
1254    which only 'p' is necessarily aligned.
1255
1256    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1257    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1258    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1259
1260    Devising a cost model is the most critical aspect of this work.  It will
1261    guide us on which access to peel for, whether to use loop versioning, how
1262    many versions to create, etc.  The cost model will probably consist of
1263    generic considerations as well as target specific considerations (on
1264    powerpc for example, misaligned stores are more painful than misaligned
1265    loads).
1266
1267    Here are the general steps involved in alignment enhancements:
1268
1269      -- original loop, before alignment analysis:
1270         for (i=0; i<N; i++){
1271           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1272           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1273         }
1274
1275      -- After vect_compute_data_refs_alignment:
1276         for (i=0; i<N; i++){
1277           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1278           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1279         }
1280
1281      -- Possibility 1: we do loop versioning:
1282      if (p is aligned) {
1283         for (i=0; i<N; i++){    # loop 1A
1284           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1285           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1286         }
1287      }
1288      else {
1289         for (i=0; i<N; i++){    # loop 1B
1290           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1291           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1292         }
1293      }
1294
1295      -- Possibility 2: we do loop peeling:
1296      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1297         x = q[i];
1298         p[i] = y;
1299      }
1300      for (i = 3; i < N; i++){   # loop 2A
1301         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1302         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1303      }
1304
1305      -- Possibility 3: combination of loop peeling and versioning:
1306      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1307         x = q[i];
1308         p[i] = y;
1309      }
1310      if (p is aligned) {
1311         for (i = 3; i<N; i++){  # loop 3A
1312           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1313           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1314         }
1315      }
1316      else {
1317         for (i = 3; i<N; i++){  # loop 3B
1318           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1319           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1320         }
1321      }
1322
1323      These loops are later passed to loop_transform to be vectorized.  The
1324      vectorizer will use the alignment information to guide the transformation
1325      (whether to generate regular loads/stores, or with special handling for
1326      misalignment).  */
1327
1328 bool
1329 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1330 {
1331   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1332   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1333   enum dr_alignment_support supportable_dr_alignment;
1334   struct data_reference *dr0 = NULL, *first_store = NULL;
1335   struct data_reference *dr;
1336   unsigned int i, j;
1337   bool do_peeling = false;
1338   bool do_versioning = false;
1339   bool stat;
1340   gimple *stmt;
1341   stmt_vec_info stmt_info;
1342   unsigned int npeel = 0;
1343   bool all_misalignments_unknown = true;
1344   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1345   unsigned possible_npeel_number = 1;
1346   tree vectype;
1347   unsigned int nelements, mis, same_align_drs_max = 0;
1348   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1349   hash_table<peel_info_hasher> peeling_htab (1);
1350
1351   if (dump_enabled_p ())
1352     dump_printf_loc (MSG_NOTE, vect_location,
1353                      "=== vect_enhance_data_refs_alignment ===\n");
1354
1355   /* Reset data so we can safely be called multiple times.  */
1356   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1357   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1358
1359   /* While cost model enhancements are expected in the future, the high level
1360      view of the code at this time is as follows:
1361
1362      A) If there is a misaligned access then see if peeling to align
1363         this access can make all data references satisfy
1364         vect_supportable_dr_alignment.  If so, update data structures
1365         as needed and return true.
1366
1367      B) If peeling wasn't possible and there is a data reference with an
1368         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1369         then see if loop versioning checks can be used to make all data
1370         references satisfy vect_supportable_dr_alignment.  If so, update
1371         data structures as needed and return true.
1372
1373      C) If neither peeling nor versioning were successful then return false if
1374         any data reference does not satisfy vect_supportable_dr_alignment.
1375
1376      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1377
1378      Note, Possibility 3 above (which is peeling and versioning together) is not
1379      being done at this time.  */
1380
1381   /* (1) Peeling to force alignment.  */
1382
1383   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1384      Considerations:
1385      + How many accesses will become aligned due to the peeling
1386      - How many accesses will become unaligned due to the peeling,
1387        and the cost of misaligned accesses.
1388      - The cost of peeling (the extra runtime checks, the increase
1389        in code size).  */
1390
1391   FOR_EACH_VEC_ELT (datarefs, i, dr)
1392     {
1393       stmt = DR_STMT (dr);
1394       stmt_info = vinfo_for_stmt (stmt);
1395
1396       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1397         continue;
1398
1399       /* For interleaving, only the alignment of the first access
1400          matters.  */
1401       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1402           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1403         continue;
1404
1405       /* For invariant accesses there is nothing to enhance.  */
1406       if (integer_zerop (DR_STEP (dr)))
1407         continue;
1408
1409       /* Strided accesses perform only component accesses, alignment is
1410          irrelevant for them.  */
1411       if (STMT_VINFO_STRIDED_P (stmt_info)
1412           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1413         continue;
1414
1415       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1416       do_peeling = vector_alignment_reachable_p (dr);
1417       if (do_peeling)
1418         {
1419           if (known_alignment_for_access_p (dr))
1420             {
1421               unsigned int npeel_tmp;
1422               bool negative = tree_int_cst_compare (DR_STEP (dr),
1423                                                     size_zero_node) < 0;
1424
1425               /* Save info about DR in the hash table.  */
1426               vectype = STMT_VINFO_VECTYPE (stmt_info);
1427               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1428               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1429                                                 TREE_TYPE (DR_REF (dr))));
1430               npeel_tmp = (negative
1431                            ? (mis - nelements) : (nelements - mis))
1432                   & (nelements - 1);
1433
1434               /* For multiple types, it is possible that the bigger type access
1435                  will have more than one peeling option.  E.g., a loop with two
1436                  types: one of size (vector size / 4), and the other one of
1437                  size (vector size / 8).  Vectorization factor will 8.  If both
1438                  access are misaligned by 3, the first one needs one scalar
1439                  iteration to be aligned, and the second one needs 5.  But the
1440                  the first one will be aligned also by peeling 5 scalar
1441                  iterations, and in that case both accesses will be aligned.
1442                  Hence, except for the immediate peeling amount, we also want
1443                  to try to add full vector size, while we don't exceed
1444                  vectorization factor.
1445                  We do this automtically for cost model, since we calculate cost
1446                  for every peeling option.  */
1447               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1448                 {
1449                   if (STMT_SLP_TYPE (stmt_info))
1450                     possible_npeel_number
1451                       = (vf * GROUP_SIZE (stmt_info)) / nelements;
1452                   else
1453                     possible_npeel_number = vf / nelements;
1454                 }
1455
1456               /* Handle the aligned case. We may decide to align some other
1457                  access, making DR unaligned.  */
1458               if (DR_MISALIGNMENT (dr) == 0)
1459                 {
1460                   npeel_tmp = 0;
1461                   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1462                     possible_npeel_number++;
1463                 }
1464
1465               for (j = 0; j < possible_npeel_number; j++)
1466                 {
1467                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1468                                             dr, npeel_tmp);
1469                   npeel_tmp += nelements;
1470                 }
1471
1472               all_misalignments_unknown = false;
1473               /* Data-ref that was chosen for the case that all the
1474                  misalignments are unknown is not relevant anymore, since we
1475                  have a data-ref with known alignment.  */
1476               dr0 = NULL;
1477             }
1478           else
1479             {
1480               /* If we don't know any misalignment values, we prefer
1481                  peeling for data-ref that has the maximum number of data-refs
1482                  with the same alignment, unless the target prefers to align
1483                  stores over load.  */
1484               if (all_misalignments_unknown)
1485                 {
1486                   unsigned same_align_drs
1487                     = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1488                   if (!dr0
1489                       || same_align_drs_max < same_align_drs)
1490                     {
1491                       same_align_drs_max = same_align_drs;
1492                       dr0 = dr;
1493                     }
1494                   /* For data-refs with the same number of related
1495                      accesses prefer the one where the misalign
1496                      computation will be invariant in the outermost loop.  */
1497                   else if (same_align_drs_max == same_align_drs)
1498                     {
1499                       struct loop *ivloop0, *ivloop;
1500                       ivloop0 = outermost_invariant_loop_for_expr
1501                           (loop, DR_BASE_ADDRESS (dr0));
1502                       ivloop = outermost_invariant_loop_for_expr
1503                           (loop, DR_BASE_ADDRESS (dr));
1504                       if ((ivloop && !ivloop0)
1505                           || (ivloop && ivloop0
1506                               && flow_loop_nested_p (ivloop, ivloop0)))
1507                         dr0 = dr;
1508                     }
1509
1510                   if (!first_store && DR_IS_WRITE (dr))
1511                     first_store = dr;
1512                 }
1513
1514               /* If there are both known and unknown misaligned accesses in the
1515                  loop, we choose peeling amount according to the known
1516                  accesses.  */
1517               if (!supportable_dr_alignment)
1518                 {
1519                   dr0 = dr;
1520                   if (!first_store && DR_IS_WRITE (dr))
1521                     first_store = dr;
1522                 }
1523             }
1524         }
1525       else
1526         {
1527           if (!aligned_access_p (dr))
1528             {
1529               if (dump_enabled_p ())
1530                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1531                                  "vector alignment may not be reachable\n");
1532               break;
1533             }
1534         }
1535     }
1536
1537   /* Check if we can possibly peel the loop.  */
1538   if (!vect_can_advance_ivs_p (loop_vinfo)
1539       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1540       || loop->inner)
1541     do_peeling = false;
1542
1543   if (do_peeling
1544       && all_misalignments_unknown
1545       && vect_supportable_dr_alignment (dr0, false))
1546     {
1547       /* Check if the target requires to prefer stores over loads, i.e., if
1548          misaligned stores are more expensive than misaligned loads (taking
1549          drs with same alignment into account).  */
1550       if (first_store && DR_IS_READ (dr0))
1551         {
1552           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1553           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1554           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1555           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1556           stmt_vector_for_cost dummy;
1557           dummy.create (2);
1558
1559           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1560                                      &dummy);
1561           vect_get_data_access_cost (first_store, &store_inside_cost,
1562                                      &store_outside_cost, &dummy);
1563
1564           dummy.release ();
1565
1566           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1567              aligning the load DR0).  */
1568           load_inside_penalty = store_inside_cost;
1569           load_outside_penalty = store_outside_cost;
1570           for (i = 0;
1571                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1572                           DR_STMT (first_store))).iterate (i, &dr);
1573                i++)
1574             if (DR_IS_READ (dr))
1575               {
1576                 load_inside_penalty += load_inside_cost;
1577                 load_outside_penalty += load_outside_cost;
1578               }
1579             else
1580               {
1581                 load_inside_penalty += store_inside_cost;
1582                 load_outside_penalty += store_outside_cost;
1583               }
1584
1585           /* Calculate the penalty for leaving DR0 unaligned (by
1586              aligning the FIRST_STORE).  */
1587           store_inside_penalty = load_inside_cost;
1588           store_outside_penalty = load_outside_cost;
1589           for (i = 0;
1590                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1591                       DR_STMT (dr0))).iterate (i, &dr);
1592                i++)
1593             if (DR_IS_READ (dr))
1594               {
1595                 store_inside_penalty += load_inside_cost;
1596                 store_outside_penalty += load_outside_cost;
1597               }
1598             else
1599               {
1600                 store_inside_penalty += store_inside_cost;
1601                 store_outside_penalty += store_outside_cost;
1602               }
1603
1604           if (load_inside_penalty > store_inside_penalty
1605               || (load_inside_penalty == store_inside_penalty
1606                   && load_outside_penalty > store_outside_penalty))
1607             dr0 = first_store;
1608         }
1609
1610       /* In case there are only loads with different unknown misalignments, use
1611          peeling only if it may help to align other accesses in the loop or
1612          if it may help improving load bandwith when we'd end up using
1613          unaligned loads.  */
1614       tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1615       if (!first_store
1616           && !STMT_VINFO_SAME_ALIGN_REFS (
1617                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1618           && (vect_supportable_dr_alignment (dr0, false)
1619               != dr_unaligned_supported
1620               || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1621                   == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1622         do_peeling = false;
1623     }
1624
1625   if (do_peeling && !dr0)
1626     {
1627       /* Peeling is possible, but there is no data access that is not supported
1628          unless aligned. So we try to choose the best possible peeling.  */
1629
1630       /* We should get here only if there are drs with known misalignment.  */
1631       gcc_assert (!all_misalignments_unknown);
1632
1633       /* Choose the best peeling from the hash table.  */
1634       dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1635                                                    loop_vinfo, &npeel,
1636                                                    &body_cost_vec);
1637       if (!dr0 || !npeel)
1638         do_peeling = false;
1639     }
1640
1641   if (do_peeling)
1642     {
1643       stmt = DR_STMT (dr0);
1644       stmt_info = vinfo_for_stmt (stmt);
1645       vectype = STMT_VINFO_VECTYPE (stmt_info);
1646       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1647
1648       if (known_alignment_for_access_p (dr0))
1649         {
1650           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1651                                                 size_zero_node) < 0;
1652           if (!npeel)
1653             {
1654               /* Since it's known at compile time, compute the number of
1655                  iterations in the peeled loop (the peeling factor) for use in
1656                  updating DR_MISALIGNMENT values.  The peeling factor is the
1657                  vectorization factor minus the misalignment as an element
1658                  count.  */
1659               mis = DR_MISALIGNMENT (dr0);
1660               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1661               npeel = ((negative ? mis - nelements : nelements - mis)
1662                        & (nelements - 1));
1663             }
1664
1665           /* For interleaved data access every iteration accesses all the
1666              members of the group, therefore we divide the number of iterations
1667              by the group size.  */
1668           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1669           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1670             npeel /= GROUP_SIZE (stmt_info);
1671
1672           if (dump_enabled_p ())
1673             dump_printf_loc (MSG_NOTE, vect_location,
1674                              "Try peeling by %d\n", npeel);
1675         }
1676
1677       /* Ensure that all data refs can be vectorized after the peel.  */
1678       FOR_EACH_VEC_ELT (datarefs, i, dr)
1679         {
1680           int save_misalignment;
1681
1682           if (dr == dr0)
1683             continue;
1684
1685           stmt = DR_STMT (dr);
1686           stmt_info = vinfo_for_stmt (stmt);
1687           /* For interleaving, only the alignment of the first access
1688             matters.  */
1689           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1690               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1691             continue;
1692
1693           /* Strided accesses perform only component accesses, alignment is
1694              irrelevant for them.  */
1695           if (STMT_VINFO_STRIDED_P (stmt_info)
1696               && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1697             continue;
1698
1699           save_misalignment = DR_MISALIGNMENT (dr);
1700           vect_update_misalignment_for_peel (dr, dr0, npeel);
1701           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1702           SET_DR_MISALIGNMENT (dr, save_misalignment);
1703
1704           if (!supportable_dr_alignment)
1705             {
1706               do_peeling = false;
1707               break;
1708             }
1709         }
1710
1711       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1712         {
1713           stat = vect_verify_datarefs_alignment (loop_vinfo);
1714           if (!stat)
1715             do_peeling = false;
1716           else
1717             {
1718               body_cost_vec.release ();
1719               return stat;
1720             }
1721         }
1722
1723       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
1724       if (do_peeling)
1725         {
1726           unsigned max_allowed_peel
1727             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1728           if (max_allowed_peel != (unsigned)-1)
1729             {
1730               unsigned max_peel = npeel;
1731               if (max_peel == 0)
1732                 {
1733                   gimple *dr_stmt = DR_STMT (dr0);
1734                   stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1735                   tree vtype = STMT_VINFO_VECTYPE (vinfo);
1736                   max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1737                 }
1738               if (max_peel > max_allowed_peel)
1739                 {
1740                   do_peeling = false;
1741                   if (dump_enabled_p ())
1742                     dump_printf_loc (MSG_NOTE, vect_location,
1743                         "Disable peeling, max peels reached: %d\n", max_peel);
1744                 }
1745             }
1746         }
1747
1748       /* Cost model #2 - if peeling may result in a remaining loop not
1749          iterating enough to be vectorized then do not peel.  */
1750       if (do_peeling
1751           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1752         {
1753           unsigned max_peel
1754             = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1755           if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1756               < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1757             do_peeling = false;
1758         }
1759
1760       if (do_peeling)
1761         {
1762           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1763              If the misalignment of DR_i is identical to that of dr0 then set
1764              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1765              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1766              by the peeling factor times the element size of DR_i (MOD the
1767              vectorization factor times the size).  Otherwise, the
1768              misalignment of DR_i must be set to unknown.  */
1769           FOR_EACH_VEC_ELT (datarefs, i, dr)
1770             if (dr != dr0)
1771               vect_update_misalignment_for_peel (dr, dr0, npeel);
1772
1773           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1774           if (npeel)
1775             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1776           else
1777             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1778               = DR_MISALIGNMENT (dr0);
1779           SET_DR_MISALIGNMENT (dr0, 0);
1780           if (dump_enabled_p ())
1781             {
1782               dump_printf_loc (MSG_NOTE, vect_location,
1783                                "Alignment of access forced using peeling.\n");
1784               dump_printf_loc (MSG_NOTE, vect_location,
1785                                "Peeling for alignment will be applied.\n");
1786             }
1787           /* The inside-loop cost will be accounted for in vectorizable_load
1788              and vectorizable_store correctly with adjusted alignments.
1789              Drop the body_cst_vec on the floor here.  */
1790           body_cost_vec.release ();
1791
1792           stat = vect_verify_datarefs_alignment (loop_vinfo);
1793           gcc_assert (stat);
1794           return stat;
1795         }
1796     }
1797
1798   body_cost_vec.release ();
1799
1800   /* (2) Versioning to force alignment.  */
1801
1802   /* Try versioning if:
1803      1) optimize loop for speed
1804      2) there is at least one unsupported misaligned data ref with an unknown
1805         misalignment, and
1806      3) all misaligned data refs with a known misalignment are supported, and
1807      4) the number of runtime alignment checks is within reason.  */
1808
1809   do_versioning =
1810         optimize_loop_nest_for_speed_p (loop)
1811         && (!loop->inner); /* FORNOW */
1812
1813   if (do_versioning)
1814     {
1815       FOR_EACH_VEC_ELT (datarefs, i, dr)
1816         {
1817           stmt = DR_STMT (dr);
1818           stmt_info = vinfo_for_stmt (stmt);
1819
1820           /* For interleaving, only the alignment of the first access
1821              matters.  */
1822           if (aligned_access_p (dr)
1823               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1824                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1825             continue;
1826
1827           if (STMT_VINFO_STRIDED_P (stmt_info))
1828             {
1829               /* Strided loads perform only component accesses, alignment is
1830                  irrelevant for them.  */
1831               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1832                 continue;
1833               do_versioning = false;
1834               break;
1835             }
1836
1837           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1838
1839           if (!supportable_dr_alignment)
1840             {
1841               gimple *stmt;
1842               int mask;
1843               tree vectype;
1844
1845               if (known_alignment_for_access_p (dr)
1846                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1847                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1848                 {
1849                   do_versioning = false;
1850                   break;
1851                 }
1852
1853               stmt = DR_STMT (dr);
1854               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1855               gcc_assert (vectype);
1856
1857               /* The rightmost bits of an aligned address must be zeros.
1858                  Construct the mask needed for this test.  For example,
1859                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1860                  mask must be 15 = 0xf. */
1861               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1862
1863               /* FORNOW: use the same mask to test all potentially unaligned
1864                  references in the loop.  The vectorizer currently supports
1865                  a single vector size, see the reference to
1866                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1867                  vectorization factor is computed.  */
1868               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1869                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1870               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1871               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1872                       DR_STMT (dr));
1873             }
1874         }
1875
1876       /* Versioning requires at least one misaligned data reference.  */
1877       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1878         do_versioning = false;
1879       else if (!do_versioning)
1880         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1881     }
1882
1883   if (do_versioning)
1884     {
1885       vec<gimple *> may_misalign_stmts
1886         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1887       gimple *stmt;
1888
1889       /* It can now be assumed that the data references in the statements
1890          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1891          of the loop being vectorized.  */
1892       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1893         {
1894           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1895           dr = STMT_VINFO_DATA_REF (stmt_info);
1896           SET_DR_MISALIGNMENT (dr, 0);
1897           if (dump_enabled_p ())
1898             dump_printf_loc (MSG_NOTE, vect_location,
1899                              "Alignment of access forced using versioning.\n");
1900         }
1901
1902       if (dump_enabled_p ())
1903         dump_printf_loc (MSG_NOTE, vect_location,
1904                          "Versioning for alignment will be applied.\n");
1905
1906       /* Peeling and versioning can't be done together at this time.  */
1907       gcc_assert (! (do_peeling && do_versioning));
1908
1909       stat = vect_verify_datarefs_alignment (loop_vinfo);
1910       gcc_assert (stat);
1911       return stat;
1912     }
1913
1914   /* This point is reached if neither peeling nor versioning is being done.  */
1915   gcc_assert (! (do_peeling || do_versioning));
1916
1917   stat = vect_verify_datarefs_alignment (loop_vinfo);
1918   return stat;
1919 }
1920
1921
1922 /* Function vect_find_same_alignment_drs.
1923
1924    Update group and alignment relations according to the chosen
1925    vectorization factor.  */
1926
1927 static void
1928 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1929                               loop_vec_info loop_vinfo)
1930 {
1931   unsigned int i;
1932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1933   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1934   struct data_reference *dra = DDR_A (ddr);
1935   struct data_reference *drb = DDR_B (ddr);
1936   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1937   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1938   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1939   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1940   lambda_vector dist_v;
1941   unsigned int loop_depth;
1942
1943   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1944     return;
1945
1946   if (dra == drb)
1947     return;
1948
1949   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1950     return;
1951
1952   /* Loop-based vectorization and known data dependence.  */
1953   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1954     return;
1955
1956   /* Data-dependence analysis reports a distance vector of zero
1957      for data-references that overlap only in the first iteration
1958      but have different sign step (see PR45764).
1959      So as a sanity check require equal DR_STEP.  */
1960   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1961     return;
1962
1963   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1964   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1965     {
1966       int dist = dist_v[loop_depth];
1967
1968       if (dump_enabled_p ())
1969         dump_printf_loc (MSG_NOTE, vect_location,
1970                          "dependence distance  = %d.\n", dist);
1971
1972       /* Same loop iteration.  */
1973       if (dist == 0
1974           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1975         {
1976           /* Two references with distance zero have the same alignment.  */
1977           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1978           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1979           if (dump_enabled_p ())
1980             {
1981               dump_printf_loc (MSG_NOTE, vect_location,
1982                                "accesses have the same alignment.\n");
1983               dump_printf (MSG_NOTE,
1984                            "dependence distance modulo vf == 0 between ");
1985               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1986               dump_printf (MSG_NOTE,  " and ");
1987               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1988               dump_printf (MSG_NOTE, "\n");
1989             }
1990         }
1991     }
1992 }
1993
1994
1995 /* Function vect_analyze_data_refs_alignment
1996
1997    Analyze the alignment of the data-references in the loop.
1998    Return FALSE if a data reference is found that cannot be vectorized.  */
1999
2000 bool
2001 vect_analyze_data_refs_alignment (vec_info *vinfo)
2002 {
2003   if (dump_enabled_p ())
2004     dump_printf_loc (MSG_NOTE, vect_location,
2005                      "=== vect_analyze_data_refs_alignment ===\n");
2006
2007   /* Mark groups of data references with same alignment using
2008      data dependence information.  */
2009   if (is_a <loop_vec_info> (vinfo))
2010     {
2011       vec<ddr_p> ddrs = vinfo->ddrs;
2012       struct data_dependence_relation *ddr;
2013       unsigned int i;
2014
2015       FOR_EACH_VEC_ELT (ddrs, i, ddr)
2016         vect_find_same_alignment_drs (ddr, as_a <loop_vec_info> (vinfo));
2017     }
2018
2019   if (!vect_compute_data_refs_alignment (vinfo))
2020     {
2021       if (dump_enabled_p ())
2022         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2023                          "not vectorized: can't calculate alignment "
2024                          "for data ref.\n");
2025       return false;
2026     }
2027
2028   return true;
2029 }
2030
2031
2032 /* Analyze groups of accesses: check that DR belongs to a group of
2033    accesses of legal size, step, etc.  Detect gaps, single element
2034    interleaving, and other special cases. Set grouped access info.
2035    Collect groups of strided stores for further use in SLP analysis.
2036    Worker for vect_analyze_group_access.  */
2037
2038 static bool
2039 vect_analyze_group_access_1 (struct data_reference *dr)
2040 {
2041   tree step = DR_STEP (dr);
2042   tree scalar_type = TREE_TYPE (DR_REF (dr));
2043   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2044   gimple *stmt = DR_STMT (dr);
2045   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2046   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2047   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2048   HOST_WIDE_INT dr_step = -1;
2049   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2050   bool slp_impossible = false;
2051   struct loop *loop = NULL;
2052
2053   if (loop_vinfo)
2054     loop = LOOP_VINFO_LOOP (loop_vinfo);
2055
2056   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2057      size of the interleaving group (including gaps).  */
2058   if (tree_fits_shwi_p (step))
2059     {
2060       dr_step = tree_to_shwi (step);
2061       groupsize = absu_hwi (dr_step) / type_size;
2062     }
2063   else
2064     groupsize = 0;
2065
2066   /* Not consecutive access is possible only if it is a part of interleaving.  */
2067   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2068     {
2069       /* Check if it this DR is a part of interleaving, and is a single
2070          element of the group that is accessed in the loop.  */
2071
2072       /* Gaps are supported only for loads. STEP must be a multiple of the type
2073          size.  The size of the group must be a power of 2.  */
2074       if (DR_IS_READ (dr)
2075           && (dr_step % type_size) == 0
2076           && groupsize > 0
2077           && exact_log2 (groupsize) != -1)
2078         {
2079           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2080           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2081           if (dump_enabled_p ())
2082             {
2083               dump_printf_loc (MSG_NOTE, vect_location,
2084                                "Detected single element interleaving ");
2085               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2086               dump_printf (MSG_NOTE, " step ");
2087               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2088               dump_printf (MSG_NOTE, "\n");
2089             }
2090
2091           if (loop_vinfo)
2092             {
2093               if (dump_enabled_p ())
2094                 dump_printf_loc (MSG_NOTE, vect_location,
2095                                  "Data access with gaps requires scalar "
2096                                  "epilogue loop\n");
2097               if (loop->inner)
2098                 {
2099                   if (dump_enabled_p ())
2100                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2101                                      "Peeling for outer loop is not"
2102                                      " supported\n");
2103                   return false;
2104                 }
2105
2106               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2107             }
2108
2109           return true;
2110         }
2111
2112       if (dump_enabled_p ())
2113         {
2114           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2115                            "not consecutive access ");
2116           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2117         }
2118
2119       if (bb_vinfo)
2120         {
2121           /* Mark the statement as unvectorizable.  */
2122           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2123           return true;
2124         }
2125
2126       dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2127       STMT_VINFO_STRIDED_P (stmt_info) = true;
2128       return true;
2129     }
2130
2131   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2132     {
2133       /* First stmt in the interleaving chain. Check the chain.  */
2134       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2135       struct data_reference *data_ref = dr;
2136       unsigned int count = 1;
2137       tree prev_init = DR_INIT (data_ref);
2138       gimple *prev = stmt;
2139       HOST_WIDE_INT diff, gaps = 0;
2140
2141       while (next)
2142         {
2143           /* Skip same data-refs.  In case that two or more stmts share
2144              data-ref (supported only for loads), we vectorize only the first
2145              stmt, and the rest get their vectorized loads from the first
2146              one.  */
2147           if (!tree_int_cst_compare (DR_INIT (data_ref),
2148                                      DR_INIT (STMT_VINFO_DATA_REF (
2149                                                    vinfo_for_stmt (next)))))
2150             {
2151               if (DR_IS_WRITE (data_ref))
2152                 {
2153                   if (dump_enabled_p ())
2154                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2155                                      "Two store stmts share the same dr.\n");
2156                   return false;
2157                 }
2158
2159               if (dump_enabled_p ())
2160                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2161                                  "Two or more load stmts share the same dr.\n");
2162
2163               /* For load use the same data-ref load.  */
2164               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2165
2166               prev = next;
2167               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2168               continue;
2169             }
2170
2171           prev = next;
2172           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2173
2174           /* All group members have the same STEP by construction.  */
2175           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2176
2177           /* Check that the distance between two accesses is equal to the type
2178              size. Otherwise, we have gaps.  */
2179           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2180                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2181           if (diff != 1)
2182             {
2183               /* FORNOW: SLP of accesses with gaps is not supported.  */
2184               slp_impossible = true;
2185               if (DR_IS_WRITE (data_ref))
2186                 {
2187                   if (dump_enabled_p ())
2188                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2189                                      "interleaved store with gaps\n");
2190                   return false;
2191                 }
2192
2193               gaps += diff - 1;
2194             }
2195
2196           last_accessed_element += diff;
2197
2198           /* Store the gap from the previous member of the group. If there is no
2199              gap in the access, GROUP_GAP is always 1.  */
2200           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2201
2202           prev_init = DR_INIT (data_ref);
2203           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2204           /* Count the number of data-refs in the chain.  */
2205           count++;
2206         }
2207
2208       if (groupsize == 0)
2209         groupsize = count + gaps;
2210
2211       if (groupsize > UINT_MAX)
2212         {
2213           if (dump_enabled_p ())
2214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2215                              "group is too large\n");
2216           return false;
2217         }
2218
2219       /* Check that the size of the interleaving is equal to count for stores,
2220          i.e., that there are no gaps.  */
2221       if (groupsize != count
2222           && !DR_IS_READ (dr))
2223         {
2224           if (dump_enabled_p ())
2225             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2226                              "interleaved store with gaps\n");
2227           return false;
2228         }
2229
2230       /* If there is a gap after the last load in the group it is the
2231          difference between the groupsize and the last accessed
2232          element.
2233          When there is no gap, this difference should be 0.  */
2234       GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2235
2236       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2237       if (dump_enabled_p ())
2238         {
2239           dump_printf_loc (MSG_NOTE, vect_location,
2240                            "Detected interleaving ");
2241           if (DR_IS_READ (dr))
2242             dump_printf (MSG_NOTE, "load ");
2243           else
2244             dump_printf (MSG_NOTE, "store ");
2245           dump_printf (MSG_NOTE, "of size %u starting with ",
2246                        (unsigned)groupsize);
2247           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2248           if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2249             dump_printf_loc (MSG_NOTE, vect_location,
2250                              "There is a gap of %u elements after the group\n",
2251                              GROUP_GAP (vinfo_for_stmt (stmt)));
2252         }
2253
2254       /* SLP: create an SLP data structure for every interleaving group of
2255          stores for further analysis in vect_analyse_slp.  */
2256       if (DR_IS_WRITE (dr) && !slp_impossible)
2257         {
2258           if (loop_vinfo)
2259             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2260           if (bb_vinfo)
2261             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2262         }
2263
2264       /* If there is a gap in the end of the group or the group size cannot
2265          be made a multiple of the vector element count then we access excess
2266          elements in the last iteration and thus need to peel that off.  */
2267       if (loop_vinfo
2268           && (groupsize - last_accessed_element > 0
2269               || exact_log2 (groupsize) == -1))
2270
2271         {
2272           if (dump_enabled_p ())
2273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274                              "Data access with gaps requires scalar "
2275                              "epilogue loop\n");
2276           if (loop->inner)
2277             {
2278               if (dump_enabled_p ())
2279                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280                                  "Peeling for outer loop is not supported\n");
2281               return false;
2282             }
2283
2284           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2285         }
2286     }
2287
2288   return true;
2289 }
2290
2291 /* Analyze groups of accesses: check that DR belongs to a group of
2292    accesses of legal size, step, etc.  Detect gaps, single element
2293    interleaving, and other special cases. Set grouped access info.
2294    Collect groups of strided stores for further use in SLP analysis.  */
2295
2296 static bool
2297 vect_analyze_group_access (struct data_reference *dr)
2298 {
2299   if (!vect_analyze_group_access_1 (dr))
2300     {
2301       /* Dissolve the group if present.  */
2302       gimple *next;
2303       gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2304       while (stmt)
2305         {
2306           stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2307           next = GROUP_NEXT_ELEMENT (vinfo);
2308           GROUP_FIRST_ELEMENT (vinfo) = NULL;
2309           GROUP_NEXT_ELEMENT (vinfo) = NULL;
2310           stmt = next;
2311         }
2312       return false;
2313     }
2314   return true;
2315 }
2316
2317 /* Analyze the access pattern of the data-reference DR.
2318    In case of non-consecutive accesses call vect_analyze_group_access() to
2319    analyze groups of accesses.  */
2320
2321 static bool
2322 vect_analyze_data_ref_access (struct data_reference *dr)
2323 {
2324   tree step = DR_STEP (dr);
2325   tree scalar_type = TREE_TYPE (DR_REF (dr));
2326   gimple *stmt = DR_STMT (dr);
2327   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2328   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2329   struct loop *loop = NULL;
2330
2331   if (loop_vinfo)
2332     loop = LOOP_VINFO_LOOP (loop_vinfo);
2333
2334   if (loop_vinfo && !step)
2335     {
2336       if (dump_enabled_p ())
2337         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2338                          "bad data-ref access in loop\n");
2339       return false;
2340     }
2341
2342   /* Allow loads with zero step in inner-loop vectorization.  */
2343   if (loop_vinfo && integer_zerop (step))
2344     {
2345       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2346       if (!nested_in_vect_loop_p (loop, stmt))
2347         return DR_IS_READ (dr);
2348       /* Allow references with zero step for outer loops marked
2349          with pragma omp simd only - it guarantees absence of
2350          loop-carried dependencies between inner loop iterations.  */
2351       if (!loop->force_vectorize)
2352         {
2353           if (dump_enabled_p ())
2354             dump_printf_loc (MSG_NOTE, vect_location,
2355                              "zero step in inner loop of nest\n");
2356           return false;
2357         }
2358     }
2359
2360   if (loop && nested_in_vect_loop_p (loop, stmt))
2361     {
2362       /* Interleaved accesses are not yet supported within outer-loop
2363         vectorization for references in the inner-loop.  */
2364       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2365
2366       /* For the rest of the analysis we use the outer-loop step.  */
2367       step = STMT_VINFO_DR_STEP (stmt_info);
2368       if (integer_zerop (step))
2369         {
2370           if (dump_enabled_p ())
2371             dump_printf_loc (MSG_NOTE, vect_location,
2372                              "zero step in outer loop.\n");
2373           return DR_IS_READ (dr);
2374         }
2375     }
2376
2377   /* Consecutive?  */
2378   if (TREE_CODE (step) == INTEGER_CST)
2379     {
2380       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2381       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2382           || (dr_step < 0
2383               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2384         {
2385           /* Mark that it is not interleaving.  */
2386           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2387           return true;
2388         }
2389     }
2390
2391   if (loop && nested_in_vect_loop_p (loop, stmt))
2392     {
2393       if (dump_enabled_p ())
2394         dump_printf_loc (MSG_NOTE, vect_location,
2395                          "grouped access in outer loop.\n");
2396       return false;
2397     }
2398
2399
2400   /* Assume this is a DR handled by non-constant strided load case.  */
2401   if (TREE_CODE (step) != INTEGER_CST)
2402     return (STMT_VINFO_STRIDED_P (stmt_info)
2403             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2404                 || vect_analyze_group_access (dr)));
2405
2406   /* Not consecutive access - check if it's a part of interleaving group.  */
2407   return vect_analyze_group_access (dr);
2408 }
2409
2410
2411
2412 /*  A helper function used in the comparator function to sort data
2413     references.  T1 and T2 are two data references to be compared.
2414     The function returns -1, 0, or 1.  */
2415
2416 static int
2417 compare_tree (tree t1, tree t2)
2418 {
2419   int i, cmp;
2420   enum tree_code code;
2421   char tclass;
2422
2423   if (t1 == t2)
2424     return 0;
2425   if (t1 == NULL)
2426     return -1;
2427   if (t2 == NULL)
2428     return 1;
2429
2430
2431   if (TREE_CODE (t1) != TREE_CODE (t2))
2432     return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2433
2434   code = TREE_CODE (t1);
2435   switch (code)
2436     {
2437     /* For const values, we can just use hash values for comparisons.  */
2438     case INTEGER_CST:
2439     case REAL_CST:
2440     case FIXED_CST:
2441     case STRING_CST:
2442     case COMPLEX_CST:
2443     case VECTOR_CST:
2444       {
2445         hashval_t h1 = iterative_hash_expr (t1, 0);
2446         hashval_t h2 = iterative_hash_expr (t2, 0);
2447         if (h1 != h2)
2448           return h1 < h2 ? -1 : 1;
2449         break;
2450       }
2451
2452     case SSA_NAME:
2453       cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2454       if (cmp != 0)
2455         return cmp;
2456
2457       if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2458         return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2459       break;
2460
2461     default:
2462       tclass = TREE_CODE_CLASS (code);
2463
2464       /* For var-decl, we could compare their UIDs.  */
2465       if (tclass == tcc_declaration)
2466         {
2467           if (DECL_UID (t1) != DECL_UID (t2))
2468             return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2469           break;
2470         }
2471
2472       /* For expressions with operands, compare their operands recursively.  */
2473       for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2474         {
2475           cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2476           if (cmp != 0)
2477             return cmp;
2478         }
2479     }
2480
2481   return 0;
2482 }
2483
2484
2485 /* Compare two data-references DRA and DRB to group them into chunks
2486    suitable for grouping.  */
2487
2488 static int
2489 dr_group_sort_cmp (const void *dra_, const void *drb_)
2490 {
2491   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2492   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2493   int cmp;
2494
2495   /* Stabilize sort.  */
2496   if (dra == drb)
2497     return 0;
2498
2499   /* Ordering of DRs according to base.  */
2500   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2501     {
2502       cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2503       if (cmp != 0)
2504         return cmp;
2505     }
2506
2507   /* And according to DR_OFFSET.  */
2508   if (!dr_equal_offsets_p (dra, drb))
2509     {
2510       cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2511       if (cmp != 0)
2512         return cmp;
2513     }
2514
2515   /* Put reads before writes.  */
2516   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2517     return DR_IS_READ (dra) ? -1 : 1;
2518
2519   /* Then sort after access size.  */
2520   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2521                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2522     {
2523       cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2524                           TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2525       if (cmp != 0)
2526         return cmp;
2527     }
2528
2529   /* And after step.  */
2530   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2531     {
2532       cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2533       if (cmp != 0)
2534         return cmp;
2535     }
2536
2537   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2538   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2539   if (cmp == 0)
2540     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2541   return cmp;
2542 }
2543
2544 /* Function vect_analyze_data_ref_accesses.
2545
2546    Analyze the access pattern of all the data references in the loop.
2547
2548    FORNOW: the only access pattern that is considered vectorizable is a
2549            simple step 1 (consecutive) access.
2550
2551    FORNOW: handle only arrays and pointer accesses.  */
2552
2553 bool
2554 vect_analyze_data_ref_accesses (vec_info *vinfo)
2555 {
2556   unsigned int i;
2557   vec<data_reference_p> datarefs = vinfo->datarefs;
2558   struct data_reference *dr;
2559
2560   if (dump_enabled_p ())
2561     dump_printf_loc (MSG_NOTE, vect_location,
2562                      "=== vect_analyze_data_ref_accesses ===\n");
2563
2564   if (datarefs.is_empty ())
2565     return true;
2566
2567   /* Sort the array of datarefs to make building the interleaving chains
2568      linear.  Don't modify the original vector's order, it is needed for
2569      determining what dependencies are reversed.  */
2570   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2571   datarefs_copy.qsort (dr_group_sort_cmp);
2572
2573   /* Build the interleaving chains.  */
2574   for (i = 0; i < datarefs_copy.length () - 1;)
2575     {
2576       data_reference_p dra = datarefs_copy[i];
2577       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2578       stmt_vec_info lastinfo = NULL;
2579       for (i = i + 1; i < datarefs_copy.length (); ++i)
2580         {
2581           data_reference_p drb = datarefs_copy[i];
2582           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2583
2584           /* ???  Imperfect sorting (non-compatible types, non-modulo
2585              accesses, same accesses) can lead to a group to be artificially
2586              split here as we don't just skip over those.  If it really
2587              matters we can push those to a worklist and re-iterate
2588              over them.  The we can just skip ahead to the next DR here.  */
2589
2590           /* Check that the data-refs have same first location (except init)
2591              and they are both either store or load (not load and store,
2592              not masked loads or stores).  */
2593           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2594               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2595                                    DR_BASE_ADDRESS (drb), 0)
2596               || !dr_equal_offsets_p (dra, drb)
2597               || !gimple_assign_single_p (DR_STMT (dra))
2598               || !gimple_assign_single_p (DR_STMT (drb)))
2599             break;
2600
2601           /* Check that the data-refs have the same constant size.  */
2602           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2603           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2604           if (!tree_fits_uhwi_p (sza)
2605               || !tree_fits_uhwi_p (szb)
2606               || !tree_int_cst_equal (sza, szb))
2607             break;
2608
2609           /* Check that the data-refs have the same step.  */
2610           if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2611             break;
2612
2613           /* Do not place the same access in the interleaving chain twice.  */
2614           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2615             break;
2616
2617           /* Check the types are compatible.
2618              ???  We don't distinguish this during sorting.  */
2619           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2620                                    TREE_TYPE (DR_REF (drb))))
2621             break;
2622
2623           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2624           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2625           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2626           gcc_assert (init_a < init_b);
2627
2628           /* If init_b == init_a + the size of the type * k, we have an
2629              interleaving, and DRA is accessed before DRB.  */
2630           HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2631           if ((init_b - init_a) % type_size_a != 0)
2632             break;
2633
2634           /* If we have a store, the accesses are adjacent.  This splits
2635              groups into chunks we support (we don't support vectorization
2636              of stores with gaps).  */
2637           if (!DR_IS_READ (dra)
2638               && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2639                                              (DR_INIT (datarefs_copy[i-1]))
2640                   != type_size_a))
2641             break;
2642
2643           /* If the step (if not zero or non-constant) is greater than the
2644              difference between data-refs' inits this splits groups into
2645              suitable sizes.  */
2646           if (tree_fits_shwi_p (DR_STEP (dra)))
2647             {
2648               HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2649               if (step != 0 && step <= (init_b - init_a))
2650                 break;
2651             }
2652
2653           if (dump_enabled_p ())
2654             {
2655               dump_printf_loc (MSG_NOTE, vect_location,
2656                                "Detected interleaving ");
2657               if (DR_IS_READ (dra))
2658                 dump_printf (MSG_NOTE, "load ");
2659               else
2660                 dump_printf (MSG_NOTE, "store ");
2661               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2662               dump_printf (MSG_NOTE,  " and ");
2663               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2664               dump_printf (MSG_NOTE, "\n");
2665             }
2666
2667           /* Link the found element into the group list.  */
2668           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2669             {
2670               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2671               lastinfo = stmtinfo_a;
2672             }
2673           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2674           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2675           lastinfo = stmtinfo_b;
2676         }
2677     }
2678
2679   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2680     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2681         && !vect_analyze_data_ref_access (dr))
2682       {
2683         if (dump_enabled_p ())
2684           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2685                            "not vectorized: complicated access pattern.\n");
2686
2687         if (is_a <bb_vec_info> (vinfo))
2688           {
2689             /* Mark the statement as not vectorizable.  */
2690             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2691             continue;
2692           }
2693         else
2694           {
2695             datarefs_copy.release ();
2696             return false;
2697           }
2698       }
2699
2700   datarefs_copy.release ();
2701   return true;
2702 }
2703
2704
2705 /* Operator == between two dr_with_seg_len objects.
2706
2707    This equality operator is used to make sure two data refs
2708    are the same one so that we will consider to combine the
2709    aliasing checks of those two pairs of data dependent data
2710    refs.  */
2711
2712 static bool
2713 operator == (const dr_with_seg_len& d1,
2714              const dr_with_seg_len& d2)
2715 {
2716   return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2717                           DR_BASE_ADDRESS (d2.dr), 0)
2718            && compare_tree (d1.offset, d2.offset) == 0
2719            && compare_tree (d1.seg_len, d2.seg_len) == 0;
2720 }
2721
2722 /* Function comp_dr_with_seg_len_pair.
2723
2724    Comparison function for sorting objects of dr_with_seg_len_pair_t
2725    so that we can combine aliasing checks in one scan.  */
2726
2727 static int
2728 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2729 {
2730   const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2731   const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2732
2733   const dr_with_seg_len &p11 = p1->first,
2734                         &p12 = p1->second,
2735                         &p21 = p2->first,
2736                         &p22 = p2->second;
2737
2738   /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2739      if a and c have the same basic address snd step, and b and d have the same
2740      address and step.  Therefore, if any a&c or b&d don't have the same address
2741      and step, we don't care the order of those two pairs after sorting.  */
2742   int comp_res;
2743
2744   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2745                                 DR_BASE_ADDRESS (p21.dr))) != 0)
2746     return comp_res;
2747   if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2748                                 DR_BASE_ADDRESS (p22.dr))) != 0)
2749     return comp_res;
2750   if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2751     return comp_res;
2752   if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2753     return comp_res;
2754   if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2755     return comp_res;
2756   if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2757     return comp_res;
2758
2759   return 0;
2760 }
2761
2762 /* Function vect_vfa_segment_size.
2763
2764    Create an expression that computes the size of segment
2765    that will be accessed for a data reference.  The functions takes into
2766    account that realignment loads may access one more vector.
2767
2768    Input:
2769      DR: The data reference.
2770      LENGTH_FACTOR: segment length to consider.
2771
2772    Return an expression whose value is the size of segment which will be
2773    accessed by DR.  */
2774
2775 static tree
2776 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2777 {
2778   tree segment_length;
2779
2780   if (integer_zerop (DR_STEP (dr)))
2781     segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2782   else
2783     segment_length = size_binop (MULT_EXPR,
2784                                  fold_convert (sizetype, DR_STEP (dr)),
2785                                  fold_convert (sizetype, length_factor));
2786
2787   if (vect_supportable_dr_alignment (dr, false)
2788         == dr_explicit_realign_optimized)
2789     {
2790       tree vector_size = TYPE_SIZE_UNIT
2791                           (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2792
2793       segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2794     }
2795   return segment_length;
2796 }
2797
2798 /* Function vect_prune_runtime_alias_test_list.
2799
2800    Prune a list of ddrs to be tested at run-time by versioning for alias.
2801    Merge several alias checks into one if possible.
2802    Return FALSE if resulting list of ddrs is longer then allowed by
2803    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2804
2805 bool
2806 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2807 {
2808   vec<ddr_p> may_alias_ddrs =
2809     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2810   vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2811     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2812   int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2813   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2814
2815   ddr_p ddr;
2816   unsigned int i;
2817   tree length_factor;
2818
2819   if (dump_enabled_p ())
2820     dump_printf_loc (MSG_NOTE, vect_location,
2821                      "=== vect_prune_runtime_alias_test_list ===\n");
2822
2823   if (may_alias_ddrs.is_empty ())
2824     return true;
2825
2826   /* Basically, for each pair of dependent data refs store_ptr_0
2827      and load_ptr_0, we create an expression:
2828
2829      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2830      || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2831
2832      for aliasing checks.  However, in some cases we can decrease
2833      the number of checks by combining two checks into one.  For
2834      example, suppose we have another pair of data refs store_ptr_0
2835      and load_ptr_1, and if the following condition is satisfied:
2836
2837      load_ptr_0 < load_ptr_1  &&
2838      load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2839
2840      (this condition means, in each iteration of vectorized loop,
2841      the accessed memory of store_ptr_0 cannot be between the memory
2842      of load_ptr_0 and load_ptr_1.)
2843
2844      we then can use only the following expression to finish the
2845      alising checks between store_ptr_0 & load_ptr_0 and
2846      store_ptr_0 & load_ptr_1:
2847
2848      ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2849      || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2850
2851      Note that we only consider that load_ptr_0 and load_ptr_1 have the
2852      same basic address.  */
2853
2854   comp_alias_ddrs.create (may_alias_ddrs.length ());
2855
2856   /* First, we collect all data ref pairs for aliasing checks.  */
2857   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2858     {
2859       struct data_reference *dr_a, *dr_b;
2860       gimple *dr_group_first_a, *dr_group_first_b;
2861       tree segment_length_a, segment_length_b;
2862       gimple *stmt_a, *stmt_b;
2863
2864       dr_a = DDR_A (ddr);
2865       stmt_a = DR_STMT (DDR_A (ddr));
2866       dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2867       if (dr_group_first_a)
2868         {
2869           stmt_a = dr_group_first_a;
2870           dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2871         }
2872
2873       dr_b = DDR_B (ddr);
2874       stmt_b = DR_STMT (DDR_B (ddr));
2875       dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2876       if (dr_group_first_b)
2877         {
2878           stmt_b = dr_group_first_b;
2879           dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2880         }
2881
2882       if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2883         length_factor = scalar_loop_iters;
2884       else
2885         length_factor = size_int (vect_factor);
2886       segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2887       segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2888
2889       dr_with_seg_len_pair_t dr_with_seg_len_pair
2890           (dr_with_seg_len (dr_a, segment_length_a),
2891            dr_with_seg_len (dr_b, segment_length_b));
2892
2893       if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2894         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2895
2896       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2897     }
2898
2899   /* Second, we sort the collected data ref pairs so that we can scan
2900      them once to combine all possible aliasing checks.  */
2901   comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2902
2903   /* Third, we scan the sorted dr pairs and check if we can combine
2904      alias checks of two neighbouring dr pairs.  */
2905   for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2906     {
2907       /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2).  */
2908       dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2909                       *dr_b1 = &comp_alias_ddrs[i-1].second,
2910                       *dr_a2 = &comp_alias_ddrs[i].first,
2911                       *dr_b2 = &comp_alias_ddrs[i].second;
2912
2913       /* Remove duplicate data ref pairs.  */
2914       if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2915         {
2916           if (dump_enabled_p ())
2917             {
2918               dump_printf_loc (MSG_NOTE, vect_location,
2919                                "found equal ranges ");
2920               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2921                                  DR_REF (dr_a1->dr));
2922               dump_printf (MSG_NOTE,  ", ");
2923               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2924                                  DR_REF (dr_b1->dr));
2925               dump_printf (MSG_NOTE,  " and ");
2926               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2927                                  DR_REF (dr_a2->dr));
2928               dump_printf (MSG_NOTE,  ", ");
2929               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2930                                  DR_REF (dr_b2->dr));
2931               dump_printf (MSG_NOTE, "\n");
2932             }
2933
2934           comp_alias_ddrs.ordered_remove (i--);
2935           continue;
2936         }
2937
2938       if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2939         {
2940           /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2941              and DR_A1 and DR_A2 are two consecutive memrefs.  */
2942           if (*dr_a1 == *dr_a2)
2943             {
2944               std::swap (dr_a1, dr_b1);
2945               std::swap (dr_a2, dr_b2);
2946             }
2947
2948           if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2949                                 DR_BASE_ADDRESS (dr_a2->dr),
2950                                 0)
2951               || !tree_fits_shwi_p (dr_a1->offset)
2952               || !tree_fits_shwi_p (dr_a2->offset))
2953             continue;
2954
2955           HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2956                                 - tree_to_shwi (dr_a1->offset));
2957
2958
2959           /* Now we check if the following condition is satisfied:
2960
2961              DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2962
2963              where DIFF = DR_A2->OFFSET - DR_A1->OFFSET.  However,
2964              SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2965              have to make a best estimation.  We can get the minimum value
2966              of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2967              then either of the following two conditions can guarantee the
2968              one above:
2969
2970              1: DIFF <= MIN_SEG_LEN_B
2971              2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2972
2973              */
2974
2975           HOST_WIDE_INT  min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2976                                           ? tree_to_shwi (dr_b1->seg_len)
2977                                           : vect_factor);
2978
2979           if (diff <= min_seg_len_b
2980               || (tree_fits_shwi_p (dr_a1->seg_len)
2981                   && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2982             {
2983               if (dump_enabled_p ())
2984                 {
2985                   dump_printf_loc (MSG_NOTE, vect_location,
2986                                    "merging ranges for ");
2987                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2988                                      DR_REF (dr_a1->dr));
2989                   dump_printf (MSG_NOTE,  ", ");
2990                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2991                                      DR_REF (dr_b1->dr));
2992                   dump_printf (MSG_NOTE,  " and ");
2993                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2994                                      DR_REF (dr_a2->dr));
2995                   dump_printf (MSG_NOTE,  ", ");
2996                   dump_generic_expr (MSG_NOTE, TDF_SLIM,
2997                                      DR_REF (dr_b2->dr));
2998                   dump_printf (MSG_NOTE, "\n");
2999                 }
3000
3001               dr_a1->seg_len = size_binop (PLUS_EXPR,
3002                                            dr_a2->seg_len, size_int (diff));
3003               comp_alias_ddrs.ordered_remove (i--);
3004             }
3005         }
3006     }
3007
3008   dump_printf_loc (MSG_NOTE, vect_location,
3009                    "improved number of alias checks from %d to %d\n",
3010                    may_alias_ddrs.length (), comp_alias_ddrs.length ());
3011   if ((int) comp_alias_ddrs.length () >
3012       PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3013     return false;
3014
3015   return true;
3016 }
3017
3018 /* Check whether a non-affine read or write in stmt is suitable for gather load
3019    or scatter store and if so, return a builtin decl for that operation.  */
3020
3021 tree
3022 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3023                            tree *offp, int *scalep)
3024 {
3025   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3026   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3027   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3028   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3029   tree offtype = NULL_TREE;
3030   tree decl, base, off;
3031   machine_mode pmode;
3032   int punsignedp, pvolatilep;
3033
3034   base = DR_REF (dr);
3035   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3036      see if we can use the def stmt of the address.  */
3037   if (is_gimple_call (stmt)
3038       && gimple_call_internal_p (stmt)
3039       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3040           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3041       && TREE_CODE (base) == MEM_REF
3042       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3043       && integer_zerop (TREE_OPERAND (base, 1))
3044       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3045     {
3046       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3047       if (is_gimple_assign (def_stmt)
3048           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3049         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3050     }
3051
3052   /* The gather and scatter builtins need address of the form
3053      loop_invariant + vector * {1, 2, 4, 8}
3054      or
3055      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3056      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3057      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3058      multiplications and additions in it.  To get a vector, we need
3059      a single SSA_NAME that will be defined in the loop and will
3060      contain everything that is not loop invariant and that can be
3061      vectorized.  The following code attempts to find such a preexistng
3062      SSA_NAME OFF and put the loop invariants into a tree BASE
3063      that can be gimplified before the loop.  */
3064   base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3065                               &pmode, &punsignedp, &pvolatilep, false);
3066   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3067
3068   if (TREE_CODE (base) == MEM_REF)
3069     {
3070       if (!integer_zerop (TREE_OPERAND (base, 1)))
3071         {
3072           if (off == NULL_TREE)
3073             {
3074               offset_int moff = mem_ref_offset (base);
3075               off = wide_int_to_tree (sizetype, moff);
3076             }
3077           else
3078             off = size_binop (PLUS_EXPR, off,
3079                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3080         }
3081       base = TREE_OPERAND (base, 0);
3082     }
3083   else
3084     base = build_fold_addr_expr (base);
3085
3086   if (off == NULL_TREE)
3087     off = size_zero_node;
3088
3089   /* If base is not loop invariant, either off is 0, then we start with just
3090      the constant offset in the loop invariant BASE and continue with base
3091      as OFF, otherwise give up.
3092      We could handle that case by gimplifying the addition of base + off
3093      into some SSA_NAME and use that as off, but for now punt.  */
3094   if (!expr_invariant_in_loop_p (loop, base))
3095     {
3096       if (!integer_zerop (off))
3097         return NULL_TREE;
3098       off = base;
3099       base = size_int (pbitpos / BITS_PER_UNIT);
3100     }
3101   /* Otherwise put base + constant offset into the loop invariant BASE
3102      and continue with OFF.  */
3103   else
3104     {
3105       base = fold_convert (sizetype, base);
3106       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3107     }
3108
3109   /* OFF at this point may be either a SSA_NAME or some tree expression
3110      from get_inner_reference.  Try to peel off loop invariants from it
3111      into BASE as long as possible.  */
3112   STRIP_NOPS (off);
3113   while (offtype == NULL_TREE)
3114     {
3115       enum tree_code code;
3116       tree op0, op1, add = NULL_TREE;
3117
3118       if (TREE_CODE (off) == SSA_NAME)
3119         {
3120           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3121
3122           if (expr_invariant_in_loop_p (loop, off))
3123             return NULL_TREE;
3124
3125           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3126             break;
3127
3128           op0 = gimple_assign_rhs1 (def_stmt);
3129           code = gimple_assign_rhs_code (def_stmt);
3130           op1 = gimple_assign_rhs2 (def_stmt);
3131         }
3132       else
3133         {
3134           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3135             return NULL_TREE;
3136           code = TREE_CODE (off);
3137           extract_ops_from_tree (off, &code, &op0, &op1);
3138         }
3139       switch (code)
3140         {
3141         case POINTER_PLUS_EXPR:
3142         case PLUS_EXPR:
3143           if (expr_invariant_in_loop_p (loop, op0))
3144             {
3145               add = op0;
3146               off = op1;
3147             do_add:
3148               add = fold_convert (sizetype, add);
3149               if (scale != 1)
3150                 add = size_binop (MULT_EXPR, add, size_int (scale));
3151               base = size_binop (PLUS_EXPR, base, add);
3152               continue;
3153             }
3154           if (expr_invariant_in_loop_p (loop, op1))
3155             {
3156               add = op1;
3157               off = op0;
3158               goto do_add;
3159             }
3160           break;
3161         case MINUS_EXPR:
3162           if (expr_invariant_in_loop_p (loop, op1))
3163             {
3164               add = fold_convert (sizetype, op1);
3165               add = size_binop (MINUS_EXPR, size_zero_node, add);
3166               off = op0;
3167               goto do_add;
3168             }
3169           break;
3170         case MULT_EXPR:
3171           if (scale == 1 && tree_fits_shwi_p (op1))
3172             {
3173               scale = tree_to_shwi (op1);
3174               off = op0;
3175               continue;
3176             }
3177           break;
3178         case SSA_NAME:
3179           off = op0;
3180           continue;
3181         CASE_CONVERT:
3182           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3183               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3184             break;
3185           if (TYPE_PRECISION (TREE_TYPE (op0))
3186               == TYPE_PRECISION (TREE_TYPE (off)))
3187             {
3188               off = op0;
3189               continue;
3190             }
3191           if (TYPE_PRECISION (TREE_TYPE (op0))
3192               < TYPE_PRECISION (TREE_TYPE (off)))
3193             {
3194               off = op0;
3195               offtype = TREE_TYPE (off);
3196               STRIP_NOPS (off);
3197               continue;
3198             }
3199           break;
3200         default:
3201           break;
3202         }
3203       break;
3204     }
3205
3206   /* If at the end OFF still isn't a SSA_NAME or isn't
3207      defined in the loop, punt.  */
3208   if (TREE_CODE (off) != SSA_NAME
3209       || expr_invariant_in_loop_p (loop, off))
3210     return NULL_TREE;
3211
3212   if (offtype == NULL_TREE)
3213     offtype = TREE_TYPE (off);
3214
3215   if (DR_IS_READ (dr))
3216     decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3217                                              offtype, scale);
3218   else
3219     decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3220                                               offtype, scale);
3221
3222   if (decl == NULL_TREE)
3223     return NULL_TREE;
3224
3225   if (basep)
3226     *basep = base;
3227   if (offp)
3228     *offp = off;
3229   if (scalep)
3230     *scalep = scale;
3231   return decl;
3232 }
3233
3234 /* Function vect_analyze_data_refs.
3235
3236   Find all the data references in the loop or basic block.
3237
3238    The general structure of the analysis of data refs in the vectorizer is as
3239    follows:
3240    1- vect_analyze_data_refs(loop/bb): call
3241       compute_data_dependences_for_loop/bb to find and analyze all data-refs
3242       in the loop/bb and their dependences.
3243    2- vect_analyze_dependences(): apply dependence testing using ddrs.
3244    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3245    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3246
3247 */
3248
3249 bool
3250 vect_analyze_data_refs (vec_info *vinfo, int *min_vf, unsigned *n_stmts)
3251 {
3252   struct loop *loop = NULL;
3253   basic_block bb = NULL;
3254   unsigned int i;
3255   vec<data_reference_p> datarefs;
3256   struct data_reference *dr;
3257   tree scalar_type;
3258
3259   if (dump_enabled_p ())
3260     dump_printf_loc (MSG_NOTE, vect_location,
3261                      "=== vect_analyze_data_refs ===\n");
3262
3263   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3264     {
3265       basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3266
3267       loop = LOOP_VINFO_LOOP (loop_vinfo);
3268       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3269       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3270         {
3271           if (dump_enabled_p ())
3272             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3273                              "not vectorized: loop contains function calls"
3274                              " or data references that cannot be analyzed\n");
3275           return false;
3276         }
3277
3278       for (i = 0; i < loop->num_nodes; i++)
3279         {
3280           gimple_stmt_iterator gsi;
3281
3282           for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3283             {
3284               gimple *stmt = gsi_stmt (gsi);
3285               if (is_gimple_debug (stmt))
3286                 continue;
3287               ++*n_stmts;
3288               if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3289                 {
3290                   if (is_gimple_call (stmt) && loop->safelen)
3291                     {
3292                       tree fndecl = gimple_call_fndecl (stmt), op;
3293                       if (fndecl != NULL_TREE)
3294                         {
3295                           struct cgraph_node *node = cgraph_node::get (fndecl);
3296                           if (node != NULL && node->simd_clones != NULL)
3297                             {
3298                               unsigned int j, n = gimple_call_num_args (stmt);
3299                               for (j = 0; j < n; j++)
3300                                 {
3301                                   op = gimple_call_arg (stmt, j);
3302                                   if (DECL_P (op)
3303                                       || (REFERENCE_CLASS_P (op)
3304                                           && get_base_address (op)))
3305                                     break;
3306                                 }
3307                               op = gimple_call_lhs (stmt);
3308                               /* Ignore #pragma omp declare simd functions
3309                                  if they don't have data references in the
3310                                  call stmt itself.  */
3311                               if (j == n
3312                                   && !(op
3313                                        && (DECL_P (op)
3314                                            || (REFERENCE_CLASS_P (op)
3315                                                && get_base_address (op)))))
3316                                 continue;
3317                             }
3318                         }
3319                     }
3320                   LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3321                   if (dump_enabled_p ())
3322                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3323                                      "not vectorized: loop contains function "
3324                                      "calls or data references that cannot "
3325                                      "be analyzed\n");
3326                   return false;
3327                 }
3328             }
3329         }
3330
3331       LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3332     }
3333   else
3334     {
3335       bb_vec_info bb_vinfo = as_a <bb_vec_info> (vinfo);
3336       gimple_stmt_iterator gsi;
3337
3338       bb = BB_VINFO_BB (bb_vinfo);
3339       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3340         {
3341           gimple *stmt = gsi_stmt (gsi);
3342           if (is_gimple_debug (stmt))
3343             continue;
3344           ++*n_stmts;
3345           if (!find_data_references_in_stmt (NULL, stmt,
3346                                              &BB_VINFO_DATAREFS (bb_vinfo)))
3347             {
3348               /* Mark the rest of the basic-block as unvectorizable.  */
3349               for (; !gsi_end_p (gsi); gsi_next (&gsi))
3350                 {
3351                   stmt = gsi_stmt (gsi);
3352                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3353                 }
3354               break;
3355             }
3356         }
3357
3358       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3359     }
3360
3361   /* Go through the data-refs, check that the analysis succeeded.  Update
3362      pointer from stmt_vec_info struct to DR and vectype.  */
3363
3364   FOR_EACH_VEC_ELT (datarefs, i, dr)
3365     {
3366       gimple *stmt;
3367       stmt_vec_info stmt_info;
3368       tree base, offset, init;
3369       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3370       bool simd_lane_access = false;
3371       int vf;
3372
3373 again:
3374       if (!dr || !DR_REF (dr))
3375         {
3376           if (dump_enabled_p ())
3377             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3378                              "not vectorized: unhandled data-ref\n");
3379           return false;
3380         }
3381
3382       stmt = DR_STMT (dr);
3383       stmt_info = vinfo_for_stmt (stmt);
3384
3385       /* Discard clobbers from the dataref vector.  We will remove
3386          clobber stmts during vectorization.  */
3387       if (gimple_clobber_p (stmt))
3388         {
3389           free_data_ref (dr);
3390           if (i == datarefs.length () - 1)
3391             {
3392               datarefs.pop ();
3393               break;
3394             }
3395           datarefs.ordered_remove (i);
3396           dr = datarefs[i];
3397           goto again;
3398         }
3399
3400       /* Check that analysis of the data-ref succeeded.  */
3401       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3402           || !DR_STEP (dr))
3403         {
3404           bool maybe_gather
3405             = DR_IS_READ (dr)
3406               && !TREE_THIS_VOLATILE (DR_REF (dr))
3407               && targetm.vectorize.builtin_gather != NULL;
3408           bool maybe_scatter
3409             = DR_IS_WRITE (dr)
3410               && !TREE_THIS_VOLATILE (DR_REF (dr))
3411               && targetm.vectorize.builtin_scatter != NULL;
3412           bool maybe_simd_lane_access
3413             = is_a <loop_vec_info> (vinfo) && loop->simduid;
3414
3415           /* If target supports vector gather loads or scatter stores, or if
3416              this might be a SIMD lane access, see if they can't be used.  */
3417           if (is_a <loop_vec_info> (vinfo)
3418               && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3419               && !nested_in_vect_loop_p (loop, stmt))
3420             {
3421               struct data_reference *newdr
3422                 = create_data_ref (NULL, loop_containing_stmt (stmt),
3423                                    DR_REF (dr), stmt, maybe_scatter ? false : true);
3424               gcc_assert (newdr != NULL && DR_REF (newdr));
3425               if (DR_BASE_ADDRESS (newdr)
3426                   && DR_OFFSET (newdr)
3427                   && DR_INIT (newdr)
3428                   && DR_STEP (newdr)
3429                   && integer_zerop (DR_STEP (newdr)))
3430                 {
3431                   if (maybe_simd_lane_access)
3432                     {
3433                       tree off = DR_OFFSET (newdr);
3434                       STRIP_NOPS (off);
3435                       if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3436                           && TREE_CODE (off) == MULT_EXPR
3437                           && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3438                         {
3439                           tree step = TREE_OPERAND (off, 1);
3440                           off = TREE_OPERAND (off, 0);
3441                           STRIP_NOPS (off);
3442                           if (CONVERT_EXPR_P (off)
3443                               && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3444                                                                           0)))
3445                                  < TYPE_PRECISION (TREE_TYPE (off)))
3446                             off = TREE_OPERAND (off, 0);
3447                           if (TREE_CODE (off) == SSA_NAME)
3448                             {
3449                               gimple *def = SSA_NAME_DEF_STMT (off);
3450                               tree reft = TREE_TYPE (DR_REF (newdr));
3451                               if (is_gimple_call (def)
3452                                   && gimple_call_internal_p (def)
3453                                   && (gimple_call_internal_fn (def)
3454                                       == IFN_GOMP_SIMD_LANE))
3455                                 {
3456                                   tree arg = gimple_call_arg (def, 0);
3457                                   gcc_assert (TREE_CODE (arg) == SSA_NAME);
3458                                   arg = SSA_NAME_VAR (arg);
3459                                   if (arg == loop->simduid
3460                                       /* For now.  */
3461                                       && tree_int_cst_equal
3462                                            (TYPE_SIZE_UNIT (reft),
3463                                             step))
3464                                     {
3465                                       DR_OFFSET (newdr) = ssize_int (0);
3466                                       DR_STEP (newdr) = step;
3467                                       DR_ALIGNED_TO (newdr)
3468                                         = size_int (BIGGEST_ALIGNMENT);
3469                                       dr = newdr;
3470                                       simd_lane_access = true;
3471                                     }
3472                                 }
3473                             }
3474                         }
3475                     }
3476                   if (!simd_lane_access && (maybe_gather || maybe_scatter))
3477                     {
3478                       dr = newdr;
3479                       if (maybe_gather)
3480                         gatherscatter = GATHER;
3481                       else
3482                         gatherscatter = SCATTER;
3483                     }
3484                 }
3485               if (gatherscatter == SG_NONE && !simd_lane_access)
3486                 free_data_ref (newdr);
3487             }
3488
3489           if (gatherscatter == SG_NONE && !simd_lane_access)
3490             {
3491               if (dump_enabled_p ())
3492                 {
3493                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3494                                    "not vectorized: data ref analysis "
3495                                    "failed ");
3496                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3497                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3498                 }
3499
3500               if (is_a <bb_vec_info> (vinfo))
3501                 break;
3502
3503               return false;
3504             }
3505         }
3506
3507       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3508         {
3509           if (dump_enabled_p ())
3510             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3511                              "not vectorized: base addr of dr is a "
3512                              "constant\n");
3513
3514           if (is_a <bb_vec_info> (vinfo))
3515             break;
3516
3517           if (gatherscatter != SG_NONE || simd_lane_access)
3518             free_data_ref (dr);
3519           return false;
3520         }
3521
3522       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3523         {
3524           if (dump_enabled_p ())
3525             {
3526               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3527                                "not vectorized: volatile type ");
3528               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3529               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3530             }
3531
3532           if (is_a <bb_vec_info> (vinfo))
3533             break;
3534
3535           return false;
3536         }
3537
3538       if (stmt_can_throw_internal (stmt))
3539         {
3540           if (dump_enabled_p ())
3541             {
3542               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3543                                "not vectorized: statement can throw an "
3544                                "exception ");
3545               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3546               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3547             }
3548
3549           if (is_a <bb_vec_info> (vinfo))
3550             break;
3551
3552           if (gatherscatter != SG_NONE || simd_lane_access)
3553             free_data_ref (dr);
3554           return false;
3555         }
3556
3557       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3558           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3559         {
3560           if (dump_enabled_p ())
3561             {
3562               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3563                                "not vectorized: statement is bitfield "
3564                                "access ");
3565               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3566               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3567             }
3568
3569           if (is_a <bb_vec_info> (vinfo))
3570             break;
3571
3572           if (gatherscatter != SG_NONE || simd_lane_access)
3573             free_data_ref (dr);
3574           return false;
3575         }
3576
3577       base = unshare_expr (DR_BASE_ADDRESS (dr));
3578       offset = unshare_expr (DR_OFFSET (dr));
3579       init = unshare_expr (DR_INIT (dr));
3580
3581       if (is_gimple_call (stmt)
3582           && (!gimple_call_internal_p (stmt)
3583               || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3584                   && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3585         {
3586           if (dump_enabled_p ())
3587             {
3588               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3589                                "not vectorized: dr in a call ");
3590               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3591               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3592             }
3593
3594           if (is_a <bb_vec_info> (vinfo))
3595             break;
3596
3597           if (gatherscatter != SG_NONE || simd_lane_access)
3598             free_data_ref (dr);
3599           return false;
3600         }
3601
3602       /* Update DR field in stmt_vec_info struct.  */
3603
3604       /* If the dataref is in an inner-loop of the loop that is considered for
3605          for vectorization, we also want to analyze the access relative to
3606          the outer-loop (DR contains information only relative to the
3607          inner-most enclosing loop).  We do that by building a reference to the
3608          first location accessed by the inner-loop, and analyze it relative to
3609          the outer-loop.  */
3610       if (loop && nested_in_vect_loop_p (loop, stmt))
3611         {
3612           tree outer_step, outer_base, outer_init;
3613           HOST_WIDE_INT pbitsize, pbitpos;
3614           tree poffset;
3615           machine_mode pmode;
3616           int punsignedp, pvolatilep;
3617           affine_iv base_iv, offset_iv;
3618           tree dinit;
3619
3620           /* Build a reference to the first location accessed by the
3621              inner-loop: *(BASE+INIT).  (The first location is actually
3622              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3623           tree inner_base = build_fold_indirect_ref
3624                                 (fold_build_pointer_plus (base, init));
3625
3626           if (dump_enabled_p ())
3627             {
3628               dump_printf_loc (MSG_NOTE, vect_location,
3629                                "analyze in outer-loop: ");
3630               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3631               dump_printf (MSG_NOTE, "\n");
3632             }
3633
3634           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3635                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3636           gcc_assert (outer_base != NULL_TREE);
3637
3638           if (pbitpos % BITS_PER_UNIT != 0)
3639             {
3640               if (dump_enabled_p ())
3641                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3642                                  "failed: bit offset alignment.\n");
3643               return false;
3644             }
3645
3646           outer_base = build_fold_addr_expr (outer_base);
3647           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3648                           &base_iv, false))
3649             {
3650               if (dump_enabled_p ())
3651                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3652                                  "failed: evolution of base is not affine.\n");
3653               return false;
3654             }
3655
3656           if (offset)
3657             {
3658               if (poffset)
3659                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3660                                        poffset);
3661               else
3662                 poffset = offset;
3663             }
3664
3665           if (!poffset)
3666             {
3667               offset_iv.base = ssize_int (0);
3668               offset_iv.step = ssize_int (0);
3669             }
3670           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3671                                &offset_iv, false))
3672             {
3673               if (dump_enabled_p ())
3674                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3675                                  "evolution of offset is not affine.\n");
3676               return false;
3677             }
3678
3679           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3680           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3681           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3682           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3683           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3684
3685           outer_step = size_binop (PLUS_EXPR,
3686                                 fold_convert (ssizetype, base_iv.step),
3687                                 fold_convert (ssizetype, offset_iv.step));
3688
3689           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3690           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3691           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3692           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3693           STMT_VINFO_DR_OFFSET (stmt_info) =
3694                                 fold_convert (ssizetype, offset_iv.base);
3695           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3696                                 size_int (highest_pow2_factor (offset_iv.base));
3697
3698           if (dump_enabled_p ())
3699             {
3700               dump_printf_loc (MSG_NOTE, vect_location,
3701                                "\touter base_address: ");
3702               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3703                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3704               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3705               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3706                                  STMT_VINFO_DR_OFFSET (stmt_info));
3707               dump_printf (MSG_NOTE,
3708                            "\n\touter constant offset from base address: ");
3709               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3710                                  STMT_VINFO_DR_INIT (stmt_info));
3711               dump_printf (MSG_NOTE, "\n\touter step: ");
3712               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3713                                  STMT_VINFO_DR_STEP (stmt_info));
3714               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3715               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3716                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3717               dump_printf (MSG_NOTE, "\n");
3718             }
3719         }
3720
3721       if (STMT_VINFO_DATA_REF (stmt_info))
3722         {
3723           if (dump_enabled_p ())
3724             {
3725               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3726                                "not vectorized: more than one data ref "
3727                                "in stmt: ");
3728               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3729               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3730             }
3731
3732           if (is_a <bb_vec_info> (vinfo))
3733             break;
3734
3735           if (gatherscatter != SG_NONE || simd_lane_access)
3736             free_data_ref (dr);
3737           return false;
3738         }
3739
3740       STMT_VINFO_DATA_REF (stmt_info) = dr;
3741       if (simd_lane_access)
3742         {
3743           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3744           free_data_ref (datarefs[i]);
3745           datarefs[i] = dr;
3746         }
3747
3748       /* Set vectype for STMT.  */
3749       scalar_type = TREE_TYPE (DR_REF (dr));
3750       STMT_VINFO_VECTYPE (stmt_info)
3751         = get_vectype_for_scalar_type (scalar_type);
3752       if (!STMT_VINFO_VECTYPE (stmt_info))
3753         {
3754           if (dump_enabled_p ())
3755             {
3756               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3757                                "not vectorized: no vectype for stmt: ");
3758               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3759               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3760               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3761                                  scalar_type);
3762               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3763             }
3764
3765           if (is_a <bb_vec_info> (vinfo))
3766             break;
3767
3768           if (gatherscatter != SG_NONE || simd_lane_access)
3769             {
3770               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3771               if (gatherscatter != SG_NONE)
3772                 free_data_ref (dr);
3773             }
3774           return false;
3775         }
3776       else
3777         {
3778           if (dump_enabled_p ())
3779             {
3780               dump_printf_loc (MSG_NOTE, vect_location,
3781                                "got vectype for stmt: ");
3782               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3783               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3784                                  STMT_VINFO_VECTYPE (stmt_info));
3785               dump_printf (MSG_NOTE, "\n");
3786             }
3787         }
3788
3789       /* Adjust the minimal vectorization factor according to the
3790          vector type.  */
3791       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3792       if (vf > *min_vf)
3793         *min_vf = vf;
3794
3795       if (gatherscatter != SG_NONE)
3796         {
3797           tree off;
3798           if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3799                                           NULL, &off, NULL)
3800               || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3801             {
3802               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3803               free_data_ref (dr);
3804               if (dump_enabled_p ())
3805                 {
3806                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3807                                    (gatherscatter == GATHER) ?
3808                                    "not vectorized: not suitable for gather "
3809                                    "load " :
3810                                    "not vectorized: not suitable for scatter "
3811                                    "store ");
3812                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3813                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3814                 }
3815               return false;
3816             }
3817
3818           datarefs[i] = dr;
3819           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3820         }
3821
3822       else if (is_a <loop_vec_info> (vinfo)
3823                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3824         {
3825           if (nested_in_vect_loop_p (loop, stmt))
3826             {
3827               if (dump_enabled_p ())
3828                 {
3829                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3830                                    "not vectorized: not suitable for strided "
3831                                    "load ");
3832                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3833                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3834                 }
3835               return false;
3836             }
3837           STMT_VINFO_STRIDED_P (stmt_info) = true;
3838         }
3839     }
3840
3841   /* If we stopped analysis at the first dataref we could not analyze
3842      when trying to vectorize a basic-block mark the rest of the datarefs
3843      as not vectorizable and truncate the vector of datarefs.  That
3844      avoids spending useless time in analyzing their dependence.  */
3845   if (i != datarefs.length ())
3846     {
3847       gcc_assert (is_a <bb_vec_info> (vinfo));
3848       for (unsigned j = i; j < datarefs.length (); ++j)
3849         {
3850           data_reference_p dr = datarefs[j];
3851           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3852           free_data_ref (dr);
3853         }
3854       datarefs.truncate (i);
3855     }
3856
3857   return true;
3858 }
3859
3860
3861 /* Function vect_get_new_vect_var.
3862
3863    Returns a name for a new variable.  The current naming scheme appends the
3864    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3865    the name of vectorizer generated variables, and appends that to NAME if
3866    provided.  */
3867
3868 tree
3869 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3870 {
3871   const char *prefix;
3872   tree new_vect_var;
3873
3874   switch (var_kind)
3875   {
3876   case vect_simple_var:
3877     prefix = "vect";
3878     break;
3879   case vect_scalar_var:
3880     prefix = "stmp";
3881     break;
3882   case vect_pointer_var:
3883     prefix = "vectp";
3884     break;
3885   default:
3886     gcc_unreachable ();
3887   }
3888
3889   if (name)
3890     {
3891       char* tmp = concat (prefix, "_", name, NULL);
3892       new_vect_var = create_tmp_reg (type, tmp);
3893       free (tmp);
3894     }
3895   else
3896     new_vect_var = create_tmp_reg (type, prefix);
3897
3898   return new_vect_var;
3899 }
3900
3901 /* Like vect_get_new_vect_var but return an SSA name.  */
3902
3903 tree
3904 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3905 {
3906   const char *prefix;
3907   tree new_vect_var;
3908
3909   switch (var_kind)
3910   {
3911   case vect_simple_var:
3912     prefix = "vect";
3913     break;
3914   case vect_scalar_var:
3915     prefix = "stmp";
3916     break;
3917   case vect_pointer_var:
3918     prefix = "vectp";
3919     break;
3920   default:
3921     gcc_unreachable ();
3922   }
3923
3924   if (name)
3925     {
3926       char* tmp = concat (prefix, "_", name, NULL);
3927       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3928       free (tmp);
3929     }
3930   else
3931     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3932
3933   return new_vect_var;
3934 }
3935
3936 /* Duplicate ptr info and set alignment/misaligment on NAME from DR.  */
3937
3938 static void
3939 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3940                                   stmt_vec_info stmt_info)
3941 {
3942   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3943   unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3944   int misalign = DR_MISALIGNMENT (dr);
3945   if (misalign == -1)
3946     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3947   else
3948     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3949 }
3950
3951 /* Function vect_create_addr_base_for_vector_ref.
3952
3953    Create an expression that computes the address of the first memory location
3954    that will be accessed for a data reference.
3955
3956    Input:
3957    STMT: The statement containing the data reference.
3958    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3959    OFFSET: Optional. If supplied, it is be added to the initial address.
3960    LOOP:    Specify relative to which loop-nest should the address be computed.
3961             For example, when the dataref is in an inner-loop nested in an
3962             outer-loop that is now being vectorized, LOOP can be either the
3963             outer-loop, or the inner-loop.  The first memory location accessed
3964             by the following dataref ('in' points to short):
3965
3966                 for (i=0; i<N; i++)
3967                    for (j=0; j<M; j++)
3968                      s += in[i+j]
3969
3970             is as follows:
3971             if LOOP=i_loop:     &in             (relative to i_loop)
3972             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3973    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
3974             initial address.  Unlike OFFSET, which is number of elements to
3975             be added, BYTE_OFFSET is measured in bytes.
3976
3977    Output:
3978    1. Return an SSA_NAME whose value is the address of the memory location of
3979       the first vector of the data reference.
3980    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3981       these statement(s) which define the returned SSA_NAME.
3982
3983    FORNOW: We are only handling array accesses with step 1.  */
3984
3985 tree
3986 vect_create_addr_base_for_vector_ref (gimple *stmt,
3987                                       gimple_seq *new_stmt_list,
3988                                       tree offset,
3989                                       struct loop *loop,
3990                                       tree byte_offset)
3991 {
3992   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3993   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3994   tree data_ref_base;
3995   const char *base_name;
3996   tree addr_base;
3997   tree dest;
3998   gimple_seq seq = NULL;
3999   tree base_offset;
4000   tree init;
4001   tree vect_ptr_type;
4002   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4003   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4004
4005   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
4006     {
4007       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
4008
4009       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
4010
4011       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
4012       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
4013       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
4014     }
4015   else
4016     {
4017       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4018       base_offset = unshare_expr (DR_OFFSET (dr));
4019       init = unshare_expr (DR_INIT (dr));
4020     }
4021
4022   if (loop_vinfo)
4023     base_name = get_name (data_ref_base);
4024   else
4025     {
4026       base_offset = ssize_int (0);
4027       init = ssize_int (0);
4028       base_name = get_name (DR_REF (dr));
4029     }
4030
4031   /* Create base_offset */
4032   base_offset = size_binop (PLUS_EXPR,
4033                             fold_convert (sizetype, base_offset),
4034                             fold_convert (sizetype, init));
4035
4036   if (offset)
4037     {
4038       offset = fold_build2 (MULT_EXPR, sizetype,
4039                             fold_convert (sizetype, offset), step);
4040       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4041                                  base_offset, offset);
4042     }
4043   if (byte_offset)
4044     {
4045       byte_offset = fold_convert (sizetype, byte_offset);
4046       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4047                                  base_offset, byte_offset);
4048     }
4049
4050   /* base + base_offset */
4051   if (loop_vinfo)
4052     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4053   else
4054     {
4055       addr_base = build1 (ADDR_EXPR,
4056                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4057                           unshare_expr (DR_REF (dr)));
4058     }
4059
4060   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4061   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4062   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4063   gimple_seq_add_seq (new_stmt_list, seq);
4064
4065   if (DR_PTR_INFO (dr)
4066       && TREE_CODE (addr_base) == SSA_NAME
4067       && !SSA_NAME_PTR_INFO (addr_base))
4068     {
4069       vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4070       if (offset || byte_offset)
4071         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4072     }
4073
4074   if (dump_enabled_p ())
4075     {
4076       dump_printf_loc (MSG_NOTE, vect_location, "created ");
4077       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4078       dump_printf (MSG_NOTE, "\n");
4079     }
4080
4081   return addr_base;
4082 }
4083
4084
4085 /* Function vect_create_data_ref_ptr.
4086
4087    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4088    location accessed in the loop by STMT, along with the def-use update
4089    chain to appropriately advance the pointer through the loop iterations.
4090    Also set aliasing information for the pointer.  This pointer is used by
4091    the callers to this function to create a memory reference expression for
4092    vector load/store access.
4093
4094    Input:
4095    1. STMT: a stmt that references memory. Expected to be of the form
4096          GIMPLE_ASSIGN <name, data-ref> or
4097          GIMPLE_ASSIGN <data-ref, name>.
4098    2. AGGR_TYPE: the type of the reference, which should be either a vector
4099         or an array.
4100    3. AT_LOOP: the loop where the vector memref is to be created.
4101    4. OFFSET (optional): an offset to be added to the initial address accessed
4102         by the data-ref in STMT.
4103    5. BSI: location where the new stmts are to be placed if there is no loop
4104    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4105         pointing to the initial address.
4106    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4107         to the initial address accessed by the data-ref in STMT.  This is
4108         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4109         in bytes.
4110
4111    Output:
4112    1. Declare a new ptr to vector_type, and have it point to the base of the
4113       data reference (initial addressed accessed by the data reference).
4114       For example, for vector of type V8HI, the following code is generated:
4115
4116       v8hi *ap;
4117       ap = (v8hi *)initial_address;
4118
4119       if OFFSET is not supplied:
4120          initial_address = &a[init];
4121       if OFFSET is supplied:
4122          initial_address = &a[init + OFFSET];
4123       if BYTE_OFFSET is supplied:
4124          initial_address = &a[init] + BYTE_OFFSET;
4125
4126       Return the initial_address in INITIAL_ADDRESS.
4127
4128    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4129       update the pointer in each iteration of the loop.
4130
4131       Return the increment stmt that updates the pointer in PTR_INCR.
4132
4133    3. Set INV_P to true if the access pattern of the data reference in the
4134       vectorized loop is invariant.  Set it to false otherwise.
4135
4136    4. Return the pointer.  */
4137
4138 tree
4139 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4140                           tree offset, tree *initial_address,
4141                           gimple_stmt_iterator *gsi, gimple **ptr_incr,
4142                           bool only_init, bool *inv_p, tree byte_offset)
4143 {
4144   const char *base_name;
4145   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4146   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4147   struct loop *loop = NULL;
4148   bool nested_in_vect_loop = false;
4149   struct loop *containing_loop = NULL;
4150   tree aggr_ptr_type;
4151   tree aggr_ptr;
4152   tree new_temp;
4153   gimple_seq new_stmt_list = NULL;
4154   edge pe = NULL;
4155   basic_block new_bb;
4156   tree aggr_ptr_init;
4157   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4158   tree aptr;
4159   gimple_stmt_iterator incr_gsi;
4160   bool insert_after;
4161   tree indx_before_incr, indx_after_incr;
4162   gimple *incr;
4163   tree step;
4164   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4165
4166   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4167               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4168
4169   if (loop_vinfo)
4170     {
4171       loop = LOOP_VINFO_LOOP (loop_vinfo);
4172       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4173       containing_loop = (gimple_bb (stmt))->loop_father;
4174       pe = loop_preheader_edge (loop);
4175     }
4176   else
4177     {
4178       gcc_assert (bb_vinfo);
4179       only_init = true;
4180       *ptr_incr = NULL;
4181     }
4182
4183   /* Check the step (evolution) of the load in LOOP, and record
4184      whether it's invariant.  */
4185   if (nested_in_vect_loop)
4186     step = STMT_VINFO_DR_STEP (stmt_info);
4187   else
4188     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4189
4190   if (integer_zerop (step))
4191     *inv_p = true;
4192   else
4193     *inv_p = false;
4194
4195   /* Create an expression for the first address accessed by this load
4196      in LOOP.  */
4197   base_name = get_name (DR_BASE_ADDRESS (dr));
4198
4199   if (dump_enabled_p ())
4200     {
4201       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4202       dump_printf_loc (MSG_NOTE, vect_location,
4203                        "create %s-pointer variable to type: ",
4204                        get_tree_code_name (TREE_CODE (aggr_type)));
4205       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4206       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4207         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4208       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4209         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4210       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4211         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4212       else
4213         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4214       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4215       dump_printf (MSG_NOTE, "\n");
4216     }
4217
4218   /* (1) Create the new aggregate-pointer variable.
4219      Vector and array types inherit the alias set of their component
4220      type by default so we need to use a ref-all pointer if the data
4221      reference does not conflict with the created aggregated data
4222      reference because it is not addressable.  */
4223   bool need_ref_all = false;
4224   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4225                               get_alias_set (DR_REF (dr))))
4226     need_ref_all = true;
4227   /* Likewise for any of the data references in the stmt group.  */
4228   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4229     {
4230       gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4231       do
4232         {
4233           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4234           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4235           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4236                                       get_alias_set (DR_REF (sdr))))
4237             {
4238               need_ref_all = true;
4239               break;
4240             }
4241           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4242         }
4243       while (orig_stmt);
4244     }
4245   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4246                                                need_ref_all);
4247   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4248
4249
4250   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4251      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4252      def-use update cycles for the pointer: one relative to the outer-loop
4253      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4254      to the inner-loop (which is the inner-most loop containing the dataref),
4255      and this is done be step (5) below.
4256
4257      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4258      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4259      redundant.  Steps (3),(4) create the following:
4260
4261         vp0 = &base_addr;
4262         LOOP:   vp1 = phi(vp0,vp2)
4263                 ...
4264                 ...
4265                 vp2 = vp1 + step
4266                 goto LOOP
4267
4268      If there is an inner-loop nested in loop, then step (5) will also be
4269      applied, and an additional update in the inner-loop will be created:
4270
4271         vp0 = &base_addr;
4272         LOOP:   vp1 = phi(vp0,vp2)
4273                 ...
4274         inner:     vp3 = phi(vp1,vp4)
4275                    vp4 = vp3 + inner_step
4276                    if () goto inner
4277                 ...
4278                 vp2 = vp1 + step
4279                 if () goto LOOP   */
4280
4281   /* (2) Calculate the initial address of the aggregate-pointer, and set
4282      the aggregate-pointer to point to it before the loop.  */
4283
4284   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4285
4286   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4287                                                    offset, loop, byte_offset);
4288   if (new_stmt_list)
4289     {
4290       if (pe)
4291         {
4292           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4293           gcc_assert (!new_bb);
4294         }
4295       else
4296         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4297     }
4298
4299   *initial_address = new_temp;
4300   aggr_ptr_init = new_temp;
4301
4302   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4303      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4304      inner-loop nested in LOOP (during outer-loop vectorization).  */
4305
4306   /* No update in loop is required.  */
4307   if (only_init && (!loop_vinfo || at_loop == loop))
4308     aptr = aggr_ptr_init;
4309   else
4310     {
4311       /* The step of the aggregate pointer is the type size.  */
4312       tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4313       /* One exception to the above is when the scalar step of the load in
4314          LOOP is zero. In this case the step here is also zero.  */
4315       if (*inv_p)
4316         iv_step = size_zero_node;
4317       else if (tree_int_cst_sgn (step) == -1)
4318         iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4319
4320       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4321
4322       create_iv (aggr_ptr_init,
4323                  fold_convert (aggr_ptr_type, iv_step),
4324                  aggr_ptr, loop, &incr_gsi, insert_after,
4325                  &indx_before_incr, &indx_after_incr);
4326       incr = gsi_stmt (incr_gsi);
4327       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4328
4329       /* Copy the points-to information if it exists. */
4330       if (DR_PTR_INFO (dr))
4331         {
4332           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4333           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4334         }
4335       if (ptr_incr)
4336         *ptr_incr = incr;
4337
4338       aptr = indx_before_incr;
4339     }
4340
4341   if (!nested_in_vect_loop || only_init)
4342     return aptr;
4343
4344
4345   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4346      nested in LOOP, if exists.  */
4347
4348   gcc_assert (nested_in_vect_loop);
4349   if (!only_init)
4350     {
4351       standard_iv_increment_position (containing_loop, &incr_gsi,
4352                                       &insert_after);
4353       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4354                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4355                  &indx_after_incr);
4356       incr = gsi_stmt (incr_gsi);
4357       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4358
4359       /* Copy the points-to information if it exists. */
4360       if (DR_PTR_INFO (dr))
4361         {
4362           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4363           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4364         }
4365       if (ptr_incr)
4366         *ptr_incr = incr;
4367
4368       return indx_before_incr;
4369     }
4370   else
4371     gcc_unreachable ();
4372 }
4373
4374
4375 /* Function bump_vector_ptr
4376
4377    Increment a pointer (to a vector type) by vector-size. If requested,
4378    i.e. if PTR-INCR is given, then also connect the new increment stmt
4379    to the existing def-use update-chain of the pointer, by modifying
4380    the PTR_INCR as illustrated below:
4381
4382    The pointer def-use update-chain before this function:
4383                         DATAREF_PTR = phi (p_0, p_2)
4384                         ....
4385         PTR_INCR:       p_2 = DATAREF_PTR + step
4386
4387    The pointer def-use update-chain after this function:
4388                         DATAREF_PTR = phi (p_0, p_2)
4389                         ....
4390                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4391                         ....
4392         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4393
4394    Input:
4395    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4396                  in the loop.
4397    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4398               the loop.  The increment amount across iterations is expected
4399               to be vector_size.
4400    BSI - location where the new update stmt is to be placed.
4401    STMT - the original scalar memory-access stmt that is being vectorized.
4402    BUMP - optional. The offset by which to bump the pointer. If not given,
4403           the offset is assumed to be vector_size.
4404
4405    Output: Return NEW_DATAREF_PTR as illustrated above.
4406
4407 */
4408
4409 tree
4410 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4411                  gimple *stmt, tree bump)
4412 {
4413   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4414   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4415   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4416   tree update = TYPE_SIZE_UNIT (vectype);
4417   gassign *incr_stmt;
4418   ssa_op_iter iter;
4419   use_operand_p use_p;
4420   tree new_dataref_ptr;
4421
4422   if (bump)
4423     update = bump;
4424
4425   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4426     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4427   else
4428     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4429   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4430                                    dataref_ptr, update);
4431   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4432
4433   /* Copy the points-to information if it exists. */
4434   if (DR_PTR_INFO (dr))
4435     {
4436       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4437       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4438     }
4439
4440   if (!ptr_incr)
4441     return new_dataref_ptr;
4442
4443   /* Update the vector-pointer's cross-iteration increment.  */
4444   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4445     {
4446       tree use = USE_FROM_PTR (use_p);
4447
4448       if (use == dataref_ptr)
4449         SET_USE (use_p, new_dataref_ptr);
4450       else
4451         gcc_assert (tree_int_cst_compare (use, update) == 0);
4452     }
4453
4454   return new_dataref_ptr;
4455 }
4456
4457
4458 /* Function vect_create_destination_var.
4459
4460    Create a new temporary of type VECTYPE.  */
4461
4462 tree
4463 vect_create_destination_var (tree scalar_dest, tree vectype)
4464 {
4465   tree vec_dest;
4466   const char *name;
4467   char *new_name;
4468   tree type;
4469   enum vect_var_kind kind;
4470
4471   kind = vectype ? vect_simple_var : vect_scalar_var;
4472   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4473
4474   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4475
4476   name = get_name (scalar_dest);
4477   if (name)
4478     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4479   else
4480     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4481   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4482   free (new_name);
4483
4484   return vec_dest;
4485 }
4486
4487 /* Function vect_grouped_store_supported.
4488
4489    Returns TRUE if interleave high and interleave low permutations
4490    are supported, and FALSE otherwise.  */
4491
4492 bool
4493 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4494 {
4495   machine_mode mode = TYPE_MODE (vectype);
4496
4497   /* vect_permute_store_chain requires the group size to be equal to 3 or
4498      be a power of two.  */
4499   if (count != 3 && exact_log2 (count) == -1)
4500     {
4501       if (dump_enabled_p ())
4502         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4503                          "the size of the group of accesses"
4504                          " is not a power of 2 or not eqaul to 3\n");
4505       return false;
4506     }
4507
4508   /* Check that the permutation is supported.  */
4509   if (VECTOR_MODE_P (mode))
4510     {
4511       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4512       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4513
4514       if (count == 3)
4515         {
4516           unsigned int j0 = 0, j1 = 0, j2 = 0;
4517           unsigned int i, j;
4518
4519           for (j = 0; j < 3; j++)
4520             {
4521               int nelt0 = ((3 - j) * nelt) % 3;
4522               int nelt1 = ((3 - j) * nelt + 1) % 3;
4523               int nelt2 = ((3 - j) * nelt + 2) % 3;
4524               for (i = 0; i < nelt; i++)
4525                 {
4526                   if (3 * i + nelt0 < nelt)
4527                     sel[3 * i + nelt0] = j0++;
4528                   if (3 * i + nelt1 < nelt)
4529                     sel[3 * i + nelt1] = nelt + j1++;
4530                   if (3 * i + nelt2 < nelt)
4531                     sel[3 * i + nelt2] = 0;
4532                 }
4533               if (!can_vec_perm_p (mode, false, sel))
4534                 {
4535                   if (dump_enabled_p ())
4536                     dump_printf (MSG_MISSED_OPTIMIZATION,
4537                                  "permutaion op not supported by target.\n");
4538                   return false;
4539                 }
4540
4541               for (i = 0; i < nelt; i++)
4542                 {
4543                   if (3 * i + nelt0 < nelt)
4544                     sel[3 * i + nelt0] = 3 * i + nelt0;
4545                   if (3 * i + nelt1 < nelt)
4546                     sel[3 * i + nelt1] = 3 * i + nelt1;
4547                   if (3 * i + nelt2 < nelt)
4548                     sel[3 * i + nelt2] = nelt + j2++;
4549                 }
4550               if (!can_vec_perm_p (mode, false, sel))
4551                 {
4552                   if (dump_enabled_p ())
4553                     dump_printf (MSG_MISSED_OPTIMIZATION,
4554                                  "permutaion op not supported by target.\n");
4555                   return false;
4556                 }
4557             }
4558           return true;
4559         }
4560       else
4561         {
4562           /* If length is not equal to 3 then only power of 2 is supported.  */
4563           gcc_assert (exact_log2 (count) != -1);
4564
4565           for (i = 0; i < nelt / 2; i++)
4566             {
4567               sel[i * 2] = i;
4568               sel[i * 2 + 1] = i + nelt;
4569             }
4570             if (can_vec_perm_p (mode, false, sel))
4571               {
4572                 for (i = 0; i < nelt; i++)
4573                   sel[i] += nelt / 2;
4574                 if (can_vec_perm_p (mode, false, sel))
4575                   return true;
4576               }
4577         }
4578     }
4579
4580   if (dump_enabled_p ())
4581     dump_printf (MSG_MISSED_OPTIMIZATION,
4582                  "permutaion op not supported by target.\n");
4583   return false;
4584 }
4585
4586
4587 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4588    type VECTYPE.  */
4589
4590 bool
4591 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4592 {
4593   return vect_lanes_optab_supported_p ("vec_store_lanes",
4594                                        vec_store_lanes_optab,
4595                                        vectype, count);
4596 }
4597
4598
4599 /* Function vect_permute_store_chain.
4600
4601    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4602    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4603    the data correctly for the stores.  Return the final references for stores
4604    in RESULT_CHAIN.
4605
4606    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4607    The input is 4 vectors each containing 8 elements.  We assign a number to
4608    each element, the input sequence is:
4609
4610    1st vec:   0  1  2  3  4  5  6  7
4611    2nd vec:   8  9 10 11 12 13 14 15
4612    3rd vec:  16 17 18 19 20 21 22 23
4613    4th vec:  24 25 26 27 28 29 30 31
4614
4615    The output sequence should be:
4616
4617    1st vec:  0  8 16 24  1  9 17 25
4618    2nd vec:  2 10 18 26  3 11 19 27
4619    3rd vec:  4 12 20 28  5 13 21 30
4620    4th vec:  6 14 22 30  7 15 23 31
4621
4622    i.e., we interleave the contents of the four vectors in their order.
4623
4624    We use interleave_high/low instructions to create such output.  The input of
4625    each interleave_high/low operation is two vectors:
4626    1st vec    2nd vec
4627    0 1 2 3    4 5 6 7
4628    the even elements of the result vector are obtained left-to-right from the
4629    high/low elements of the first vector.  The odd elements of the result are
4630    obtained left-to-right from the high/low elements of the second vector.
4631    The output of interleave_high will be:   0 4 1 5
4632    and of interleave_low:                   2 6 3 7
4633
4634
4635    The permutation is done in log LENGTH stages.  In each stage interleave_high
4636    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4637    where the first argument is taken from the first half of DR_CHAIN and the
4638    second argument from it's second half.
4639    In our example,
4640
4641    I1: interleave_high (1st vec, 3rd vec)
4642    I2: interleave_low (1st vec, 3rd vec)
4643    I3: interleave_high (2nd vec, 4th vec)
4644    I4: interleave_low (2nd vec, 4th vec)
4645
4646    The output for the first stage is:
4647
4648    I1:  0 16  1 17  2 18  3 19
4649    I2:  4 20  5 21  6 22  7 23
4650    I3:  8 24  9 25 10 26 11 27
4651    I4: 12 28 13 29 14 30 15 31
4652
4653    The output of the second stage, i.e. the final result is:
4654
4655    I1:  0  8 16 24  1  9 17 25
4656    I2:  2 10 18 26  3 11 19 27
4657    I3:  4 12 20 28  5 13 21 30
4658    I4:  6 14 22 30  7 15 23 31.  */
4659
4660 void
4661 vect_permute_store_chain (vec<tree> dr_chain,
4662                           unsigned int length,
4663                           gimple *stmt,
4664                           gimple_stmt_iterator *gsi,
4665                           vec<tree> *result_chain)
4666 {
4667   tree vect1, vect2, high, low;
4668   gimple *perm_stmt;
4669   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4670   tree perm_mask_low, perm_mask_high;
4671   tree data_ref;
4672   tree perm3_mask_low, perm3_mask_high;
4673   unsigned int i, n, log_length = exact_log2 (length);
4674   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4675   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4676
4677   result_chain->quick_grow (length);
4678   memcpy (result_chain->address (), dr_chain.address (),
4679           length * sizeof (tree));
4680
4681   if (length == 3)
4682     {
4683       unsigned int j0 = 0, j1 = 0, j2 = 0;
4684
4685       for (j = 0; j < 3; j++)
4686         {
4687           int nelt0 = ((3 - j) * nelt) % 3;
4688           int nelt1 = ((3 - j) * nelt + 1) % 3;
4689           int nelt2 = ((3 - j) * nelt + 2) % 3;
4690
4691           for (i = 0; i < nelt; i++)
4692             {
4693               if (3 * i + nelt0 < nelt)
4694                 sel[3 * i + nelt0] = j0++;
4695               if (3 * i + nelt1 < nelt)
4696                 sel[3 * i + nelt1] = nelt + j1++;
4697               if (3 * i + nelt2 < nelt)
4698                 sel[3 * i + nelt2] = 0;
4699             }
4700           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4701
4702           for (i = 0; i < nelt; i++)
4703             {
4704               if (3 * i + nelt0 < nelt)
4705                 sel[3 * i + nelt0] = 3 * i + nelt0;
4706               if (3 * i + nelt1 < nelt)
4707                 sel[3 * i + nelt1] = 3 * i + nelt1;
4708               if (3 * i + nelt2 < nelt)
4709                 sel[3 * i + nelt2] = nelt + j2++;
4710             }
4711           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4712
4713           vect1 = dr_chain[0];
4714           vect2 = dr_chain[1];
4715
4716           /* Create interleaving stmt:
4717              low = VEC_PERM_EXPR <vect1, vect2,
4718                                   {j, nelt, *, j + 1, nelt + j + 1, *,
4719                                    j + 2, nelt + j + 2, *, ...}>  */
4720           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4721           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4722                                            vect2, perm3_mask_low);
4723           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4724
4725           vect1 = data_ref;
4726           vect2 = dr_chain[2];
4727           /* Create interleaving stmt:
4728              low = VEC_PERM_EXPR <vect1, vect2,
4729                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
4730                                    6, 7, nelt + j + 2, ...}>  */
4731           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4732           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4733                                            vect2, perm3_mask_high);
4734           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4735           (*result_chain)[j] = data_ref;
4736         }
4737     }
4738   else
4739     {
4740       /* If length is not equal to 3 then only power of 2 is supported.  */
4741       gcc_assert (exact_log2 (length) != -1);
4742
4743       for (i = 0, n = nelt / 2; i < n; i++)
4744         {
4745           sel[i * 2] = i;
4746           sel[i * 2 + 1] = i + nelt;
4747         }
4748         perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4749
4750         for (i = 0; i < nelt; i++)
4751           sel[i] += nelt / 2;
4752         perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4753
4754         for (i = 0, n = log_length; i < n; i++)
4755           {
4756             for (j = 0; j < length/2; j++)
4757               {
4758                 vect1 = dr_chain[j];
4759                 vect2 = dr_chain[j+length/2];
4760
4761                 /* Create interleaving stmt:
4762                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4763                                                         ...}>  */
4764                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4765                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4766                                                  vect2, perm_mask_high);
4767                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4768                 (*result_chain)[2*j] = high;
4769
4770                 /* Create interleaving stmt:
4771                    low = VEC_PERM_EXPR <vect1, vect2,
4772                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4773                                          ...}>  */
4774                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4775                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4776                                                  vect2, perm_mask_low);
4777                 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4778                 (*result_chain)[2*j+1] = low;
4779               }
4780             memcpy (dr_chain.address (), result_chain->address (),
4781                     length * sizeof (tree));
4782           }
4783     }
4784 }
4785
4786 /* Function vect_setup_realignment
4787
4788    This function is called when vectorizing an unaligned load using
4789    the dr_explicit_realign[_optimized] scheme.
4790    This function generates the following code at the loop prolog:
4791
4792       p = initial_addr;
4793    x  msq_init = *(floor(p));   # prolog load
4794       realignment_token = call target_builtin;
4795     loop:
4796    x  msq = phi (msq_init, ---)
4797
4798    The stmts marked with x are generated only for the case of
4799    dr_explicit_realign_optimized.
4800
4801    The code above sets up a new (vector) pointer, pointing to the first
4802    location accessed by STMT, and a "floor-aligned" load using that pointer.
4803    It also generates code to compute the "realignment-token" (if the relevant
4804    target hook was defined), and creates a phi-node at the loop-header bb
4805    whose arguments are the result of the prolog-load (created by this
4806    function) and the result of a load that takes place in the loop (to be
4807    created by the caller to this function).
4808
4809    For the case of dr_explicit_realign_optimized:
4810    The caller to this function uses the phi-result (msq) to create the
4811    realignment code inside the loop, and sets up the missing phi argument,
4812    as follows:
4813     loop:
4814       msq = phi (msq_init, lsq)
4815       lsq = *(floor(p'));        # load in loop
4816       result = realign_load (msq, lsq, realignment_token);
4817
4818    For the case of dr_explicit_realign:
4819     loop:
4820       msq = *(floor(p));        # load in loop
4821       p' = p + (VS-1);
4822       lsq = *(floor(p'));       # load in loop
4823       result = realign_load (msq, lsq, realignment_token);
4824
4825    Input:
4826    STMT - (scalar) load stmt to be vectorized. This load accesses
4827           a memory location that may be unaligned.
4828    BSI - place where new code is to be inserted.
4829    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4830                               is used.
4831
4832    Output:
4833    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4834                        target hook, if defined.
4835    Return value - the result of the loop-header phi node.  */
4836
4837 tree
4838 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4839                         tree *realignment_token,
4840                         enum dr_alignment_support alignment_support_scheme,
4841                         tree init_addr,
4842                         struct loop **at_loop)
4843 {
4844   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4845   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4846   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4847   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4848   struct loop *loop = NULL;
4849   edge pe = NULL;
4850   tree scalar_dest = gimple_assign_lhs (stmt);
4851   tree vec_dest;
4852   gimple *inc;
4853   tree ptr;
4854   tree data_ref;
4855   basic_block new_bb;
4856   tree msq_init = NULL_TREE;
4857   tree new_temp;
4858   gphi *phi_stmt;
4859   tree msq = NULL_TREE;
4860   gimple_seq stmts = NULL;
4861   bool inv_p;
4862   bool compute_in_loop = false;
4863   bool nested_in_vect_loop = false;
4864   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4865   struct loop *loop_for_initial_load = NULL;
4866
4867   if (loop_vinfo)
4868     {
4869       loop = LOOP_VINFO_LOOP (loop_vinfo);
4870       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4871     }
4872
4873   gcc_assert (alignment_support_scheme == dr_explicit_realign
4874               || alignment_support_scheme == dr_explicit_realign_optimized);
4875
4876   /* We need to generate three things:
4877      1. the misalignment computation
4878      2. the extra vector load (for the optimized realignment scheme).
4879      3. the phi node for the two vectors from which the realignment is
4880       done (for the optimized realignment scheme).  */
4881
4882   /* 1. Determine where to generate the misalignment computation.
4883
4884      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4885      calculation will be generated by this function, outside the loop (in the
4886      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4887      caller, inside the loop.
4888
4889      Background: If the misalignment remains fixed throughout the iterations of
4890      the loop, then both realignment schemes are applicable, and also the
4891      misalignment computation can be done outside LOOP.  This is because we are
4892      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4893      are a multiple of VS (the Vector Size), and therefore the misalignment in
4894      different vectorized LOOP iterations is always the same.
4895      The problem arises only if the memory access is in an inner-loop nested
4896      inside LOOP, which is now being vectorized using outer-loop vectorization.
4897      This is the only case when the misalignment of the memory access may not
4898      remain fixed throughout the iterations of the inner-loop (as explained in
4899      detail in vect_supportable_dr_alignment).  In this case, not only is the
4900      optimized realignment scheme not applicable, but also the misalignment
4901      computation (and generation of the realignment token that is passed to
4902      REALIGN_LOAD) have to be done inside the loop.
4903
4904      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4905      or not, which in turn determines if the misalignment is computed inside
4906      the inner-loop, or outside LOOP.  */
4907
4908   if (init_addr != NULL_TREE || !loop_vinfo)
4909     {
4910       compute_in_loop = true;
4911       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4912     }
4913
4914
4915   /* 2. Determine where to generate the extra vector load.
4916
4917      For the optimized realignment scheme, instead of generating two vector
4918      loads in each iteration, we generate a single extra vector load in the
4919      preheader of the loop, and in each iteration reuse the result of the
4920      vector load from the previous iteration.  In case the memory access is in
4921      an inner-loop nested inside LOOP, which is now being vectorized using
4922      outer-loop vectorization, we need to determine whether this initial vector
4923      load should be generated at the preheader of the inner-loop, or can be
4924      generated at the preheader of LOOP.  If the memory access has no evolution
4925      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4926      to be generated inside LOOP (in the preheader of the inner-loop).  */
4927
4928   if (nested_in_vect_loop)
4929     {
4930       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4931       bool invariant_in_outerloop =
4932             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4933       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4934     }
4935   else
4936     loop_for_initial_load = loop;
4937   if (at_loop)
4938     *at_loop = loop_for_initial_load;
4939
4940   if (loop_for_initial_load)
4941     pe = loop_preheader_edge (loop_for_initial_load);
4942
4943   /* 3. For the case of the optimized realignment, create the first vector
4944       load at the loop preheader.  */
4945
4946   if (alignment_support_scheme == dr_explicit_realign_optimized)
4947     {
4948       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4949       gassign *new_stmt;
4950
4951       gcc_assert (!compute_in_loop);
4952       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4953       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4954                                       NULL_TREE, &init_addr, NULL, &inc,
4955                                       true, &inv_p);
4956       if (TREE_CODE (ptr) == SSA_NAME)
4957         new_temp = copy_ssa_name (ptr);
4958       else
4959         new_temp = make_ssa_name (TREE_TYPE (ptr));
4960       new_stmt = gimple_build_assign
4961                    (new_temp, BIT_AND_EXPR, ptr,
4962                     build_int_cst (TREE_TYPE (ptr),
4963                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4964       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4965       gcc_assert (!new_bb);
4966       data_ref
4967         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4968                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4969       new_stmt = gimple_build_assign (vec_dest, data_ref);
4970       new_temp = make_ssa_name (vec_dest, new_stmt);
4971       gimple_assign_set_lhs (new_stmt, new_temp);
4972       if (pe)
4973         {
4974           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4975           gcc_assert (!new_bb);
4976         }
4977       else
4978          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4979
4980       msq_init = gimple_assign_lhs (new_stmt);
4981     }
4982
4983   /* 4. Create realignment token using a target builtin, if available.
4984       It is done either inside the containing loop, or before LOOP (as
4985       determined above).  */
4986
4987   if (targetm.vectorize.builtin_mask_for_load)
4988     {
4989       gcall *new_stmt;
4990       tree builtin_decl;
4991
4992       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4993       if (!init_addr)
4994         {
4995           /* Generate the INIT_ADDR computation outside LOOP.  */
4996           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4997                                                         NULL_TREE, loop);
4998           if (loop)
4999             {
5000               pe = loop_preheader_edge (loop);
5001               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5002               gcc_assert (!new_bb);
5003             }
5004           else
5005              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5006         }
5007
5008       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5009       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5010       vec_dest =
5011         vect_create_destination_var (scalar_dest,
5012                                      gimple_call_return_type (new_stmt));
5013       new_temp = make_ssa_name (vec_dest, new_stmt);
5014       gimple_call_set_lhs (new_stmt, new_temp);
5015
5016       if (compute_in_loop)
5017         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5018       else
5019         {
5020           /* Generate the misalignment computation outside LOOP.  */
5021           pe = loop_preheader_edge (loop);
5022           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5023           gcc_assert (!new_bb);
5024         }
5025
5026       *realignment_token = gimple_call_lhs (new_stmt);
5027
5028       /* The result of the CALL_EXPR to this builtin is determined from
5029          the value of the parameter and no global variables are touched
5030          which makes the builtin a "const" function.  Requiring the
5031          builtin to have the "const" attribute makes it unnecessary
5032          to call mark_call_clobbered.  */
5033       gcc_assert (TREE_READONLY (builtin_decl));
5034     }
5035
5036   if (alignment_support_scheme == dr_explicit_realign)
5037     return msq;
5038
5039   gcc_assert (!compute_in_loop);
5040   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5041
5042
5043   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5044
5045   pe = loop_preheader_edge (containing_loop);
5046   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5047   msq = make_ssa_name (vec_dest);
5048   phi_stmt = create_phi_node (msq, containing_loop->header);
5049   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5050
5051   return msq;
5052 }
5053
5054
5055 /* Function vect_grouped_load_supported.
5056
5057    Returns TRUE if even and odd permutations are supported,
5058    and FALSE otherwise.  */
5059
5060 bool
5061 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5062 {
5063   machine_mode mode = TYPE_MODE (vectype);
5064
5065   /* vect_permute_load_chain requires the group size to be equal to 3 or
5066      be a power of two.  */
5067   if (count != 3 && exact_log2 (count) == -1)
5068     {
5069       if (dump_enabled_p ())
5070         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5071                          "the size of the group of accesses"
5072                          " is not a power of 2 or not equal to 3\n");
5073       return false;
5074     }
5075
5076   /* Check that the permutation is supported.  */
5077   if (VECTOR_MODE_P (mode))
5078     {
5079       unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5080       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5081
5082       if (count == 3)
5083         {
5084           unsigned int k;
5085           for (k = 0; k < 3; k++)
5086             {
5087               for (i = 0; i < nelt; i++)
5088                 if (3 * i + k < 2 * nelt)
5089                   sel[i] = 3 * i + k;
5090                 else
5091                   sel[i] = 0;
5092               if (!can_vec_perm_p (mode, false, sel))
5093                 {
5094                   if (dump_enabled_p ())
5095                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5096                                      "shuffle of 3 loads is not supported by"
5097                                      " target\n");
5098                   return false;
5099                 }
5100               for (i = 0, j = 0; i < nelt; i++)
5101                 if (3 * i + k < 2 * nelt)
5102                   sel[i] = i;
5103                 else
5104                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5105               if (!can_vec_perm_p (mode, false, sel))
5106                 {
5107                   if (dump_enabled_p ())
5108                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5109                                      "shuffle of 3 loads is not supported by"
5110                                      " target\n");
5111                   return false;
5112                 }
5113             }
5114           return true;
5115         }
5116       else
5117         {
5118           /* If length is not equal to 3 then only power of 2 is supported.  */
5119           gcc_assert (exact_log2 (count) != -1);
5120           for (i = 0; i < nelt; i++)
5121             sel[i] = i * 2;
5122           if (can_vec_perm_p (mode, false, sel))
5123             {
5124               for (i = 0; i < nelt; i++)
5125                 sel[i] = i * 2 + 1;
5126               if (can_vec_perm_p (mode, false, sel))
5127                 return true;
5128             }
5129         }
5130     }
5131
5132   if (dump_enabled_p ())
5133     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5134                      "extract even/odd not supported by target\n");
5135   return false;
5136 }
5137
5138 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5139    type VECTYPE.  */
5140
5141 bool
5142 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5143 {
5144   return vect_lanes_optab_supported_p ("vec_load_lanes",
5145                                        vec_load_lanes_optab,
5146                                        vectype, count);
5147 }
5148
5149 /* Function vect_permute_load_chain.
5150
5151    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5152    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5153    the input data correctly.  Return the final references for loads in
5154    RESULT_CHAIN.
5155
5156    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5157    The input is 4 vectors each containing 8 elements. We assign a number to each
5158    element, the input sequence is:
5159
5160    1st vec:   0  1  2  3  4  5  6  7
5161    2nd vec:   8  9 10 11 12 13 14 15
5162    3rd vec:  16 17 18 19 20 21 22 23
5163    4th vec:  24 25 26 27 28 29 30 31
5164
5165    The output sequence should be:
5166
5167    1st vec:  0 4  8 12 16 20 24 28
5168    2nd vec:  1 5  9 13 17 21 25 29
5169    3rd vec:  2 6 10 14 18 22 26 30
5170    4th vec:  3 7 11 15 19 23 27 31
5171
5172    i.e., the first output vector should contain the first elements of each
5173    interleaving group, etc.
5174
5175    We use extract_even/odd instructions to create such output.  The input of
5176    each extract_even/odd operation is two vectors
5177    1st vec    2nd vec
5178    0 1 2 3    4 5 6 7
5179
5180    and the output is the vector of extracted even/odd elements.  The output of
5181    extract_even will be:   0 2 4 6
5182    and of extract_odd:     1 3 5 7
5183
5184
5185    The permutation is done in log LENGTH stages.  In each stage extract_even
5186    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5187    their order.  In our example,
5188
5189    E1: extract_even (1st vec, 2nd vec)
5190    E2: extract_odd (1st vec, 2nd vec)
5191    E3: extract_even (3rd vec, 4th vec)
5192    E4: extract_odd (3rd vec, 4th vec)
5193
5194    The output for the first stage will be:
5195
5196    E1:  0  2  4  6  8 10 12 14
5197    E2:  1  3  5  7  9 11 13 15
5198    E3: 16 18 20 22 24 26 28 30
5199    E4: 17 19 21 23 25 27 29 31
5200
5201    In order to proceed and create the correct sequence for the next stage (or
5202    for the correct output, if the second stage is the last one, as in our
5203    example), we first put the output of extract_even operation and then the
5204    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5205    The input for the second stage is:
5206
5207    1st vec (E1):  0  2  4  6  8 10 12 14
5208    2nd vec (E3): 16 18 20 22 24 26 28 30
5209    3rd vec (E2):  1  3  5  7  9 11 13 15
5210    4th vec (E4): 17 19 21 23 25 27 29 31
5211
5212    The output of the second stage:
5213
5214    E1: 0 4  8 12 16 20 24 28
5215    E2: 2 6 10 14 18 22 26 30
5216    E3: 1 5  9 13 17 21 25 29
5217    E4: 3 7 11 15 19 23 27 31
5218
5219    And RESULT_CHAIN after reordering:
5220
5221    1st vec (E1):  0 4  8 12 16 20 24 28
5222    2nd vec (E3):  1 5  9 13 17 21 25 29
5223    3rd vec (E2):  2 6 10 14 18 22 26 30
5224    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5225
5226 static void
5227 vect_permute_load_chain (vec<tree> dr_chain,
5228                          unsigned int length,
5229                          gimple *stmt,
5230                          gimple_stmt_iterator *gsi,
5231                          vec<tree> *result_chain)
5232 {
5233   tree data_ref, first_vect, second_vect;
5234   tree perm_mask_even, perm_mask_odd;
5235   tree perm3_mask_low, perm3_mask_high;
5236   gimple *perm_stmt;
5237   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5238   unsigned int i, j, log_length = exact_log2 (length);
5239   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5240   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5241
5242   result_chain->quick_grow (length);
5243   memcpy (result_chain->address (), dr_chain.address (),
5244           length * sizeof (tree));
5245
5246   if (length == 3)
5247     {
5248       unsigned int k;
5249
5250       for (k = 0; k < 3; k++)
5251         {
5252           for (i = 0; i < nelt; i++)
5253             if (3 * i + k < 2 * nelt)
5254               sel[i] = 3 * i + k;
5255             else
5256               sel[i] = 0;
5257           perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5258
5259           for (i = 0, j = 0; i < nelt; i++)
5260             if (3 * i + k < 2 * nelt)
5261               sel[i] = i;
5262             else
5263               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5264
5265           perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5266
5267           first_vect = dr_chain[0];
5268           second_vect = dr_chain[1];
5269
5270           /* Create interleaving stmt (low part of):
5271              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5272                                                              ...}>  */
5273           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5274           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5275                                            second_vect, perm3_mask_low);
5276           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5277
5278           /* Create interleaving stmt (high part of):
5279              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5280                                                               ...}>  */
5281           first_vect = data_ref;
5282           second_vect = dr_chain[2];
5283           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5284           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5285                                            second_vect, perm3_mask_high);
5286           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5287           (*result_chain)[k] = data_ref;
5288         }
5289     }
5290   else
5291     {
5292       /* If length is not equal to 3 then only power of 2 is supported.  */
5293       gcc_assert (exact_log2 (length) != -1);
5294
5295       for (i = 0; i < nelt; ++i)
5296         sel[i] = i * 2;
5297       perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5298
5299       for (i = 0; i < nelt; ++i)
5300         sel[i] = i * 2 + 1;
5301       perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5302
5303       for (i = 0; i < log_length; i++)
5304         {
5305           for (j = 0; j < length; j += 2)
5306             {
5307               first_vect = dr_chain[j];
5308               second_vect = dr_chain[j+1];
5309
5310               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5311               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5312               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5313                                                first_vect, second_vect,
5314                                                perm_mask_even);
5315               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5316               (*result_chain)[j/2] = data_ref;
5317
5318               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5319               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5320               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5321                                                first_vect, second_vect,
5322                                                perm_mask_odd);
5323               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5324               (*result_chain)[j/2+length/2] = data_ref;
5325             }
5326           memcpy (dr_chain.address (), result_chain->address (),
5327                   length * sizeof (tree));
5328         }
5329     }
5330 }
5331
5332 /* Function vect_shift_permute_load_chain.
5333
5334    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5335    sequence of stmts to reorder the input data accordingly.
5336    Return the final references for loads in RESULT_CHAIN.
5337    Return true if successed, false otherwise.
5338
5339    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5340    The input is 3 vectors each containing 8 elements.  We assign a
5341    number to each element, the input sequence is:
5342
5343    1st vec:   0  1  2  3  4  5  6  7
5344    2nd vec:   8  9 10 11 12 13 14 15
5345    3rd vec:  16 17 18 19 20 21 22 23
5346
5347    The output sequence should be:
5348
5349    1st vec:  0 3 6  9 12 15 18 21
5350    2nd vec:  1 4 7 10 13 16 19 22
5351    3rd vec:  2 5 8 11 14 17 20 23
5352
5353    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5354
5355    First we shuffle all 3 vectors to get correct elements order:
5356
5357    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5358    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5359    3rd vec:  (16 19 22) (17 20 23) (18 21)
5360
5361    Next we unite and shift vector 3 times:
5362
5363    1st step:
5364      shift right by 6 the concatenation of:
5365      "1st vec" and  "2nd vec"
5366        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5367      "2nd vec" and  "3rd vec"
5368        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5369      "3rd vec" and  "1st vec"
5370        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5371                              | New vectors                   |
5372
5373      So that now new vectors are:
5374
5375      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5376      2nd vec:  (10 13) (16 19 22) (17 20 23)
5377      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5378
5379    2nd step:
5380      shift right by 5 the concatenation of:
5381      "1st vec" and  "3rd vec"
5382        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5383      "2nd vec" and  "1st vec"
5384        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5385      "3rd vec" and  "2nd vec"
5386        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5387                           | New vectors                   |
5388
5389      So that now new vectors are:
5390
5391      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5392      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
5393      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
5394
5395    3rd step:
5396      shift right by 5 the concatenation of:
5397      "1st vec" and  "1st vec"
5398        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
5399      shift right by 3 the concatenation of:
5400      "2nd vec" and  "2nd vec"
5401                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
5402                           | New vectors                   |
5403
5404      So that now all vectors are READY:
5405      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
5406      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
5407      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
5408
5409    This algorithm is faster than one in vect_permute_load_chain if:
5410      1.  "shift of a concatination" is faster than general permutation.
5411          This is usually so.
5412      2.  The TARGET machine can't execute vector instructions in parallel.
5413          This is because each step of the algorithm depends on previous.
5414          The algorithm in vect_permute_load_chain is much more parallel.
5415
5416    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5417 */
5418
5419 static bool
5420 vect_shift_permute_load_chain (vec<tree> dr_chain,
5421                                unsigned int length,
5422                                gimple *stmt,
5423                                gimple_stmt_iterator *gsi,
5424                                vec<tree> *result_chain)
5425 {
5426   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5427   tree perm2_mask1, perm2_mask2, perm3_mask;
5428   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5429   gimple *perm_stmt;
5430
5431   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5432   unsigned int i;
5433   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5434   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5435   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5436   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5437
5438   result_chain->quick_grow (length);
5439   memcpy (result_chain->address (), dr_chain.address (),
5440           length * sizeof (tree));
5441
5442   if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5443     {
5444       unsigned int j, log_length = exact_log2 (length);
5445       for (i = 0; i < nelt / 2; ++i)
5446         sel[i] = i * 2;
5447       for (i = 0; i < nelt / 2; ++i)
5448         sel[nelt / 2 + i] = i * 2 + 1;
5449       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5450         {
5451           if (dump_enabled_p ())
5452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5453                              "shuffle of 2 fields structure is not \
5454                               supported by target\n");
5455           return false;
5456         }
5457       perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5458
5459       for (i = 0; i < nelt / 2; ++i)
5460         sel[i] = i * 2 + 1;
5461       for (i = 0; i < nelt / 2; ++i)
5462         sel[nelt / 2 + i] = i * 2;
5463       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5464         {
5465           if (dump_enabled_p ())
5466             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5467                              "shuffle of 2 fields structure is not \
5468                               supported by target\n");
5469           return false;
5470         }
5471       perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5472
5473       /* Generating permutation constant to shift all elements.
5474          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
5475       for (i = 0; i < nelt; i++)
5476         sel[i] = nelt / 2 + i;
5477       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5478         {
5479           if (dump_enabled_p ())
5480             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5481                              "shift permutation is not supported by target\n");
5482           return false;
5483         }
5484       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5485
5486       /* Generating permutation constant to select vector from 2.
5487          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
5488       for (i = 0; i < nelt / 2; i++)
5489         sel[i] = i;
5490       for (i = nelt / 2; i < nelt; i++)
5491         sel[i] = nelt + i;
5492       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5493         {
5494           if (dump_enabled_p ())
5495             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5496                              "select is not supported by target\n");
5497           return false;
5498         }
5499       select_mask = vect_gen_perm_mask_checked (vectype, sel);
5500
5501       for (i = 0; i < log_length; i++)
5502         {
5503           for (j = 0; j < length; j += 2)
5504             {
5505               first_vect = dr_chain[j];
5506               second_vect = dr_chain[j + 1];
5507
5508               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5509               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5510                                                first_vect, first_vect,
5511                                                perm2_mask1);
5512               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5513               vect[0] = data_ref;
5514
5515               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5516               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5517                                                second_vect, second_vect,
5518                                                perm2_mask2);
5519               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5520               vect[1] = data_ref;
5521
5522               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5523               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5524                                                vect[0], vect[1], shift1_mask);
5525               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5526               (*result_chain)[j/2 + length/2] = data_ref;
5527
5528               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5529               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5530                                                vect[0], vect[1], select_mask);
5531               vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5532               (*result_chain)[j/2] = data_ref;
5533             }
5534           memcpy (dr_chain.address (), result_chain->address (),
5535                   length * sizeof (tree));
5536         }
5537       return true;
5538     }
5539   if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5540     {
5541       unsigned int k = 0, l = 0;
5542
5543       /* Generating permutation constant to get all elements in rigth order.
5544          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
5545       for (i = 0; i < nelt; i++)
5546         {
5547           if (3 * k + (l % 3) >= nelt)
5548             {
5549               k = 0;
5550               l += (3 - (nelt % 3));
5551             }
5552           sel[i] = 3 * k + (l % 3);
5553           k++;
5554         }
5555       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5556         {
5557           if (dump_enabled_p ())
5558             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5559                              "shuffle of 3 fields structure is not \
5560                               supported by target\n");
5561           return false;
5562         }
5563       perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5564
5565       /* Generating permutation constant to shift all elements.
5566          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
5567       for (i = 0; i < nelt; i++)
5568         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5569       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5570         {
5571           if (dump_enabled_p ())
5572             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5573                              "shift permutation is not supported by target\n");
5574           return false;
5575         }
5576       shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5577
5578       /* Generating permutation constant to shift all elements.
5579          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5580       for (i = 0; i < nelt; i++)
5581         sel[i] = 2 * (nelt / 3) + 1 + i;
5582       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5583         {
5584           if (dump_enabled_p ())
5585             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5586                              "shift permutation is not supported by target\n");
5587           return false;
5588         }
5589       shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5590
5591       /* Generating permutation constant to shift all elements.
5592          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
5593       for (i = 0; i < nelt; i++)
5594         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5595       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5596         {
5597           if (dump_enabled_p ())
5598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5599                              "shift permutation is not supported by target\n");
5600           return false;
5601         }
5602       shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5603
5604       /* Generating permutation constant to shift all elements.
5605          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
5606       for (i = 0; i < nelt; i++)
5607         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5608       if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5609         {
5610           if (dump_enabled_p ())
5611             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5612                              "shift permutation is not supported by target\n");
5613           return false;
5614         }
5615       shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5616
5617       for (k = 0; k < 3; k++)
5618         {
5619           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5620           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5621                                            dr_chain[k], dr_chain[k],
5622                                            perm3_mask);
5623           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5624           vect[k] = data_ref;
5625         }
5626
5627       for (k = 0; k < 3; k++)
5628         {
5629           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5630           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5631                                            vect[k % 3], vect[(k + 1) % 3],
5632                                            shift1_mask);
5633           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5634           vect_shift[k] = data_ref;
5635         }
5636
5637       for (k = 0; k < 3; k++)
5638         {
5639           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5640           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5641                                            vect_shift[(4 - k) % 3],
5642                                            vect_shift[(3 - k) % 3],
5643                                            shift2_mask);
5644           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5645           vect[k] = data_ref;
5646         }
5647
5648       (*result_chain)[3 - (nelt % 3)] = vect[2];
5649
5650       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5651       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5652                                        vect[0], shift3_mask);
5653       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5654       (*result_chain)[nelt % 3] = data_ref;
5655
5656       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5657       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5658                                        vect[1], shift4_mask);
5659       vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5660       (*result_chain)[0] = data_ref;
5661       return true;
5662     }
5663   return false;
5664 }
5665
5666 /* Function vect_transform_grouped_load.
5667
5668    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5669    to perform their permutation and ascribe the result vectorized statements to
5670    the scalar statements.
5671 */
5672
5673 void
5674 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5675                              gimple_stmt_iterator *gsi)
5676 {
5677   machine_mode mode;
5678   vec<tree> result_chain = vNULL;
5679
5680   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5681      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5682      vectors, that are ready for vector computation.  */
5683   result_chain.create (size);
5684
5685   /* If reassociation width for vector type is 2 or greater target machine can
5686      execute 2 or more vector instructions in parallel.  Otherwise try to
5687      get chain for loads group using vect_shift_permute_load_chain.  */
5688   mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5689   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5690       || exact_log2 (size) != -1
5691       || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5692                                          gsi, &result_chain))
5693     vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5694   vect_record_grouped_load_vectors (stmt, result_chain);
5695   result_chain.release ();
5696 }
5697
5698 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5699    generated as part of the vectorization of STMT.  Assign the statement
5700    for each vector to the associated scalar statement.  */
5701
5702 void
5703 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5704 {
5705   gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5706   gimple *next_stmt, *new_stmt;
5707   unsigned int i, gap_count;
5708   tree tmp_data_ref;
5709
5710   /* Put a permuted data-ref in the VECTORIZED_STMT field.
5711      Since we scan the chain starting from it's first node, their order
5712      corresponds the order of data-refs in RESULT_CHAIN.  */
5713   next_stmt = first_stmt;
5714   gap_count = 1;
5715   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5716     {
5717       if (!next_stmt)
5718         break;
5719
5720       /* Skip the gaps.  Loads created for the gaps will be removed by dead
5721        code elimination pass later.  No need to check for the first stmt in
5722        the group, since it always exists.
5723        GROUP_GAP is the number of steps in elements from the previous
5724        access (if there is no gap GROUP_GAP is 1).  We skip loads that
5725        correspond to the gaps.  */
5726       if (next_stmt != first_stmt
5727           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5728       {
5729         gap_count++;
5730         continue;
5731       }
5732
5733       while (next_stmt)
5734         {
5735           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5736           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5737              copies, and we put the new vector statement in the first available
5738              RELATED_STMT.  */
5739           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5740             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5741           else
5742             {
5743               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5744                 {
5745                   gimple *prev_stmt =
5746                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5747                   gimple *rel_stmt =
5748                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5749                   while (rel_stmt)
5750                     {
5751                       prev_stmt = rel_stmt;
5752                       rel_stmt =
5753                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5754                     }
5755
5756                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5757                     new_stmt;
5758                 }
5759             }
5760
5761           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5762           gap_count = 1;
5763           /* If NEXT_STMT accesses the same DR as the previous statement,
5764              put the same TMP_DATA_REF as its vectorized statement; otherwise
5765              get the next data-ref from RESULT_CHAIN.  */
5766           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5767             break;
5768         }
5769     }
5770 }
5771
5772 /* Function vect_force_dr_alignment_p.
5773
5774    Returns whether the alignment of a DECL can be forced to be aligned
5775    on ALIGNMENT bit boundary.  */
5776
5777 bool
5778 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5779 {
5780   if (TREE_CODE (decl) != VAR_DECL)
5781     return false;
5782
5783   if (decl_in_symtab_p (decl)
5784       && !symtab_node::get (decl)->can_increase_alignment_p ())
5785     return false;
5786
5787   if (TREE_STATIC (decl))
5788     return (alignment <= MAX_OFILE_ALIGNMENT);
5789   else
5790     return (alignment <= MAX_STACK_ALIGNMENT);
5791 }
5792
5793
5794 /* Return whether the data reference DR is supported with respect to its
5795    alignment.
5796    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5797    it is aligned, i.e., check if it is possible to vectorize it with different
5798    alignment.  */
5799
5800 enum dr_alignment_support
5801 vect_supportable_dr_alignment (struct data_reference *dr,
5802                                bool check_aligned_accesses)
5803 {
5804   gimple *stmt = DR_STMT (dr);
5805   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5806   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5807   machine_mode mode = TYPE_MODE (vectype);
5808   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5809   struct loop *vect_loop = NULL;
5810   bool nested_in_vect_loop = false;
5811
5812   if (aligned_access_p (dr) && !check_aligned_accesses)
5813     return dr_aligned;
5814
5815   /* For now assume all conditional loads/stores support unaligned
5816      access without any special code.  */
5817   if (is_gimple_call (stmt)
5818       && gimple_call_internal_p (stmt)
5819       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5820           || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5821     return dr_unaligned_supported;
5822
5823   if (loop_vinfo)
5824     {
5825       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5826       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5827     }
5828
5829   /* Possibly unaligned access.  */
5830
5831   /* We can choose between using the implicit realignment scheme (generating
5832      a misaligned_move stmt) and the explicit realignment scheme (generating
5833      aligned loads with a REALIGN_LOAD).  There are two variants to the
5834      explicit realignment scheme: optimized, and unoptimized.
5835      We can optimize the realignment only if the step between consecutive
5836      vector loads is equal to the vector size.  Since the vector memory
5837      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5838      is guaranteed that the misalignment amount remains the same throughout the
5839      execution of the vectorized loop.  Therefore, we can create the
5840      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5841      at the loop preheader.
5842
5843      However, in the case of outer-loop vectorization, when vectorizing a
5844      memory access in the inner-loop nested within the LOOP that is now being
5845      vectorized, while it is guaranteed that the misalignment of the
5846      vectorized memory access will remain the same in different outer-loop
5847      iterations, it is *not* guaranteed that is will remain the same throughout
5848      the execution of the inner-loop.  This is because the inner-loop advances
5849      with the original scalar step (and not in steps of VS).  If the inner-loop
5850      step happens to be a multiple of VS, then the misalignment remains fixed
5851      and we can use the optimized realignment scheme.  For example:
5852
5853       for (i=0; i<N; i++)
5854         for (j=0; j<M; j++)
5855           s += a[i+j];
5856
5857      When vectorizing the i-loop in the above example, the step between
5858      consecutive vector loads is 1, and so the misalignment does not remain
5859      fixed across the execution of the inner-loop, and the realignment cannot
5860      be optimized (as illustrated in the following pseudo vectorized loop):
5861
5862       for (i=0; i<N; i+=4)
5863         for (j=0; j<M; j++){
5864           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5865                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
5866                          // (assuming that we start from an aligned address).
5867           }
5868
5869      We therefore have to use the unoptimized realignment scheme:
5870
5871       for (i=0; i<N; i+=4)
5872           for (j=k; j<M; j+=4)
5873           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5874                            // that the misalignment of the initial address is
5875                            // 0).
5876
5877      The loop can then be vectorized as follows:
5878
5879       for (k=0; k<4; k++){
5880         rt = get_realignment_token (&vp[k]);
5881         for (i=0; i<N; i+=4){
5882           v1 = vp[i+k];
5883           for (j=k; j<M; j+=4){
5884             v2 = vp[i+j+VS-1];
5885             va = REALIGN_LOAD <v1,v2,rt>;
5886             vs += va;
5887             v1 = v2;
5888           }
5889         }
5890     } */
5891
5892   if (DR_IS_READ (dr))
5893     {
5894       bool is_packed = false;
5895       tree type = (TREE_TYPE (DR_REF (dr)));
5896
5897       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5898           && (!targetm.vectorize.builtin_mask_for_load
5899               || targetm.vectorize.builtin_mask_for_load ()))
5900         {
5901           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5902           if ((nested_in_vect_loop
5903                && (TREE_INT_CST_LOW (DR_STEP (dr))
5904                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
5905               || !loop_vinfo)
5906             return dr_explicit_realign;
5907           else
5908             return dr_explicit_realign_optimized;
5909         }
5910       if (!known_alignment_for_access_p (dr))
5911         is_packed = not_size_aligned (DR_REF (dr));
5912
5913       if ((TYPE_USER_ALIGN (type) && !is_packed)
5914           || targetm.vectorize.
5915                support_vector_misalignment (mode, type,
5916                                             DR_MISALIGNMENT (dr), is_packed))
5917         /* Can't software pipeline the loads, but can at least do them.  */
5918         return dr_unaligned_supported;
5919     }
5920   else
5921     {
5922       bool is_packed = false;
5923       tree type = (TREE_TYPE (DR_REF (dr)));
5924
5925       if (!known_alignment_for_access_p (dr))
5926         is_packed = not_size_aligned (DR_REF (dr));
5927
5928      if ((TYPE_USER_ALIGN (type) && !is_packed)
5929          || targetm.vectorize.
5930               support_vector_misalignment (mode, type,
5931                                            DR_MISALIGNMENT (dr), is_packed))
5932        return dr_unaligned_supported;
5933     }
5934
5935   /* Unsupported.  */
5936   return dr_unaligned_unsupported;
5937 }