gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "tm_p.h"
  30 #include "target.h"
  31 #include "basic-block.h"
  32 #include "gimple-pretty-print.h"
  33 #include "tree-flow.h"
  34 #include "dumpfile.h"
  35 #include "cfgloop.h"
  36 #include "tree-chrec.h"
  37 #include "tree-scalar-evolution.h"
  38 #include "tree-vectorizer.h"
  39 #include "diagnostic-core.h"
  40
  41 /* Need to include rtl.h, expr.h, etc. for optabs.  */
  42 #include "expr.h"
  43 #include "optabs.h"
  44
  45 /* Return true if load- or store-lanes optab OPTAB is implemented for
  46    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  47
  48 static bool
  49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  50                               tree vectype, unsigned HOST_WIDE_INT count)
  51 {
  52   enum machine_mode mode, array_mode;
  53   bool limit_p;
  54
  55   mode = TYPE_MODE (vectype);
  56   limit_p = !targetm.array_mode_supported_p (mode, count);
  57   array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
  58                               MODE_INT, limit_p);
  59
  60   if (array_mode == BLKmode)
  61     {
  62       if (dump_enabled_p ())
  63         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  64                          "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
  65                          GET_MODE_NAME (mode), count);
  66       return false;
  67     }
  68
  69   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  70     {
  71       if (dump_enabled_p ())
  72         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  73                          "cannot use %s<%s><%s>", name,
  74                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  75       return false;
  76     }
  77
  78   if (dump_enabled_p ())
  79     dump_printf_loc (MSG_NOTE, vect_location,
  80                      "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
  81                      GET_MODE_NAME (mode));
  82
  83   return true;
  84 }
  85
  86
  87 /* Return the smallest scalar part of STMT.
  88    This is used to determine the vectype of the stmt.  We generally set the
  89    vectype according to the type of the result (lhs).  For stmts whose
  90    result-type is different than the type of the arguments (e.g., demotion,
  91    promotion), vectype will be reset appropriately (later).  Note that we have
  92    to visit the smallest datatype in this function, because that determines the
  93    VF.  If the smallest datatype in the loop is present only as the rhs of a
  94    promotion operation - we'd miss it.
  95    Such a case, where a variable of this datatype does not appear in the lhs
  96    anywhere in the loop, can only occur if it's an invariant: e.g.:
  97    'int_x = (int) short_inv', which we'd expect to have been optimized away by
  98    invariant motion.  However, we cannot rely on invariant motion to always
  99    take invariants out of the loop, and so in the case of promotion we also
 100    have to check the rhs.
 101    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 102    types.  */
 103
 104 tree
 105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
 106                                HOST_WIDE_INT *rhs_size_unit)
 107 {
 108   tree scalar_type = gimple_expr_type (stmt);
 109   HOST_WIDE_INT lhs, rhs;
 110
 111   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 112
 113   if (is_gimple_assign (stmt)
 114       && (gimple_assign_cast_p (stmt)
 115           || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
 116           || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
 117           || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
 118     {
 119       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 120
 121       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 122       if (rhs < lhs)
 123         scalar_type = rhs_type;
 124     }
 125
 126   *lhs_size_unit = lhs;
 127   *rhs_size_unit = rhs;
 128   return scalar_type;
 129 }
 130
 131
 132 /* Find the place of the data-ref in STMT in the interleaving chain that starts
 133    from FIRST_STMT.  Return -1 if the data-ref is not a part of the chain.  */
 134
 135 int
 136 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
 137 {
 138   gimple next_stmt = first_stmt;
 139   int result = 0;
 140
 141   if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
 142     return -1;
 143
 144   while (next_stmt && next_stmt != stmt)
 145     {
 146       result++;
 147       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
 148     }
 149
 150   if (next_stmt)
 151     return result;
 152   else
 153     return -1;
 154 }
 155
 156
 157 /* Check if data references pointed by DR_I and DR_J are same or
 158    belong to same interleaving group.  Return FALSE if drs are
 159    different, otherwise return TRUE.  */
 160
 161 static bool
 162 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
 163 {
 164   gimple stmt_i = DR_STMT (dr_i);
 165   gimple stmt_j = DR_STMT (dr_j);
 166
 167   if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
 168       || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
 169             && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
 170             && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
 171                 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
 172     return true;
 173   else
 174     return false;
 175 }
 176
 177 /* If address ranges represented by DDR_I and DDR_J are equal,
 178    return TRUE, otherwise return FALSE.  */
 179
 180 static bool
 181 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
 182 {
 183   if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
 184        && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
 185       || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
 186           && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
 187     return true;
 188   else
 189     return false;
 190 }
 191
 192 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 193    tested at run-time.  Return TRUE if DDR was successfully inserted.
 194    Return false if versioning is not supported.  */
 195
 196 static bool
 197 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 198 {
 199   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 200
 201   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 202     return false;
 203
 204   if (dump_enabled_p ())
 205     {
 206       dump_printf_loc (MSG_NOTE, vect_location,
 207                        "mark for run-time aliasing test between ");
 208       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
 209       dump_printf (MSG_NOTE,  " and ");
 210       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
 211     }
 212
 213   if (optimize_loop_nest_for_size_p (loop))
 214     {
 215       if (dump_enabled_p ())
 216         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 217                          "versioning not supported when optimizing for size.");
 218       return false;
 219     }
 220
 221   /* FORNOW: We don't support versioning with outer-loop vectorization.  */
 222   if (loop->inner)
 223     {
 224       if (dump_enabled_p ())
 225         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 226                          "versioning not yet supported for outer-loops.");
 227       return false;
 228     }
 229
 230   /* FORNOW: We don't support creating runtime alias tests for non-constant
 231      step.  */
 232   if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
 233       || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
 234     {
 235       if (dump_enabled_p ())
 236         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 237                          "versioning not yet supported for non-constant "
 238                          "step");
 239       return false;
 240     }
 241
 242   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 243   return true;
 244 }
 245
 246
 247 /* Function vect_analyze_data_ref_dependence.
 248
 249    Return TRUE if there (might) exist a dependence between a memory-reference
 250    DRA and a memory-reference DRB.  When versioning for alias may check a
 251    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 252    the data dependence.  */
 253
 254 static bool
 255 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 256                                   loop_vec_info loop_vinfo, int *max_vf)
 257 {
 258   unsigned int i;
 259   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 260   struct data_reference *dra = DDR_A (ddr);
 261   struct data_reference *drb = DDR_B (ddr);
 262   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
 263   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
 264   lambda_vector dist_v;
 265   unsigned int loop_depth;
 266
 267   /* In loop analysis all data references should be vectorizable.  */
 268   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 269       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 270     gcc_unreachable ();
 271
 272   /* Independent data accesses.  */
 273   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 274     return false;
 275
 276   if (dra == drb
 277       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 278     return false;
 279
 280   /* Unknown data dependence.  */
 281   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 282     {
 283       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 284           || STMT_VINFO_GATHER_P (stmtinfo_b))
 285         {
 286           if (dump_enabled_p ())
 287             {
 288               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 289                                "versioning for alias not supported for: "
 290                                "can't determine dependence between ");
 291               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 292                                  DR_REF (dra));
 293               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 294               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 295                                  DR_REF (drb));
 296             }
 297           return false;
 298         }
 299
 300       if (dump_enabled_p ())
 301         {
 302           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 303                            "versioning for alias required: "
 304                            "can't determine dependence between ");
 305           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 306                              DR_REF (dra));
 307           dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 308           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 309                              DR_REF (drb));
 310         }
 311
 312       /* Add to list of ddrs that need to be tested at run-time.  */
 313       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 314     }
 315
 316   /* Known data dependence.  */
 317   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 318     {
 319       if (STMT_VINFO_GATHER_P (stmtinfo_a)
 320           || STMT_VINFO_GATHER_P (stmtinfo_b))
 321         {
 322           if (dump_enabled_p ())
 323             {
 324               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 325                                "versioning for alias not supported for: "
 326                                "bad dist vector for ");
 327               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 328                                  DR_REF (dra));
 329               dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 330               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 331                                  DR_REF (drb));
 332             }
 333           return false;
 334         }
 335
 336       if (dump_enabled_p ())
 337         {
 338           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                            "versioning for alias required: "
 340                            "bad dist vector for ");
 341           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 342           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 343           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 344         }
 345       /* Add to list of ddrs that need to be tested at run-time.  */
 346       return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 347     }
 348
 349   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 350   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 351     {
 352       int dist = dist_v[loop_depth];
 353
 354       if (dump_enabled_p ())
 355         dump_printf_loc (MSG_NOTE, vect_location,
 356                          "dependence distance  = %d.", dist);
 357
 358       if (dist == 0)
 359         {
 360           if (dump_enabled_p ())
 361             {
 362               dump_printf_loc (MSG_NOTE, vect_location,
 363                                "dependence distance == 0 between ");
 364               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 365               dump_printf (MSG_NOTE, " and ");
 366               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 367             }
 368
 369           /* For interleaving, mark that there is a read-write dependency if
 370              necessary.  We check before that one of the data-refs is store.  */
 371           if (DR_IS_READ (dra))
 372             GROUP_READ_WRITE_DEPENDENCE (stmtinfo_a) = true;
 373           else
 374             {
 375               if (DR_IS_READ (drb))
 376                 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_b) = true;
 377             }
 378
 379           continue;
 380         }
 381
 382       if (dist > 0 && DDR_REVERSED_P (ddr))
 383         {
 384           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 385              reversed (to make distance vector positive), and the actual
 386              distance is negative.  */
 387           if (dump_enabled_p ())
 388             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                              "dependence distance negative.");
 390           continue;
 391         }
 392
 393       if (abs (dist) >= 2
 394           && abs (dist) < *max_vf)
 395         {
 396           /* The dependence distance requires reduction of the maximal
 397              vectorization factor.  */
 398           *max_vf = abs (dist);
 399           if (dump_enabled_p ())
 400             dump_printf_loc (MSG_NOTE, vect_location,
 401                              "adjusting maximal vectorization factor to %i",
 402                              *max_vf);
 403         }
 404
 405       if (abs (dist) >= *max_vf)
 406         {
 407           /* Dependence distance does not create dependence, as far as
 408              vectorization is concerned, in this case.  */
 409           if (dump_enabled_p ())
 410             dump_printf_loc (MSG_NOTE, vect_location,
 411                              "dependence distance >= VF.");
 412           continue;
 413         }
 414
 415       if (dump_enabled_p ())
 416         {
 417           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                        "not vectorized, possible dependence "
 419                        "between data-refs ");
 420           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 421           dump_printf (MSG_NOTE,  " and ");
 422           dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 423         }
 424
 425       return true;
 426     }
 427
 428   return false;
 429 }
 430
 431 /* Function vect_analyze_data_ref_dependences.
 432
 433    Examine all the data references in the loop, and make sure there do not
 434    exist any data dependences between them.  Set *MAX_VF according to
 435    the maximum vectorization factor the data dependences allow.  */
 436
 437 bool
 438 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
 439 {
 440   unsigned int i;
 441   struct data_dependence_relation *ddr;
 442
 443   if (dump_enabled_p ())
 444     dump_printf_loc (MSG_NOTE, vect_location,
 445                      "=== vect_analyze_data_ref_dependences ===");
 446
 447   if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 448                                 &LOOP_VINFO_DDRS (loop_vinfo),
 449                                 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
 450     return false;
 451
 452   FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 453     if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
 454       return false;
 455
 456   return true;
 457 }
 458
 459
 460 /* Function vect_slp_analyze_data_ref_dependence.
 461
 462    Return TRUE if there (might) exist a dependence between a memory-reference
 463    DRA and a memory-reference DRB.  When versioning for alias may check a
 464    dependence at run-time, return FALSE.  Adjust *MAX_VF according to
 465    the data dependence.  */
 466
 467 static bool
 468 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
 469 {
 470   struct data_reference *dra = DDR_A (ddr);
 471   struct data_reference *drb = DDR_B (ddr);
 472
 473   /* We need to check dependences of statements marked as unvectorizable
 474      as well, they still can prohibit vectorization.  */
 475
 476   /* Independent data accesses.  */
 477   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 478     return false;
 479
 480   if (dra == drb)
 481     return false;
 482
 483   /* Read-read is OK.  */
 484   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 485     return false;
 486
 487   /* If dra and drb are part of the same interleaving chain consider
 488      them independent.  */
 489   if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
 490       && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
 491           == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
 492     return false;
 493
 494   /* Unknown data dependence.  */
 495   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 496     {
 497       gimple earlier_stmt;
 498
 499       if (dump_enabled_p ())
 500         {
 501           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 502                            "can't determine dependence between ");
 503           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
 504           dump_printf (MSG_MISSED_OPTIMIZATION,  " and ");
 505           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
 506         }
 507
 508       /* We do not vectorize basic blocks with write-write dependencies.  */
 509       if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 510         return true;
 511
 512       /* Check that it's not a load-after-store dependence.  */
 513       earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 514       if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 515         return true;
 516
 517       return false;
 518     }
 519
 520   if (dump_enabled_p ())
 521     {
 522       dump_printf_loc (MSG_NOTE, vect_location,
 523                        "determined dependence between ");
 524       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
 525       dump_printf (MSG_NOTE, " and ");
 526       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
 527     }
 528
 529   /* Do not vectorize basic blocks with write-write dependences.  */
 530   if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
 531     return true;
 532
 533   /* Check dependence between DRA and DRB for basic block vectorization.
 534      If the accesses share same bases and offsets, we can compare their initial
 535      constant offsets to decide whether they differ or not.  In case of a read-
 536      write dependence we check that the load is before the store to ensure that
 537      vectorization will not change the order of the accesses.  */
 538
 539   HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
 540   gimple earlier_stmt;
 541
 542   /* Check that the data-refs have same bases and offsets.  If not, we can't
 543      determine if they are dependent.  */
 544   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
 545       || !dr_equal_offsets_p (dra, drb))
 546     return true;
 547
 548   /* Check the types.  */
 549   type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
 550   type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
 551
 552   if (type_size_a != type_size_b
 553       || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
 554                               TREE_TYPE (DR_REF (drb))))
 555     return true;
 556
 557   init_a = TREE_INT_CST_LOW (DR_INIT (dra));
 558   init_b = TREE_INT_CST_LOW (DR_INIT (drb));
 559
 560   /* Two different locations - no dependence.  */
 561   if (init_a != init_b)
 562     return false;
 563
 564   /* We have a read-write dependence.  Check that the load is before the store.
 565      When we vectorize basic blocks, vector load can be only before
 566      corresponding scalar load, and vector store can be only after its
 567      corresponding scalar store.  So the order of the acceses is preserved in
 568      case the load is before the store.  */
 569   earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
 570   if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
 571     return false;
 572
 573   return true;
 574 }
 575
 576
 577 /* Function vect_analyze_data_ref_dependences.
 578
 579    Examine all the data references in the basic-block, and make sure there
 580    do not exist any data dependences between them.  Set *MAX_VF according to
 581    the maximum vectorization factor the data dependences allow.  */
 582
 583 bool
 584 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
 585 {
 586   struct data_dependence_relation *ddr;
 587   unsigned int i;
 588
 589   if (dump_enabled_p ())
 590     dump_printf_loc (MSG_NOTE, vect_location,
 591                      "=== vect_slp_analyze_data_ref_dependences ===");
 592
 593   if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
 594                                 &BB_VINFO_DDRS (bb_vinfo),
 595                                 vNULL, true))
 596     return false;
 597
 598   FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
 599     if (vect_slp_analyze_data_ref_dependence (ddr))
 600       return false;
 601
 602   return true;
 603 }
 604
 605
 606 /* Function vect_compute_data_ref_alignment
 607
 608    Compute the misalignment of the data reference DR.
 609
 610    Output:
 611    1. If during the misalignment computation it is found that the data reference
 612       cannot be vectorized then false is returned.
 613    2. DR_MISALIGNMENT (DR) is defined.
 614
 615    FOR NOW: No analysis is actually performed. Misalignment is calculated
 616    only for trivial cases. TODO.  */
 617
 618 static bool
 619 vect_compute_data_ref_alignment (struct data_reference *dr)
 620 {
 621   gimple stmt = DR_STMT (dr);
 622   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 623   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 624   struct loop *loop = NULL;
 625   tree ref = DR_REF (dr);
 626   tree vectype;
 627   tree base, base_addr;
 628   bool base_aligned;
 629   tree misalign;
 630   tree aligned_to, alignment;
 631
 632   if (dump_enabled_p ())
 633     dump_printf_loc (MSG_NOTE, vect_location,
 634                      "vect_compute_data_ref_alignment:");
 635
 636   if (loop_vinfo)
 637     loop = LOOP_VINFO_LOOP (loop_vinfo);
 638
 639   /* Initialize misalignment to unknown.  */
 640   SET_DR_MISALIGNMENT (dr, -1);
 641
 642   /* Strided loads perform only component accesses, misalignment information
 643      is irrelevant for them.  */
 644   if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 645     return true;
 646
 647   misalign = DR_INIT (dr);
 648   aligned_to = DR_ALIGNED_TO (dr);
 649   base_addr = DR_BASE_ADDRESS (dr);
 650   vectype = STMT_VINFO_VECTYPE (stmt_info);
 651
 652   /* In case the dataref is in an inner-loop of the loop that is being
 653      vectorized (LOOP), we use the base and misalignment information
 654      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 655      stays the same throughout the execution of the inner-loop, which is why
 656      we have to check that the stride of the dataref in the inner-loop evenly
 657      divides by the vector size.  */
 658   if (loop && nested_in_vect_loop_p (loop, stmt))
 659     {
 660       tree step = DR_STEP (dr);
 661       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 662
 663       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
 664         {
 665           if (dump_enabled_p ())
 666             dump_printf_loc (MSG_NOTE, vect_location,
 667                              "inner step divides the vector-size.");
 668           misalign = STMT_VINFO_DR_INIT (stmt_info);
 669           aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
 670           base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
 671         }
 672       else
 673         {
 674           if (dump_enabled_p ())
 675             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 676                              "inner step doesn't divide the vector-size.");
 677           misalign = NULL_TREE;
 678         }
 679     }
 680
 681   /* Similarly, if we're doing basic-block vectorization, we can only use
 682      base and misalignment information relative to an innermost loop if the
 683      misalignment stays the same throughout the execution of the loop.
 684      As above, this is the case if the stride of the dataref evenly divides
 685      by the vector size.  */
 686   if (!loop)
 687     {
 688       tree step = DR_STEP (dr);
 689       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
 690
 691       if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
 692         {
 693           if (dump_enabled_p ())
 694             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 695                              "SLP: step doesn't divide the vector-size.");
 696           misalign = NULL_TREE;
 697         }
 698     }
 699
 700   base = build_fold_indirect_ref (base_addr);
 701   alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
 702
 703   if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
 704       || !misalign)
 705     {
 706       if (dump_enabled_p ())
 707         {
 708           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 709                            "Unknown alignment for access: ");
 710           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
 711         }
 712       return true;
 713     }
 714
 715   if ((DECL_P (base)
 716        && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
 717                                 alignment) >= 0)
 718       || (TREE_CODE (base_addr) == SSA_NAME
 719           && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
 720                                                       TREE_TYPE (base_addr)))),
 721                                    alignment) >= 0)
 722       || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
 723     base_aligned = true;
 724   else
 725     base_aligned = false;
 726
 727   if (!base_aligned)
 728     {
 729       /* Do not change the alignment of global variables here if
 730          flag_section_anchors is enabled as we already generated
 731          RTL for other functions.  Most global variables should
 732          have been aligned during the IPA increase_alignment pass.  */
 733       if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
 734           || (TREE_STATIC (base) && flag_section_anchors))
 735         {
 736           if (dump_enabled_p ())
 737             {
 738               dump_printf_loc (MSG_NOTE, vect_location,
 739                                "can't force alignment of ref: ");
 740               dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 741             }
 742           return true;
 743         }
 744
 745       /* Force the alignment of the decl.
 746          NOTE: This is the only change to the code we make during
 747          the analysis phase, before deciding to vectorize the loop.  */
 748       if (dump_enabled_p ())
 749         {
 750           dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
 751           dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
 752         }
 753
 754       DECL_ALIGN (base) = TYPE_ALIGN (vectype);
 755       DECL_USER_ALIGN (base) = 1;
 756     }
 757
 758   /* At this point we assume that the base is aligned.  */
 759   gcc_assert (base_aligned
 760               || (TREE_CODE (base) == VAR_DECL
 761                   && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
 762
 763   /* If this is a backward running DR then first access in the larger
 764      vectype actually is N-1 elements before the address in the DR.
 765      Adjust misalign accordingly.  */
 766   if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
 767     {
 768       tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
 769       /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
 770          otherwise we wouldn't be here.  */
 771       offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
 772       /* PLUS because DR_STEP was negative.  */
 773       misalign = size_binop (PLUS_EXPR, misalign, offset);
 774     }
 775
 776   /* Modulo alignment.  */
 777   misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
 778
 779   if (!host_integerp (misalign, 1))
 780     {
 781       /* Negative or overflowed misalignment value.  */
 782       if (dump_enabled_p ())
 783         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 784                          "unexpected misalign value");
 785       return false;
 786     }
 787
 788   SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
 789
 790   if (dump_enabled_p ())
 791     {
 792       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 793                        "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
 794       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
 795     }
 796
 797   return true;
 798 }
 799
 800
 801 /* Function vect_compute_data_refs_alignment
 802
 803    Compute the misalignment of data references in the loop.
 804    Return FALSE if a data reference is found that cannot be vectorized.  */
 805
 806 static bool
 807 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
 808                                   bb_vec_info bb_vinfo)
 809 {
 810   vec<data_reference_p> datarefs;
 811   struct data_reference *dr;
 812   unsigned int i;
 813
 814   if (loop_vinfo)
 815     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 816   else
 817     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 818
 819   FOR_EACH_VEC_ELT (datarefs, i, dr)
 820     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
 821         && !vect_compute_data_ref_alignment (dr))
 822       {
 823         if (bb_vinfo)
 824           {
 825             /* Mark unsupported statement as unvectorizable.  */
 826             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
 827             continue;
 828           }
 829         else
 830           return false;
 831       }
 832
 833   return true;
 834 }
 835
 836
 837 /* Function vect_update_misalignment_for_peel
 838
 839    DR - the data reference whose misalignment is to be adjusted.
 840    DR_PEEL - the data reference whose misalignment is being made
 841              zero in the vector loop by the peel.
 842    NPEEL - the number of iterations in the peel loop if the misalignment
 843            of DR_PEEL is known at compile time.  */
 844
 845 static void
 846 vect_update_misalignment_for_peel (struct data_reference *dr,
 847                                    struct data_reference *dr_peel, int npeel)
 848 {
 849   unsigned int i;
 850   vec<dr_p> same_align_drs;
 851   struct data_reference *current_dr;
 852   int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
 853   int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
 854   stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
 855   stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
 856
 857  /* For interleaved data accesses the step in the loop must be multiplied by
 858      the size of the interleaving group.  */
 859   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 860     dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
 861   if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
 862     dr_peel_size *= GROUP_SIZE (peel_stmt_info);
 863
 864   /* It can be assumed that the data refs with the same alignment as dr_peel
 865      are aligned in the vector loop.  */
 866   same_align_drs
 867     = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
 868   FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
 869     {
 870       if (current_dr != dr)
 871         continue;
 872       gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
 873                   DR_MISALIGNMENT (dr_peel) / dr_peel_size);
 874       SET_DR_MISALIGNMENT (dr, 0);
 875       return;
 876     }
 877
 878   if (known_alignment_for_access_p (dr)
 879       && known_alignment_for_access_p (dr_peel))
 880     {
 881       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
 882       int misal = DR_MISALIGNMENT (dr);
 883       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 884       misal += negative ? -npeel * dr_size : npeel * dr_size;
 885       misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
 886       SET_DR_MISALIGNMENT (dr, misal);
 887       return;
 888     }
 889
 890   if (dump_enabled_p ())
 891     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
 892   SET_DR_MISALIGNMENT (dr, -1);
 893 }
 894
 895
 896 /* Function vect_verify_datarefs_alignment
 897
 898    Return TRUE if all data references in the loop can be
 899    handled with respect to alignment.  */
 900
 901 bool
 902 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
 903 {
 904   vec<data_reference_p> datarefs;
 905   struct data_reference *dr;
 906   enum dr_alignment_support supportable_dr_alignment;
 907   unsigned int i;
 908
 909   if (loop_vinfo)
 910     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
 911   else
 912     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
 913
 914   FOR_EACH_VEC_ELT (datarefs, i, dr)
 915     {
 916       gimple stmt = DR_STMT (dr);
 917       stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 918
 919       if (!STMT_VINFO_RELEVANT_P (stmt_info))
 920         continue;
 921
 922       /* For interleaving, only the alignment of the first access matters.
 923          Skip statements marked as not vectorizable.  */
 924       if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
 925            && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
 926           || !STMT_VINFO_VECTORIZABLE (stmt_info))
 927         continue;
 928
 929       /* Strided loads perform only component accesses, alignment is
 930          irrelevant for them.  */
 931       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
 932         continue;
 933
 934       supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
 935       if (!supportable_dr_alignment)
 936         {
 937           if (dump_enabled_p ())
 938             {
 939               if (DR_IS_READ (dr))
 940                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 941                                  "not vectorized: unsupported unaligned load.");
 942               else
 943                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 944                                  "not vectorized: unsupported unaligned "
 945                                  "store.");
 946
 947               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 948                                  DR_REF (dr));
 949             }
 950           return false;
 951         }
 952       if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
 953         dump_printf_loc (MSG_NOTE, vect_location,
 954                          "Vectorizing an unaligned access.");
 955     }
 956   return true;
 957 }
 958
 959 /* Given an memory reference EXP return whether its alignment is less
 960    than its size.  */
 961
 962 static bool
 963 not_size_aligned (tree exp)
 964 {
 965   if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
 966     return true;
 967
 968   return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
 969           > get_object_alignment (exp));
 970 }
 971
 972 /* Function vector_alignment_reachable_p
 973
 974    Return true if vector alignment for DR is reachable by peeling
 975    a few loop iterations.  Return false otherwise.  */
 976
 977 static bool
 978 vector_alignment_reachable_p (struct data_reference *dr)
 979 {
 980   gimple stmt = DR_STMT (dr);
 981   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 982   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 983
 984   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
 985     {
 986       /* For interleaved access we peel only if number of iterations in
 987          the prolog loop ({VF - misalignment}), is a multiple of the
 988          number of the interleaved accesses.  */
 989       int elem_size, mis_in_elements;
 990       int nelements = TYPE_VECTOR_SUBPARTS (vectype);
 991
 992       /* FORNOW: handle only known alignment.  */
 993       if (!known_alignment_for_access_p (dr))
 994         return false;
 995
 996       elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
 997       mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
 998
 999       if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1000         return false;
1001     }
1002
1003   /* If misalignment is known at the compile time then allow peeling
1004      only if natural alignment is reachable through peeling.  */
1005   if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1006     {
1007       HOST_WIDE_INT elmsize =
1008                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1009       if (dump_enabled_p ())
1010         {
1011           dump_printf_loc (MSG_NOTE, vect_location,
1012                            "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1013           dump_printf (MSG_NOTE,
1014                        ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1015         }
1016       if (DR_MISALIGNMENT (dr) % elmsize)
1017         {
1018           if (dump_enabled_p ())
1019             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1020                              "data size does not divide the misalignment.\n");
1021           return false;
1022         }
1023     }
1024
1025   if (!known_alignment_for_access_p (dr))
1026     {
1027       tree type = TREE_TYPE (DR_REF (dr));
1028       bool is_packed = not_size_aligned (DR_REF (dr));
1029       if (dump_enabled_p ())
1030         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1031                          "Unknown misalignment, is_packed = %d",is_packed);
1032       if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
1033         return true;
1034       else
1035         return false;
1036     }
1037
1038   return true;
1039 }
1040
1041
1042 /* Calculate the cost of the memory access represented by DR.  */
1043
1044 static void
1045 vect_get_data_access_cost (struct data_reference *dr,
1046                            unsigned int *inside_cost,
1047                            unsigned int *outside_cost,
1048                            stmt_vector_for_cost *body_cost_vec)
1049 {
1050   gimple stmt = DR_STMT (dr);
1051   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1052   int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1053   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1054   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1055   int ncopies = vf / nunits;
1056
1057   if (DR_IS_READ (dr))
1058     vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1059                         NULL, body_cost_vec, false);
1060   else
1061     vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1062
1063   if (dump_enabled_p ())
1064     dump_printf_loc (MSG_NOTE, vect_location,
1065                      "vect_get_data_access_cost: inside_cost = %d, "
1066                      "outside_cost = %d.", *inside_cost, *outside_cost);
1067 }
1068
1069
1070 static hashval_t
1071 vect_peeling_hash (const void *elem)
1072 {
1073   const struct _vect_peel_info *peel_info;
1074
1075   peel_info = (const struct _vect_peel_info *) elem;
1076   return (hashval_t) peel_info->npeel;
1077 }
1078
1079
1080 static int
1081 vect_peeling_hash_eq (const void *elem1, const void *elem2)
1082 {
1083   const struct _vect_peel_info *a, *b;
1084
1085   a = (const struct _vect_peel_info *) elem1;
1086   b = (const struct _vect_peel_info *) elem2;
1087   return (a->npeel == b->npeel);
1088 }
1089
1090
1091 /* Insert DR into peeling hash table with NPEEL as key.  */
1092
1093 static void
1094 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1095                           int npeel)
1096 {
1097   struct _vect_peel_info elem, *slot;
1098   void **new_slot;
1099   bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1100
1101   elem.npeel = npeel;
1102   slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1103                                      &elem);
1104   if (slot)
1105     slot->count++;
1106   else
1107     {
1108       slot = XNEW (struct _vect_peel_info);
1109       slot->npeel = npeel;
1110       slot->dr = dr;
1111       slot->count = 1;
1112       new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot,
1113                                  INSERT);
1114       *new_slot = slot;
1115     }
1116
1117   if (!supportable_dr_alignment && !flag_vect_cost_model)
1118     slot->count += VECT_MAX_COST;
1119 }
1120
1121
1122 /* Traverse peeling hash table to find peeling option that aligns maximum
1123    number of data accesses.  */
1124
1125 static int
1126 vect_peeling_hash_get_most_frequent (void **slot, void *data)
1127 {
1128   vect_peel_info elem = (vect_peel_info) *slot;
1129   vect_peel_extended_info max = (vect_peel_extended_info) data;
1130
1131   if (elem->count > max->peel_info.count
1132       || (elem->count == max->peel_info.count
1133           && max->peel_info.npeel > elem->npeel))
1134     {
1135       max->peel_info.npeel = elem->npeel;
1136       max->peel_info.count = elem->count;
1137       max->peel_info.dr = elem->dr;
1138     }
1139
1140   return 1;
1141 }
1142
1143
1144 /* Traverse peeling hash table and calculate cost for each peeling option.
1145    Find the one with the lowest cost.  */
1146
1147 static int
1148 vect_peeling_hash_get_lowest_cost (void **slot, void *data)
1149 {
1150   vect_peel_info elem = (vect_peel_info) *slot;
1151   vect_peel_extended_info min = (vect_peel_extended_info) data;
1152   int save_misalignment, dummy;
1153   unsigned int inside_cost = 0, outside_cost = 0, i;
1154   gimple stmt = DR_STMT (elem->dr);
1155   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1156   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1157   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1158   struct data_reference *dr;
1159   stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1160   int single_iter_cost;
1161
1162   prologue_cost_vec.create (2);
1163   body_cost_vec.create (2);
1164   epilogue_cost_vec.create (2);
1165
1166   FOR_EACH_VEC_ELT (datarefs, i, dr)
1167     {
1168       stmt = DR_STMT (dr);
1169       stmt_info = vinfo_for_stmt (stmt);
1170       /* For interleaving, only the alignment of the first access
1171          matters.  */
1172       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1173           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1174         continue;
1175
1176       save_misalignment = DR_MISALIGNMENT (dr);
1177       vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1178       vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1179                                  &body_cost_vec);
1180       SET_DR_MISALIGNMENT (dr, save_misalignment);
1181     }
1182
1183   single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1184   outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1185                                                &dummy, single_iter_cost,
1186                                                &prologue_cost_vec,
1187                                                &epilogue_cost_vec);
1188
1189   /* Prologue and epilogue costs are added to the target model later.
1190      These costs depend only on the scalar iteration cost, the
1191      number of peeling iterations finally chosen, and the number of
1192      misaligned statements.  So discard the information found here.  */
1193   prologue_cost_vec.release ();
1194   epilogue_cost_vec.release ();
1195
1196   if (inside_cost < min->inside_cost
1197       || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1198     {
1199       min->inside_cost = inside_cost;
1200       min->outside_cost = outside_cost;
1201       min->body_cost_vec.release ();
1202       min->body_cost_vec = body_cost_vec;
1203       min->peel_info.dr = elem->dr;
1204       min->peel_info.npeel = elem->npeel;
1205     }
1206   else
1207     body_cost_vec.release ();
1208
1209   return 1;
1210 }
1211
1212
1213 /* Choose best peeling option by traversing peeling hash table and either
1214    choosing an option with the lowest cost (if cost model is enabled) or the
1215    option that aligns as many accesses as possible.  */
1216
1217 static struct data_reference *
1218 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1219                                        unsigned int *npeel,
1220                                        stmt_vector_for_cost *body_cost_vec)
1221 {
1222    struct _vect_peel_extended_info res;
1223
1224    res.peel_info.dr = NULL;
1225    res.body_cost_vec = stmt_vector_for_cost();
1226
1227    if (flag_vect_cost_model)
1228      {
1229        res.inside_cost = INT_MAX;
1230        res.outside_cost = INT_MAX;
1231        htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1232                       vect_peeling_hash_get_lowest_cost, &res);
1233      }
1234    else
1235      {
1236        res.peel_info.count = 0;
1237        htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1238                       vect_peeling_hash_get_most_frequent, &res);
1239      }
1240
1241    *npeel = res.peel_info.npeel;
1242    *body_cost_vec = res.body_cost_vec;
1243    return res.peel_info.dr;
1244 }
1245
1246
1247 /* Function vect_enhance_data_refs_alignment
1248
1249    This pass will use loop versioning and loop peeling in order to enhance
1250    the alignment of data references in the loop.
1251
1252    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1253    original loop is to be vectorized.  Any other loops that are created by
1254    the transformations performed in this pass - are not supposed to be
1255    vectorized.  This restriction will be relaxed.
1256
1257    This pass will require a cost model to guide it whether to apply peeling
1258    or versioning or a combination of the two.  For example, the scheme that
1259    intel uses when given a loop with several memory accesses, is as follows:
1260    choose one memory access ('p') which alignment you want to force by doing
1261    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1262    other accesses are not necessarily aligned, or (2) use loop versioning to
1263    generate one loop in which all accesses are aligned, and another loop in
1264    which only 'p' is necessarily aligned.
1265
1266    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1267    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1268    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1269
1270    Devising a cost model is the most critical aspect of this work.  It will
1271    guide us on which access to peel for, whether to use loop versioning, how
1272    many versions to create, etc.  The cost model will probably consist of
1273    generic considerations as well as target specific considerations (on
1274    powerpc for example, misaligned stores are more painful than misaligned
1275    loads).
1276
1277    Here are the general steps involved in alignment enhancements:
1278
1279      -- original loop, before alignment analysis:
1280         for (i=0; i<N; i++){
1281           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1282           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1283         }
1284
1285      -- After vect_compute_data_refs_alignment:
1286         for (i=0; i<N; i++){
1287           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1288           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1289         }
1290
1291      -- Possibility 1: we do loop versioning:
1292      if (p is aligned) {
1293         for (i=0; i<N; i++){    # loop 1A
1294           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1295           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1296         }
1297      }
1298      else {
1299         for (i=0; i<N; i++){    # loop 1B
1300           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1301           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1302         }
1303      }
1304
1305      -- Possibility 2: we do loop peeling:
1306      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1307         x = q[i];
1308         p[i] = y;
1309      }
1310      for (i = 3; i < N; i++){   # loop 2A
1311         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1312         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1313      }
1314
1315      -- Possibility 3: combination of loop peeling and versioning:
1316      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1317         x = q[i];
1318         p[i] = y;
1319      }
1320      if (p is aligned) {
1321         for (i = 3; i<N; i++){  # loop 3A
1322           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1323           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1324         }
1325      }
1326      else {
1327         for (i = 3; i<N; i++){  # loop 3B
1328           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1329           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1330         }
1331      }
1332
1333      These loops are later passed to loop_transform to be vectorized.  The
1334      vectorizer will use the alignment information to guide the transformation
1335      (whether to generate regular loads/stores, or with special handling for
1336      misalignment).  */
1337
1338 bool
1339 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1340 {
1341   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1342   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1343   enum dr_alignment_support supportable_dr_alignment;
1344   struct data_reference *dr0 = NULL, *first_store = NULL;
1345   struct data_reference *dr;
1346   unsigned int i, j;
1347   bool do_peeling = false;
1348   bool do_versioning = false;
1349   bool stat;
1350   gimple stmt;
1351   stmt_vec_info stmt_info;
1352   int vect_versioning_for_alias_required;
1353   unsigned int npeel = 0;
1354   bool all_misalignments_unknown = true;
1355   unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1356   unsigned possible_npeel_number = 1;
1357   tree vectype;
1358   unsigned int nelements, mis, same_align_drs_max = 0;
1359   stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1360
1361   if (dump_enabled_p ())
1362     dump_printf_loc (MSG_NOTE, vect_location,
1363                      "=== vect_enhance_data_refs_alignment ===");
1364
1365   /* While cost model enhancements are expected in the future, the high level
1366      view of the code at this time is as follows:
1367
1368      A) If there is a misaligned access then see if peeling to align
1369         this access can make all data references satisfy
1370         vect_supportable_dr_alignment.  If so, update data structures
1371         as needed and return true.
1372
1373      B) If peeling wasn't possible and there is a data reference with an
1374         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1375         then see if loop versioning checks can be used to make all data
1376         references satisfy vect_supportable_dr_alignment.  If so, update
1377         data structures as needed and return true.
1378
1379      C) If neither peeling nor versioning were successful then return false if
1380         any data reference does not satisfy vect_supportable_dr_alignment.
1381
1382      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1383
1384      Note, Possibility 3 above (which is peeling and versioning together) is not
1385      being done at this time.  */
1386
1387   /* (1) Peeling to force alignment.  */
1388
1389   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1390      Considerations:
1391      + How many accesses will become aligned due to the peeling
1392      - How many accesses will become unaligned due to the peeling,
1393        and the cost of misaligned accesses.
1394      - The cost of peeling (the extra runtime checks, the increase
1395        in code size).  */
1396
1397   FOR_EACH_VEC_ELT (datarefs, i, dr)
1398     {
1399       stmt = DR_STMT (dr);
1400       stmt_info = vinfo_for_stmt (stmt);
1401
1402       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1403         continue;
1404
1405       /* For interleaving, only the alignment of the first access
1406          matters.  */
1407       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1408           && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1409         continue;
1410
1411       /* For invariant accesses there is nothing to enhance.  */
1412       if (integer_zerop (DR_STEP (dr)))
1413         continue;
1414
1415       /* Strided loads perform only component accesses, alignment is
1416          irrelevant for them.  */
1417       if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1418         continue;
1419
1420       supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1421       do_peeling = vector_alignment_reachable_p (dr);
1422       if (do_peeling)
1423         {
1424           if (known_alignment_for_access_p (dr))
1425             {
1426               unsigned int npeel_tmp;
1427               bool negative = tree_int_cst_compare (DR_STEP (dr),
1428                                                     size_zero_node) < 0;
1429
1430               /* Save info about DR in the hash table.  */
1431               if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1432                 LOOP_VINFO_PEELING_HTAB (loop_vinfo) =
1433                            htab_create (1, vect_peeling_hash,
1434                                         vect_peeling_hash_eq, free);
1435
1436               vectype = STMT_VINFO_VECTYPE (stmt_info);
1437               nelements = TYPE_VECTOR_SUBPARTS (vectype);
1438               mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1439                                                 TREE_TYPE (DR_REF (dr))));
1440               npeel_tmp = (negative
1441                            ? (mis - nelements) : (nelements - mis))
1442                   & (nelements - 1);
1443
1444               /* For multiple types, it is possible that the bigger type access
1445                  will have more than one peeling option.  E.g., a loop with two
1446                  types: one of size (vector size / 4), and the other one of
1447                  size (vector size / 8).  Vectorization factor will 8.  If both
1448                  access are misaligned by 3, the first one needs one scalar
1449                  iteration to be aligned, and the second one needs 5.  But the
1450                  the first one will be aligned also by peeling 5 scalar
1451                  iterations, and in that case both accesses will be aligned.
1452                  Hence, except for the immediate peeling amount, we also want
1453                  to try to add full vector size, while we don't exceed
1454                  vectorization factor.
1455                  We do this automtically for cost model, since we calculate cost
1456                  for every peeling option.  */
1457               if (!flag_vect_cost_model)
1458                 possible_npeel_number = vf /nelements;
1459
1460               /* Handle the aligned case. We may decide to align some other
1461                  access, making DR unaligned.  */
1462               if (DR_MISALIGNMENT (dr) == 0)
1463                 {
1464                   npeel_tmp = 0;
1465                   if (!flag_vect_cost_model)
1466                     possible_npeel_number++;
1467                 }
1468
1469               for (j = 0; j < possible_npeel_number; j++)
1470                 {
1471                   gcc_assert (npeel_tmp <= vf);
1472                   vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1473                   npeel_tmp += nelements;
1474                 }
1475
1476               all_misalignments_unknown = false;
1477               /* Data-ref that was chosen for the case that all the
1478                  misalignments are unknown is not relevant anymore, since we
1479                  have a data-ref with known alignment.  */
1480               dr0 = NULL;
1481             }
1482           else
1483             {
1484               /* If we don't know all the misalignment values, we prefer
1485                  peeling for data-ref that has maximum number of data-refs
1486                  with the same alignment, unless the target prefers to align
1487                  stores over load.  */
1488               if (all_misalignments_unknown)
1489                 {
1490                   if (same_align_drs_max
1491                         < STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ()
1492                       || !dr0)
1493                     {
1494                       same_align_drs_max
1495                           = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1496                       dr0 = dr;
1497                     }
1498
1499                   if (!first_store && DR_IS_WRITE (dr))
1500                     first_store = dr;
1501                 }
1502
1503               /* If there are both known and unknown misaligned accesses in the
1504                  loop, we choose peeling amount according to the known
1505                  accesses.  */
1506
1507
1508               if (!supportable_dr_alignment)
1509                 {
1510                   dr0 = dr;
1511                   if (!first_store && DR_IS_WRITE (dr))
1512                     first_store = dr;
1513                 }
1514             }
1515         }
1516       else
1517         {
1518           if (!aligned_access_p (dr))
1519             {
1520               if (dump_enabled_p ())
1521                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1522                                  "vector alignment may not be reachable");
1523               break;
1524             }
1525         }
1526     }
1527
1528   vect_versioning_for_alias_required
1529     = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
1530
1531   /* Temporarily, if versioning for alias is required, we disable peeling
1532      until we support peeling and versioning.  Often peeling for alignment
1533      will require peeling for loop-bound, which in turn requires that we
1534      know how to adjust the loop ivs after the loop.  */
1535   if (vect_versioning_for_alias_required
1536       || !vect_can_advance_ivs_p (loop_vinfo)
1537       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1538     do_peeling = false;
1539
1540   if (do_peeling && all_misalignments_unknown
1541       && vect_supportable_dr_alignment (dr0, false))
1542     {
1543
1544       /* Check if the target requires to prefer stores over loads, i.e., if
1545          misaligned stores are more expensive than misaligned loads (taking
1546          drs with same alignment into account).  */
1547       if (first_store && DR_IS_READ (dr0))
1548         {
1549           unsigned int load_inside_cost = 0, load_outside_cost = 0;
1550           unsigned int store_inside_cost = 0, store_outside_cost = 0;
1551           unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1552           unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1553           stmt_vector_for_cost dummy;
1554           dummy.create (2);
1555
1556           vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1557                                      &dummy);
1558           vect_get_data_access_cost (first_store, &store_inside_cost,
1559                                      &store_outside_cost, &dummy);
1560
1561           dummy.release ();
1562
1563           /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1564              aligning the load DR0).  */
1565           load_inside_penalty = store_inside_cost;
1566           load_outside_penalty = store_outside_cost;
1567           for (i = 0;
1568                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1569                           DR_STMT (first_store))).iterate (i, &dr);
1570                i++)
1571             if (DR_IS_READ (dr))
1572               {
1573                 load_inside_penalty += load_inside_cost;
1574                 load_outside_penalty += load_outside_cost;
1575               }
1576             else
1577               {
1578                 load_inside_penalty += store_inside_cost;
1579                 load_outside_penalty += store_outside_cost;
1580               }
1581
1582           /* Calculate the penalty for leaving DR0 unaligned (by
1583              aligning the FIRST_STORE).  */
1584           store_inside_penalty = load_inside_cost;
1585           store_outside_penalty = load_outside_cost;
1586           for (i = 0;
1587                STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1588                       DR_STMT (dr0))).iterate (i, &dr);
1589                i++)
1590             if (DR_IS_READ (dr))
1591               {
1592                 store_inside_penalty += load_inside_cost;
1593                 store_outside_penalty += load_outside_cost;
1594               }
1595             else
1596               {
1597                 store_inside_penalty += store_inside_cost;
1598                 store_outside_penalty += store_outside_cost;
1599               }
1600
1601           if (load_inside_penalty > store_inside_penalty
1602               || (load_inside_penalty == store_inside_penalty
1603                   && load_outside_penalty > store_outside_penalty))
1604             dr0 = first_store;
1605         }
1606
1607       /* In case there are only loads with different unknown misalignments, use
1608          peeling only if it may help to align other accesses in the loop.  */
1609       if (!first_store
1610           && !STMT_VINFO_SAME_ALIGN_REFS (
1611                   vinfo_for_stmt (DR_STMT (dr0))).length ()
1612           && vect_supportable_dr_alignment (dr0, false)
1613               != dr_unaligned_supported)
1614         do_peeling = false;
1615     }
1616
1617   if (do_peeling && !dr0)
1618     {
1619       /* Peeling is possible, but there is no data access that is not supported
1620          unless aligned. So we try to choose the best possible peeling.  */
1621
1622       /* We should get here only if there are drs with known misalignment.  */
1623       gcc_assert (!all_misalignments_unknown);
1624
1625       /* Choose the best peeling from the hash table.  */
1626       dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1627                                                    &body_cost_vec);
1628       if (!dr0 || !npeel)
1629         do_peeling = false;
1630     }
1631
1632   if (do_peeling)
1633     {
1634       stmt = DR_STMT (dr0);
1635       stmt_info = vinfo_for_stmt (stmt);
1636       vectype = STMT_VINFO_VECTYPE (stmt_info);
1637       nelements = TYPE_VECTOR_SUBPARTS (vectype);
1638
1639       if (known_alignment_for_access_p (dr0))
1640         {
1641           bool negative = tree_int_cst_compare (DR_STEP (dr0),
1642                                                 size_zero_node) < 0;
1643           if (!npeel)
1644             {
1645               /* Since it's known at compile time, compute the number of
1646                  iterations in the peeled loop (the peeling factor) for use in
1647                  updating DR_MISALIGNMENT values.  The peeling factor is the
1648                  vectorization factor minus the misalignment as an element
1649                  count.  */
1650               mis = DR_MISALIGNMENT (dr0);
1651               mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1652               npeel = ((negative ? mis - nelements : nelements - mis)
1653                        & (nelements - 1));
1654             }
1655
1656           /* For interleaved data access every iteration accesses all the
1657              members of the group, therefore we divide the number of iterations
1658              by the group size.  */
1659           stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1660           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1661             npeel /= GROUP_SIZE (stmt_info);
1662
1663           if (dump_enabled_p ())
1664             dump_printf_loc (MSG_NOTE, vect_location,
1665                              "Try peeling by %d", npeel);
1666         }
1667
1668       /* Ensure that all data refs can be vectorized after the peel.  */
1669       FOR_EACH_VEC_ELT (datarefs, i, dr)
1670         {
1671           int save_misalignment;
1672
1673           if (dr == dr0)
1674             continue;
1675
1676           stmt = DR_STMT (dr);
1677           stmt_info = vinfo_for_stmt (stmt);
1678           /* For interleaving, only the alignment of the first access
1679             matters.  */
1680           if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1681               && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1682             continue;
1683
1684           /* Strided loads perform only component accesses, alignment is
1685              irrelevant for them.  */
1686           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1687             continue;
1688
1689           save_misalignment = DR_MISALIGNMENT (dr);
1690           vect_update_misalignment_for_peel (dr, dr0, npeel);
1691           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1692           SET_DR_MISALIGNMENT (dr, save_misalignment);
1693
1694           if (!supportable_dr_alignment)
1695             {
1696               do_peeling = false;
1697               break;
1698             }
1699         }
1700
1701       if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1702         {
1703           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1704           if (!stat)
1705             do_peeling = false;
1706           else
1707             {
1708               body_cost_vec.release ();
1709               return stat;
1710             }
1711         }
1712
1713       if (do_peeling)
1714         {
1715           stmt_info_for_cost *si;
1716           void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1717
1718           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1719              If the misalignment of DR_i is identical to that of dr0 then set
1720              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
1721              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1722              by the peeling factor times the element size of DR_i (MOD the
1723              vectorization factor times the size).  Otherwise, the
1724              misalignment of DR_i must be set to unknown.  */
1725           FOR_EACH_VEC_ELT (datarefs, i, dr)
1726             if (dr != dr0)
1727               vect_update_misalignment_for_peel (dr, dr0, npeel);
1728
1729           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1730           if (npeel)
1731             LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1732           else
1733             LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1734           SET_DR_MISALIGNMENT (dr0, 0);
1735           if (dump_enabled_p ())
1736             {
1737               dump_printf_loc (MSG_NOTE, vect_location,
1738                                "Alignment of access forced using peeling.");
1739               dump_printf_loc (MSG_NOTE, vect_location,
1740                                "Peeling for alignment will be applied.");
1741             }
1742           /* We've delayed passing the inside-loop peeling costs to the
1743              target cost model until we were sure peeling would happen.
1744              Do so now.  */
1745           if (body_cost_vec.exists ())
1746             {
1747               FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1748                 {
1749                   struct _stmt_vec_info *stmt_info
1750                     = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1751                   (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1752                                         si->misalign, vect_body);
1753                 }
1754               body_cost_vec.release ();
1755             }
1756
1757           stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1758           gcc_assert (stat);
1759           return stat;
1760         }
1761     }
1762
1763   body_cost_vec.release ();
1764
1765   /* (2) Versioning to force alignment.  */
1766
1767   /* Try versioning if:
1768      1) flag_tree_vect_loop_version is TRUE
1769      2) optimize loop for speed
1770      3) there is at least one unsupported misaligned data ref with an unknown
1771         misalignment, and
1772      4) all misaligned data refs with a known misalignment are supported, and
1773      5) the number of runtime alignment checks is within reason.  */
1774
1775   do_versioning =
1776         flag_tree_vect_loop_version
1777         && optimize_loop_nest_for_speed_p (loop)
1778         && (!loop->inner); /* FORNOW */
1779
1780   if (do_versioning)
1781     {
1782       FOR_EACH_VEC_ELT (datarefs, i, dr)
1783         {
1784           stmt = DR_STMT (dr);
1785           stmt_info = vinfo_for_stmt (stmt);
1786
1787           /* For interleaving, only the alignment of the first access
1788              matters.  */
1789           if (aligned_access_p (dr)
1790               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1791                   && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1792             continue;
1793
1794           /* Strided loads perform only component accesses, alignment is
1795              irrelevant for them.  */
1796           if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1797             continue;
1798
1799           supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1800
1801           if (!supportable_dr_alignment)
1802             {
1803               gimple stmt;
1804               int mask;
1805               tree vectype;
1806
1807               if (known_alignment_for_access_p (dr)
1808                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1809                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1810                 {
1811                   do_versioning = false;
1812                   break;
1813                 }
1814
1815               stmt = DR_STMT (dr);
1816               vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1817               gcc_assert (vectype);
1818
1819               /* The rightmost bits of an aligned address must be zeros.
1820                  Construct the mask needed for this test.  For example,
1821                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1822                  mask must be 15 = 0xf. */
1823               mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1824
1825               /* FORNOW: use the same mask to test all potentially unaligned
1826                  references in the loop.  The vectorizer currently supports
1827                  a single vector size, see the reference to
1828                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1829                  vectorization factor is computed.  */
1830               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1831                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1832               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1833               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1834                       DR_STMT (dr));
1835             }
1836         }
1837
1838       /* Versioning requires at least one misaligned data reference.  */
1839       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1840         do_versioning = false;
1841       else if (!do_versioning)
1842         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1843     }
1844
1845   if (do_versioning)
1846     {
1847       vec<gimple> may_misalign_stmts
1848         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1849       gimple stmt;
1850
1851       /* It can now be assumed that the data references in the statements
1852          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1853          of the loop being vectorized.  */
1854       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1855         {
1856           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1857           dr = STMT_VINFO_DATA_REF (stmt_info);
1858           SET_DR_MISALIGNMENT (dr, 0);
1859           if (dump_enabled_p ())
1860             dump_printf_loc (MSG_NOTE, vect_location,
1861                              "Alignment of access forced using versioning.");
1862         }
1863
1864       if (dump_enabled_p ())
1865         dump_printf_loc (MSG_NOTE, vect_location,
1866                          "Versioning for alignment will be applied.");
1867
1868       /* Peeling and versioning can't be done together at this time.  */
1869       gcc_assert (! (do_peeling && do_versioning));
1870
1871       stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1872       gcc_assert (stat);
1873       return stat;
1874     }
1875
1876   /* This point is reached if neither peeling nor versioning is being done.  */
1877   gcc_assert (! (do_peeling || do_versioning));
1878
1879   stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1880   return stat;
1881 }
1882
1883
1884 /* Function vect_find_same_alignment_drs.
1885
1886    Update group and alignment relations according to the chosen
1887    vectorization factor.  */
1888
1889 static void
1890 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1891                               loop_vec_info loop_vinfo)
1892 {
1893   unsigned int i;
1894   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1895   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1896   struct data_reference *dra = DDR_A (ddr);
1897   struct data_reference *drb = DDR_B (ddr);
1898   stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1899   stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1900   int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1901   int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1902   lambda_vector dist_v;
1903   unsigned int loop_depth;
1904
1905   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1906     return;
1907
1908   if (dra == drb)
1909     return;
1910
1911   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1912     return;
1913
1914   /* Loop-based vectorization and known data dependence.  */
1915   if (DDR_NUM_DIST_VECTS (ddr) == 0)
1916     return;
1917
1918   /* Data-dependence analysis reports a distance vector of zero
1919      for data-references that overlap only in the first iteration
1920      but have different sign step (see PR45764).
1921      So as a sanity check require equal DR_STEP.  */
1922   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1923     return;
1924
1925   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1926   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1927     {
1928       int dist = dist_v[loop_depth];
1929
1930       if (dump_enabled_p ())
1931         dump_printf_loc (MSG_NOTE, vect_location,
1932                          "dependence distance  = %d.", dist);
1933
1934       /* Same loop iteration.  */
1935       if (dist == 0
1936           || (dist % vectorization_factor == 0 && dra_size == drb_size))
1937         {
1938           /* Two references with distance zero have the same alignment.  */
1939           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1940           STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1941           if (dump_enabled_p ())
1942             {
1943               dump_printf_loc (MSG_NOTE, vect_location,
1944                                "accesses have the same alignment.");
1945               dump_printf (MSG_NOTE,
1946                            "dependence distance modulo vf == 0 between ");
1947               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1948               dump_printf (MSG_NOTE,  " and ");
1949               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1950             }
1951         }
1952     }
1953 }
1954
1955
1956 /* Function vect_analyze_data_refs_alignment
1957
1958    Analyze the alignment of the data-references in the loop.
1959    Return FALSE if a data reference is found that cannot be vectorized.  */
1960
1961 bool
1962 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1963                                   bb_vec_info bb_vinfo)
1964 {
1965   if (dump_enabled_p ())
1966     dump_printf_loc (MSG_NOTE, vect_location,
1967                      "=== vect_analyze_data_refs_alignment ===");
1968
1969   /* Mark groups of data references with same alignment using
1970      data dependence information.  */
1971   if (loop_vinfo)
1972     {
1973       vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1974       struct data_dependence_relation *ddr;
1975       unsigned int i;
1976
1977       FOR_EACH_VEC_ELT (ddrs, i, ddr)
1978         vect_find_same_alignment_drs (ddr, loop_vinfo);
1979     }
1980
1981   if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1982     {
1983       if (dump_enabled_p ())
1984         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1985                          "not vectorized: can't calculate alignment "
1986                          "for data ref.");
1987       return false;
1988     }
1989
1990   return true;
1991 }
1992
1993
1994 /* Analyze groups of accesses: check that DR belongs to a group of
1995    accesses of legal size, step, etc.  Detect gaps, single element
1996    interleaving, and other special cases. Set grouped access info.
1997    Collect groups of strided stores for further use in SLP analysis.  */
1998
1999 static bool
2000 vect_analyze_group_access (struct data_reference *dr)
2001 {
2002   tree step = DR_STEP (dr);
2003   tree scalar_type = TREE_TYPE (DR_REF (dr));
2004   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2005   gimple stmt = DR_STMT (dr);
2006   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2007   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2008   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2009   HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2010   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2011   bool slp_impossible = false;
2012   struct loop *loop = NULL;
2013
2014   if (loop_vinfo)
2015     loop = LOOP_VINFO_LOOP (loop_vinfo);
2016
2017   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2018      size of the interleaving group (including gaps).  */
2019   groupsize = dr_step / type_size;
2020
2021   /* Not consecutive access is possible only if it is a part of interleaving.  */
2022   if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2023     {
2024       /* Check if it this DR is a part of interleaving, and is a single
2025          element of the group that is accessed in the loop.  */
2026
2027       /* Gaps are supported only for loads. STEP must be a multiple of the type
2028          size.  The size of the group must be a power of 2.  */
2029       if (DR_IS_READ (dr)
2030           && (dr_step % type_size) == 0
2031           && groupsize > 0
2032           && exact_log2 (groupsize) != -1)
2033         {
2034           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2035           GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2036           if (dump_enabled_p ())
2037             {
2038               dump_printf_loc (MSG_NOTE, vect_location,
2039                                "Detected single element interleaving ");
2040               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2041               dump_printf (MSG_NOTE, " step ");
2042               dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2043             }
2044
2045           if (loop_vinfo)
2046             {
2047               if (dump_enabled_p ())
2048                 dump_printf_loc (MSG_NOTE, vect_location,
2049                                  "Data access with gaps requires scalar "
2050                                  "epilogue loop");
2051               if (loop->inner)
2052                 {
2053                   if (dump_enabled_p ())
2054                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2055                                      "Peeling for outer loop is not"
2056                                      " supported");
2057                   return false;
2058                 }
2059
2060               LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2061             }
2062
2063           return true;
2064         }
2065
2066       if (dump_enabled_p ())
2067         {
2068           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2069                            "not consecutive access ");
2070           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2071         }
2072
2073       if (bb_vinfo)
2074         {
2075           /* Mark the statement as unvectorizable.  */
2076           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2077           return true;
2078         }
2079
2080       return false;
2081     }
2082
2083   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2084     {
2085       /* First stmt in the interleaving chain. Check the chain.  */
2086       gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2087       struct data_reference *data_ref = dr;
2088       unsigned int count = 1;
2089       tree next_step;
2090       tree prev_init = DR_INIT (data_ref);
2091       gimple prev = stmt;
2092       HOST_WIDE_INT diff, count_in_bytes, gaps = 0;
2093
2094       while (next)
2095         {
2096           /* Skip same data-refs.  In case that two or more stmts share
2097              data-ref (supported only for loads), we vectorize only the first
2098              stmt, and the rest get their vectorized loads from the first
2099              one.  */
2100           if (!tree_int_cst_compare (DR_INIT (data_ref),
2101                                      DR_INIT (STMT_VINFO_DATA_REF (
2102                                                    vinfo_for_stmt (next)))))
2103             {
2104               if (DR_IS_WRITE (data_ref))
2105                 {
2106                   if (dump_enabled_p ())
2107                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108                                      "Two store stmts share the same dr.");
2109                   return false;
2110                 }
2111
2112               /* Check that there is no load-store dependencies for this loads
2113                  to prevent a case of load-store-load to the same location.  */
2114               if (GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next))
2115                   || GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev)))
2116                 {
2117                   if (dump_enabled_p ())
2118                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2119                                      "READ_WRITE dependence in interleaving.");
2120                   return false;
2121                 }
2122
2123               /* For load use the same data-ref load.  */
2124               GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2125
2126               prev = next;
2127               next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2128               continue;
2129             }
2130
2131           prev = next;
2132
2133           /* Check that all the accesses have the same STEP.  */
2134           next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
2135           if (tree_int_cst_compare (step, next_step))
2136             {
2137               if (dump_enabled_p ())
2138                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139                                  "not consecutive access in interleaving");
2140               return false;
2141             }
2142
2143           data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2144           /* Check that the distance between two accesses is equal to the type
2145              size. Otherwise, we have gaps.  */
2146           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2147                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2148           if (diff != 1)
2149             {
2150               /* FORNOW: SLP of accesses with gaps is not supported.  */
2151               slp_impossible = true;
2152               if (DR_IS_WRITE (data_ref))
2153                 {
2154                   if (dump_enabled_p ())
2155                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2156                                      "interleaved store with gaps");
2157                   return false;
2158                 }
2159
2160               gaps += diff - 1;
2161             }
2162
2163           last_accessed_element += diff;
2164
2165           /* Store the gap from the previous member of the group. If there is no
2166              gap in the access, GROUP_GAP is always 1.  */
2167           GROUP_GAP (vinfo_for_stmt (next)) = diff;
2168
2169           prev_init = DR_INIT (data_ref);
2170           next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2171           /* Count the number of data-refs in the chain.  */
2172           count++;
2173         }
2174
2175       /* COUNT is the number of accesses found, we multiply it by the size of
2176          the type to get COUNT_IN_BYTES.  */
2177       count_in_bytes = type_size * count;
2178
2179       /* Check that the size of the interleaving (including gaps) is not
2180          greater than STEP.  */
2181       if (dr_step && dr_step < count_in_bytes + gaps * type_size)
2182         {
2183           if (dump_enabled_p ())
2184             {
2185               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2186                                "interleaving size is greater than step for ");
2187               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2188             }
2189           return false;
2190         }
2191
2192       /* Check that the size of the interleaving is equal to STEP for stores,
2193          i.e., that there are no gaps.  */
2194       if (dr_step && dr_step != count_in_bytes)
2195         {
2196           if (DR_IS_READ (dr))
2197             {
2198               slp_impossible = true;
2199               /* There is a gap after the last load in the group. This gap is a
2200                  difference between the groupsize and the number of elements.
2201                  When there is no gap, this difference should be 0.  */
2202               GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2203             }
2204           else
2205             {
2206               if (dump_enabled_p ())
2207                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208                                  "interleaved store with gaps");
2209               return false;
2210             }
2211         }
2212
2213       /* Check that STEP is a multiple of type size.  */
2214       if (dr_step && (dr_step % type_size) != 0)
2215         {
2216           if (dump_enabled_p ())
2217             {
2218               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219                                "step is not a multiple of type size: step ");
2220               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2221               dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2222               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2223                                  TYPE_SIZE_UNIT (scalar_type));
2224             }
2225           return false;
2226         }
2227
2228       if (groupsize == 0)
2229         groupsize = count;
2230
2231       GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2232       if (dump_enabled_p ())
2233         dump_printf_loc (MSG_NOTE, vect_location,
2234                          "Detected interleaving of size %d", (int)groupsize);
2235
2236       /* SLP: create an SLP data structure for every interleaving group of
2237          stores for further analysis in vect_analyse_slp.  */
2238       if (DR_IS_WRITE (dr) && !slp_impossible)
2239         {
2240           if (loop_vinfo)
2241             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2242           if (bb_vinfo)
2243             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2244         }
2245
2246       /* There is a gap in the end of the group.  */
2247       if (groupsize - last_accessed_element > 0 && loop_vinfo)
2248         {
2249           if (dump_enabled_p ())
2250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2251                              "Data access with gaps requires scalar "
2252                              "epilogue loop");
2253           if (loop->inner)
2254             {
2255               if (dump_enabled_p ())
2256                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257                                  "Peeling for outer loop is not supported");
2258               return false;
2259             }
2260
2261           LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2262         }
2263     }
2264
2265   return true;
2266 }
2267
2268
2269 /* Analyze the access pattern of the data-reference DR.
2270    In case of non-consecutive accesses call vect_analyze_group_access() to
2271    analyze groups of accesses.  */
2272
2273 static bool
2274 vect_analyze_data_ref_access (struct data_reference *dr)
2275 {
2276   tree step = DR_STEP (dr);
2277   tree scalar_type = TREE_TYPE (DR_REF (dr));
2278   gimple stmt = DR_STMT (dr);
2279   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2280   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2281   struct loop *loop = NULL;
2282
2283   if (loop_vinfo)
2284     loop = LOOP_VINFO_LOOP (loop_vinfo);
2285
2286   if (loop_vinfo && !step)
2287     {
2288       if (dump_enabled_p ())
2289         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2290                          "bad data-ref access in loop");
2291       return false;
2292     }
2293
2294   /* Allow invariant loads in loops.  */
2295   if (loop_vinfo && integer_zerop (step))
2296     {
2297       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2298       return DR_IS_READ (dr);
2299     }
2300
2301   if (loop && nested_in_vect_loop_p (loop, stmt))
2302     {
2303       /* Interleaved accesses are not yet supported within outer-loop
2304         vectorization for references in the inner-loop.  */
2305       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2306
2307       /* For the rest of the analysis we use the outer-loop step.  */
2308       step = STMT_VINFO_DR_STEP (stmt_info);
2309       if (integer_zerop (step))
2310         {
2311           if (dump_enabled_p ())
2312             dump_printf_loc (MSG_NOTE, vect_location,
2313                              "zero step in outer loop.");
2314           if (DR_IS_READ (dr))
2315             return true;
2316           else
2317             return false;
2318         }
2319     }
2320
2321   /* Consecutive?  */
2322   if (TREE_CODE (step) == INTEGER_CST)
2323     {
2324       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2325       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2326           || (dr_step < 0
2327               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2328         {
2329           /* Mark that it is not interleaving.  */
2330           GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2331           return true;
2332         }
2333     }
2334
2335   if (loop && nested_in_vect_loop_p (loop, stmt))
2336     {
2337       if (dump_enabled_p ())
2338         dump_printf_loc (MSG_NOTE, vect_location,
2339                          "grouped access in outer loop.");
2340       return false;
2341     }
2342
2343   /* Assume this is a DR handled by non-constant strided load case.  */
2344   if (TREE_CODE (step) != INTEGER_CST)
2345     return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2346
2347   /* Not consecutive access - check if it's a part of interleaving group.  */
2348   return vect_analyze_group_access (dr);
2349 }
2350
2351 /* Compare two data-references DRA and DRB to group them into chunks
2352    suitable for grouping.  */
2353
2354 static int
2355 dr_group_sort_cmp (const void *dra_, const void *drb_)
2356 {
2357   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2358   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2359   hashval_t h1, h2;
2360   int cmp;
2361
2362   /* Stabilize sort.  */
2363   if (dra == drb)
2364     return 0;
2365
2366   /* Ordering of DRs according to base.  */
2367   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2368     {
2369       h1 = iterative_hash_expr (DR_BASE_ADDRESS (dra), 0);
2370       h2 = iterative_hash_expr (DR_BASE_ADDRESS (drb), 0);
2371       if (h1 != h2)
2372         return h1 < h2 ? -1 : 1;
2373     }
2374
2375   /* And according to DR_OFFSET.  */
2376   if (!dr_equal_offsets_p (dra, drb))
2377     {
2378       h1 = iterative_hash_expr (DR_OFFSET (dra), 0);
2379       h2 = iterative_hash_expr (DR_OFFSET (drb), 0);
2380       if (h1 != h2)
2381         return h1 < h2 ? -1 : 1;
2382     }
2383
2384   /* Put reads before writes.  */
2385   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2386     return DR_IS_READ (dra) ? -1 : 1;
2387
2388   /* Then sort after access size.  */
2389   if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2390                         TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2391     {
2392       h1 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 0);
2393       h2 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0);
2394       if (h1 != h2)
2395         return h1 < h2 ? -1 : 1;
2396     }
2397
2398   /* And after step.  */
2399   if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2400     {
2401       h1 = iterative_hash_expr (DR_STEP (dra), 0);
2402       h2 = iterative_hash_expr (DR_STEP (drb), 0);
2403       if (h1 != h2)
2404         return h1 < h2 ? -1 : 1;
2405     }
2406
2407   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2408   cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2409   if (cmp == 0)
2410     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2411   return cmp;
2412 }
2413
2414 /* Function vect_analyze_data_ref_accesses.
2415
2416    Analyze the access pattern of all the data references in the loop.
2417
2418    FORNOW: the only access pattern that is considered vectorizable is a
2419            simple step 1 (consecutive) access.
2420
2421    FORNOW: handle only arrays and pointer accesses.  */
2422
2423 bool
2424 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2425 {
2426   unsigned int i;
2427   vec<data_reference_p> datarefs;
2428   struct data_reference *dr;
2429
2430   if (dump_enabled_p ())
2431     dump_printf_loc (MSG_NOTE, vect_location,
2432                      "=== vect_analyze_data_ref_accesses ===");
2433
2434   if (loop_vinfo)
2435     datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2436   else
2437     datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2438
2439   if (datarefs.is_empty ())
2440     return true;
2441
2442   /* Sort the array of datarefs to make building the interleaving chains
2443      linear.  */
2444   qsort (datarefs.address(), datarefs.length (),
2445          sizeof (data_reference_p), dr_group_sort_cmp);
2446
2447   /* Build the interleaving chains.  */
2448   for (i = 0; i < datarefs.length () - 1;)
2449     {
2450       data_reference_p dra = datarefs[i];
2451       stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2452       stmt_vec_info lastinfo = NULL;
2453       for (i = i + 1; i < datarefs.length (); ++i)
2454         {
2455           data_reference_p drb = datarefs[i];
2456           stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2457
2458           /* ???  Imperfect sorting (non-compatible types, non-modulo
2459              accesses, same accesses) can lead to a group to be artificially
2460              split here as we don't just skip over those.  If it really
2461              matters we can push those to a worklist and re-iterate
2462              over them.  The we can just skip ahead to the next DR here.  */
2463
2464           /* Check that the data-refs have same first location (except init)
2465              and they are both either store or load (not load and store).  */
2466           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2467               || !operand_equal_p (DR_BASE_ADDRESS (dra),
2468                                    DR_BASE_ADDRESS (drb), 0)
2469               || !dr_equal_offsets_p (dra, drb))
2470             break;
2471
2472           /* Check that the data-refs have the same constant size and step.  */
2473           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2474           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2475           if (!host_integerp (sza, 1)
2476               || !host_integerp (szb, 1)
2477               || !tree_int_cst_equal (sza, szb)
2478               || !host_integerp (DR_STEP (dra), 0)
2479               || !host_integerp (DR_STEP (drb), 0)
2480               || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2481             break;
2482
2483           /* Do not place the same access in the interleaving chain twice.  */
2484           if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2485             break;
2486
2487           /* Check the types are compatible.
2488              ???  We don't distinguish this during sorting.  */
2489           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2490                                    TREE_TYPE (DR_REF (drb))))
2491             break;
2492
2493           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
2494           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2495           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2496           gcc_assert (init_a < init_b);
2497
2498           /* If init_b == init_a + the size of the type * k, we have an
2499              interleaving, and DRA is accessed before DRB.  */
2500           HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2501           if ((init_b - init_a) % type_size_a != 0)
2502             break;
2503
2504           /* The step (if not zero) is greater than the difference between
2505              data-refs' inits.  This splits groups into suitable sizes.  */
2506           HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2507           if (step != 0 && step <= (init_b - init_a))
2508             break;
2509
2510           if (dump_enabled_p ())
2511             {
2512               dump_printf_loc (MSG_NOTE, vect_location,
2513                                "Detected interleaving ");
2514               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2515               dump_printf (MSG_NOTE,  " and ");
2516               dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2517             }
2518
2519           /* Link the found element into the group list.  */
2520           if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2521             {
2522               GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2523               lastinfo = stmtinfo_a;
2524             }
2525           GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2526           GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2527           lastinfo = stmtinfo_b;
2528         }
2529     }
2530
2531   FOR_EACH_VEC_ELT (datarefs, i, dr)
2532     if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2533         && !vect_analyze_data_ref_access (dr))
2534       {
2535         if (dump_enabled_p ())
2536           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2537                            "not vectorized: complicated access pattern.");
2538
2539         if (bb_vinfo)
2540           {
2541             /* Mark the statement as not vectorizable.  */
2542             STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2543             continue;
2544           }
2545         else
2546           return false;
2547       }
2548
2549   return true;
2550 }
2551
2552 /* Function vect_prune_runtime_alias_test_list.
2553
2554    Prune a list of ddrs to be tested at run-time by versioning for alias.
2555    Return FALSE if resulting list of ddrs is longer then allowed by
2556    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
2557
2558 bool
2559 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2560 {
2561   vec<ddr_p>  ddrs =
2562     LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2563   unsigned i, j;
2564
2565   if (dump_enabled_p ())
2566     dump_printf_loc (MSG_NOTE, vect_location,
2567                      "=== vect_prune_runtime_alias_test_list ===");
2568
2569   for (i = 0; i < ddrs.length (); )
2570     {
2571       bool found;
2572       ddr_p ddr_i;
2573
2574       ddr_i = ddrs[i];
2575       found = false;
2576
2577       for (j = 0; j < i; j++)
2578         {
2579           ddr_p ddr_j = ddrs[j];
2580
2581           if (vect_vfa_range_equal (ddr_i, ddr_j))
2582             {
2583               if (dump_enabled_p ())
2584                 {
2585                   dump_printf_loc (MSG_NOTE, vect_location,
2586                                    "found equal ranges ");
2587                   dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2588                   dump_printf (MSG_NOTE,  ", ");
2589                   dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2590                   dump_printf (MSG_NOTE,  " and ");
2591                   dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2592                   dump_printf (MSG_NOTE,  ", ");
2593                   dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2594                 }
2595               found = true;
2596               break;
2597             }
2598         }
2599
2600       if (found)
2601       {
2602         ddrs.ordered_remove (i);
2603         continue;
2604       }
2605       i++;
2606     }
2607
2608   if (ddrs.length () >
2609        (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2610     {
2611       if (dump_enabled_p ())
2612         {
2613           dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
2614                            "disable versioning for alias - max number of "
2615                            "generated checks exceeded.");
2616         }
2617
2618       LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2619
2620       return false;
2621     }
2622
2623   return true;
2624 }
2625
2626 /* Check whether a non-affine read in stmt is suitable for gather load
2627    and if so, return a builtin decl for that operation.  */
2628
2629 tree
2630 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2631                    tree *offp, int *scalep)
2632 {
2633   HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2634   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2635   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2636   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2637   tree offtype = NULL_TREE;
2638   tree decl, base, off;
2639   enum machine_mode pmode;
2640   int punsignedp, pvolatilep;
2641
2642   /* The gather builtins need address of the form
2643      loop_invariant + vector * {1, 2, 4, 8}
2644      or
2645      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2646      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2647      of loop invariants/SSA_NAMEs defined in the loop, with casts,
2648      multiplications and additions in it.  To get a vector, we need
2649      a single SSA_NAME that will be defined in the loop and will
2650      contain everything that is not loop invariant and that can be
2651      vectorized.  The following code attempts to find such a preexistng
2652      SSA_NAME OFF and put the loop invariants into a tree BASE
2653      that can be gimplified before the loop.  */
2654   base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2655                               &pmode, &punsignedp, &pvolatilep, false);
2656   gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2657
2658   if (TREE_CODE (base) == MEM_REF)
2659     {
2660       if (!integer_zerop (TREE_OPERAND (base, 1)))
2661         {
2662           if (off == NULL_TREE)
2663             {
2664               double_int moff = mem_ref_offset (base);
2665               off = double_int_to_tree (sizetype, moff);
2666             }
2667           else
2668             off = size_binop (PLUS_EXPR, off,
2669                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
2670         }
2671       base = TREE_OPERAND (base, 0);
2672     }
2673   else
2674     base = build_fold_addr_expr (base);
2675
2676   if (off == NULL_TREE)
2677     off = size_zero_node;
2678
2679   /* If base is not loop invariant, either off is 0, then we start with just
2680      the constant offset in the loop invariant BASE and continue with base
2681      as OFF, otherwise give up.
2682      We could handle that case by gimplifying the addition of base + off
2683      into some SSA_NAME and use that as off, but for now punt.  */
2684   if (!expr_invariant_in_loop_p (loop, base))
2685     {
2686       if (!integer_zerop (off))
2687         return NULL_TREE;
2688       off = base;
2689       base = size_int (pbitpos / BITS_PER_UNIT);
2690     }
2691   /* Otherwise put base + constant offset into the loop invariant BASE
2692      and continue with OFF.  */
2693   else
2694     {
2695       base = fold_convert (sizetype, base);
2696       base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2697     }
2698
2699   /* OFF at this point may be either a SSA_NAME or some tree expression
2700      from get_inner_reference.  Try to peel off loop invariants from it
2701      into BASE as long as possible.  */
2702   STRIP_NOPS (off);
2703   while (offtype == NULL_TREE)
2704     {
2705       enum tree_code code;
2706       tree op0, op1, add = NULL_TREE;
2707
2708       if (TREE_CODE (off) == SSA_NAME)
2709         {
2710           gimple def_stmt = SSA_NAME_DEF_STMT (off);
2711
2712           if (expr_invariant_in_loop_p (loop, off))
2713             return NULL_TREE;
2714
2715           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2716             break;
2717
2718           op0 = gimple_assign_rhs1 (def_stmt);
2719           code = gimple_assign_rhs_code (def_stmt);
2720           op1 = gimple_assign_rhs2 (def_stmt);
2721         }
2722       else
2723         {
2724           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2725             return NULL_TREE;
2726           code = TREE_CODE (off);
2727           extract_ops_from_tree (off, &code, &op0, &op1);
2728         }
2729       switch (code)
2730         {
2731         case POINTER_PLUS_EXPR:
2732         case PLUS_EXPR:
2733           if (expr_invariant_in_loop_p (loop, op0))
2734             {
2735               add = op0;
2736               off = op1;
2737             do_add:
2738               add = fold_convert (sizetype, add);
2739               if (scale != 1)
2740                 add = size_binop (MULT_EXPR, add, size_int (scale));
2741               base = size_binop (PLUS_EXPR, base, add);
2742               continue;
2743             }
2744           if (expr_invariant_in_loop_p (loop, op1))
2745             {
2746               add = op1;
2747               off = op0;
2748               goto do_add;
2749             }
2750           break;
2751         case MINUS_EXPR:
2752           if (expr_invariant_in_loop_p (loop, op1))
2753             {
2754               add = fold_convert (sizetype, op1);
2755               add = size_binop (MINUS_EXPR, size_zero_node, add);
2756               off = op0;
2757               goto do_add;
2758             }
2759           break;
2760         case MULT_EXPR:
2761           if (scale == 1 && host_integerp (op1, 0))
2762             {
2763               scale = tree_low_cst (op1, 0);
2764               off = op0;
2765               continue;
2766             }
2767           break;
2768         case SSA_NAME:
2769           off = op0;
2770           continue;
2771         CASE_CONVERT:
2772           if (!POINTER_TYPE_P (TREE_TYPE (op0))
2773               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2774             break;
2775           if (TYPE_PRECISION (TREE_TYPE (op0))
2776               == TYPE_PRECISION (TREE_TYPE (off)))
2777             {
2778               off = op0;
2779               continue;
2780             }
2781           if (TYPE_PRECISION (TREE_TYPE (op0))
2782               < TYPE_PRECISION (TREE_TYPE (off)))
2783             {
2784               off = op0;
2785               offtype = TREE_TYPE (off);
2786               STRIP_NOPS (off);
2787               continue;
2788             }
2789           break;
2790         default:
2791           break;
2792         }
2793       break;
2794     }
2795
2796   /* If at the end OFF still isn't a SSA_NAME or isn't
2797      defined in the loop, punt.  */
2798   if (TREE_CODE (off) != SSA_NAME
2799       || expr_invariant_in_loop_p (loop, off))
2800     return NULL_TREE;
2801
2802   if (offtype == NULL_TREE)
2803     offtype = TREE_TYPE (off);
2804
2805   decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2806                                            offtype, scale);
2807   if (decl == NULL_TREE)
2808     return NULL_TREE;
2809
2810   if (basep)
2811     *basep = base;
2812   if (offp)
2813     *offp = off;
2814   if (scalep)
2815     *scalep = scale;
2816   return decl;
2817 }
2818
2819 /* Check wether a non-affine load in STMT (being in the loop referred to
2820    in LOOP_VINFO) is suitable for handling as strided load.  That is the case
2821    if its address is a simple induction variable.  If so return the base
2822    of that induction variable in *BASEP and the (loop-invariant) step
2823    in *STEPP, both only when that pointer is non-zero.
2824
2825    This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant
2826    base pointer) only.  */
2827
2828 static bool
2829 vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo)
2830 {
2831   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2832   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2833   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2834   tree base, off;
2835   affine_iv iv;
2836
2837   if (!DR_IS_READ (dr))
2838     return false;
2839
2840   base = DR_REF (dr);
2841
2842   if (TREE_CODE (base) == REALPART_EXPR
2843       || TREE_CODE (base) == IMAGPART_EXPR)
2844     base = TREE_OPERAND (base, 0);
2845
2846   if (TREE_CODE (base) == ARRAY_REF)
2847     {
2848       off = TREE_OPERAND (base, 1);
2849       base = TREE_OPERAND (base, 0);
2850     }
2851   else if (TREE_CODE (base) == MEM_REF)
2852     {
2853       off = TREE_OPERAND (base, 0);
2854       base = TREE_OPERAND (base, 1);
2855     }
2856   else
2857     return false;
2858
2859   if (TREE_CODE (off) != SSA_NAME)
2860     return false;
2861
2862   if (!expr_invariant_in_loop_p (loop, base)
2863       || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true))
2864     return false;
2865
2866   return true;
2867 }
2868
2869 /* Function vect_analyze_data_refs.
2870
2871   Find all the data references in the loop or basic block.
2872
2873    The general structure of the analysis of data refs in the vectorizer is as
2874    follows:
2875    1- vect_analyze_data_refs(loop/bb): call
2876       compute_data_dependences_for_loop/bb to find and analyze all data-refs
2877       in the loop/bb and their dependences.
2878    2- vect_analyze_dependences(): apply dependence testing using ddrs.
2879    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2880    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2881
2882 */
2883
2884 bool
2885 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2886                         bb_vec_info bb_vinfo,
2887                         int *min_vf)
2888 {
2889   struct loop *loop = NULL;
2890   basic_block bb = NULL;
2891   unsigned int i;
2892   vec<data_reference_p> datarefs;
2893   struct data_reference *dr;
2894   tree scalar_type;
2895
2896   if (dump_enabled_p ())
2897     dump_printf_loc (MSG_NOTE, vect_location,
2898                      "=== vect_analyze_data_refs ===\n");
2899
2900   if (loop_vinfo)
2901     {
2902       loop = LOOP_VINFO_LOOP (loop_vinfo);
2903       if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2904           || find_data_references_in_loop
2905                (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2906         {
2907           if (dump_enabled_p ())
2908             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2909                              "not vectorized: loop contains function calls"
2910                              " or data references that cannot be analyzed");
2911           return false;
2912         }
2913
2914       datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2915     }
2916   else
2917     {
2918       gimple_stmt_iterator gsi;
2919
2920       bb = BB_VINFO_BB (bb_vinfo);
2921       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2922         {
2923           gimple stmt = gsi_stmt (gsi);
2924           if (!find_data_references_in_stmt (NULL, stmt,
2925                                              &BB_VINFO_DATAREFS (bb_vinfo)))
2926             {
2927               /* Mark the rest of the basic-block as unvectorizable.  */
2928               for (; !gsi_end_p (gsi); gsi_next (&gsi))
2929                 {
2930                   stmt = gsi_stmt (gsi);
2931                   STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2932                 }
2933               break;
2934             }
2935         }
2936
2937       datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2938     }
2939
2940   /* Go through the data-refs, check that the analysis succeeded.  Update
2941      pointer from stmt_vec_info struct to DR and vectype.  */
2942
2943   FOR_EACH_VEC_ELT (datarefs, i, dr)
2944     {
2945       gimple stmt;
2946       stmt_vec_info stmt_info;
2947       tree base, offset, init;
2948       bool gather = false;
2949       int vf;
2950
2951       if (!dr || !DR_REF (dr))
2952         {
2953           if (dump_enabled_p ())
2954             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2955                              "not vectorized: unhandled data-ref ");
2956           return false;
2957         }
2958
2959       stmt = DR_STMT (dr);
2960       stmt_info = vinfo_for_stmt (stmt);
2961
2962       /* Check that analysis of the data-ref succeeded.  */
2963       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2964           || !DR_STEP (dr))
2965         {
2966           /* If target supports vector gather loads, see if they can't
2967              be used.  */
2968           if (loop_vinfo
2969               && DR_IS_READ (dr)
2970               && !TREE_THIS_VOLATILE (DR_REF (dr))
2971               && targetm.vectorize.builtin_gather != NULL
2972               && !nested_in_vect_loop_p (loop, stmt))
2973             {
2974               struct data_reference *newdr
2975                 = create_data_ref (NULL, loop_containing_stmt (stmt),
2976                                    DR_REF (dr), stmt, true);
2977               gcc_assert (newdr != NULL && DR_REF (newdr));
2978               if (DR_BASE_ADDRESS (newdr)
2979                   && DR_OFFSET (newdr)
2980                   && DR_INIT (newdr)
2981                   && DR_STEP (newdr)
2982                   && integer_zerop (DR_STEP (newdr)))
2983                 {
2984                   dr = newdr;
2985                   gather = true;
2986                 }
2987               else
2988                 free_data_ref (newdr);
2989             }
2990
2991           if (!gather)
2992             {
2993               if (dump_enabled_p ())
2994                 {
2995                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2996                                    "not vectorized: data ref analysis "
2997                                    "failed ");
2998                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2999                 }
3000
3001               if (bb_vinfo)
3002                 break;
3003
3004               return false;
3005             }
3006         }
3007
3008       if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3009         {
3010           if (dump_enabled_p ())
3011             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3012                              "not vectorized: base addr of dr is a "
3013                              "constant");
3014
3015           if (bb_vinfo)
3016             break;
3017
3018           if (gather)
3019             free_data_ref (dr);
3020           return false;
3021         }
3022
3023       if (TREE_THIS_VOLATILE (DR_REF (dr)))
3024         {
3025           if (dump_enabled_p ())
3026             {
3027               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3028                                "not vectorized: volatile type ");
3029               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3030             }
3031
3032           if (bb_vinfo)
3033             break;
3034
3035           return false;
3036         }
3037
3038       if (stmt_can_throw_internal (stmt))
3039         {
3040           if (dump_enabled_p ())
3041             {
3042               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3043                                "not vectorized: statement can throw an "
3044                                "exception ");
3045               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3046             }
3047
3048           if (bb_vinfo)
3049             break;
3050
3051           if (gather)
3052             free_data_ref (dr);
3053           return false;
3054         }
3055
3056       if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3057           && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3058         {
3059           if (dump_enabled_p ())
3060             {
3061               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3062                                "not vectorized: statement is bitfield "
3063                                "access ");
3064               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3065             }
3066
3067           if (bb_vinfo)
3068             break;
3069
3070           if (gather)
3071             free_data_ref (dr);
3072           return false;
3073         }
3074
3075       base = unshare_expr (DR_BASE_ADDRESS (dr));
3076       offset = unshare_expr (DR_OFFSET (dr));
3077       init = unshare_expr (DR_INIT (dr));
3078
3079       if (is_gimple_call (stmt))
3080         {
3081           if (dump_enabled_p ())
3082             {
3083               dump_printf_loc (MSG_MISSED_OPTIMIZATION,  vect_location,
3084                                "not vectorized: dr in a call ");
3085               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3086             }
3087
3088           if (bb_vinfo)
3089             break;
3090
3091           if (gather)
3092             free_data_ref (dr);
3093           return false;
3094         }
3095
3096       /* Update DR field in stmt_vec_info struct.  */
3097
3098       /* If the dataref is in an inner-loop of the loop that is considered for
3099          for vectorization, we also want to analyze the access relative to
3100          the outer-loop (DR contains information only relative to the
3101          inner-most enclosing loop).  We do that by building a reference to the
3102          first location accessed by the inner-loop, and analyze it relative to
3103          the outer-loop.  */
3104       if (loop && nested_in_vect_loop_p (loop, stmt))
3105         {
3106           tree outer_step, outer_base, outer_init;
3107           HOST_WIDE_INT pbitsize, pbitpos;
3108           tree poffset;
3109           enum machine_mode pmode;
3110           int punsignedp, pvolatilep;
3111           affine_iv base_iv, offset_iv;
3112           tree dinit;
3113
3114           /* Build a reference to the first location accessed by the
3115              inner-loop: *(BASE+INIT).  (The first location is actually
3116              BASE+INIT+OFFSET, but we add OFFSET separately later).  */
3117           tree inner_base = build_fold_indirect_ref
3118                                 (fold_build_pointer_plus (base, init));
3119
3120           if (dump_enabled_p ())
3121             {
3122               dump_printf_loc (MSG_NOTE, vect_location,
3123                                "analyze in outer-loop: ");
3124               dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3125             }
3126
3127           outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3128                           &poffset, &pmode, &punsignedp, &pvolatilep, false);
3129           gcc_assert (outer_base != NULL_TREE);
3130
3131           if (pbitpos % BITS_PER_UNIT != 0)
3132             {
3133               if (dump_enabled_p ())
3134                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3135                                  "failed: bit offset alignment.\n");
3136               return false;
3137             }
3138
3139           outer_base = build_fold_addr_expr (outer_base);
3140           if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3141                           &base_iv, false))
3142             {
3143               if (dump_enabled_p ())
3144                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3145                                  "failed: evolution of base is not affine.\n");
3146               return false;
3147             }
3148
3149           if (offset)
3150             {
3151               if (poffset)
3152                 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3153                                        poffset);
3154               else
3155                 poffset = offset;
3156             }
3157
3158           if (!poffset)
3159             {
3160               offset_iv.base = ssize_int (0);
3161               offset_iv.step = ssize_int (0);
3162             }
3163           else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3164                                &offset_iv, false))
3165             {
3166               if (dump_enabled_p ())
3167                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3168                                  "evolution of offset is not affine.\n");
3169               return false;
3170             }
3171
3172           outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3173           split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3174           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3175           split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3176           outer_init =  size_binop (PLUS_EXPR, outer_init, dinit);
3177
3178           outer_step = size_binop (PLUS_EXPR,
3179                                 fold_convert (ssizetype, base_iv.step),
3180                                 fold_convert (ssizetype, offset_iv.step));
3181
3182           STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3183           /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3184           STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3185           STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3186           STMT_VINFO_DR_OFFSET (stmt_info) =
3187                                 fold_convert (ssizetype, offset_iv.base);
3188           STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3189                                 size_int (highest_pow2_factor (offset_iv.base));
3190
3191           if (dump_enabled_p ())
3192             {
3193               dump_printf_loc (MSG_NOTE, vect_location,
3194                                "\touter base_address: ");
3195               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3196                                  STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3197               dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3198               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3199                                  STMT_VINFO_DR_OFFSET (stmt_info));
3200               dump_printf (MSG_NOTE,
3201                            "\n\touter constant offset from base address: ");
3202               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3203                                  STMT_VINFO_DR_INIT (stmt_info));
3204               dump_printf (MSG_NOTE, "\n\touter step: ");
3205               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3206                                  STMT_VINFO_DR_STEP (stmt_info));
3207               dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3208               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3209                                  STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3210             }
3211         }
3212
3213       if (STMT_VINFO_DATA_REF (stmt_info))
3214         {
3215           if (dump_enabled_p ())
3216             {
3217               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3218                                "not vectorized: more than one data ref "
3219                                "in stmt: ");
3220               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3221             }
3222
3223           if (bb_vinfo)
3224             break;
3225
3226           if (gather)
3227             free_data_ref (dr);
3228           return false;
3229         }
3230
3231       STMT_VINFO_DATA_REF (stmt_info) = dr;
3232
3233       /* Set vectype for STMT.  */
3234       scalar_type = TREE_TYPE (DR_REF (dr));
3235       STMT_VINFO_VECTYPE (stmt_info) =
3236                 get_vectype_for_scalar_type (scalar_type);
3237       if (!STMT_VINFO_VECTYPE (stmt_info))
3238         {
3239           if (dump_enabled_p ())
3240             {
3241               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3242                                "not vectorized: no vectype for stmt: ");
3243               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3244               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3245               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3246                                  scalar_type);
3247             }
3248
3249           if (bb_vinfo)
3250             break;
3251
3252           if (gather)
3253             {
3254               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3255               free_data_ref (dr);
3256             }
3257           return false;
3258         }
3259
3260       /* Adjust the minimal vectorization factor according to the
3261          vector type.  */
3262       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3263       if (vf > *min_vf)
3264         *min_vf = vf;
3265
3266       if (gather)
3267         {
3268           tree off;
3269
3270           gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3271           if (gather
3272               && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3273             gather = false;
3274           if (!gather)
3275             {
3276               STMT_VINFO_DATA_REF (stmt_info) = NULL;
3277               free_data_ref (dr);
3278               if (dump_enabled_p ())
3279                 {
3280                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3281                                    "not vectorized: not suitable for gather "
3282                                    "load ");
3283                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3284                 }
3285               return false;
3286             }
3287
3288           datarefs[i] = dr;
3289           STMT_VINFO_GATHER_P (stmt_info) = true;
3290         }
3291       else if (loop_vinfo
3292                && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3293         {
3294           bool strided_load = false;
3295           if (!nested_in_vect_loop_p (loop, stmt))
3296             strided_load = vect_check_strided_load (stmt, loop_vinfo);
3297           if (!strided_load)
3298             {
3299               if (dump_enabled_p ())
3300                 {
3301                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3302                                    "not vectorized: not suitable for strided "
3303                                    "load ");
3304                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3305                 }
3306               return false;
3307             }
3308           STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3309         }
3310     }
3311
3312   /* If we stopped analysis at the first dataref we could not analyze
3313      when trying to vectorize a basic-block mark the rest of the datarefs
3314      as not vectorizable and truncate the vector of datarefs.  That
3315      avoids spending useless time in analyzing their dependence.  */
3316   if (i != datarefs.length ())
3317     {
3318       gcc_assert (bb_vinfo != NULL);
3319       for (unsigned j = i; j < datarefs.length (); ++j)
3320         {
3321           data_reference_p dr = datarefs[j];
3322           STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3323           free_data_ref (dr);
3324         }
3325       datarefs.truncate (i);
3326     }
3327
3328   return true;
3329 }
3330
3331
3332 /* Function vect_get_new_vect_var.
3333
3334    Returns a name for a new variable.  The current naming scheme appends the
3335    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3336    the name of vectorizer generated variables, and appends that to NAME if
3337    provided.  */
3338
3339 tree
3340 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3341 {
3342   const char *prefix;
3343   tree new_vect_var;
3344
3345   switch (var_kind)
3346   {
3347   case vect_simple_var:
3348     prefix = "vect_";
3349     break;
3350   case vect_scalar_var:
3351     prefix = "stmp_";
3352     break;
3353   case vect_pointer_var:
3354     prefix = "vect_p";
3355     break;
3356   default:
3357     gcc_unreachable ();
3358   }
3359
3360   if (name)
3361     {
3362       char* tmp = concat (prefix, name, NULL);
3363       new_vect_var = create_tmp_reg (type, tmp);
3364       free (tmp);
3365     }
3366   else
3367     new_vect_var = create_tmp_reg (type, prefix);
3368
3369   return new_vect_var;
3370 }
3371
3372
3373 /* Function vect_create_addr_base_for_vector_ref.
3374
3375    Create an expression that computes the address of the first memory location
3376    that will be accessed for a data reference.
3377
3378    Input:
3379    STMT: The statement containing the data reference.
3380    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3381    OFFSET: Optional. If supplied, it is be added to the initial address.
3382    LOOP:    Specify relative to which loop-nest should the address be computed.
3383             For example, when the dataref is in an inner-loop nested in an
3384             outer-loop that is now being vectorized, LOOP can be either the
3385             outer-loop, or the inner-loop.  The first memory location accessed
3386             by the following dataref ('in' points to short):
3387
3388                 for (i=0; i<N; i++)
3389                    for (j=0; j<M; j++)
3390                      s += in[i+j]
3391
3392             is as follows:
3393             if LOOP=i_loop:     &in             (relative to i_loop)
3394             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
3395
3396    Output:
3397    1. Return an SSA_NAME whose value is the address of the memory location of
3398       the first vector of the data reference.
3399    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3400       these statement(s) which define the returned SSA_NAME.
3401
3402    FORNOW: We are only handling array accesses with step 1.  */
3403
3404 tree
3405 vect_create_addr_base_for_vector_ref (gimple stmt,
3406                                       gimple_seq *new_stmt_list,
3407                                       tree offset,
3408                                       struct loop *loop)
3409 {
3410   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3411   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3412   tree data_ref_base;
3413   const char *base_name;
3414   tree addr_base;
3415   tree dest;
3416   gimple_seq seq = NULL;
3417   tree base_offset;
3418   tree init;
3419   tree vect_ptr_type;
3420   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3421   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3422
3423   if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3424     {
3425       struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3426
3427       gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3428
3429       data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3430       base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3431       init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3432     }
3433   else
3434     {
3435       data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3436       base_offset = unshare_expr (DR_OFFSET (dr));
3437       init = unshare_expr (DR_INIT (dr));
3438     }
3439
3440   if (loop_vinfo)
3441     base_name = get_name (data_ref_base);
3442   else
3443     {
3444       base_offset = ssize_int (0);
3445       init = ssize_int (0);
3446       base_name = get_name (DR_REF (dr));
3447     }
3448
3449   /* Create base_offset */
3450   base_offset = size_binop (PLUS_EXPR,
3451                             fold_convert (sizetype, base_offset),
3452                             fold_convert (sizetype, init));
3453
3454   if (offset)
3455     {
3456       offset = fold_build2 (MULT_EXPR, sizetype,
3457                             fold_convert (sizetype, offset), step);
3458       base_offset = fold_build2 (PLUS_EXPR, sizetype,
3459                                  base_offset, offset);
3460     }
3461
3462   /* base + base_offset */
3463   if (loop_vinfo)
3464     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3465   else
3466     {
3467       addr_base = build1 (ADDR_EXPR,
3468                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
3469                           unshare_expr (DR_REF (dr)));
3470     }
3471
3472   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3473   addr_base = fold_convert (vect_ptr_type, addr_base);
3474   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3475   addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3476   gimple_seq_add_seq (new_stmt_list, seq);
3477
3478   if (DR_PTR_INFO (dr)
3479       && TREE_CODE (addr_base) == SSA_NAME)
3480     {
3481       duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3482       if (offset)
3483         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3484     }
3485
3486   if (dump_enabled_p ())
3487     {
3488       dump_printf_loc (MSG_NOTE, vect_location, "created ");
3489       dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3490     }
3491
3492   return addr_base;
3493 }
3494
3495
3496 /* Function vect_create_data_ref_ptr.
3497
3498    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3499    location accessed in the loop by STMT, along with the def-use update
3500    chain to appropriately advance the pointer through the loop iterations.
3501    Also set aliasing information for the pointer.  This pointer is used by
3502    the callers to this function to create a memory reference expression for
3503    vector load/store access.
3504
3505    Input:
3506    1. STMT: a stmt that references memory. Expected to be of the form
3507          GIMPLE_ASSIGN <name, data-ref> or
3508          GIMPLE_ASSIGN <data-ref, name>.
3509    2. AGGR_TYPE: the type of the reference, which should be either a vector
3510         or an array.
3511    3. AT_LOOP: the loop where the vector memref is to be created.
3512    4. OFFSET (optional): an offset to be added to the initial address accessed
3513         by the data-ref in STMT.
3514    5. BSI: location where the new stmts are to be placed if there is no loop
3515    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3516         pointing to the initial address.
3517
3518    Output:
3519    1. Declare a new ptr to vector_type, and have it point to the base of the
3520       data reference (initial addressed accessed by the data reference).
3521       For example, for vector of type V8HI, the following code is generated:
3522
3523       v8hi *ap;
3524       ap = (v8hi *)initial_address;
3525
3526       if OFFSET is not supplied:
3527          initial_address = &a[init];
3528       if OFFSET is supplied:
3529          initial_address = &a[init + OFFSET];
3530
3531       Return the initial_address in INITIAL_ADDRESS.
3532
3533    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
3534       update the pointer in each iteration of the loop.
3535
3536       Return the increment stmt that updates the pointer in PTR_INCR.
3537
3538    3. Set INV_P to true if the access pattern of the data reference in the
3539       vectorized loop is invariant.  Set it to false otherwise.
3540
3541    4. Return the pointer.  */
3542
3543 tree
3544 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3545                           tree offset, tree *initial_address,
3546                           gimple_stmt_iterator *gsi, gimple *ptr_incr,
3547                           bool only_init, bool *inv_p)
3548 {
3549   const char *base_name;
3550   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3551   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3552   struct loop *loop = NULL;
3553   bool nested_in_vect_loop = false;
3554   struct loop *containing_loop = NULL;
3555   tree aggr_ptr_type;
3556   tree aggr_ptr;
3557   tree new_temp;
3558   gimple vec_stmt;
3559   gimple_seq new_stmt_list = NULL;
3560   edge pe = NULL;
3561   basic_block new_bb;
3562   tree aggr_ptr_init;
3563   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3564   tree aptr;
3565   gimple_stmt_iterator incr_gsi;
3566   bool insert_after;
3567   bool negative;
3568   tree indx_before_incr, indx_after_incr;
3569   gimple incr;
3570   tree step;
3571   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3572
3573   gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3574               || TREE_CODE (aggr_type) == VECTOR_TYPE);
3575
3576   if (loop_vinfo)
3577     {
3578       loop = LOOP_VINFO_LOOP (loop_vinfo);
3579       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3580       containing_loop = (gimple_bb (stmt))->loop_father;
3581       pe = loop_preheader_edge (loop);
3582     }
3583   else
3584     {
3585       gcc_assert (bb_vinfo);
3586       only_init = true;
3587       *ptr_incr = NULL;
3588     }
3589
3590   /* Check the step (evolution) of the load in LOOP, and record
3591      whether it's invariant.  */
3592   if (nested_in_vect_loop)
3593     step = STMT_VINFO_DR_STEP (stmt_info);
3594   else
3595     step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3596
3597   if (tree_int_cst_compare (step, size_zero_node) == 0)
3598     *inv_p = true;
3599   else
3600     *inv_p = false;
3601   negative = tree_int_cst_compare (step, size_zero_node) < 0;
3602
3603   /* Create an expression for the first address accessed by this load
3604      in LOOP.  */
3605   base_name = get_name (DR_BASE_ADDRESS (dr));
3606
3607   if (dump_enabled_p ())
3608     {
3609       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3610       dump_printf_loc (MSG_NOTE, vect_location,
3611                        "create %s-pointer variable to type: ",
3612                        tree_code_name[(int) TREE_CODE (aggr_type)]);
3613       dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3614       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3615         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
3616       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3617         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
3618       else
3619         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
3620       dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3621     }
3622
3623   /* (1) Create the new aggregate-pointer variable.
3624      Vector and array types inherit the alias set of their component
3625      type by default so we need to use a ref-all pointer if the data
3626      reference does not conflict with the created aggregated data
3627      reference because it is not addressable.  */
3628   bool need_ref_all = false;
3629   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3630                               get_alias_set (DR_REF (dr))))
3631     need_ref_all = true;
3632   /* Likewise for any of the data references in the stmt group.  */
3633   else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3634     {
3635       gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3636       do
3637         {
3638           stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3639           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3640           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3641                                       get_alias_set (DR_REF (sdr))))
3642             {
3643               need_ref_all = true;
3644               break;
3645             }
3646           orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3647         }
3648       while (orig_stmt);
3649     }
3650   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3651                                                need_ref_all);
3652   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3653
3654
3655   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3656      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3657      def-use update cycles for the pointer: one relative to the outer-loop
3658      (LOOP), which is what steps (3) and (4) below do.  The other is relative
3659      to the inner-loop (which is the inner-most loop containing the dataref),
3660      and this is done be step (5) below.
3661
3662      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3663      inner-most loop, and so steps (3),(4) work the same, and step (5) is
3664      redundant.  Steps (3),(4) create the following:
3665
3666         vp0 = &base_addr;
3667         LOOP:   vp1 = phi(vp0,vp2)
3668                 ...
3669                 ...
3670                 vp2 = vp1 + step
3671                 goto LOOP
3672
3673      If there is an inner-loop nested in loop, then step (5) will also be
3674      applied, and an additional update in the inner-loop will be created:
3675
3676         vp0 = &base_addr;
3677         LOOP:   vp1 = phi(vp0,vp2)
3678                 ...
3679         inner:     vp3 = phi(vp1,vp4)
3680                    vp4 = vp3 + inner_step
3681                    if () goto inner
3682                 ...
3683                 vp2 = vp1 + step
3684                 if () goto LOOP   */
3685
3686   /* (2) Calculate the initial address of the aggregate-pointer, and set
3687      the aggregate-pointer to point to it before the loop.  */
3688
3689   /* Create: (&(base[init_val+offset]) in the loop preheader.  */
3690
3691   new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3692                                                    offset, loop);
3693   if (new_stmt_list)
3694     {
3695       if (pe)
3696         {
3697           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3698           gcc_assert (!new_bb);
3699         }
3700       else
3701         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3702     }
3703
3704   *initial_address = new_temp;
3705
3706   /* Create: p = (aggr_type *) initial_base  */
3707   if (TREE_CODE (new_temp) != SSA_NAME
3708       || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3709     {
3710       vec_stmt = gimple_build_assign (aggr_ptr,
3711                                       fold_convert (aggr_ptr_type, new_temp));
3712       aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3713       /* Copy the points-to information if it exists. */
3714       if (DR_PTR_INFO (dr))
3715         duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3716       gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3717       if (pe)
3718         {
3719           new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3720           gcc_assert (!new_bb);
3721         }
3722       else
3723         gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3724     }
3725   else
3726     aggr_ptr_init = new_temp;
3727
3728   /* (3) Handle the updating of the aggregate-pointer inside the loop.
3729      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3730      inner-loop nested in LOOP (during outer-loop vectorization).  */
3731
3732   /* No update in loop is required.  */
3733   if (only_init && (!loop_vinfo || at_loop == loop))
3734     aptr = aggr_ptr_init;
3735   else
3736     {
3737       /* The step of the aggregate pointer is the type size.  */
3738       tree step = TYPE_SIZE_UNIT (aggr_type);
3739       /* One exception to the above is when the scalar step of the load in
3740          LOOP is zero. In this case the step here is also zero.  */
3741       if (*inv_p)
3742         step = size_zero_node;
3743       else if (negative)
3744         step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
3745
3746       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3747
3748       create_iv (aggr_ptr_init,
3749                  fold_convert (aggr_ptr_type, step),
3750                  aggr_ptr, loop, &incr_gsi, insert_after,
3751                  &indx_before_incr, &indx_after_incr);
3752       incr = gsi_stmt (incr_gsi);
3753       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3754
3755       /* Copy the points-to information if it exists. */
3756       if (DR_PTR_INFO (dr))
3757         {
3758           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3759           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3760         }
3761       if (ptr_incr)
3762         *ptr_incr = incr;
3763
3764       aptr = indx_before_incr;
3765     }
3766
3767   if (!nested_in_vect_loop || only_init)
3768     return aptr;
3769
3770
3771   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3772      nested in LOOP, if exists.  */
3773
3774   gcc_assert (nested_in_vect_loop);
3775   if (!only_init)
3776     {
3777       standard_iv_increment_position (containing_loop, &incr_gsi,
3778                                       &insert_after);
3779       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3780                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3781                  &indx_after_incr);
3782       incr = gsi_stmt (incr_gsi);
3783       set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3784
3785       /* Copy the points-to information if it exists. */
3786       if (DR_PTR_INFO (dr))
3787         {
3788           duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3789           duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3790         }
3791       if (ptr_incr)
3792         *ptr_incr = incr;
3793
3794       return indx_before_incr;
3795     }
3796   else
3797     gcc_unreachable ();
3798 }
3799
3800
3801 /* Function bump_vector_ptr
3802
3803    Increment a pointer (to a vector type) by vector-size. If requested,
3804    i.e. if PTR-INCR is given, then also connect the new increment stmt
3805    to the existing def-use update-chain of the pointer, by modifying
3806    the PTR_INCR as illustrated below:
3807
3808    The pointer def-use update-chain before this function:
3809                         DATAREF_PTR = phi (p_0, p_2)
3810                         ....
3811         PTR_INCR:       p_2 = DATAREF_PTR + step
3812
3813    The pointer def-use update-chain after this function:
3814                         DATAREF_PTR = phi (p_0, p_2)
3815                         ....
3816                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3817                         ....
3818         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
3819
3820    Input:
3821    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3822                  in the loop.
3823    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3824               the loop.  The increment amount across iterations is expected
3825               to be vector_size.
3826    BSI - location where the new update stmt is to be placed.
3827    STMT - the original scalar memory-access stmt that is being vectorized.
3828    BUMP - optional. The offset by which to bump the pointer. If not given,
3829           the offset is assumed to be vector_size.
3830
3831    Output: Return NEW_DATAREF_PTR as illustrated above.
3832
3833 */
3834
3835 tree
3836 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3837                  gimple stmt, tree bump)
3838 {
3839   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3840   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3841   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3842   tree update = TYPE_SIZE_UNIT (vectype);
3843   gimple incr_stmt;
3844   ssa_op_iter iter;
3845   use_operand_p use_p;
3846   tree new_dataref_ptr;
3847
3848   if (bump)
3849     update = bump;
3850
3851   new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3852   incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3853                                             dataref_ptr, update);
3854   vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3855
3856   /* Copy the points-to information if it exists. */
3857   if (DR_PTR_INFO (dr))
3858     {
3859       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3860       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3861     }
3862
3863   if (!ptr_incr)
3864     return new_dataref_ptr;
3865
3866   /* Update the vector-pointer's cross-iteration increment.  */
3867   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3868     {
3869       tree use = USE_FROM_PTR (use_p);
3870
3871       if (use == dataref_ptr)
3872         SET_USE (use_p, new_dataref_ptr);
3873       else
3874         gcc_assert (tree_int_cst_compare (use, update) == 0);
3875     }
3876
3877   return new_dataref_ptr;
3878 }
3879
3880
3881 /* Function vect_create_destination_var.
3882
3883    Create a new temporary of type VECTYPE.  */
3884
3885 tree
3886 vect_create_destination_var (tree scalar_dest, tree vectype)
3887 {
3888   tree vec_dest;
3889   const char *new_name;
3890   tree type;
3891   enum vect_var_kind kind;
3892
3893   kind = vectype ? vect_simple_var : vect_scalar_var;
3894   type = vectype ? vectype : TREE_TYPE (scalar_dest);
3895
3896   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3897
3898   new_name = get_name (scalar_dest);
3899   if (!new_name)
3900     new_name = "var_";
3901   vec_dest = vect_get_new_vect_var (type, kind, new_name);
3902
3903   return vec_dest;
3904 }
3905
3906 /* Function vect_grouped_store_supported.
3907
3908    Returns TRUE if interleave high and interleave low permutations
3909    are supported, and FALSE otherwise.  */
3910
3911 bool
3912 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3913 {
3914   enum machine_mode mode = TYPE_MODE (vectype);
3915
3916   /* vect_permute_store_chain requires the group size to be a power of two.  */
3917   if (exact_log2 (count) == -1)
3918     {
3919       if (dump_enabled_p ())
3920         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3921                          "the size of the group of accesses"
3922                          " is not a power of 2");
3923       return false;
3924     }
3925
3926   /* Check that the permutation is supported.  */
3927   if (VECTOR_MODE_P (mode))
3928     {
3929       unsigned int i, nelt = GET_MODE_NUNITS (mode);
3930       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3931       for (i = 0; i < nelt / 2; i++)
3932         {
3933           sel[i * 2] = i;
3934           sel[i * 2 + 1] = i + nelt;
3935         }
3936       if (can_vec_perm_p (mode, false, sel))
3937         {
3938           for (i = 0; i < nelt; i++)
3939             sel[i] += nelt / 2;
3940           if (can_vec_perm_p (mode, false, sel))
3941             return true;
3942         }
3943     }
3944
3945   if (dump_enabled_p ())
3946     dump_printf (MSG_MISSED_OPTIMIZATION,
3947                  "interleave op not supported by target.");
3948   return false;
3949 }
3950
3951
3952 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3953    type VECTYPE.  */
3954
3955 bool
3956 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3957 {
3958   return vect_lanes_optab_supported_p ("vec_store_lanes",
3959                                        vec_store_lanes_optab,
3960                                        vectype, count);
3961 }
3962
3963
3964 /* Function vect_permute_store_chain.
3965
3966    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3967    a power of 2, generate interleave_high/low stmts to reorder the data
3968    correctly for the stores.  Return the final references for stores in
3969    RESULT_CHAIN.
3970
3971    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3972    The input is 4 vectors each containing 8 elements.  We assign a number to
3973    each element, the input sequence is:
3974
3975    1st vec:   0  1  2  3  4  5  6  7
3976    2nd vec:   8  9 10 11 12 13 14 15
3977    3rd vec:  16 17 18 19 20 21 22 23
3978    4th vec:  24 25 26 27 28 29 30 31
3979
3980    The output sequence should be:
3981
3982    1st vec:  0  8 16 24  1  9 17 25
3983    2nd vec:  2 10 18 26  3 11 19 27
3984    3rd vec:  4 12 20 28  5 13 21 30
3985    4th vec:  6 14 22 30  7 15 23 31
3986
3987    i.e., we interleave the contents of the four vectors in their order.
3988
3989    We use interleave_high/low instructions to create such output.  The input of
3990    each interleave_high/low operation is two vectors:
3991    1st vec    2nd vec
3992    0 1 2 3    4 5 6 7
3993    the even elements of the result vector are obtained left-to-right from the
3994    high/low elements of the first vector.  The odd elements of the result are
3995    obtained left-to-right from the high/low elements of the second vector.
3996    The output of interleave_high will be:   0 4 1 5
3997    and of interleave_low:                   2 6 3 7
3998
3999
4000    The permutation is done in log LENGTH stages.  In each stage interleave_high
4001    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4002    where the first argument is taken from the first half of DR_CHAIN and the
4003    second argument from it's second half.
4004    In our example,
4005
4006    I1: interleave_high (1st vec, 3rd vec)
4007    I2: interleave_low (1st vec, 3rd vec)
4008    I3: interleave_high (2nd vec, 4th vec)
4009    I4: interleave_low (2nd vec, 4th vec)
4010
4011    The output for the first stage is:
4012
4013    I1:  0 16  1 17  2 18  3 19
4014    I2:  4 20  5 21  6 22  7 23
4015    I3:  8 24  9 25 10 26 11 27
4016    I4: 12 28 13 29 14 30 15 31
4017
4018    The output of the second stage, i.e. the final result is:
4019
4020    I1:  0  8 16 24  1  9 17 25
4021    I2:  2 10 18 26  3 11 19 27
4022    I3:  4 12 20 28  5 13 21 30
4023    I4:  6 14 22 30  7 15 23 31.  */
4024
4025 void
4026 vect_permute_store_chain (vec<tree> dr_chain,
4027                           unsigned int length,
4028                           gimple stmt,
4029                           gimple_stmt_iterator *gsi,
4030                           vec<tree> *result_chain)
4031 {
4032   tree vect1, vect2, high, low;
4033   gimple perm_stmt;
4034   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4035   tree perm_mask_low, perm_mask_high;
4036   unsigned int i, n;
4037   unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4038   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4039
4040   result_chain->quick_grow (length);
4041   memcpy (result_chain->address (), dr_chain.address (),
4042           length * sizeof (tree));
4043
4044   for (i = 0, n = nelt / 2; i < n; i++)
4045     {
4046       sel[i * 2] = i;
4047       sel[i * 2 + 1] = i + nelt;
4048     }
4049   perm_mask_high = vect_gen_perm_mask (vectype, sel);
4050   gcc_assert (perm_mask_high != NULL);
4051
4052   for (i = 0; i < nelt; i++)
4053     sel[i] += nelt / 2;
4054   perm_mask_low = vect_gen_perm_mask (vectype, sel);
4055   gcc_assert (perm_mask_low != NULL);
4056
4057   for (i = 0, n = exact_log2 (length); i < n; i++)
4058     {
4059       for (j = 0; j < length/2; j++)
4060         {
4061           vect1 = dr_chain[j];
4062           vect2 = dr_chain[j+length/2];
4063
4064           /* Create interleaving stmt:
4065              high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}>  */
4066           high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4067           perm_stmt
4068             = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4069                                             vect1, vect2, perm_mask_high);
4070           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4071           (*result_chain)[2*j] = high;
4072
4073           /* Create interleaving stmt:
4074              low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4075                                                  nelt*3/2+1, ...}>  */
4076           low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4077           perm_stmt
4078             = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4079                                             vect1, vect2, perm_mask_low);
4080           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4081           (*result_chain)[2*j+1] = low;
4082         }
4083       memcpy (dr_chain.address (), result_chain->address (),
4084               length * sizeof (tree));
4085     }
4086 }
4087
4088 /* Function vect_setup_realignment
4089
4090    This function is called when vectorizing an unaligned load using
4091    the dr_explicit_realign[_optimized] scheme.
4092    This function generates the following code at the loop prolog:
4093
4094       p = initial_addr;
4095    x  msq_init = *(floor(p));   # prolog load
4096       realignment_token = call target_builtin;
4097     loop:
4098    x  msq = phi (msq_init, ---)
4099
4100    The stmts marked with x are generated only for the case of
4101    dr_explicit_realign_optimized.
4102
4103    The code above sets up a new (vector) pointer, pointing to the first
4104    location accessed by STMT, and a "floor-aligned" load using that pointer.
4105    It also generates code to compute the "realignment-token" (if the relevant
4106    target hook was defined), and creates a phi-node at the loop-header bb
4107    whose arguments are the result of the prolog-load (created by this
4108    function) and the result of a load that takes place in the loop (to be
4109    created by the caller to this function).
4110
4111    For the case of dr_explicit_realign_optimized:
4112    The caller to this function uses the phi-result (msq) to create the
4113    realignment code inside the loop, and sets up the missing phi argument,
4114    as follows:
4115     loop:
4116       msq = phi (msq_init, lsq)
4117       lsq = *(floor(p'));        # load in loop
4118       result = realign_load (msq, lsq, realignment_token);
4119
4120    For the case of dr_explicit_realign:
4121     loop:
4122       msq = *(floor(p));        # load in loop
4123       p' = p + (VS-1);
4124       lsq = *(floor(p'));       # load in loop
4125       result = realign_load (msq, lsq, realignment_token);
4126
4127    Input:
4128    STMT - (scalar) load stmt to be vectorized. This load accesses
4129           a memory location that may be unaligned.
4130    BSI - place where new code is to be inserted.
4131    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4132                               is used.
4133
4134    Output:
4135    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4136                        target hook, if defined.
4137    Return value - the result of the loop-header phi node.  */
4138
4139 tree
4140 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4141                         tree *realignment_token,
4142                         enum dr_alignment_support alignment_support_scheme,
4143                         tree init_addr,
4144                         struct loop **at_loop)
4145 {
4146   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4147   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4148   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4149   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4150   struct loop *loop = NULL;
4151   edge pe = NULL;
4152   tree scalar_dest = gimple_assign_lhs (stmt);
4153   tree vec_dest;
4154   gimple inc;
4155   tree ptr;
4156   tree data_ref;
4157   gimple new_stmt;
4158   basic_block new_bb;
4159   tree msq_init = NULL_TREE;
4160   tree new_temp;
4161   gimple phi_stmt;
4162   tree msq = NULL_TREE;
4163   gimple_seq stmts = NULL;
4164   bool inv_p;
4165   bool compute_in_loop = false;
4166   bool nested_in_vect_loop = false;
4167   struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4168   struct loop *loop_for_initial_load = NULL;
4169
4170   if (loop_vinfo)
4171     {
4172       loop = LOOP_VINFO_LOOP (loop_vinfo);
4173       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4174     }
4175
4176   gcc_assert (alignment_support_scheme == dr_explicit_realign
4177               || alignment_support_scheme == dr_explicit_realign_optimized);
4178
4179   /* We need to generate three things:
4180      1. the misalignment computation
4181      2. the extra vector load (for the optimized realignment scheme).
4182      3. the phi node for the two vectors from which the realignment is
4183       done (for the optimized realignment scheme).  */
4184
4185   /* 1. Determine where to generate the misalignment computation.
4186
4187      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4188      calculation will be generated by this function, outside the loop (in the
4189      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
4190      caller, inside the loop.
4191
4192      Background: If the misalignment remains fixed throughout the iterations of
4193      the loop, then both realignment schemes are applicable, and also the
4194      misalignment computation can be done outside LOOP.  This is because we are
4195      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4196      are a multiple of VS (the Vector Size), and therefore the misalignment in
4197      different vectorized LOOP iterations is always the same.
4198      The problem arises only if the memory access is in an inner-loop nested
4199      inside LOOP, which is now being vectorized using outer-loop vectorization.
4200      This is the only case when the misalignment of the memory access may not
4201      remain fixed throughout the iterations of the inner-loop (as explained in
4202      detail in vect_supportable_dr_alignment).  In this case, not only is the
4203      optimized realignment scheme not applicable, but also the misalignment
4204      computation (and generation of the realignment token that is passed to
4205      REALIGN_LOAD) have to be done inside the loop.
4206
4207      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4208      or not, which in turn determines if the misalignment is computed inside
4209      the inner-loop, or outside LOOP.  */
4210
4211   if (init_addr != NULL_TREE || !loop_vinfo)
4212     {
4213       compute_in_loop = true;
4214       gcc_assert (alignment_support_scheme == dr_explicit_realign);
4215     }
4216
4217
4218   /* 2. Determine where to generate the extra vector load.
4219
4220      For the optimized realignment scheme, instead of generating two vector
4221      loads in each iteration, we generate a single extra vector load in the
4222      preheader of the loop, and in each iteration reuse the result of the
4223      vector load from the previous iteration.  In case the memory access is in
4224      an inner-loop nested inside LOOP, which is now being vectorized using
4225      outer-loop vectorization, we need to determine whether this initial vector
4226      load should be generated at the preheader of the inner-loop, or can be
4227      generated at the preheader of LOOP.  If the memory access has no evolution
4228      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4229      to be generated inside LOOP (in the preheader of the inner-loop).  */
4230
4231   if (nested_in_vect_loop)
4232     {
4233       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4234       bool invariant_in_outerloop =
4235             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4236       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4237     }
4238   else
4239     loop_for_initial_load = loop;
4240   if (at_loop)
4241     *at_loop = loop_for_initial_load;
4242
4243   if (loop_for_initial_load)
4244     pe = loop_preheader_edge (loop_for_initial_load);
4245
4246   /* 3. For the case of the optimized realignment, create the first vector
4247       load at the loop preheader.  */
4248
4249   if (alignment_support_scheme == dr_explicit_realign_optimized)
4250     {
4251       /* Create msq_init = *(floor(p1)) in the loop preheader  */
4252
4253       gcc_assert (!compute_in_loop);
4254       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4255       ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4256                                       NULL_TREE, &init_addr, NULL, &inc,
4257                                       true, &inv_p);
4258       new_temp = copy_ssa_name (ptr, NULL);
4259       new_stmt = gimple_build_assign_with_ops
4260                    (BIT_AND_EXPR, new_temp, ptr,
4261                     build_int_cst (TREE_TYPE (ptr),
4262                                    -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4263       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4264       gcc_assert (!new_bb);
4265       data_ref
4266         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4267                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4268       new_stmt = gimple_build_assign (vec_dest, data_ref);
4269       new_temp = make_ssa_name (vec_dest, new_stmt);
4270       gimple_assign_set_lhs (new_stmt, new_temp);
4271       if (pe)
4272         {
4273           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4274           gcc_assert (!new_bb);
4275         }
4276       else
4277          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4278
4279       msq_init = gimple_assign_lhs (new_stmt);
4280     }
4281
4282   /* 4. Create realignment token using a target builtin, if available.
4283       It is done either inside the containing loop, or before LOOP (as
4284       determined above).  */
4285
4286   if (targetm.vectorize.builtin_mask_for_load)
4287     {
4288       tree builtin_decl;
4289
4290       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
4291       if (!init_addr)
4292         {
4293           /* Generate the INIT_ADDR computation outside LOOP.  */
4294           init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4295                                                         NULL_TREE, loop);
4296           if (loop)
4297             {
4298               pe = loop_preheader_edge (loop);
4299               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4300               gcc_assert (!new_bb);
4301             }
4302           else
4303              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4304         }
4305
4306       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4307       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4308       vec_dest =
4309         vect_create_destination_var (scalar_dest,
4310                                      gimple_call_return_type (new_stmt));
4311       new_temp = make_ssa_name (vec_dest, new_stmt);
4312       gimple_call_set_lhs (new_stmt, new_temp);
4313
4314       if (compute_in_loop)
4315         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4316       else
4317         {
4318           /* Generate the misalignment computation outside LOOP.  */
4319           pe = loop_preheader_edge (loop);
4320           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4321           gcc_assert (!new_bb);
4322         }
4323
4324       *realignment_token = gimple_call_lhs (new_stmt);
4325
4326       /* The result of the CALL_EXPR to this builtin is determined from
4327          the value of the parameter and no global variables are touched
4328          which makes the builtin a "const" function.  Requiring the
4329          builtin to have the "const" attribute makes it unnecessary
4330          to call mark_call_clobbered.  */
4331       gcc_assert (TREE_READONLY (builtin_decl));
4332     }
4333
4334   if (alignment_support_scheme == dr_explicit_realign)
4335     return msq;
4336
4337   gcc_assert (!compute_in_loop);
4338   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4339
4340
4341   /* 5. Create msq = phi <msq_init, lsq> in loop  */
4342
4343   pe = loop_preheader_edge (containing_loop);
4344   vec_dest = vect_create_destination_var (scalar_dest, vectype);
4345   msq = make_ssa_name (vec_dest, NULL);
4346   phi_stmt = create_phi_node (msq, containing_loop->header);
4347   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4348
4349   return msq;
4350 }
4351
4352
4353 /* Function vect_grouped_load_supported.
4354
4355    Returns TRUE if even and odd permutations are supported,
4356    and FALSE otherwise.  */
4357
4358 bool
4359 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4360 {
4361   enum machine_mode mode = TYPE_MODE (vectype);
4362
4363   /* vect_permute_load_chain requires the group size to be a power of two.  */
4364   if (exact_log2 (count) == -1)
4365     {
4366       if (dump_enabled_p ())
4367         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4368                          "the size of the group of accesses"
4369                          " is not a power of 2");
4370       return false;
4371     }
4372
4373   /* Check that the permutation is supported.  */
4374   if (VECTOR_MODE_P (mode))
4375     {
4376       unsigned int i, nelt = GET_MODE_NUNITS (mode);
4377       unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4378
4379       for (i = 0; i < nelt; i++)
4380         sel[i] = i * 2;
4381       if (can_vec_perm_p (mode, false, sel))
4382         {
4383           for (i = 0; i < nelt; i++)
4384             sel[i] = i * 2 + 1;
4385           if (can_vec_perm_p (mode, false, sel))
4386             return true;
4387         }
4388     }
4389
4390   if (dump_enabled_p ())
4391     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4392                      "extract even/odd not supported by target");
4393   return false;
4394 }
4395
4396 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4397    type VECTYPE.  */
4398
4399 bool
4400 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4401 {
4402   return vect_lanes_optab_supported_p ("vec_load_lanes",
4403                                        vec_load_lanes_optab,
4404                                        vectype, count);
4405 }
4406
4407 /* Function vect_permute_load_chain.
4408
4409    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4410    a power of 2, generate extract_even/odd stmts to reorder the input data
4411    correctly.  Return the final references for loads in RESULT_CHAIN.
4412
4413    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4414    The input is 4 vectors each containing 8 elements. We assign a number to each
4415    element, the input sequence is:
4416
4417    1st vec:   0  1  2  3  4  5  6  7
4418    2nd vec:   8  9 10 11 12 13 14 15
4419    3rd vec:  16 17 18 19 20 21 22 23
4420    4th vec:  24 25 26 27 28 29 30 31
4421
4422    The output sequence should be:
4423
4424    1st vec:  0 4  8 12 16 20 24 28
4425    2nd vec:  1 5  9 13 17 21 25 29
4426    3rd vec:  2 6 10 14 18 22 26 30
4427    4th vec:  3 7 11 15 19 23 27 31
4428
4429    i.e., the first output vector should contain the first elements of each
4430    interleaving group, etc.
4431
4432    We use extract_even/odd instructions to create such output.  The input of
4433    each extract_even/odd operation is two vectors
4434    1st vec    2nd vec
4435    0 1 2 3    4 5 6 7
4436
4437    and the output is the vector of extracted even/odd elements.  The output of
4438    extract_even will be:   0 2 4 6
4439    and of extract_odd:     1 3 5 7
4440
4441
4442    The permutation is done in log LENGTH stages.  In each stage extract_even
4443    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4444    their order.  In our example,
4445
4446    E1: extract_even (1st vec, 2nd vec)
4447    E2: extract_odd (1st vec, 2nd vec)
4448    E3: extract_even (3rd vec, 4th vec)
4449    E4: extract_odd (3rd vec, 4th vec)
4450
4451    The output for the first stage will be:
4452
4453    E1:  0  2  4  6  8 10 12 14
4454    E2:  1  3  5  7  9 11 13 15
4455    E3: 16 18 20 22 24 26 28 30
4456    E4: 17 19 21 23 25 27 29 31
4457
4458    In order to proceed and create the correct sequence for the next stage (or
4459    for the correct output, if the second stage is the last one, as in our
4460    example), we first put the output of extract_even operation and then the
4461    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4462    The input for the second stage is:
4463
4464    1st vec (E1):  0  2  4  6  8 10 12 14
4465    2nd vec (E3): 16 18 20 22 24 26 28 30
4466    3rd vec (E2):  1  3  5  7  9 11 13 15
4467    4th vec (E4): 17 19 21 23 25 27 29 31
4468
4469    The output of the second stage:
4470
4471    E1: 0 4  8 12 16 20 24 28
4472    E2: 2 6 10 14 18 22 26 30
4473    E3: 1 5  9 13 17 21 25 29
4474    E4: 3 7 11 15 19 23 27 31
4475
4476    And RESULT_CHAIN after reordering:
4477
4478    1st vec (E1):  0 4  8 12 16 20 24 28
4479    2nd vec (E3):  1 5  9 13 17 21 25 29
4480    3rd vec (E2):  2 6 10 14 18 22 26 30
4481    4th vec (E4):  3 7 11 15 19 23 27 31.  */
4482
4483 static void
4484 vect_permute_load_chain (vec<tree> dr_chain,
4485                          unsigned int length,
4486                          gimple stmt,
4487                          gimple_stmt_iterator *gsi,
4488                          vec<tree> *result_chain)
4489 {
4490   tree data_ref, first_vect, second_vect;
4491   tree perm_mask_even, perm_mask_odd;
4492   gimple perm_stmt;
4493   tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4494   unsigned int i, j, log_length = exact_log2 (length);
4495   unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4496   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4497
4498   result_chain->quick_grow (length);
4499   memcpy (result_chain->address (), dr_chain.address (),
4500           length * sizeof (tree));
4501
4502   for (i = 0; i < nelt; ++i)
4503     sel[i] = i * 2;
4504   perm_mask_even = vect_gen_perm_mask (vectype, sel);
4505   gcc_assert (perm_mask_even != NULL);
4506
4507   for (i = 0; i < nelt; ++i)
4508     sel[i] = i * 2 + 1;
4509   perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4510   gcc_assert (perm_mask_odd != NULL);
4511
4512   for (i = 0; i < log_length; i++)
4513     {
4514       for (j = 0; j < length; j += 2)
4515         {
4516           first_vect = dr_chain[j];
4517           second_vect = dr_chain[j+1];
4518
4519           /* data_ref = permute_even (first_data_ref, second_data_ref);  */
4520           data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4521           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4522                                                     first_vect, second_vect,
4523                                                     perm_mask_even);
4524           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4525           (*result_chain)[j/2] = data_ref;
4526
4527           /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
4528           data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4529           perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4530                                                     first_vect, second_vect,
4531                                                     perm_mask_odd);
4532           vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4533           (*result_chain)[j/2+length/2] = data_ref;
4534         }
4535       memcpy (dr_chain.address (), result_chain->address (),
4536               length * sizeof (tree));
4537     }
4538 }
4539
4540
4541 /* Function vect_transform_grouped_load.
4542
4543    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4544    to perform their permutation and ascribe the result vectorized statements to
4545    the scalar statements.
4546 */
4547
4548 void
4549 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4550                              gimple_stmt_iterator *gsi)
4551 {
4552   vec<tree> result_chain = vNULL;
4553
4554   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4555      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4556      vectors, that are ready for vector computation.  */
4557   result_chain.create (size);
4558   vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4559   vect_record_grouped_load_vectors (stmt, result_chain);
4560   result_chain.release ();
4561 }
4562
4563 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4564    generated as part of the vectorization of STMT.  Assign the statement
4565    for each vector to the associated scalar statement.  */
4566
4567 void
4568 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4569 {
4570   gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4571   gimple next_stmt, new_stmt;
4572   unsigned int i, gap_count;
4573   tree tmp_data_ref;
4574
4575   /* Put a permuted data-ref in the VECTORIZED_STMT field.
4576      Since we scan the chain starting from it's first node, their order
4577      corresponds the order of data-refs in RESULT_CHAIN.  */
4578   next_stmt = first_stmt;
4579   gap_count = 1;
4580   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4581     {
4582       if (!next_stmt)
4583         break;
4584
4585       /* Skip the gaps.  Loads created for the gaps will be removed by dead
4586        code elimination pass later.  No need to check for the first stmt in
4587        the group, since it always exists.
4588        GROUP_GAP is the number of steps in elements from the previous
4589        access (if there is no gap GROUP_GAP is 1).  We skip loads that
4590        correspond to the gaps.  */
4591       if (next_stmt != first_stmt
4592           && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4593       {
4594         gap_count++;
4595         continue;
4596       }
4597
4598       while (next_stmt)
4599         {
4600           new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4601           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4602              copies, and we put the new vector statement in the first available
4603              RELATED_STMT.  */
4604           if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4605             STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4606           else
4607             {
4608               if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4609                 {
4610                   gimple prev_stmt =
4611                     STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4612                   gimple rel_stmt =
4613                     STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4614                   while (rel_stmt)
4615                     {
4616                       prev_stmt = rel_stmt;
4617                       rel_stmt =
4618                         STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4619                     }
4620
4621                   STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4622                     new_stmt;
4623                 }
4624             }
4625
4626           next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4627           gap_count = 1;
4628           /* If NEXT_STMT accesses the same DR as the previous statement,
4629              put the same TMP_DATA_REF as its vectorized statement; otherwise
4630              get the next data-ref from RESULT_CHAIN.  */
4631           if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4632             break;
4633         }
4634     }
4635 }
4636
4637 /* Function vect_force_dr_alignment_p.
4638
4639    Returns whether the alignment of a DECL can be forced to be aligned
4640    on ALIGNMENT bit boundary.  */
4641
4642 bool
4643 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4644 {
4645   if (TREE_CODE (decl) != VAR_DECL)
4646     return false;
4647
4648   /* We cannot change alignment of common or external symbols as another
4649      translation unit may contain a definition with lower alignment.
4650      The rules of common symbol linking mean that the definition
4651      will override the common symbol.  The same is true for constant
4652      pool entries which may be shared and are not properly merged
4653      by LTO.  */
4654   if (DECL_EXTERNAL (decl)
4655       || DECL_COMMON (decl)
4656       || DECL_IN_CONSTANT_POOL (decl))
4657     return false;
4658
4659   if (TREE_ASM_WRITTEN (decl))
4660     return false;
4661
4662   /* Do not override the alignment as specified by the ABI when the used
4663      attribute is set.  */
4664   if (DECL_PRESERVE_P (decl))
4665     return false;
4666
4667   /* Do not override explicit alignment set by the user when an explicit
4668      section name is also used.  This is a common idiom used by many
4669      software projects.  */
4670   if (DECL_SECTION_NAME (decl) != NULL_TREE
4671       && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4672     return false;
4673
4674   if (TREE_STATIC (decl))
4675     return (alignment <= MAX_OFILE_ALIGNMENT);
4676   else
4677     return (alignment <= MAX_STACK_ALIGNMENT);
4678 }
4679
4680
4681 /* Return whether the data reference DR is supported with respect to its
4682    alignment.
4683    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4684    it is aligned, i.e., check if it is possible to vectorize it with different
4685    alignment.  */
4686
4687 enum dr_alignment_support
4688 vect_supportable_dr_alignment (struct data_reference *dr,
4689                                bool check_aligned_accesses)
4690 {
4691   gimple stmt = DR_STMT (dr);
4692   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4693   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4694   enum machine_mode mode = TYPE_MODE (vectype);
4695   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4696   struct loop *vect_loop = NULL;
4697   bool nested_in_vect_loop = false;
4698
4699   if (aligned_access_p (dr) && !check_aligned_accesses)
4700     return dr_aligned;
4701
4702   if (loop_vinfo)
4703     {
4704       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4705       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4706     }
4707
4708   /* Possibly unaligned access.  */
4709
4710   /* We can choose between using the implicit realignment scheme (generating
4711      a misaligned_move stmt) and the explicit realignment scheme (generating
4712      aligned loads with a REALIGN_LOAD).  There are two variants to the
4713      explicit realignment scheme: optimized, and unoptimized.
4714      We can optimize the realignment only if the step between consecutive
4715      vector loads is equal to the vector size.  Since the vector memory
4716      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4717      is guaranteed that the misalignment amount remains the same throughout the
4718      execution of the vectorized loop.  Therefore, we can create the
4719      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4720      at the loop preheader.
4721
4722      However, in the case of outer-loop vectorization, when vectorizing a
4723      memory access in the inner-loop nested within the LOOP that is now being
4724      vectorized, while it is guaranteed that the misalignment of the
4725      vectorized memory access will remain the same in different outer-loop
4726      iterations, it is *not* guaranteed that is will remain the same throughout
4727      the execution of the inner-loop.  This is because the inner-loop advances
4728      with the original scalar step (and not in steps of VS).  If the inner-loop
4729      step happens to be a multiple of VS, then the misalignment remains fixed
4730      and we can use the optimized realignment scheme.  For example:
4731
4732       for (i=0; i<N; i++)
4733         for (j=0; j<M; j++)
4734           s += a[i+j];
4735
4736      When vectorizing the i-loop in the above example, the step between
4737      consecutive vector loads is 1, and so the misalignment does not remain
4738      fixed across the execution of the inner-loop, and the realignment cannot
4739      be optimized (as illustrated in the following pseudo vectorized loop):
4740
4741       for (i=0; i<N; i+=4)
4742         for (j=0; j<M; j++){
4743           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4744                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
4745                          // (assuming that we start from an aligned address).
4746           }
4747
4748      We therefore have to use the unoptimized realignment scheme:
4749
4750       for (i=0; i<N; i+=4)
4751           for (j=k; j<M; j+=4)
4752           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4753                            // that the misalignment of the initial address is
4754                            // 0).
4755
4756      The loop can then be vectorized as follows:
4757
4758       for (k=0; k<4; k++){
4759         rt = get_realignment_token (&vp[k]);
4760         for (i=0; i<N; i+=4){
4761           v1 = vp[i+k];
4762           for (j=k; j<M; j+=4){
4763             v2 = vp[i+j+VS-1];
4764             va = REALIGN_LOAD <v1,v2,rt>;
4765             vs += va;
4766             v1 = v2;
4767           }
4768         }
4769     } */
4770
4771   if (DR_IS_READ (dr))
4772     {
4773       bool is_packed = false;
4774       tree type = (TREE_TYPE (DR_REF (dr)));
4775
4776       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4777           && (!targetm.vectorize.builtin_mask_for_load
4778               || targetm.vectorize.builtin_mask_for_load ()))
4779         {
4780           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4781           if ((nested_in_vect_loop
4782                && (TREE_INT_CST_LOW (DR_STEP (dr))
4783                    != GET_MODE_SIZE (TYPE_MODE (vectype))))
4784               || !loop_vinfo)
4785             return dr_explicit_realign;
4786           else
4787             return dr_explicit_realign_optimized;
4788         }
4789       if (!known_alignment_for_access_p (dr))
4790         is_packed = not_size_aligned (DR_REF (dr));
4791
4792       if (targetm.vectorize.
4793           support_vector_misalignment (mode, type,
4794                                        DR_MISALIGNMENT (dr), is_packed))
4795         /* Can't software pipeline the loads, but can at least do them.  */
4796         return dr_unaligned_supported;
4797     }
4798   else
4799     {
4800       bool is_packed = false;
4801       tree type = (TREE_TYPE (DR_REF (dr)));
4802
4803       if (!known_alignment_for_access_p (dr))
4804         is_packed = not_size_aligned (DR_REF (dr));
4805
4806      if (targetm.vectorize.
4807          support_vector_misalignment (mode, type,
4808                                       DR_MISALIGNMENT (dr), is_packed))
4809        return dr_unaligned_supported;
4810     }
4811
4812   /* Unsupported.  */
4813   return dr_unaligned_unsupported;
4814 }