gcc/tree-vect-data-refs.c

   1 /* Data References Analysis and Manipulation Utilities for Vectorization.
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "predict.h"
  31 #include "memmodel.h"
  32 #include "tm_p.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "cgraph.h"
  36 #include "dumpfile.h"
  37 #include "alias.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "tree-eh.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop.h"
  47 #include "cfgloop.h"
  48 #include "tree-scalar-evolution.h"
  49 #include "tree-vectorizer.h"
  50 #include "expr.h"
  51 #include "builtins.h"
  52 #include "params.h"
  53 #include "tree-cfg.h"
  54 #include "tree-hash-traits.h"
  55 #include "vec-perm-indices.h"
  56 #include "internal-fn.h"
  57
  58 /* Return true if load- or store-lanes optab OPTAB is implemented for
  59    COUNT vectors of type VECTYPE.  NAME is the name of OPTAB.  */
  60
  61 static bool
  62 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
  63                               tree vectype, unsigned HOST_WIDE_INT count)
  64 {
  65   machine_mode mode, array_mode;
  66   bool limit_p;
  67
  68   mode = TYPE_MODE (vectype);
  69   if (!targetm.array_mode (mode, count).exists (&array_mode))
  70     {
  71       poly_uint64 bits = count * GET_MODE_BITSIZE (mode);
  72       limit_p = !targetm.array_mode_supported_p (mode, count);
  73       if (!int_mode_for_size (bits, limit_p).exists (&array_mode))
  74         {
  75           if (dump_enabled_p ())
  76             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  77                              "no array mode for %s[%wu]\n",
  78                              GET_MODE_NAME (mode), count);
  79           return false;
  80         }
  81     }
  82
  83   if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
  84     {
  85       if (dump_enabled_p ())
  86         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
  87                          "cannot use %s<%s><%s>\n", name,
  88                          GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
  89       return false;
  90     }
  91
  92   if (dump_enabled_p ())
  93     dump_printf_loc (MSG_NOTE, vect_location,
  94                      "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
  95                      GET_MODE_NAME (mode));
  96
  97   return true;
  98 }
  99
 100
 101 /* Return the smallest scalar part of STMT_INFO.
 102    This is used to determine the vectype of the stmt.  We generally set the
 103    vectype according to the type of the result (lhs).  For stmts whose
 104    result-type is different than the type of the arguments (e.g., demotion,
 105    promotion), vectype will be reset appropriately (later).  Note that we have
 106    to visit the smallest datatype in this function, because that determines the
 107    VF.  If the smallest datatype in the loop is present only as the rhs of a
 108    promotion operation - we'd miss it.
 109    Such a case, where a variable of this datatype does not appear in the lhs
 110    anywhere in the loop, can only occur if it's an invariant: e.g.:
 111    'int_x = (int) short_inv', which we'd expect to have been optimized away by
 112    invariant motion.  However, we cannot rely on invariant motion to always
 113    take invariants out of the loop, and so in the case of promotion we also
 114    have to check the rhs.
 115    LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
 116    types.  */
 117
 118 tree
 119 vect_get_smallest_scalar_type (stmt_vec_info stmt_info,
 120                                HOST_WIDE_INT *lhs_size_unit,
 121                                HOST_WIDE_INT *rhs_size_unit)
 122 {
 123   tree scalar_type = gimple_expr_type (stmt_info->stmt);
 124   HOST_WIDE_INT lhs, rhs;
 125
 126   /* During the analysis phase, this function is called on arbitrary
 127      statements that might not have scalar results.  */
 128   if (!tree_fits_uhwi_p (TYPE_SIZE_UNIT (scalar_type)))
 129     return scalar_type;
 130
 131   lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
 132
 133   gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
 134   if (assign
 135       && (gimple_assign_cast_p (assign)
 136           || gimple_assign_rhs_code (assign) == DOT_PROD_EXPR
 137           || gimple_assign_rhs_code (assign) == WIDEN_SUM_EXPR
 138           || gimple_assign_rhs_code (assign) == WIDEN_MULT_EXPR
 139           || gimple_assign_rhs_code (assign) == WIDEN_LSHIFT_EXPR
 140           || gimple_assign_rhs_code (assign) == FLOAT_EXPR))
 141     {
 142       tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
 143
 144       rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 145       if (rhs < lhs)
 146         scalar_type = rhs_type;
 147     }
 148   else if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
 149     {
 150       unsigned int i = 0;
 151       if (gimple_call_internal_p (call))
 152         {
 153           internal_fn ifn = gimple_call_internal_fn (call);
 154           if (internal_load_fn_p (ifn) || internal_store_fn_p (ifn))
 155             /* gimple_expr_type already picked the type of the loaded
 156                or stored data.  */
 157             i = ~0U;
 158           else if (internal_fn_mask_index (ifn) == 0)
 159             i = 1;
 160         }
 161       if (i < gimple_call_num_args (call))
 162         {
 163           tree rhs_type = TREE_TYPE (gimple_call_arg (call, i));
 164           if (tree_fits_uhwi_p (TYPE_SIZE_UNIT (rhs_type)))
 165             {
 166               rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
 167               if (rhs < lhs)
 168                 scalar_type = rhs_type;
 169             }
 170         }
 171     }
 172
 173   *lhs_size_unit = lhs;
 174   *rhs_size_unit = rhs;
 175   return scalar_type;
 176 }
 177
 178
 179 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
 180    tested at run-time.  Return TRUE if DDR was successfully inserted.
 181    Return false if versioning is not supported.  */
 182
 183 static opt_result
 184 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
 185 {
 186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 187
 188   if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
 189     return opt_result::failure_at (vect_location,
 190                                    "will not create alias checks, as"
 191                                    " --param vect-max-version-for-alias-checks"
 192                                    " == 0\n");
 193
 194   opt_result res
 195     = runtime_alias_check_p (ddr, loop,
 196                              optimize_loop_nest_for_speed_p (loop));
 197   if (!res)
 198     return res;
 199
 200   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
 201   return opt_result::success ();
 202 }
 203
 204 /* Record that loop LOOP_VINFO needs to check that VALUE is nonzero.  */
 205
 206 static void
 207 vect_check_nonzero_value (loop_vec_info loop_vinfo, tree value)
 208 {
 209   vec<tree> checks = LOOP_VINFO_CHECK_NONZERO (loop_vinfo);
 210   for (unsigned int i = 0; i < checks.length(); ++i)
 211     if (checks[i] == value)
 212       return;
 213
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location,
 216                      "need run-time check that %T is nonzero\n",
 217                      value);
 218   LOOP_VINFO_CHECK_NONZERO (loop_vinfo).safe_push (value);
 219 }
 220
 221 /* Return true if we know that the order of vectorized DR_INFO_A and
 222    vectorized DR_INFO_B will be the same as the order of DR_INFO_A and
 223    DR_INFO_B.  At least one of the accesses is a write.  */
 224
 225 static bool
 226 vect_preserves_scalar_order_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b)
 227 {
 228   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 229   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 230
 231   /* Single statements are always kept in their original order.  */
 232   if (!STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
 233       && !STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
 234     return true;
 235
 236   /* STMT_A and STMT_B belong to overlapping groups.  All loads in a
 237      SLP group are emitted at the position of the last scalar load and
 238      all loads in an interleaving group are emitted at the position
 239      of the first scalar load.
 240      Stores in a group are emitted at the position of the last scalar store.
 241      Compute that position and check whether the resulting order matches
 242      the current one.
 243      We have not yet decided between SLP and interleaving so we have
 244      to conservatively assume both.  */
 245   stmt_vec_info il_a;
 246   stmt_vec_info last_a = il_a = DR_GROUP_FIRST_ELEMENT (stmtinfo_a);
 247   if (last_a)
 248     {
 249       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_a); s;
 250            s = DR_GROUP_NEXT_ELEMENT (s))
 251         last_a = get_later_stmt (last_a, s);
 252       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_a)))
 253         {
 254           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_a); s;
 255                s = DR_GROUP_NEXT_ELEMENT (s))
 256             if (get_later_stmt (il_a, s) == il_a)
 257               il_a = s;
 258         }
 259       else
 260         il_a = last_a;
 261     }
 262   else
 263     last_a = il_a = stmtinfo_a;
 264   stmt_vec_info il_b;
 265   stmt_vec_info last_b = il_b = DR_GROUP_FIRST_ELEMENT (stmtinfo_b);
 266   if (last_b)
 267     {
 268       for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (last_b); s;
 269            s = DR_GROUP_NEXT_ELEMENT (s))
 270         last_b = get_later_stmt (last_b, s);
 271       if (!DR_IS_WRITE (STMT_VINFO_DATA_REF (stmtinfo_b)))
 272         {
 273           for (stmt_vec_info s = DR_GROUP_NEXT_ELEMENT (il_b); s;
 274                s = DR_GROUP_NEXT_ELEMENT (s))
 275             if (get_later_stmt (il_b, s) == il_b)
 276               il_b = s;
 277         }
 278       else
 279         il_b = last_b;
 280     }
 281   else
 282     last_b = il_b = stmtinfo_b;
 283   bool a_after_b = (get_later_stmt (stmtinfo_a, stmtinfo_b) == stmtinfo_a);
 284   return (/* SLP */
 285           (get_later_stmt (last_a, last_b) == last_a) == a_after_b
 286           /* Interleaving */
 287           && (get_later_stmt (il_a, il_b) == il_a) == a_after_b
 288           /* Mixed */
 289           && (get_later_stmt (il_a, last_b) == il_a) == a_after_b
 290           && (get_later_stmt (last_a, il_b) == last_a) == a_after_b);
 291 }
 292
 293 /* A subroutine of vect_analyze_data_ref_dependence.  Handle
 294    DDR_COULD_BE_INDEPENDENT_P ddr DDR that has a known set of dependence
 295    distances.  These distances are conservatively correct but they don't
 296    reflect a guaranteed dependence.
 297
 298    Return true if this function does all the work necessary to avoid
 299    an alias or false if the caller should use the dependence distances
 300    to limit the vectorization factor in the usual way.  LOOP_DEPTH is
 301    the depth of the loop described by LOOP_VINFO and the other arguments
 302    are as for vect_analyze_data_ref_dependence.  */
 303
 304 static bool
 305 vect_analyze_possibly_independent_ddr (data_dependence_relation *ddr,
 306                                        loop_vec_info loop_vinfo,
 307                                        int loop_depth, unsigned int *max_vf)
 308 {
 309   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 310   lambda_vector dist_v;
 311   unsigned int i;
 312   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 313     {
 314       int dist = dist_v[loop_depth];
 315       if (dist != 0 && !(dist > 0 && DDR_REVERSED_P (ddr)))
 316         {
 317           /* If the user asserted safelen >= DIST consecutive iterations
 318              can be executed concurrently, assume independence.
 319
 320              ??? An alternative would be to add the alias check even
 321              in this case, and vectorize the fallback loop with the
 322              maximum VF set to safelen.  However, if the user has
 323              explicitly given a length, it's less likely that that
 324              would be a win.  */
 325           if (loop->safelen >= 2 && abs_hwi (dist) <= loop->safelen)
 326             {
 327               if ((unsigned int) loop->safelen < *max_vf)
 328                 *max_vf = loop->safelen;
 329               LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 330               continue;
 331             }
 332
 333           /* For dependence distances of 2 or more, we have the option
 334              of limiting VF or checking for an alias at runtime.
 335              Prefer to check at runtime if we can, to avoid limiting
 336              the VF unnecessarily when the bases are in fact independent.
 337
 338              Note that the alias checks will be removed if the VF ends up
 339              being small enough.  */
 340           dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
 341           dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
 342           return (!STMT_VINFO_GATHER_SCATTER_P (dr_info_a->stmt)
 343                   && !STMT_VINFO_GATHER_SCATTER_P (dr_info_b->stmt)
 344                   && vect_mark_for_runtime_alias_test (ddr, loop_vinfo));
 345         }
 346     }
 347   return true;
 348 }
 349
 350
 351 /* Function vect_analyze_data_ref_dependence.
 352
 353    FIXME: I needed to change the sense of the returned flag.
 354
 355    Return FALSE if there (might) exist a dependence between a memory-reference
 356    DRA and a memory-reference DRB.  When versioning for alias may check a
 357    dependence at run-time, return TRUE.  Adjust *MAX_VF according to
 358    the data dependence.  */
 359
 360 static opt_result
 361 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
 362                                   loop_vec_info loop_vinfo,
 363                                   unsigned int *max_vf)
 364 {
 365   unsigned int i;
 366   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 367   struct data_reference *dra = DDR_A (ddr);
 368   struct data_reference *drb = DDR_B (ddr);
 369   dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (dra);
 370   dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (drb);
 371   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
 372   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
 373   lambda_vector dist_v;
 374   unsigned int loop_depth;
 375
 376   /* In loop analysis all data references should be vectorizable.  */
 377   if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
 378       || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
 379     gcc_unreachable ();
 380
 381   /* Independent data accesses.  */
 382   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 383     return opt_result::success ();
 384
 385   if (dra == drb
 386       || (DR_IS_READ (dra) && DR_IS_READ (drb)))
 387     return opt_result::success ();
 388
 389   /* We do not have to consider dependences between accesses that belong
 390      to the same group, unless the stride could be smaller than the
 391      group size.  */
 392   if (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 393       && (DR_GROUP_FIRST_ELEMENT (stmtinfo_a)
 394           == DR_GROUP_FIRST_ELEMENT (stmtinfo_b))
 395       && !STMT_VINFO_STRIDED_P (stmtinfo_a))
 396     return opt_result::success ();
 397
 398   /* Even if we have an anti-dependence then, as the vectorized loop covers at
 399      least two scalar iterations, there is always also a true dependence.
 400      As the vectorizer does not re-order loads and stores we can ignore
 401      the anti-dependence if TBAA can disambiguate both DRs similar to the
 402      case with known negative distance anti-dependences (positive
 403      distance anti-dependences would violate TBAA constraints).  */
 404   if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
 405        || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
 406       && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
 407                                  get_alias_set (DR_REF (drb))))
 408     return opt_result::success ();
 409
 410   /* Unknown data dependence.  */
 411   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 412     {
 413       /* If user asserted safelen consecutive iterations can be
 414          executed concurrently, assume independence.  */
 415       if (loop->safelen >= 2)
 416         {
 417           if ((unsigned int) loop->safelen < *max_vf)
 418             *max_vf = loop->safelen;
 419           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 420           return opt_result::success ();
 421         }
 422
 423       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 424           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 425         return opt_result::failure_at
 426           (stmtinfo_a->stmt,
 427            "versioning for alias not supported for: "
 428            "can't determine dependence between %T and %T\n",
 429            DR_REF (dra), DR_REF (drb));
 430
 431       if (dump_enabled_p ())
 432         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 433                          "versioning for alias required: "
 434                          "can't determine dependence between %T and %T\n",
 435                          DR_REF (dra), DR_REF (drb));
 436
 437       /* Add to list of ddrs that need to be tested at run-time.  */
 438       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 439     }
 440
 441   /* Known data dependence.  */
 442   if (DDR_NUM_DIST_VECTS (ddr) == 0)
 443     {
 444       /* If user asserted safelen consecutive iterations can be
 445          executed concurrently, assume independence.  */
 446       if (loop->safelen >= 2)
 447         {
 448           if ((unsigned int) loop->safelen < *max_vf)
 449             *max_vf = loop->safelen;
 450           LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
 451           return opt_result::success ();
 452         }
 453
 454       if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
 455           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
 456         return opt_result::failure_at
 457           (stmtinfo_a->stmt,
 458            "versioning for alias not supported for: "
 459            "bad dist vector for %T and %T\n",
 460            DR_REF (dra), DR_REF (drb));
 461
 462       if (dump_enabled_p ())
 463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, stmtinfo_a->stmt,
 464                          "versioning for alias required: "
 465                          "bad dist vector for %T and %T\n",
 466                          DR_REF (dra), DR_REF (drb));
 467       /* Add to list of ddrs that need to be tested at run-time.  */
 468       return vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
 469     }
 470
 471   loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
 472
 473   if (DDR_COULD_BE_INDEPENDENT_P (ddr)
 474       && vect_analyze_possibly_independent_ddr (ddr, loop_vinfo,
 475                                                 loop_depth, max_vf))
 476     return opt_result::success ();
 477
 478   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
 479     {
 480       int dist = dist_v[loop_depth];
 481
 482       if (dump_enabled_p ())
 483         dump_printf_loc (MSG_NOTE, vect_location,
 484                          "dependence distance  = %d.\n", dist);
 485
 486       if (dist == 0)
 487         {
 488           if (dump_enabled_p ())
 489             dump_printf_loc (MSG_NOTE, vect_location,
 490                              "dependence distance == 0 between %T and %T\n",
 491                              DR_REF (dra), DR_REF (drb));
 492
 493           /* When we perform grouped accesses and perform implicit CSE
 494              by detecting equal accesses and doing disambiguation with
 495              runtime alias tests like for
 496                 .. = a[i];
 497                 .. = a[i+1];
 498                 a[i] = ..;
 499                 a[i+1] = ..;
 500                 *p = ..;
 501                 .. = a[i];
 502                 .. = a[i+1];
 503              where we will end up loading { a[i], a[i+1] } once, make
 504              sure that inserting group loads before the first load and
 505              stores after the last store will do the right thing.
 506              Similar for groups like
 507                 a[i] = ...;
 508                 ... = a[i];
 509                 a[i+1] = ...;
 510              where loads from the group interleave with the store.  */
 511           if (!vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
 512             return opt_result::failure_at (stmtinfo_a->stmt,
 513                                            "READ_WRITE dependence"
 514                                            " in interleaving.\n");
 515
 516           if (loop->safelen < 2)
 517             {
 518               tree indicator = dr_zero_step_indicator (dra);
 519               if (!indicator || integer_zerop (indicator))
 520                 return opt_result::failure_at (stmtinfo_a->stmt,
 521                                                "access also has a zero step\n");
 522               else if (TREE_CODE (indicator) != INTEGER_CST)
 523                 vect_check_nonzero_value (loop_vinfo, indicator);
 524             }
 525           continue;
 526         }
 527
 528       if (dist > 0 && DDR_REVERSED_P (ddr))
 529         {
 530           /* If DDR_REVERSED_P the order of the data-refs in DDR was
 531              reversed (to make distance vector positive), and the actual
 532              distance is negative.  */
 533           if (dump_enabled_p ())
 534             dump_printf_loc (MSG_NOTE, vect_location,
 535                              "dependence distance negative.\n");
 536           /* When doing outer loop vectorization, we need to check if there is
 537              a backward dependence at the inner loop level if the dependence
 538              at the outer loop is reversed.  See PR81740.  */
 539           if (nested_in_vect_loop_p (loop, stmtinfo_a)
 540               || nested_in_vect_loop_p (loop, stmtinfo_b))
 541             {
 542               unsigned inner_depth = index_in_loop_nest (loop->inner->num,
 543                                                          DDR_LOOP_NEST (ddr));
 544               if (dist_v[inner_depth] < 0)
 545                 return opt_result::failure_at (stmtinfo_a->stmt,
 546                                                "not vectorized, dependence "
 547                                                "between data-refs %T and %T\n",
 548                                                DR_REF (dra), DR_REF (drb));
 549             }
 550           /* Record a negative dependence distance to later limit the
 551              amount of stmt copying / unrolling we can perform.
 552              Only need to handle read-after-write dependence.  */
 553           if (DR_IS_READ (drb)
 554               && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
 555                   || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
 556             STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
 557           continue;
 558         }
 559
 560       unsigned int abs_dist = abs (dist);
 561       if (abs_dist >= 2 && abs_dist < *max_vf)
 562         {
 563           /* The dependence distance requires reduction of the maximal
 564              vectorization factor.  */
 565           *max_vf = abs_dist;
 566           if (dump_enabled_p ())
 567             dump_printf_loc (MSG_NOTE, vect_location,
 568                              "adjusting maximal vectorization factor to %i\n",
 569                              *max_vf);
 570         }
 571
 572       if (abs_dist >= *max_vf)
 573         {
 574           /* Dependence distance does not create dependence, as far as
 575              vectorization is concerned, in this case.  */
 576           if (dump_enabled_p ())
 577             dump_printf_loc (MSG_NOTE, vect_location,
 578                              "dependence distance >= VF.\n");
 579           continue;
 580         }
 581
 582       return opt_result::failure_at (stmtinfo_a->stmt,
 583                                      "not vectorized, possible dependence "
 584                                      "between data-refs %T and %T\n",
 585                                      DR_REF (dra), DR_REF (drb));
 586     }
 587
 588   return opt_result::success ();
 589 }
 590
 591 /* Function vect_analyze_data_ref_dependences.
 592
 593    Examine all the data references in the loop, and make sure there do not
 594    exist any data dependences between them.  Set *MAX_VF according to
 595    the maximum vectorization factor the data dependences allow.  */
 596
 597 opt_result
 598 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
 599                                    unsigned int *max_vf)
 600 {
 601   unsigned int i;
 602   struct data_dependence_relation *ddr;
 603
 604   DUMP_VECT_SCOPE ("vect_analyze_data_ref_dependences");
 605
 606   if (!LOOP_VINFO_DDRS (loop_vinfo).exists ())
 607     {
 608       LOOP_VINFO_DDRS (loop_vinfo)
 609         .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
 610                  * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
 611       /* We need read-read dependences to compute
 612          STMT_VINFO_SAME_ALIGN_REFS.  */
 613       bool res = compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
 614                                           &LOOP_VINFO_DDRS (loop_vinfo),
 615                                           LOOP_VINFO_LOOP_NEST (loop_vinfo),
 616                                           true);
 617       gcc_assert (res);
 618     }
 619
 620   LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
 621
 622   /* For epilogues we either have no aliases or alias versioning
 623      was applied to original loop.  Therefore we may just get max_vf
 624      using VF of original loop.  */
 625   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
 626     *max_vf = LOOP_VINFO_ORIG_MAX_VECT_FACTOR (loop_vinfo);
 627   else
 628     FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
 629       {
 630         opt_result res
 631           = vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf);
 632         if (!res)
 633           return res;
 634       }
 635
 636   return opt_result::success ();
 637 }
 638
 639
 640 /* Function vect_slp_analyze_data_ref_dependence.
 641
 642    Return TRUE if there (might) exist a dependence between a memory-reference
 643    DRA and a memory-reference DRB for VINFO.  When versioning for alias
 644    may check a dependence at run-time, return FALSE.  Adjust *MAX_VF
 645    according to the data dependence.  */
 646
 647 static bool
 648 vect_slp_analyze_data_ref_dependence (vec_info *vinfo,
 649                                       struct data_dependence_relation *ddr)
 650 {
 651   struct data_reference *dra = DDR_A (ddr);
 652   struct data_reference *drb = DDR_B (ddr);
 653   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
 654   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
 655
 656   /* We need to check dependences of statements marked as unvectorizable
 657      as well, they still can prohibit vectorization.  */
 658
 659   /* Independent data accesses.  */
 660   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
 661     return false;
 662
 663   if (dra == drb)
 664     return false;
 665
 666   /* Read-read is OK.  */
 667   if (DR_IS_READ (dra) && DR_IS_READ (drb))
 668     return false;
 669
 670   /* If dra and drb are part of the same interleaving chain consider
 671      them independent.  */
 672   if (STMT_VINFO_GROUPED_ACCESS (dr_info_a->stmt)
 673       && (DR_GROUP_FIRST_ELEMENT (dr_info_a->stmt)
 674           == DR_GROUP_FIRST_ELEMENT (dr_info_b->stmt)))
 675     return false;
 676
 677   /* Unknown data dependence.  */
 678   if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
 679     {
 680       if  (dump_enabled_p ())
 681         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 682                          "can't determine dependence between %T and %T\n",
 683                          DR_REF (dra), DR_REF (drb));
 684     }
 685   else if (dump_enabled_p ())
 686     dump_printf_loc (MSG_NOTE, vect_location,
 687                      "determined dependence between %T and %T\n",
 688                      DR_REF (dra), DR_REF (drb));
 689
 690   return true;
 691 }
 692
 693
 694 /* Analyze dependences involved in the transform of SLP NODE.  STORES
 695    contain the vector of scalar stores of this instance if we are
 696    disambiguating the loads.  */
 697
 698 static bool
 699 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node,
 700                                    vec<stmt_vec_info> stores,
 701                                    stmt_vec_info last_store_info)
 702 {
 703   /* This walks over all stmts involved in the SLP load/store done
 704      in NODE verifying we can sink them up to the last stmt in the
 705      group.  */
 706   stmt_vec_info last_access_info = vect_find_last_scalar_stmt_in_slp (node);
 707   vec_info *vinfo = last_access_info->vinfo;
 708   for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 709     {
 710       stmt_vec_info access_info = SLP_TREE_SCALAR_STMTS (node)[k];
 711       if (access_info == last_access_info)
 712         continue;
 713       data_reference *dr_a = STMT_VINFO_DATA_REF (access_info);
 714       ao_ref ref;
 715       bool ref_initialized_p = false;
 716       for (gimple_stmt_iterator gsi = gsi_for_stmt (access_info->stmt);
 717            gsi_stmt (gsi) != last_access_info->stmt; gsi_next (&gsi))
 718         {
 719           gimple *stmt = gsi_stmt (gsi);
 720           if (! gimple_vuse (stmt)
 721               || (DR_IS_READ (dr_a) && ! gimple_vdef (stmt)))
 722             continue;
 723
 724           /* If we couldn't record a (single) data reference for this
 725              stmt we have to resort to the alias oracle.  */
 726           stmt_vec_info stmt_info = vinfo->lookup_stmt (stmt);
 727           data_reference *dr_b = STMT_VINFO_DATA_REF (stmt_info);
 728           if (!dr_b)
 729             {
 730               /* We are moving a store or sinking a load - this means
 731                  we cannot use TBAA for disambiguation.  */
 732               if (!ref_initialized_p)
 733                 ao_ref_init (&ref, DR_REF (dr_a));
 734               if (stmt_may_clobber_ref_p_1 (stmt, &ref, false)
 735                   || ref_maybe_used_by_stmt_p (stmt, &ref, false))
 736                 return false;
 737               continue;
 738             }
 739
 740           bool dependent = false;
 741           /* If we run into a store of this same instance (we've just
 742              marked those) then delay dependence checking until we run
 743              into the last store because this is where it will have
 744              been sunk to (and we verify if we can do that as well).  */
 745           if (gimple_visited_p (stmt))
 746             {
 747               if (stmt_info != last_store_info)
 748                 continue;
 749               unsigned i;
 750               stmt_vec_info store_info;
 751               FOR_EACH_VEC_ELT (stores, i, store_info)
 752                 {
 753                   data_reference *store_dr = STMT_VINFO_DATA_REF (store_info);
 754                   ddr_p ddr = initialize_data_dependence_relation
 755                                 (dr_a, store_dr, vNULL);
 756                   dependent
 757                     = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 758                   free_dependence_relation (ddr);
 759                   if (dependent)
 760                     break;
 761                 }
 762             }
 763           else
 764             {
 765               ddr_p ddr = initialize_data_dependence_relation (dr_a,
 766                                                                dr_b, vNULL);
 767               dependent = vect_slp_analyze_data_ref_dependence (vinfo, ddr);
 768               free_dependence_relation (ddr);
 769             }
 770           if (dependent)
 771             return false;
 772         }
 773     }
 774   return true;
 775 }
 776
 777
 778 /* Function vect_analyze_data_ref_dependences.
 779
 780    Examine all the data references in the basic-block, and make sure there
 781    do not exist any data dependences between them.  Set *MAX_VF according to
 782    the maximum vectorization factor the data dependences allow.  */
 783
 784 bool
 785 vect_slp_analyze_instance_dependence (slp_instance instance)
 786 {
 787   DUMP_VECT_SCOPE ("vect_slp_analyze_instance_dependence");
 788
 789   /* The stores of this instance are at the root of the SLP tree.  */
 790   slp_tree store = SLP_INSTANCE_TREE (instance);
 791   if (! STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (store)[0]))
 792     store = NULL;
 793
 794   /* Verify we can sink stores to the vectorized stmt insert location.  */
 795   stmt_vec_info last_store_info = NULL;
 796   if (store)
 797     {
 798       if (! vect_slp_analyze_node_dependences (instance, store, vNULL, NULL))
 799         return false;
 800
 801       /* Mark stores in this instance and remember the last one.  */
 802       last_store_info = vect_find_last_scalar_stmt_in_slp (store);
 803       for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 804         gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, true);
 805     }
 806
 807   bool res = true;
 808
 809   /* Verify we can sink loads to the vectorized stmt insert location,
 810      special-casing stores of this instance.  */
 811   slp_tree load;
 812   unsigned int i;
 813   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load)
 814     if (! vect_slp_analyze_node_dependences (instance, load,
 815                                              store
 816                                              ? SLP_TREE_SCALAR_STMTS (store)
 817                                              : vNULL, last_store_info))
 818       {
 819         res = false;
 820         break;
 821       }
 822
 823   /* Unset the visited flag.  */
 824   if (store)
 825     for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
 826       gimple_set_visited (SLP_TREE_SCALAR_STMTS (store)[k]->stmt, false);
 827
 828   return res;
 829 }
 830
 831 /* Record the base alignment guarantee given by DRB, which occurs
 832    in STMT_INFO.  */
 833
 834 static void
 835 vect_record_base_alignment (stmt_vec_info stmt_info,
 836                             innermost_loop_behavior *drb)
 837 {
 838   vec_info *vinfo = stmt_info->vinfo;
 839   bool existed;
 840   innermost_loop_behavior *&entry
 841     = vinfo->base_alignments.get_or_insert (drb->base_address, &existed);
 842   if (!existed || entry->base_alignment < drb->base_alignment)
 843     {
 844       entry = drb;
 845       if (dump_enabled_p ())
 846         dump_printf_loc (MSG_NOTE, vect_location,
 847                          "recording new base alignment for %T\n"
 848                          "  alignment:    %d\n"
 849                          "  misalignment: %d\n"
 850                          "  based on:     %G",
 851                          drb->base_address,
 852                          drb->base_alignment,
 853                          drb->base_misalignment,
 854                          stmt_info->stmt);
 855     }
 856 }
 857
 858 /* If the region we're going to vectorize is reached, all unconditional
 859    data references occur at least once.  We can therefore pool the base
 860    alignment guarantees from each unconditional reference.  Do this by
 861    going through all the data references in VINFO and checking whether
 862    the containing statement makes the reference unconditionally.  If so,
 863    record the alignment of the base address in VINFO so that it can be
 864    used for all other references with the same base.  */
 865
 866 void
 867 vect_record_base_alignments (vec_info *vinfo)
 868 {
 869   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
 870   struct loop *loop = loop_vinfo ? LOOP_VINFO_LOOP (loop_vinfo) : NULL;
 871   data_reference *dr;
 872   unsigned int i;
 873   FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
 874     {
 875       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
 876       stmt_vec_info stmt_info = dr_info->stmt;
 877       if (!DR_IS_CONDITIONAL_IN_STMT (dr)
 878           && STMT_VINFO_VECTORIZABLE (stmt_info)
 879           && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 880         {
 881           vect_record_base_alignment (stmt_info, &DR_INNERMOST (dr));
 882
 883           /* If DR is nested in the loop that is being vectorized, we can also
 884              record the alignment of the base wrt the outer loop.  */
 885           if (loop && nested_in_vect_loop_p (loop, stmt_info))
 886             vect_record_base_alignment
 887               (stmt_info, &STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info));
 888         }
 889     }
 890 }
 891
 892 /* Return the target alignment for the vectorized form of DR_INFO.  */
 893
 894 static poly_uint64
 895 vect_calculate_target_alignment (dr_vec_info *dr_info)
 896 {
 897   tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
 898   return targetm.vectorize.preferred_vector_alignment (vectype);
 899 }
 900
 901 /* Function vect_compute_data_ref_alignment
 902
 903    Compute the misalignment of the data reference DR_INFO.
 904
 905    Output:
 906    1. DR_MISALIGNMENT (DR_INFO) is defined.
 907
 908    FOR NOW: No analysis is actually performed. Misalignment is calculated
 909    only for trivial cases. TODO.  */
 910
 911 static void
 912 vect_compute_data_ref_alignment (dr_vec_info *dr_info)
 913 {
 914   stmt_vec_info stmt_info = dr_info->stmt;
 915   vec_base_alignments *base_alignments = &stmt_info->vinfo->base_alignments;
 916   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 917   struct loop *loop = NULL;
 918   tree ref = DR_REF (dr_info->dr);
 919   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
 920
 921   if (dump_enabled_p ())
 922     dump_printf_loc (MSG_NOTE, vect_location,
 923                      "vect_compute_data_ref_alignment:\n");
 924
 925   if (loop_vinfo)
 926     loop = LOOP_VINFO_LOOP (loop_vinfo);
 927
 928   /* Initialize misalignment to unknown.  */
 929   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
 930
 931   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
 932     return;
 933
 934   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
 935   bool step_preserves_misalignment_p;
 936
 937   poly_uint64 vector_alignment
 938     = exact_div (vect_calculate_target_alignment (dr_info), BITS_PER_UNIT);
 939   DR_TARGET_ALIGNMENT (dr_info) = vector_alignment;
 940
 941   unsigned HOST_WIDE_INT vect_align_c;
 942   if (!vector_alignment.is_constant (&vect_align_c))
 943     return;
 944
 945   /* No step for BB vectorization.  */
 946   if (!loop)
 947     {
 948       gcc_assert (integer_zerop (drb->step));
 949       step_preserves_misalignment_p = true;
 950     }
 951
 952   /* In case the dataref is in an inner-loop of the loop that is being
 953      vectorized (LOOP), we use the base and misalignment information
 954      relative to the outer-loop (LOOP).  This is ok only if the misalignment
 955      stays the same throughout the execution of the inner-loop, which is why
 956      we have to check that the stride of the dataref in the inner-loop evenly
 957      divides by the vector alignment.  */
 958   else if (nested_in_vect_loop_p (loop, stmt_info))
 959     {
 960       step_preserves_misalignment_p
 961         = (DR_STEP_ALIGNMENT (dr_info->dr) % vect_align_c) == 0;
 962
 963       if (dump_enabled_p ())
 964         {
 965           if (step_preserves_misalignment_p)
 966             dump_printf_loc (MSG_NOTE, vect_location,
 967                              "inner step divides the vector alignment.\n");
 968           else
 969             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 970                              "inner step doesn't divide the vector"
 971                              " alignment.\n");
 972         }
 973     }
 974
 975   /* Similarly we can only use base and misalignment information relative to
 976      an innermost loop if the misalignment stays the same throughout the
 977      execution of the loop.  As above, this is the case if the stride of
 978      the dataref evenly divides by the alignment.  */
 979   else
 980     {
 981       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 982       step_preserves_misalignment_p
 983         = multiple_p (DR_STEP_ALIGNMENT (dr_info->dr) * vf, vect_align_c);
 984
 985       if (!step_preserves_misalignment_p && dump_enabled_p ())
 986         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 987                          "step doesn't divide the vector alignment.\n");
 988     }
 989
 990   unsigned int base_alignment = drb->base_alignment;
 991   unsigned int base_misalignment = drb->base_misalignment;
 992
 993   /* Calculate the maximum of the pooled base address alignment and the
 994      alignment that we can compute for DR itself.  */
 995   innermost_loop_behavior **entry = base_alignments->get (drb->base_address);
 996   if (entry && base_alignment < (*entry)->base_alignment)
 997     {
 998       base_alignment = (*entry)->base_alignment;
 999       base_misalignment = (*entry)->base_misalignment;
1000     }
1001
1002   if (drb->offset_alignment < vect_align_c
1003       || !step_preserves_misalignment_p
1004       /* We need to know whether the step wrt the vectorized loop is
1005          negative when computing the starting misalignment below.  */
1006       || TREE_CODE (drb->step) != INTEGER_CST)
1007     {
1008       if (dump_enabled_p ())
1009         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1010                          "Unknown alignment for access: %T\n", ref);
1011       return;
1012     }
1013
1014   if (base_alignment < vect_align_c)
1015     {
1016       unsigned int max_alignment;
1017       tree base = get_base_for_alignment (drb->base_address, &max_alignment);
1018       if (max_alignment < vect_align_c
1019           || !vect_can_force_dr_alignment_p (base,
1020                                              vect_align_c * BITS_PER_UNIT))
1021         {
1022           if (dump_enabled_p ())
1023             dump_printf_loc (MSG_NOTE, vect_location,
1024                              "can't force alignment of ref: %T\n", ref);
1025           return;
1026         }
1027
1028       /* Force the alignment of the decl.
1029          NOTE: This is the only change to the code we make during
1030          the analysis phase, before deciding to vectorize the loop.  */
1031       if (dump_enabled_p ())
1032         dump_printf_loc (MSG_NOTE, vect_location,
1033                          "force alignment of %T\n", ref);
1034
1035       dr_info->base_decl = base;
1036       dr_info->base_misaligned = true;
1037       base_misalignment = 0;
1038     }
1039   poly_int64 misalignment
1040     = base_misalignment + wi::to_poly_offset (drb->init).force_shwi ();
1041
1042   /* If this is a backward running DR then first access in the larger
1043      vectype actually is N-1 elements before the address in the DR.
1044      Adjust misalign accordingly.  */
1045   if (tree_int_cst_sgn (drb->step) < 0)
1046     /* PLUS because STEP is negative.  */
1047     misalignment += ((TYPE_VECTOR_SUBPARTS (vectype) - 1)
1048                      * TREE_INT_CST_LOW (drb->step));
1049
1050   unsigned int const_misalignment;
1051   if (!known_misalignment (misalignment, vect_align_c, &const_misalignment))
1052     {
1053       if (dump_enabled_p ())
1054         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                          "Non-constant misalignment for access: %T\n", ref);
1056       return;
1057     }
1058
1059   SET_DR_MISALIGNMENT (dr_info, const_misalignment);
1060
1061   if (dump_enabled_p ())
1062     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063                      "misalign = %d bytes of ref %T\n",
1064                      DR_MISALIGNMENT (dr_info), ref);
1065
1066   return;
1067 }
1068
1069 /* Function vect_update_misalignment_for_peel.
1070    Sets DR_INFO's misalignment
1071    - to 0 if it has the same alignment as DR_PEEL_INFO,
1072    - to the misalignment computed using NPEEL if DR_INFO's salignment is known,
1073    - to -1 (unknown) otherwise.
1074
1075    DR_INFO - the data reference whose misalignment is to be adjusted.
1076    DR_PEEL_INFO - the data reference whose misalignment is being made
1077                   zero in the vector loop by the peel.
1078    NPEEL - the number of iterations in the peel loop if the misalignment
1079            of DR_PEEL_INFO is known at compile time.  */
1080
1081 static void
1082 vect_update_misalignment_for_peel (dr_vec_info *dr_info,
1083                                    dr_vec_info *dr_peel_info, int npeel)
1084 {
1085   unsigned int i;
1086   vec<dr_p> same_aligned_drs;
1087   struct data_reference *current_dr;
1088   stmt_vec_info peel_stmt_info = dr_peel_info->stmt;
1089
1090   /* It can be assumed that if dr_info has the same alignment as dr_peel,
1091      it is aligned in the vector loop.  */
1092   same_aligned_drs = STMT_VINFO_SAME_ALIGN_REFS (peel_stmt_info);
1093   FOR_EACH_VEC_ELT (same_aligned_drs, i, current_dr)
1094     {
1095       if (current_dr != dr_info->dr)
1096         continue;
1097       gcc_assert (!known_alignment_for_access_p (dr_info)
1098                   || !known_alignment_for_access_p (dr_peel_info)
1099                   || (DR_MISALIGNMENT (dr_info)
1100                       == DR_MISALIGNMENT (dr_peel_info)));
1101       SET_DR_MISALIGNMENT (dr_info, 0);
1102       return;
1103     }
1104
1105   unsigned HOST_WIDE_INT alignment;
1106   if (DR_TARGET_ALIGNMENT (dr_info).is_constant (&alignment)
1107       && known_alignment_for_access_p (dr_info)
1108       && known_alignment_for_access_p (dr_peel_info))
1109     {
1110       int misal = DR_MISALIGNMENT (dr_info);
1111       misal += npeel * TREE_INT_CST_LOW (DR_STEP (dr_info->dr));
1112       misal &= alignment - 1;
1113       SET_DR_MISALIGNMENT (dr_info, misal);
1114       return;
1115     }
1116
1117   if (dump_enabled_p ())
1118     dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment " \
1119                      "to unknown (-1).\n");
1120   SET_DR_MISALIGNMENT (dr_info, DR_MISALIGNMENT_UNKNOWN);
1121 }
1122
1123
1124 /* Function verify_data_ref_alignment
1125
1126    Return TRUE if DR_INFO can be handled with respect to alignment.  */
1127
1128 static opt_result
1129 verify_data_ref_alignment (dr_vec_info *dr_info)
1130 {
1131   enum dr_alignment_support supportable_dr_alignment
1132     = vect_supportable_dr_alignment (dr_info, false);
1133   if (!supportable_dr_alignment)
1134     return opt_result::failure_at
1135       (dr_info->stmt->stmt,
1136        DR_IS_READ (dr_info->dr)
1137         ? "not vectorized: unsupported unaligned load: %T\n"
1138         : "not vectorized: unsupported unaligned store: %T\n",
1139        DR_REF (dr_info->dr));
1140
1141   if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1142     dump_printf_loc (MSG_NOTE, vect_location,
1143                      "Vectorizing an unaligned access.\n");
1144
1145   return opt_result::success ();
1146 }
1147
1148 /* Function vect_verify_datarefs_alignment
1149
1150    Return TRUE if all data references in the loop can be
1151    handled with respect to alignment.  */
1152
1153 opt_result
1154 vect_verify_datarefs_alignment (loop_vec_info vinfo)
1155 {
1156   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
1157   struct data_reference *dr;
1158   unsigned int i;
1159
1160   FOR_EACH_VEC_ELT (datarefs, i, dr)
1161     {
1162       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
1163       stmt_vec_info stmt_info = dr_info->stmt;
1164
1165       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1166         continue;
1167
1168       /* For interleaving, only the alignment of the first access matters.   */
1169       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1170           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1171         continue;
1172
1173       /* Strided accesses perform only component accesses, alignment is
1174          irrelevant for them.  */
1175       if (STMT_VINFO_STRIDED_P (stmt_info)
1176           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1177         continue;
1178
1179       opt_result res = verify_data_ref_alignment (dr_info);
1180       if (!res)
1181         return res;
1182     }
1183
1184   return opt_result::success ();
1185 }
1186
1187 /* Given an memory reference EXP return whether its alignment is less
1188    than its size.  */
1189
1190 static bool
1191 not_size_aligned (tree exp)
1192 {
1193   if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1194     return true;
1195
1196   return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1197           > get_object_alignment (exp));
1198 }
1199
1200 /* Function vector_alignment_reachable_p
1201
1202    Return true if vector alignment for DR_INFO is reachable by peeling
1203    a few loop iterations.  Return false otherwise.  */
1204
1205 static bool
1206 vector_alignment_reachable_p (dr_vec_info *dr_info)
1207 {
1208   stmt_vec_info stmt_info = dr_info->stmt;
1209   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1210
1211   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1212     {
1213       /* For interleaved access we peel only if number of iterations in
1214          the prolog loop ({VF - misalignment}), is a multiple of the
1215          number of the interleaved accesses.  */
1216       int elem_size, mis_in_elements;
1217
1218       /* FORNOW: handle only known alignment.  */
1219       if (!known_alignment_for_access_p (dr_info))
1220         return false;
1221
1222       poly_uint64 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1223       poly_uint64 vector_size = GET_MODE_SIZE (TYPE_MODE (vectype));
1224       elem_size = vector_element_size (vector_size, nelements);
1225       mis_in_elements = DR_MISALIGNMENT (dr_info) / elem_size;
1226
1227       if (!multiple_p (nelements - mis_in_elements, DR_GROUP_SIZE (stmt_info)))
1228         return false;
1229     }
1230
1231   /* If misalignment is known at the compile time then allow peeling
1232      only if natural alignment is reachable through peeling.  */
1233   if (known_alignment_for_access_p (dr_info) && !aligned_access_p (dr_info))
1234     {
1235       HOST_WIDE_INT elmsize =
1236                 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1237       if (dump_enabled_p ())
1238         {
1239           dump_printf_loc (MSG_NOTE, vect_location,
1240                            "data size = %wd. misalignment = %d.\n", elmsize,
1241                            DR_MISALIGNMENT (dr_info));
1242         }
1243       if (DR_MISALIGNMENT (dr_info) % elmsize)
1244         {
1245           if (dump_enabled_p ())
1246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1247                              "data size does not divide the misalignment.\n");
1248           return false;
1249         }
1250     }
1251
1252   if (!known_alignment_for_access_p (dr_info))
1253     {
1254       tree type = TREE_TYPE (DR_REF (dr_info->dr));
1255       bool is_packed = not_size_aligned (DR_REF (dr_info->dr));
1256       if (dump_enabled_p ())
1257         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258                          "Unknown misalignment, %snaturally aligned\n",
1259                          is_packed ? "not " : "");
1260       return targetm.vectorize.vector_alignment_reachable (type, is_packed);
1261     }
1262
1263   return true;
1264 }
1265
1266
1267 /* Calculate the cost of the memory access represented by DR_INFO.  */
1268
1269 static void
1270 vect_get_data_access_cost (dr_vec_info *dr_info,
1271                            unsigned int *inside_cost,
1272                            unsigned int *outside_cost,
1273                            stmt_vector_for_cost *body_cost_vec,
1274                            stmt_vector_for_cost *prologue_cost_vec)
1275 {
1276   stmt_vec_info stmt_info = dr_info->stmt;
1277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1278   int ncopies;
1279
1280   if (PURE_SLP_STMT (stmt_info))
1281     ncopies = 1;
1282   else
1283     ncopies = vect_get_num_copies (loop_vinfo, STMT_VINFO_VECTYPE (stmt_info));
1284
1285   if (DR_IS_READ (dr_info->dr))
1286     vect_get_load_cost (stmt_info, ncopies, true, inside_cost, outside_cost,
1287                         prologue_cost_vec, body_cost_vec, false);
1288   else
1289     vect_get_store_cost (stmt_info, ncopies, inside_cost, body_cost_vec);
1290
1291   if (dump_enabled_p ())
1292     dump_printf_loc (MSG_NOTE, vect_location,
1293                      "vect_get_data_access_cost: inside_cost = %d, "
1294                      "outside_cost = %d.\n", *inside_cost, *outside_cost);
1295 }
1296
1297
1298 typedef struct _vect_peel_info
1299 {
1300   dr_vec_info *dr_info;
1301   int npeel;
1302   unsigned int count;
1303 } *vect_peel_info;
1304
1305 typedef struct _vect_peel_extended_info
1306 {
1307   struct _vect_peel_info peel_info;
1308   unsigned int inside_cost;
1309   unsigned int outside_cost;
1310 } *vect_peel_extended_info;
1311
1312
1313 /* Peeling hashtable helpers.  */
1314
1315 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1316 {
1317   static inline hashval_t hash (const _vect_peel_info *);
1318   static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1319 };
1320
1321 inline hashval_t
1322 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1323 {
1324   return (hashval_t) peel_info->npeel;
1325 }
1326
1327 inline bool
1328 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1329 {
1330   return (a->npeel == b->npeel);
1331 }
1332
1333
1334 /* Insert DR_INFO into peeling hash table with NPEEL as key.  */
1335
1336 static void
1337 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1338                           loop_vec_info loop_vinfo, dr_vec_info *dr_info,
1339                           int npeel)
1340 {
1341   struct _vect_peel_info elem, *slot;
1342   _vect_peel_info **new_slot;
1343   bool supportable_dr_alignment
1344     = vect_supportable_dr_alignment (dr_info, true);
1345
1346   elem.npeel = npeel;
1347   slot = peeling_htab->find (&elem);
1348   if (slot)
1349     slot->count++;
1350   else
1351     {
1352       slot = XNEW (struct _vect_peel_info);
1353       slot->npeel = npeel;
1354       slot->dr_info = dr_info;
1355       slot->count = 1;
1356       new_slot = peeling_htab->find_slot (slot, INSERT);
1357       *new_slot = slot;
1358     }
1359
1360   if (!supportable_dr_alignment
1361       && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1362     slot->count += VECT_MAX_COST;
1363 }
1364
1365
1366 /* Traverse peeling hash table to find peeling option that aligns maximum
1367    number of data accesses.  */
1368
1369 int
1370 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1371                                      _vect_peel_extended_info *max)
1372 {
1373   vect_peel_info elem = *slot;
1374
1375   if (elem->count > max->peel_info.count
1376       || (elem->count == max->peel_info.count
1377           && max->peel_info.npeel > elem->npeel))
1378     {
1379       max->peel_info.npeel = elem->npeel;
1380       max->peel_info.count = elem->count;
1381       max->peel_info.dr_info = elem->dr_info;
1382     }
1383
1384   return 1;
1385 }
1386
1387 /* Get the costs of peeling NPEEL iterations for LOOP_VINFO, checking
1388    data access costs for all data refs.  If UNKNOWN_MISALIGNMENT is true,
1389    we assume DR0_INFO's misalignment will be zero after peeling.  */
1390
1391 static void
1392 vect_get_peeling_costs_all_drs (loop_vec_info loop_vinfo,
1393                                 dr_vec_info *dr0_info,
1394                                 unsigned int *inside_cost,
1395                                 unsigned int *outside_cost,
1396                                 stmt_vector_for_cost *body_cost_vec,
1397                                 stmt_vector_for_cost *prologue_cost_vec,
1398                                 unsigned int npeel,
1399                                 bool unknown_misalignment)
1400 {
1401   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1402   unsigned i;
1403   data_reference *dr;
1404
1405   FOR_EACH_VEC_ELT (datarefs, i, dr)
1406     {
1407       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1408       stmt_vec_info stmt_info = dr_info->stmt;
1409       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1410         continue;
1411
1412       /* For interleaving, only the alignment of the first access
1413          matters.  */
1414       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1415           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1416         continue;
1417
1418       /* Strided accesses perform only component accesses, alignment is
1419          irrelevant for them.  */
1420       if (STMT_VINFO_STRIDED_P (stmt_info)
1421           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1422         continue;
1423
1424       int save_misalignment;
1425       save_misalignment = DR_MISALIGNMENT (dr_info);
1426       if (npeel == 0)
1427         ;
1428       else if (unknown_misalignment && dr_info == dr0_info)
1429         SET_DR_MISALIGNMENT (dr_info, 0);
1430       else
1431         vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1432       vect_get_data_access_cost (dr_info, inside_cost, outside_cost,
1433                                  body_cost_vec, prologue_cost_vec);
1434       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1435     }
1436 }
1437
1438 /* Traverse peeling hash table and calculate cost for each peeling option.
1439    Find the one with the lowest cost.  */
1440
1441 int
1442 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1443                                    _vect_peel_extended_info *min)
1444 {
1445   vect_peel_info elem = *slot;
1446   int dummy;
1447   unsigned int inside_cost = 0, outside_cost = 0;
1448   stmt_vec_info stmt_info = elem->dr_info->stmt;
1449   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1450   stmt_vector_for_cost prologue_cost_vec, body_cost_vec,
1451                        epilogue_cost_vec;
1452
1453   prologue_cost_vec.create (2);
1454   body_cost_vec.create (2);
1455   epilogue_cost_vec.create (2);
1456
1457   vect_get_peeling_costs_all_drs (loop_vinfo, elem->dr_info, &inside_cost,
1458                                   &outside_cost, &body_cost_vec,
1459                                   &prologue_cost_vec, elem->npeel, false);
1460
1461   body_cost_vec.release ();
1462
1463   outside_cost += vect_get_known_peeling_cost
1464     (loop_vinfo, elem->npeel, &dummy,
1465      &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1466      &prologue_cost_vec, &epilogue_cost_vec);
1467
1468   /* Prologue and epilogue costs are added to the target model later.
1469      These costs depend only on the scalar iteration cost, the
1470      number of peeling iterations finally chosen, and the number of
1471      misaligned statements.  So discard the information found here.  */
1472   prologue_cost_vec.release ();
1473   epilogue_cost_vec.release ();
1474
1475   if (inside_cost < min->inside_cost
1476       || (inside_cost == min->inside_cost
1477           && outside_cost < min->outside_cost))
1478     {
1479       min->inside_cost = inside_cost;
1480       min->outside_cost = outside_cost;
1481       min->peel_info.dr_info = elem->dr_info;
1482       min->peel_info.npeel = elem->npeel;
1483       min->peel_info.count = elem->count;
1484     }
1485
1486   return 1;
1487 }
1488
1489
1490 /* Choose best peeling option by traversing peeling hash table and either
1491    choosing an option with the lowest cost (if cost model is enabled) or the
1492    option that aligns as many accesses as possible.  */
1493
1494 static struct _vect_peel_extended_info
1495 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1496                                        loop_vec_info loop_vinfo)
1497 {
1498    struct _vect_peel_extended_info res;
1499
1500    res.peel_info.dr_info = NULL;
1501
1502    if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1503      {
1504        res.inside_cost = INT_MAX;
1505        res.outside_cost = INT_MAX;
1506        peeling_htab->traverse <_vect_peel_extended_info *,
1507                                vect_peeling_hash_get_lowest_cost> (&res);
1508      }
1509    else
1510      {
1511        res.peel_info.count = 0;
1512        peeling_htab->traverse <_vect_peel_extended_info *,
1513                                vect_peeling_hash_get_most_frequent> (&res);
1514        res.inside_cost = 0;
1515        res.outside_cost = 0;
1516      }
1517
1518    return res;
1519 }
1520
1521 /* Return true if the new peeling NPEEL is supported.  */
1522
1523 static bool
1524 vect_peeling_supportable (loop_vec_info loop_vinfo, dr_vec_info *dr0_info,
1525                           unsigned npeel)
1526 {
1527   unsigned i;
1528   struct data_reference *dr = NULL;
1529   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1530   enum dr_alignment_support supportable_dr_alignment;
1531
1532   /* Ensure that all data refs can be vectorized after the peel.  */
1533   FOR_EACH_VEC_ELT (datarefs, i, dr)
1534     {
1535       int save_misalignment;
1536
1537       if (dr == dr0_info->dr)
1538         continue;
1539
1540       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1541       stmt_vec_info stmt_info = dr_info->stmt;
1542       /* For interleaving, only the alignment of the first access
1543          matters.  */
1544       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1545           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1546         continue;
1547
1548       /* Strided accesses perform only component accesses, alignment is
1549          irrelevant for them.  */
1550       if (STMT_VINFO_STRIDED_P (stmt_info)
1551           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1552         continue;
1553
1554       save_misalignment = DR_MISALIGNMENT (dr_info);
1555       vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
1556       supportable_dr_alignment
1557         = vect_supportable_dr_alignment (dr_info, false);
1558       SET_DR_MISALIGNMENT (dr_info, save_misalignment);
1559
1560       if (!supportable_dr_alignment)
1561         return false;
1562     }
1563
1564   return true;
1565 }
1566
1567 /* Function vect_enhance_data_refs_alignment
1568
1569    This pass will use loop versioning and loop peeling in order to enhance
1570    the alignment of data references in the loop.
1571
1572    FOR NOW: we assume that whatever versioning/peeling takes place, only the
1573    original loop is to be vectorized.  Any other loops that are created by
1574    the transformations performed in this pass - are not supposed to be
1575    vectorized.  This restriction will be relaxed.
1576
1577    This pass will require a cost model to guide it whether to apply peeling
1578    or versioning or a combination of the two.  For example, the scheme that
1579    intel uses when given a loop with several memory accesses, is as follows:
1580    choose one memory access ('p') which alignment you want to force by doing
1581    peeling.  Then, either (1) generate a loop in which 'p' is aligned and all
1582    other accesses are not necessarily aligned, or (2) use loop versioning to
1583    generate one loop in which all accesses are aligned, and another loop in
1584    which only 'p' is necessarily aligned.
1585
1586    ("Automatic Intra-Register Vectorization for the Intel Architecture",
1587    Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1588    Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1589
1590    Devising a cost model is the most critical aspect of this work.  It will
1591    guide us on which access to peel for, whether to use loop versioning, how
1592    many versions to create, etc.  The cost model will probably consist of
1593    generic considerations as well as target specific considerations (on
1594    powerpc for example, misaligned stores are more painful than misaligned
1595    loads).
1596
1597    Here are the general steps involved in alignment enhancements:
1598
1599      -- original loop, before alignment analysis:
1600         for (i=0; i<N; i++){
1601           x = q[i];                     # DR_MISALIGNMENT(q) = unknown
1602           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1603         }
1604
1605      -- After vect_compute_data_refs_alignment:
1606         for (i=0; i<N; i++){
1607           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1608           p[i] = y;                     # DR_MISALIGNMENT(p) = unknown
1609         }
1610
1611      -- Possibility 1: we do loop versioning:
1612      if (p is aligned) {
1613         for (i=0; i<N; i++){    # loop 1A
1614           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1615           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1616         }
1617      }
1618      else {
1619         for (i=0; i<N; i++){    # loop 1B
1620           x = q[i];                     # DR_MISALIGNMENT(q) = 3
1621           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1622         }
1623      }
1624
1625      -- Possibility 2: we do loop peeling:
1626      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1627         x = q[i];
1628         p[i] = y;
1629      }
1630      for (i = 3; i < N; i++){   # loop 2A
1631         x = q[i];                       # DR_MISALIGNMENT(q) = 0
1632         p[i] = y;                       # DR_MISALIGNMENT(p) = unknown
1633      }
1634
1635      -- Possibility 3: combination of loop peeling and versioning:
1636      for (i = 0; i < 3; i++){   # (scalar loop, not to be vectorized).
1637         x = q[i];
1638         p[i] = y;
1639      }
1640      if (p is aligned) {
1641         for (i = 3; i<N; i++){  # loop 3A
1642           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1643           p[i] = y;                     # DR_MISALIGNMENT(p) = 0
1644         }
1645      }
1646      else {
1647         for (i = 3; i<N; i++){  # loop 3B
1648           x = q[i];                     # DR_MISALIGNMENT(q) = 0
1649           p[i] = y;                     # DR_MISALIGNMENT(p) = unaligned
1650         }
1651      }
1652
1653      These loops are later passed to loop_transform to be vectorized.  The
1654      vectorizer will use the alignment information to guide the transformation
1655      (whether to generate regular loads/stores, or with special handling for
1656      misalignment).  */
1657
1658 opt_result
1659 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1660 {
1661   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1662   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663   enum dr_alignment_support supportable_dr_alignment;
1664   dr_vec_info *first_store = NULL;
1665   dr_vec_info *dr0_info = NULL;
1666   struct data_reference *dr;
1667   unsigned int i, j;
1668   bool do_peeling = false;
1669   bool do_versioning = false;
1670   unsigned int npeel = 0;
1671   bool one_misalignment_known = false;
1672   bool one_misalignment_unknown = false;
1673   bool one_dr_unsupportable = false;
1674   dr_vec_info *unsupportable_dr_info = NULL;
1675   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1676   unsigned possible_npeel_number = 1;
1677   tree vectype;
1678   unsigned int mis, same_align_drs_max = 0;
1679   hash_table<peel_info_hasher> peeling_htab (1);
1680
1681   DUMP_VECT_SCOPE ("vect_enhance_data_refs_alignment");
1682
1683   /* Reset data so we can safely be called multiple times.  */
1684   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1685   LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1686
1687   /* While cost model enhancements are expected in the future, the high level
1688      view of the code at this time is as follows:
1689
1690      A) If there is a misaligned access then see if peeling to align
1691         this access can make all data references satisfy
1692         vect_supportable_dr_alignment.  If so, update data structures
1693         as needed and return true.
1694
1695      B) If peeling wasn't possible and there is a data reference with an
1696         unknown misalignment that does not satisfy vect_supportable_dr_alignment
1697         then see if loop versioning checks can be used to make all data
1698         references satisfy vect_supportable_dr_alignment.  If so, update
1699         data structures as needed and return true.
1700
1701      C) If neither peeling nor versioning were successful then return false if
1702         any data reference does not satisfy vect_supportable_dr_alignment.
1703
1704      D) Return true (all data references satisfy vect_supportable_dr_alignment).
1705
1706      Note, Possibility 3 above (which is peeling and versioning together) is not
1707      being done at this time.  */
1708
1709   /* (1) Peeling to force alignment.  */
1710
1711   /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1712      Considerations:
1713      + How many accesses will become aligned due to the peeling
1714      - How many accesses will become unaligned due to the peeling,
1715        and the cost of misaligned accesses.
1716      - The cost of peeling (the extra runtime checks, the increase
1717        in code size).  */
1718
1719   FOR_EACH_VEC_ELT (datarefs, i, dr)
1720     {
1721       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1722       stmt_vec_info stmt_info = dr_info->stmt;
1723
1724       if (!STMT_VINFO_RELEVANT_P (stmt_info))
1725         continue;
1726
1727       /* For interleaving, only the alignment of the first access
1728          matters.  */
1729       if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1730           && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info)
1731         continue;
1732
1733       /* For scatter-gather or invariant accesses there is nothing
1734          to enhance.  */
1735       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1736           || integer_zerop (DR_STEP (dr)))
1737         continue;
1738
1739       /* Strided accesses perform only component accesses, alignment is
1740          irrelevant for them.  */
1741       if (STMT_VINFO_STRIDED_P (stmt_info)
1742           && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1743         continue;
1744
1745       supportable_dr_alignment = vect_supportable_dr_alignment (dr_info, true);
1746       do_peeling = vector_alignment_reachable_p (dr_info);
1747       if (do_peeling)
1748         {
1749           if (known_alignment_for_access_p (dr_info))
1750             {
1751               unsigned int npeel_tmp = 0;
1752               bool negative = tree_int_cst_compare (DR_STEP (dr),
1753                                                     size_zero_node) < 0;
1754
1755               vectype = STMT_VINFO_VECTYPE (stmt_info);
1756               /* If known_alignment_for_access_p then we have set
1757                  DR_MISALIGNMENT which is only done if we know it at compiler
1758                  time, so it is safe to assume target alignment is constant.
1759                */
1760               unsigned int target_align =
1761                 DR_TARGET_ALIGNMENT (dr_info).to_constant ();
1762               unsigned int dr_size = vect_get_scalar_dr_size (dr_info);
1763               mis = (negative
1764                      ? DR_MISALIGNMENT (dr_info)
1765                      : -DR_MISALIGNMENT (dr_info));
1766               if (DR_MISALIGNMENT (dr_info) != 0)
1767                 npeel_tmp = (mis & (target_align - 1)) / dr_size;
1768
1769               /* For multiple types, it is possible that the bigger type access
1770                  will have more than one peeling option.  E.g., a loop with two
1771                  types: one of size (vector size / 4), and the other one of
1772                  size (vector size / 8).  Vectorization factor will 8.  If both
1773                  accesses are misaligned by 3, the first one needs one scalar
1774                  iteration to be aligned, and the second one needs 5.  But the
1775                  first one will be aligned also by peeling 5 scalar
1776                  iterations, and in that case both accesses will be aligned.
1777                  Hence, except for the immediate peeling amount, we also want
1778                  to try to add full vector size, while we don't exceed
1779                  vectorization factor.
1780                  We do this automatically for cost model, since we calculate
1781                  cost for every peeling option.  */
1782               if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1783                 {
1784                   poly_uint64 nscalars = (STMT_SLP_TYPE (stmt_info)
1785                                           ? vf * DR_GROUP_SIZE (stmt_info) : vf);
1786                   possible_npeel_number
1787                     = vect_get_num_vectors (nscalars, vectype);
1788
1789                   /* NPEEL_TMP is 0 when there is no misalignment, but also
1790                      allow peeling NELEMENTS.  */
1791                   if (DR_MISALIGNMENT (dr_info) == 0)
1792                     possible_npeel_number++;
1793                 }
1794
1795               /* Save info about DR in the hash table.  Also include peeling
1796                  amounts according to the explanation above.  */
1797               for (j = 0; j < possible_npeel_number; j++)
1798                 {
1799                   vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1800                                             dr_info, npeel_tmp);
1801                   npeel_tmp += target_align / dr_size;
1802                 }
1803
1804               one_misalignment_known = true;
1805             }
1806           else
1807             {
1808               /* If we don't know any misalignment values, we prefer
1809                  peeling for data-ref that has the maximum number of data-refs
1810                  with the same alignment, unless the target prefers to align
1811                  stores over load.  */
1812               unsigned same_align_drs
1813                 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1814               if (!dr0_info
1815                   || same_align_drs_max < same_align_drs)
1816                 {
1817                   same_align_drs_max = same_align_drs;
1818                   dr0_info = dr_info;
1819                 }
1820               /* For data-refs with the same number of related
1821                  accesses prefer the one where the misalign
1822                  computation will be invariant in the outermost loop.  */
1823               else if (same_align_drs_max == same_align_drs)
1824                 {
1825                   struct loop *ivloop0, *ivloop;
1826                   ivloop0 = outermost_invariant_loop_for_expr
1827                     (loop, DR_BASE_ADDRESS (dr0_info->dr));
1828                   ivloop = outermost_invariant_loop_for_expr
1829                     (loop, DR_BASE_ADDRESS (dr));
1830                   if ((ivloop && !ivloop0)
1831                       || (ivloop && ivloop0
1832                           && flow_loop_nested_p (ivloop, ivloop0)))
1833                     dr0_info = dr_info;
1834                 }
1835
1836               one_misalignment_unknown = true;
1837
1838               /* Check for data refs with unsupportable alignment that
1839                  can be peeled.  */
1840               if (!supportable_dr_alignment)
1841               {
1842                 one_dr_unsupportable = true;
1843                 unsupportable_dr_info = dr_info;
1844               }
1845
1846               if (!first_store && DR_IS_WRITE (dr))
1847                 first_store = dr_info;
1848             }
1849         }
1850       else
1851         {
1852           if (!aligned_access_p (dr_info))
1853             {
1854               if (dump_enabled_p ())
1855                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                  "vector alignment may not be reachable\n");
1857               break;
1858             }
1859         }
1860     }
1861
1862   /* Check if we can possibly peel the loop.  */
1863   if (!vect_can_advance_ivs_p (loop_vinfo)
1864       || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1865       || loop->inner)
1866     do_peeling = false;
1867
1868   struct _vect_peel_extended_info peel_for_known_alignment;
1869   struct _vect_peel_extended_info peel_for_unknown_alignment;
1870   struct _vect_peel_extended_info best_peel;
1871
1872   peel_for_unknown_alignment.inside_cost = INT_MAX;
1873   peel_for_unknown_alignment.outside_cost = INT_MAX;
1874   peel_for_unknown_alignment.peel_info.count = 0;
1875
1876   if (do_peeling
1877       && one_misalignment_unknown)
1878     {
1879       /* Check if the target requires to prefer stores over loads, i.e., if
1880          misaligned stores are more expensive than misaligned loads (taking
1881          drs with same alignment into account).  */
1882       unsigned int load_inside_cost = 0;
1883       unsigned int load_outside_cost = 0;
1884       unsigned int store_inside_cost = 0;
1885       unsigned int store_outside_cost = 0;
1886       unsigned int estimated_npeels = vect_vf_for_cost (loop_vinfo) / 2;
1887
1888       stmt_vector_for_cost dummy;
1889       dummy.create (2);
1890       vect_get_peeling_costs_all_drs (loop_vinfo, dr0_info,
1891                                       &load_inside_cost,
1892                                       &load_outside_cost,
1893                                       &dummy, &dummy, estimated_npeels, true);
1894       dummy.release ();
1895
1896       if (first_store)
1897         {
1898           dummy.create (2);
1899           vect_get_peeling_costs_all_drs (loop_vinfo, first_store,
1900                                           &store_inside_cost,
1901                                           &store_outside_cost,
1902                                           &dummy, &dummy,
1903                                           estimated_npeels, true);
1904           dummy.release ();
1905         }
1906       else
1907         {
1908           store_inside_cost = INT_MAX;
1909           store_outside_cost = INT_MAX;
1910         }
1911
1912       if (load_inside_cost > store_inside_cost
1913           || (load_inside_cost == store_inside_cost
1914               && load_outside_cost > store_outside_cost))
1915         {
1916           dr0_info = first_store;
1917           peel_for_unknown_alignment.inside_cost = store_inside_cost;
1918           peel_for_unknown_alignment.outside_cost = store_outside_cost;
1919         }
1920       else
1921         {
1922           peel_for_unknown_alignment.inside_cost = load_inside_cost;
1923           peel_for_unknown_alignment.outside_cost = load_outside_cost;
1924         }
1925
1926       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1927       prologue_cost_vec.create (2);
1928       epilogue_cost_vec.create (2);
1929
1930       int dummy2;
1931       peel_for_unknown_alignment.outside_cost += vect_get_known_peeling_cost
1932         (loop_vinfo, estimated_npeels, &dummy2,
1933          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1934          &prologue_cost_vec, &epilogue_cost_vec);
1935
1936       prologue_cost_vec.release ();
1937       epilogue_cost_vec.release ();
1938
1939       peel_for_unknown_alignment.peel_info.count = 1
1940         + STMT_VINFO_SAME_ALIGN_REFS (dr0_info->stmt).length ();
1941     }
1942
1943   peel_for_unknown_alignment.peel_info.npeel = 0;
1944   peel_for_unknown_alignment.peel_info.dr_info = dr0_info;
1945
1946   best_peel = peel_for_unknown_alignment;
1947
1948   peel_for_known_alignment.inside_cost = INT_MAX;
1949   peel_for_known_alignment.outside_cost = INT_MAX;
1950   peel_for_known_alignment.peel_info.count = 0;
1951   peel_for_known_alignment.peel_info.dr_info = NULL;
1952
1953   if (do_peeling && one_misalignment_known)
1954     {
1955       /* Peeling is possible, but there is no data access that is not supported
1956          unless aligned.  So we try to choose the best possible peeling from
1957          the hash table.  */
1958       peel_for_known_alignment = vect_peeling_hash_choose_best_peeling
1959         (&peeling_htab, loop_vinfo);
1960     }
1961
1962   /* Compare costs of peeling for known and unknown alignment. */
1963   if (peel_for_known_alignment.peel_info.dr_info != NULL
1964       && peel_for_unknown_alignment.inside_cost
1965       >= peel_for_known_alignment.inside_cost)
1966     {
1967       best_peel = peel_for_known_alignment;
1968
1969       /* If the best peeling for known alignment has NPEEL == 0, perform no
1970          peeling at all except if there is an unsupportable dr that we can
1971          align.  */
1972       if (best_peel.peel_info.npeel == 0 && !one_dr_unsupportable)
1973         do_peeling = false;
1974     }
1975
1976   /* If there is an unsupportable data ref, prefer this over all choices so far
1977      since we'd have to discard a chosen peeling except when it accidentally
1978      aligned the unsupportable data ref.  */
1979   if (one_dr_unsupportable)
1980     dr0_info = unsupportable_dr_info;
1981   else if (do_peeling)
1982     {
1983       /* Calculate the penalty for no peeling, i.e. leaving everything as-is.
1984          TODO: Use nopeel_outside_cost or get rid of it?  */
1985       unsigned nopeel_inside_cost = 0;
1986       unsigned nopeel_outside_cost = 0;
1987
1988       stmt_vector_for_cost dummy;
1989       dummy.create (2);
1990       vect_get_peeling_costs_all_drs (loop_vinfo, NULL, &nopeel_inside_cost,
1991                                       &nopeel_outside_cost, &dummy, &dummy,
1992                                       0, false);
1993       dummy.release ();
1994
1995       /* Add epilogue costs.  As we do not peel for alignment here, no prologue
1996          costs will be recorded.  */
1997       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
1998       prologue_cost_vec.create (2);
1999       epilogue_cost_vec.create (2);
2000
2001       int dummy2;
2002       nopeel_outside_cost += vect_get_known_peeling_cost
2003         (loop_vinfo, 0, &dummy2,
2004          &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
2005          &prologue_cost_vec, &epilogue_cost_vec);
2006
2007       prologue_cost_vec.release ();
2008       epilogue_cost_vec.release ();
2009
2010       npeel = best_peel.peel_info.npeel;
2011       dr0_info = best_peel.peel_info.dr_info;
2012
2013       /* If no peeling is not more expensive than the best peeling we
2014          have so far, don't perform any peeling.  */
2015       if (nopeel_inside_cost <= best_peel.inside_cost)
2016         do_peeling = false;
2017     }
2018
2019   if (do_peeling)
2020     {
2021       stmt_vec_info stmt_info = dr0_info->stmt;
2022       vectype = STMT_VINFO_VECTYPE (stmt_info);
2023
2024       if (known_alignment_for_access_p (dr0_info))
2025         {
2026           bool negative = tree_int_cst_compare (DR_STEP (dr0_info->dr),
2027                                                 size_zero_node) < 0;
2028           if (!npeel)
2029             {
2030               /* Since it's known at compile time, compute the number of
2031                  iterations in the peeled loop (the peeling factor) for use in
2032                  updating DR_MISALIGNMENT values.  The peeling factor is the
2033                  vectorization factor minus the misalignment as an element
2034                  count.  */
2035               mis = (negative
2036                      ? DR_MISALIGNMENT (dr0_info)
2037                      : -DR_MISALIGNMENT (dr0_info));
2038               /* If known_alignment_for_access_p then we have set
2039                  DR_MISALIGNMENT which is only done if we know it at compiler
2040                  time, so it is safe to assume target alignment is constant.
2041                */
2042               unsigned int target_align =
2043                 DR_TARGET_ALIGNMENT (dr0_info).to_constant ();
2044               npeel = ((mis & (target_align - 1))
2045                        / vect_get_scalar_dr_size (dr0_info));
2046             }
2047
2048           /* For interleaved data access every iteration accesses all the
2049              members of the group, therefore we divide the number of iterations
2050              by the group size.  */
2051           if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052             npeel /= DR_GROUP_SIZE (stmt_info);
2053
2054           if (dump_enabled_p ())
2055             dump_printf_loc (MSG_NOTE, vect_location,
2056                              "Try peeling by %d\n", npeel);
2057         }
2058
2059       /* Ensure that all datarefs can be vectorized after the peel.  */
2060       if (!vect_peeling_supportable (loop_vinfo, dr0_info, npeel))
2061         do_peeling = false;
2062
2063       /* Check if all datarefs are supportable and log.  */
2064       if (do_peeling && known_alignment_for_access_p (dr0_info) && npeel == 0)
2065         {
2066           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2067           if (!stat)
2068             do_peeling = false;
2069           else
2070             return stat;
2071         }
2072
2073       /* Cost model #1 - honor --param vect-max-peeling-for-alignment.  */
2074       if (do_peeling)
2075         {
2076           unsigned max_allowed_peel
2077             = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
2078           if (max_allowed_peel != (unsigned)-1)
2079             {
2080               unsigned max_peel = npeel;
2081               if (max_peel == 0)
2082                 {
2083                   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr0_info);
2084                   unsigned HOST_WIDE_INT target_align_c;
2085                   if (target_align.is_constant (&target_align_c))
2086                     max_peel =
2087                       target_align_c / vect_get_scalar_dr_size (dr0_info) - 1;
2088                   else
2089                     {
2090                       do_peeling = false;
2091                       if (dump_enabled_p ())
2092                         dump_printf_loc (MSG_NOTE, vect_location,
2093                           "Disable peeling, max peels set and vector"
2094                           " alignment unknown\n");
2095                     }
2096                 }
2097               if (max_peel > max_allowed_peel)
2098                 {
2099                   do_peeling = false;
2100                   if (dump_enabled_p ())
2101                     dump_printf_loc (MSG_NOTE, vect_location,
2102                         "Disable peeling, max peels reached: %d\n", max_peel);
2103                 }
2104             }
2105         }
2106
2107       /* Cost model #2 - if peeling may result in a remaining loop not
2108          iterating enough to be vectorized then do not peel.  Since this
2109          is a cost heuristic rather than a correctness decision, use the
2110          most likely runtime value for variable vectorization factors.  */
2111       if (do_peeling
2112           && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2113         {
2114           unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2115           unsigned int max_peel = npeel == 0 ? assumed_vf - 1 : npeel;
2116           if ((unsigned HOST_WIDE_INT) LOOP_VINFO_INT_NITERS (loop_vinfo)
2117               < assumed_vf + max_peel)
2118             do_peeling = false;
2119         }
2120
2121       if (do_peeling)
2122         {
2123           /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
2124              If the misalignment of DR_i is identical to that of dr0 then set
2125              DR_MISALIGNMENT (DR_i) to zero.  If the misalignment of DR_i and
2126              dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
2127              by the peeling factor times the element size of DR_i (MOD the
2128              vectorization factor times the size).  Otherwise, the
2129              misalignment of DR_i must be set to unknown.  */
2130           FOR_EACH_VEC_ELT (datarefs, i, dr)
2131             if (dr != dr0_info->dr)
2132               {
2133                 /* Strided accesses perform only component accesses, alignment
2134                    is irrelevant for them.  */
2135                 dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2136                 stmt_info = dr_info->stmt;
2137                 if (STMT_VINFO_STRIDED_P (stmt_info)
2138                     && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
2139                   continue;
2140
2141                 vect_update_misalignment_for_peel (dr_info, dr0_info, npeel);
2142               }
2143
2144           LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0_info;
2145           if (npeel)
2146             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
2147           else
2148             LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2149               = DR_MISALIGNMENT (dr0_info);
2150           SET_DR_MISALIGNMENT (dr0_info, 0);
2151           if (dump_enabled_p ())
2152             {
2153               dump_printf_loc (MSG_NOTE, vect_location,
2154                                "Alignment of access forced using peeling.\n");
2155               dump_printf_loc (MSG_NOTE, vect_location,
2156                                "Peeling for alignment will be applied.\n");
2157             }
2158
2159           /* The inside-loop cost will be accounted for in vectorizable_load
2160              and vectorizable_store correctly with adjusted alignments.
2161              Drop the body_cst_vec on the floor here.  */
2162           opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2163           gcc_assert (stat);
2164           return stat;
2165         }
2166     }
2167
2168   /* (2) Versioning to force alignment.  */
2169
2170   /* Try versioning if:
2171      1) optimize loop for speed
2172      2) there is at least one unsupported misaligned data ref with an unknown
2173         misalignment, and
2174      3) all misaligned data refs with a known misalignment are supported, and
2175      4) the number of runtime alignment checks is within reason.  */
2176
2177   do_versioning =
2178         optimize_loop_nest_for_speed_p (loop)
2179         && (!loop->inner); /* FORNOW */
2180
2181   if (do_versioning)
2182     {
2183       FOR_EACH_VEC_ELT (datarefs, i, dr)
2184         {
2185           dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
2186           stmt_vec_info stmt_info = dr_info->stmt;
2187
2188           /* For interleaving, only the alignment of the first access
2189              matters.  */
2190           if (aligned_access_p (dr_info)
2191               || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2192                   && DR_GROUP_FIRST_ELEMENT (stmt_info) != stmt_info))
2193             continue;
2194
2195           if (STMT_VINFO_STRIDED_P (stmt_info))
2196             {
2197               /* Strided loads perform only component accesses, alignment is
2198                  irrelevant for them.  */
2199               if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
2200                 continue;
2201               do_versioning = false;
2202               break;
2203             }
2204
2205           supportable_dr_alignment
2206             = vect_supportable_dr_alignment (dr_info, false);
2207
2208           if (!supportable_dr_alignment)
2209             {
2210               int mask;
2211               tree vectype;
2212
2213               if (known_alignment_for_access_p (dr_info)
2214                   || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
2215                      >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2216                 {
2217                   do_versioning = false;
2218                   break;
2219                 }
2220
2221               vectype = STMT_VINFO_VECTYPE (stmt_info);
2222               gcc_assert (vectype);
2223
2224               /* At present we don't support versioning for alignment
2225                  with variable VF, since there's no guarantee that the
2226                  VF is a power of two.  We could relax this if we added
2227                  a way of enforcing a power-of-two size.  */
2228               unsigned HOST_WIDE_INT size;
2229               if (!GET_MODE_SIZE (TYPE_MODE (vectype)).is_constant (&size))
2230                 {
2231                   do_versioning = false;
2232                   break;
2233                 }
2234
2235               /* Forcing alignment in the first iteration is no good if
2236                  we don't keep it across iterations.  For now, just disable
2237                  versioning in this case.
2238                  ?? We could actually unroll the loop to achieve the required
2239                  overall step alignment, and forcing the alignment could be
2240                  done by doing some iterations of the non-vectorized loop.  */
2241               if (!multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
2242                                * DR_STEP_ALIGNMENT (dr),
2243                                DR_TARGET_ALIGNMENT (dr_info)))
2244                 {
2245                   do_versioning = false;
2246                   break;
2247                 }
2248
2249               /* The rightmost bits of an aligned address must be zeros.
2250                  Construct the mask needed for this test.  For example,
2251                  GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2252                  mask must be 15 = 0xf. */
2253               mask = size - 1;
2254
2255               /* FORNOW: use the same mask to test all potentially unaligned
2256                  references in the loop.  The vectorizer currently supports
2257                  a single vector size, see the reference to
2258                  GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2259                  vectorization factor is computed.  */
2260               gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2261                           || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2262               LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2263               LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (stmt_info);
2264             }
2265         }
2266
2267       /* Versioning requires at least one misaligned data reference.  */
2268       if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2269         do_versioning = false;
2270       else if (!do_versioning)
2271         LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
2272     }
2273
2274   if (do_versioning)
2275     {
2276       vec<stmt_vec_info> may_misalign_stmts
2277         = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2278       stmt_vec_info stmt_info;
2279
2280       /* It can now be assumed that the data references in the statements
2281          in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2282          of the loop being vectorized.  */
2283       FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2284         {
2285           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
2286           SET_DR_MISALIGNMENT (dr_info, 0);
2287           if (dump_enabled_p ())
2288             dump_printf_loc (MSG_NOTE, vect_location,
2289                              "Alignment of access forced using versioning.\n");
2290         }
2291
2292       if (dump_enabled_p ())
2293         dump_printf_loc (MSG_NOTE, vect_location,
2294                          "Versioning for alignment will be applied.\n");
2295
2296       /* Peeling and versioning can't be done together at this time.  */
2297       gcc_assert (! (do_peeling && do_versioning));
2298
2299       opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2300       gcc_assert (stat);
2301       return stat;
2302     }
2303
2304   /* This point is reached if neither peeling nor versioning is being done.  */
2305   gcc_assert (! (do_peeling || do_versioning));
2306
2307   opt_result stat = vect_verify_datarefs_alignment (loop_vinfo);
2308   return stat;
2309 }
2310
2311
2312 /* Function vect_find_same_alignment_drs.
2313
2314    Update group and alignment relations in VINFO according to the chosen
2315    vectorization factor.  */
2316
2317 static void
2318 vect_find_same_alignment_drs (vec_info *vinfo, data_dependence_relation *ddr)
2319 {
2320   struct data_reference *dra = DDR_A (ddr);
2321   struct data_reference *drb = DDR_B (ddr);
2322   dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2323   dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2324   stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2325   stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2326
2327   if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2328     return;
2329
2330   if (dra == drb)
2331     return;
2332
2333   if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
2334       || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2335     return;
2336
2337   if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
2338       || !operand_equal_p (DR_OFFSET (dra), DR_OFFSET (drb), 0)
2339       || !operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2340     return;
2341
2342   /* Two references with distance zero have the same alignment.  */
2343   poly_offset_int diff = (wi::to_poly_offset (DR_INIT (dra))
2344                           - wi::to_poly_offset (DR_INIT (drb)));
2345   if (maybe_ne (diff, 0))
2346     {
2347       /* Get the wider of the two alignments.  */
2348       poly_uint64 align_a =
2349         exact_div (vect_calculate_target_alignment (dr_info_a),
2350                    BITS_PER_UNIT);
2351       poly_uint64 align_b =
2352         exact_div (vect_calculate_target_alignment (dr_info_b),
2353                    BITS_PER_UNIT);
2354       unsigned HOST_WIDE_INT align_a_c, align_b_c;
2355       if (!align_a.is_constant (&align_a_c)
2356           || !align_b.is_constant (&align_b_c))
2357         return;
2358
2359       unsigned HOST_WIDE_INT max_align = MAX (align_a_c, align_b_c);
2360
2361       /* Require the gap to be a multiple of the larger vector alignment.  */
2362       if (!multiple_p (diff, max_align))
2363         return;
2364     }
2365
2366   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2367   STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2368   if (dump_enabled_p ())
2369     dump_printf_loc (MSG_NOTE, vect_location,
2370                      "accesses have the same alignment: %T and %T\n",
2371                      DR_REF (dra), DR_REF (drb));
2372 }
2373
2374
2375 /* Function vect_analyze_data_refs_alignment
2376
2377    Analyze the alignment of the data-references in the loop.
2378    Return FALSE if a data reference is found that cannot be vectorized.  */
2379
2380 opt_result
2381 vect_analyze_data_refs_alignment (loop_vec_info vinfo)
2382 {
2383   DUMP_VECT_SCOPE ("vect_analyze_data_refs_alignment");
2384
2385   /* Mark groups of data references with same alignment using
2386      data dependence information.  */
2387   vec<ddr_p> ddrs = vinfo->shared->ddrs;
2388   struct data_dependence_relation *ddr;
2389   unsigned int i;
2390
2391   FOR_EACH_VEC_ELT (ddrs, i, ddr)
2392     vect_find_same_alignment_drs (vinfo, ddr);
2393
2394   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2395   struct data_reference *dr;
2396
2397   vect_record_base_alignments (vinfo);
2398   FOR_EACH_VEC_ELT (datarefs, i, dr)
2399     {
2400       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
2401       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt))
2402         vect_compute_data_ref_alignment (dr_info);
2403     }
2404
2405   return opt_result::success ();
2406 }
2407
2408
2409 /* Analyze alignment of DRs of stmts in NODE.  */
2410
2411 static bool
2412 vect_slp_analyze_and_verify_node_alignment (slp_tree node)
2413 {
2414   /* We vectorize from the first scalar stmt in the node unless
2415      the node is permuted in which case we start from the first
2416      element in the group.  */
2417   stmt_vec_info first_stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
2418   dr_vec_info *first_dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2419   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2420     first_stmt_info = DR_GROUP_FIRST_ELEMENT (first_stmt_info);
2421
2422   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_stmt_info);
2423   vect_compute_data_ref_alignment (dr_info);
2424   /* For creating the data-ref pointer we need alignment of the
2425      first element anyway.  */
2426   if (dr_info != first_dr_info)
2427     vect_compute_data_ref_alignment (first_dr_info);
2428   if (! verify_data_ref_alignment (dr_info))
2429     {
2430       if (dump_enabled_p ())
2431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432                          "not vectorized: bad data alignment in basic "
2433                          "block.\n");
2434       return false;
2435     }
2436
2437   return true;
2438 }
2439
2440 /* Function vect_slp_analyze_instance_alignment
2441
2442    Analyze the alignment of the data-references in the SLP instance.
2443    Return FALSE if a data reference is found that cannot be vectorized.  */
2444
2445 bool
2446 vect_slp_analyze_and_verify_instance_alignment (slp_instance instance)
2447 {
2448   DUMP_VECT_SCOPE ("vect_slp_analyze_and_verify_instance_alignment");
2449
2450   slp_tree node;
2451   unsigned i;
2452   FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, node)
2453     if (! vect_slp_analyze_and_verify_node_alignment (node))
2454       return false;
2455
2456   node = SLP_INSTANCE_TREE (instance);
2457   if (STMT_VINFO_DATA_REF (SLP_TREE_SCALAR_STMTS (node)[0])
2458       && ! vect_slp_analyze_and_verify_node_alignment
2459              (SLP_INSTANCE_TREE (instance)))
2460     return false;
2461
2462   return true;
2463 }
2464
2465
2466 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2467    accesses of legal size, step, etc.  Detect gaps, single element
2468    interleaving, and other special cases. Set grouped access info.
2469    Collect groups of strided stores for further use in SLP analysis.
2470    Worker for vect_analyze_group_access.  */
2471
2472 static bool
2473 vect_analyze_group_access_1 (dr_vec_info *dr_info)
2474 {
2475   data_reference *dr = dr_info->dr;
2476   tree step = DR_STEP (dr);
2477   tree scalar_type = TREE_TYPE (DR_REF (dr));
2478   HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2479   stmt_vec_info stmt_info = dr_info->stmt;
2480   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2481   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2482   HOST_WIDE_INT dr_step = -1;
2483   HOST_WIDE_INT groupsize, last_accessed_element = 1;
2484   bool slp_impossible = false;
2485
2486   /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2487      size of the interleaving group (including gaps).  */
2488   if (tree_fits_shwi_p (step))
2489     {
2490       dr_step = tree_to_shwi (step);
2491       /* Check that STEP is a multiple of type size.  Otherwise there is
2492          a non-element-sized gap at the end of the group which we
2493          cannot represent in DR_GROUP_GAP or DR_GROUP_SIZE.
2494          ???  As we can handle non-constant step fine here we should
2495          simply remove uses of DR_GROUP_GAP between the last and first
2496          element and instead rely on DR_STEP.  DR_GROUP_SIZE then would
2497          simply not include that gap.  */
2498       if ((dr_step % type_size) != 0)
2499         {
2500           if (dump_enabled_p ())
2501             dump_printf_loc (MSG_NOTE, vect_location,
2502                              "Step %T is not a multiple of the element size"
2503                              " for %T\n",
2504                              step, DR_REF (dr));
2505           return false;
2506         }
2507       groupsize = absu_hwi (dr_step) / type_size;
2508     }
2509   else
2510     groupsize = 0;
2511
2512   /* Not consecutive access is possible only if it is a part of interleaving.  */
2513   if (!DR_GROUP_FIRST_ELEMENT (stmt_info))
2514     {
2515       /* Check if it this DR is a part of interleaving, and is a single
2516          element of the group that is accessed in the loop.  */
2517
2518       /* Gaps are supported only for loads. STEP must be a multiple of the type
2519          size.  */
2520       if (DR_IS_READ (dr)
2521           && (dr_step % type_size) == 0
2522           && groupsize > 0)
2523         {
2524           DR_GROUP_FIRST_ELEMENT (stmt_info) = stmt_info;
2525           DR_GROUP_SIZE (stmt_info) = groupsize;
2526           DR_GROUP_GAP (stmt_info) = groupsize - 1;
2527           if (dump_enabled_p ())
2528             dump_printf_loc (MSG_NOTE, vect_location,
2529                              "Detected single element interleaving %T"
2530                              " step %T\n",
2531                              DR_REF (dr), step);
2532
2533           return true;
2534         }
2535
2536       if (dump_enabled_p ())
2537         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2538                          "not consecutive access %G", stmt_info->stmt);
2539
2540       if (bb_vinfo)
2541         {
2542           /* Mark the statement as unvectorizable.  */
2543           STMT_VINFO_VECTORIZABLE (stmt_info) = false;
2544           return true;
2545         }
2546
2547       if (dump_enabled_p ())
2548         dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2549       STMT_VINFO_STRIDED_P (stmt_info) = true;
2550       return true;
2551     }
2552
2553   if (DR_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info)
2554     {
2555       /* First stmt in the interleaving chain. Check the chain.  */
2556       stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2557       struct data_reference *data_ref = dr;
2558       unsigned int count = 1;
2559       tree prev_init = DR_INIT (data_ref);
2560       HOST_WIDE_INT diff, gaps = 0;
2561
2562       /* By construction, all group members have INTEGER_CST DR_INITs.  */
2563       while (next)
2564         {
2565           /* We never have the same DR multiple times.  */
2566           gcc_assert (tree_int_cst_compare (DR_INIT (data_ref),
2567                                 DR_INIT (STMT_VINFO_DATA_REF (next))) != 0);
2568
2569           data_ref = STMT_VINFO_DATA_REF (next);
2570
2571           /* All group members have the same STEP by construction.  */
2572           gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2573
2574           /* Check that the distance between two accesses is equal to the type
2575              size. Otherwise, we have gaps.  */
2576           diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2577                   - TREE_INT_CST_LOW (prev_init)) / type_size;
2578           if (diff != 1)
2579             {
2580               /* FORNOW: SLP of accesses with gaps is not supported.  */
2581               slp_impossible = true;
2582               if (DR_IS_WRITE (data_ref))
2583                 {
2584                   if (dump_enabled_p ())
2585                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2586                                      "interleaved store with gaps\n");
2587                   return false;
2588                 }
2589
2590               gaps += diff - 1;
2591             }
2592
2593           last_accessed_element += diff;
2594
2595           /* Store the gap from the previous member of the group. If there is no
2596              gap in the access, DR_GROUP_GAP is always 1.  */
2597           DR_GROUP_GAP (next) = diff;
2598
2599           prev_init = DR_INIT (data_ref);
2600           next = DR_GROUP_NEXT_ELEMENT (next);
2601           /* Count the number of data-refs in the chain.  */
2602           count++;
2603         }
2604
2605       if (groupsize == 0)
2606         groupsize = count + gaps;
2607
2608       /* This could be UINT_MAX but as we are generating code in a very
2609          inefficient way we have to cap earlier.  See PR78699 for example.  */
2610       if (groupsize > 4096)
2611         {
2612           if (dump_enabled_p ())
2613             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614                              "group is too large\n");
2615           return false;
2616         }
2617
2618       /* Check that the size of the interleaving is equal to count for stores,
2619          i.e., that there are no gaps.  */
2620       if (groupsize != count
2621           && !DR_IS_READ (dr))
2622         {
2623           groupsize = count;
2624           STMT_VINFO_STRIDED_P (stmt_info) = true;
2625         }
2626
2627       /* If there is a gap after the last load in the group it is the
2628          difference between the groupsize and the last accessed
2629          element.
2630          When there is no gap, this difference should be 0.  */
2631       DR_GROUP_GAP (stmt_info) = groupsize - last_accessed_element;
2632
2633       DR_GROUP_SIZE (stmt_info) = groupsize;
2634       if (dump_enabled_p ())
2635         {
2636           dump_printf_loc (MSG_NOTE, vect_location,
2637                            "Detected interleaving ");
2638           if (DR_IS_READ (dr))
2639             dump_printf (MSG_NOTE, "load ");
2640           else if (STMT_VINFO_STRIDED_P (stmt_info))
2641             dump_printf (MSG_NOTE, "strided store ");
2642           else
2643             dump_printf (MSG_NOTE, "store ");
2644           dump_printf (MSG_NOTE, "of size %u\n",
2645                        (unsigned)groupsize);
2646           dump_printf_loc (MSG_NOTE, vect_location, "\t%G", stmt_info->stmt);
2647           next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2648           while (next)
2649             {
2650               if (DR_GROUP_GAP (next) != 1)
2651                 dump_printf_loc (MSG_NOTE, vect_location,
2652                                  "\t<gap of %d elements>\n",
2653                                  DR_GROUP_GAP (next) - 1);
2654               dump_printf_loc (MSG_NOTE, vect_location, "\t%G", next->stmt);
2655               next = DR_GROUP_NEXT_ELEMENT (next);
2656             }
2657           if (DR_GROUP_GAP (stmt_info) != 0)
2658             dump_printf_loc (MSG_NOTE, vect_location,
2659                              "\t<gap of %d elements>\n",
2660                              DR_GROUP_GAP (stmt_info));
2661         }
2662
2663       /* SLP: create an SLP data structure for every interleaving group of
2664          stores for further analysis in vect_analyse_slp.  */
2665       if (DR_IS_WRITE (dr) && !slp_impossible)
2666         {
2667           if (loop_vinfo)
2668             LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt_info);
2669           if (bb_vinfo)
2670             BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt_info);
2671         }
2672     }
2673
2674   return true;
2675 }
2676
2677 /* Analyze groups of accesses: check that DR_INFO belongs to a group of
2678    accesses of legal size, step, etc.  Detect gaps, single element
2679    interleaving, and other special cases. Set grouped access info.
2680    Collect groups of strided stores for further use in SLP analysis.  */
2681
2682 static bool
2683 vect_analyze_group_access (dr_vec_info *dr_info)
2684 {
2685   if (!vect_analyze_group_access_1 (dr_info))
2686     {
2687       /* Dissolve the group if present.  */
2688       stmt_vec_info stmt_info = DR_GROUP_FIRST_ELEMENT (dr_info->stmt);
2689       while (stmt_info)
2690         {
2691           stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (stmt_info);
2692           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2693           DR_GROUP_NEXT_ELEMENT (stmt_info) = NULL;
2694           stmt_info = next;
2695         }
2696       return false;
2697     }
2698   return true;
2699 }
2700
2701 /* Analyze the access pattern of the data-reference DR_INFO.
2702    In case of non-consecutive accesses call vect_analyze_group_access() to
2703    analyze groups of accesses.  */
2704
2705 static bool
2706 vect_analyze_data_ref_access (dr_vec_info *dr_info)
2707 {
2708   data_reference *dr = dr_info->dr;
2709   tree step = DR_STEP (dr);
2710   tree scalar_type = TREE_TYPE (DR_REF (dr));
2711   stmt_vec_info stmt_info = dr_info->stmt;
2712   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2713   struct loop *loop = NULL;
2714
2715   if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2716     return true;
2717
2718   if (loop_vinfo)
2719     loop = LOOP_VINFO_LOOP (loop_vinfo);
2720
2721   if (loop_vinfo && !step)
2722     {
2723       if (dump_enabled_p ())
2724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2725                          "bad data-ref access in loop\n");
2726       return false;
2727     }
2728
2729   /* Allow loads with zero step in inner-loop vectorization.  */
2730   if (loop_vinfo && integer_zerop (step))
2731     {
2732       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2733       if (!nested_in_vect_loop_p (loop, stmt_info))
2734         return DR_IS_READ (dr);
2735       /* Allow references with zero step for outer loops marked
2736          with pragma omp simd only - it guarantees absence of
2737          loop-carried dependencies between inner loop iterations.  */
2738       if (loop->safelen < 2)
2739         {
2740           if (dump_enabled_p ())
2741             dump_printf_loc (MSG_NOTE, vect_location,
2742                              "zero step in inner loop of nest\n");
2743           return false;
2744         }
2745     }
2746
2747   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2748     {
2749       /* Interleaved accesses are not yet supported within outer-loop
2750         vectorization for references in the inner-loop.  */
2751       DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2752
2753       /* For the rest of the analysis we use the outer-loop step.  */
2754       step = STMT_VINFO_DR_STEP (stmt_info);
2755       if (integer_zerop (step))
2756         {
2757           if (dump_enabled_p ())
2758             dump_printf_loc (MSG_NOTE, vect_location,
2759                              "zero step in outer loop.\n");
2760           return DR_IS_READ (dr);
2761         }
2762     }
2763
2764   /* Consecutive?  */
2765   if (TREE_CODE (step) == INTEGER_CST)
2766     {
2767       HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2768       if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2769           || (dr_step < 0
2770               && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2771         {
2772           /* Mark that it is not interleaving.  */
2773           DR_GROUP_FIRST_ELEMENT (stmt_info) = NULL;
2774           return true;
2775         }
2776     }
2777
2778   if (loop && nested_in_vect_loop_p (loop, stmt_info))
2779     {
2780       if (dump_enabled_p ())
2781         dump_printf_loc (MSG_NOTE, vect_location,
2782                          "grouped access in outer loop.\n");
2783       return false;
2784     }
2785
2786
2787   /* Assume this is a DR handled by non-constant strided load case.  */
2788   if (TREE_CODE (step) != INTEGER_CST)
2789     return (STMT_VINFO_STRIDED_P (stmt_info)
2790             && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2791                 || vect_analyze_group_access (dr_info)));
2792
2793   /* Not consecutive access - check if it's a part of interleaving group.  */
2794   return vect_analyze_group_access (dr_info);
2795 }
2796
2797 /* Compare two data-references DRA and DRB to group them into chunks
2798    suitable for grouping.  */
2799
2800 static int
2801 dr_group_sort_cmp (const void *dra_, const void *drb_)
2802 {
2803   data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2804   data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2805   int cmp;
2806
2807   /* Stabilize sort.  */
2808   if (dra == drb)
2809     return 0;
2810
2811   /* DRs in different loops never belong to the same group.  */
2812   loop_p loopa = gimple_bb (DR_STMT (dra))->loop_father;
2813   loop_p loopb = gimple_bb (DR_STMT (drb))->loop_father;
2814   if (loopa != loopb)
2815     return loopa->num < loopb->num ? -1 : 1;
2816
2817   /* Ordering of DRs according to base.  */
2818   cmp = data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2819                                DR_BASE_ADDRESS (drb));
2820   if (cmp != 0)
2821     return cmp;
2822
2823   /* And according to DR_OFFSET.  */
2824   cmp = data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2825   if (cmp != 0)
2826     return cmp;
2827
2828   /* Put reads before writes.  */
2829   if (DR_IS_READ (dra) != DR_IS_READ (drb))
2830     return DR_IS_READ (dra) ? -1 : 1;
2831
2832   /* Then sort after access size.  */
2833   cmp = data_ref_compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2834                                TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2835   if (cmp != 0)
2836     return cmp;
2837
2838   /* And after step.  */
2839   cmp = data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb));
2840   if (cmp != 0)
2841     return cmp;
2842
2843   /* Then sort after DR_INIT.  In case of identical DRs sort after stmt UID.  */
2844   cmp = data_ref_compare_tree (DR_INIT (dra), DR_INIT (drb));
2845   if (cmp == 0)
2846     return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2847   return cmp;
2848 }
2849
2850 /* If OP is the result of a conversion, return the unconverted value,
2851    otherwise return null.  */
2852
2853 static tree
2854 strip_conversion (tree op)
2855 {
2856   if (TREE_CODE (op) != SSA_NAME)
2857     return NULL_TREE;
2858   gimple *stmt = SSA_NAME_DEF_STMT (op);
2859   if (!is_gimple_assign (stmt)
2860       || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (stmt)))
2861     return NULL_TREE;
2862   return gimple_assign_rhs1 (stmt);
2863 }
2864
2865 /* Return true if vectorizable_* routines can handle statements STMT1_INFO
2866    and STMT2_INFO being in a single group.  When ALLOW_SLP_P, masked loads can
2867    be grouped in SLP mode.  */
2868
2869 static bool
2870 can_group_stmts_p (stmt_vec_info stmt1_info, stmt_vec_info stmt2_info,
2871                    bool allow_slp_p)
2872 {
2873   if (gimple_assign_single_p (stmt1_info->stmt))
2874     return gimple_assign_single_p (stmt2_info->stmt);
2875
2876   gcall *call1 = dyn_cast <gcall *> (stmt1_info->stmt);
2877   if (call1 && gimple_call_internal_p (call1))
2878     {
2879       /* Check for two masked loads or two masked stores.  */
2880       gcall *call2 = dyn_cast <gcall *> (stmt2_info->stmt);
2881       if (!call2 || !gimple_call_internal_p (call2))
2882         return false;
2883       internal_fn ifn = gimple_call_internal_fn (call1);
2884       if (ifn != IFN_MASK_LOAD && ifn != IFN_MASK_STORE)
2885         return false;
2886       if (ifn != gimple_call_internal_fn (call2))
2887         return false;
2888
2889       /* Check that the masks are the same.  Cope with casts of masks,
2890          like those created by build_mask_conversion.  */
2891       tree mask1 = gimple_call_arg (call1, 2);
2892       tree mask2 = gimple_call_arg (call2, 2);
2893       if (!operand_equal_p (mask1, mask2, 0)
2894           && (ifn == IFN_MASK_STORE || !allow_slp_p))
2895         {
2896           mask1 = strip_conversion (mask1);
2897           if (!mask1)
2898             return false;
2899           mask2 = strip_conversion (mask2);
2900           if (!mask2)
2901             return false;
2902           if (!operand_equal_p (mask1, mask2, 0))
2903             return false;
2904         }
2905       return true;
2906     }
2907
2908   return false;
2909 }
2910
2911 /* Function vect_analyze_data_ref_accesses.
2912
2913    Analyze the access pattern of all the data references in the loop.
2914
2915    FORNOW: the only access pattern that is considered vectorizable is a
2916            simple step 1 (consecutive) access.
2917
2918    FORNOW: handle only arrays and pointer accesses.  */
2919
2920 opt_result
2921 vect_analyze_data_ref_accesses (vec_info *vinfo)
2922 {
2923   unsigned int i;
2924   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
2925   struct data_reference *dr;
2926
2927   DUMP_VECT_SCOPE ("vect_analyze_data_ref_accesses");
2928
2929   if (datarefs.is_empty ())
2930     return opt_result::success ();
2931
2932   /* Sort the array of datarefs to make building the interleaving chains
2933      linear.  Don't modify the original vector's order, it is needed for
2934      determining what dependencies are reversed.  */
2935   vec<data_reference_p> datarefs_copy = datarefs.copy ();
2936   datarefs_copy.qsort (dr_group_sort_cmp);
2937   hash_set<stmt_vec_info> to_fixup;
2938
2939   /* Build the interleaving chains.  */
2940   for (i = 0; i < datarefs_copy.length () - 1;)
2941     {
2942       data_reference_p dra = datarefs_copy[i];
2943       dr_vec_info *dr_info_a = vinfo->lookup_dr (dra);
2944       stmt_vec_info stmtinfo_a = dr_info_a->stmt;
2945       stmt_vec_info lastinfo = NULL;
2946       if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
2947           || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a))
2948         {
2949           ++i;
2950           continue;
2951         }
2952       for (i = i + 1; i < datarefs_copy.length (); ++i)
2953         {
2954           data_reference_p drb = datarefs_copy[i];
2955           dr_vec_info *dr_info_b = vinfo->lookup_dr (drb);
2956           stmt_vec_info stmtinfo_b = dr_info_b->stmt;
2957           if (!STMT_VINFO_VECTORIZABLE (stmtinfo_b)
2958               || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
2959             break;
2960
2961           /* ???  Imperfect sorting (non-compatible types, non-modulo
2962              accesses, same accesses) can lead to a group to be artificially
2963              split here as we don't just skip over those.  If it really
2964              matters we can push those to a worklist and re-iterate
2965              over them.  The we can just skip ahead to the next DR here.  */
2966
2967           /* DRs in a different loop should not be put into the same
2968              interleaving group.  */
2969           if (gimple_bb (DR_STMT (dra))->loop_father
2970               != gimple_bb (DR_STMT (drb))->loop_father)
2971             break;
2972
2973           /* Check that the data-refs have same first location (except init)
2974              and they are both either store or load (not load and store,
2975              not masked loads or stores).  */
2976           if (DR_IS_READ (dra) != DR_IS_READ (drb)
2977               || data_ref_compare_tree (DR_BASE_ADDRESS (dra),
2978                                         DR_BASE_ADDRESS (drb)) != 0
2979               || data_ref_compare_tree (DR_OFFSET (dra), DR_OFFSET (drb)) != 0
2980               || !can_group_stmts_p (stmtinfo_a, stmtinfo_b, true))
2981             break;
2982
2983           /* Check that the data-refs have the same constant size.  */
2984           tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2985           tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2986           if (!tree_fits_uhwi_p (sza)
2987               || !tree_fits_uhwi_p (szb)
2988               || !tree_int_cst_equal (sza, szb))
2989             break;
2990
2991           /* Check that the data-refs have the same step.  */
2992           if (data_ref_compare_tree (DR_STEP (dra), DR_STEP (drb)) != 0)
2993             break;
2994
2995           /* Check the types are compatible.
2996              ???  We don't distinguish this during sorting.  */
2997           if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2998                                    TREE_TYPE (DR_REF (drb))))
2999             break;
3000
3001           /* Check that the DR_INITs are compile-time constants.  */
3002           if (TREE_CODE (DR_INIT (dra)) != INTEGER_CST
3003               || TREE_CODE (DR_INIT (drb)) != INTEGER_CST)
3004             break;
3005
3006           /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb).  */
3007           HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
3008           HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
3009           HOST_WIDE_INT init_prev
3010             = TREE_INT_CST_LOW (DR_INIT (datarefs_copy[i-1]));
3011           gcc_assert (init_a <= init_b
3012                       && init_a <= init_prev
3013                       && init_prev <= init_b);
3014
3015           /* Do not place the same access in the interleaving chain twice.  */
3016           if (init_b == init_prev)
3017             {
3018               gcc_assert (gimple_uid (DR_STMT (datarefs_copy[i-1]))
3019                           < gimple_uid (DR_STMT (drb)));
3020               /* Simply link in duplicates and fix up the chain below.  */
3021             }
3022           else
3023             {
3024               /* If init_b == init_a + the size of the type * k, we have an
3025                  interleaving, and DRA is accessed before DRB.  */
3026               HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
3027               if (type_size_a == 0
3028                   || (init_b - init_a) % type_size_a != 0)
3029                 break;
3030
3031               /* If we have a store, the accesses are adjacent.  This splits
3032                  groups into chunks we support (we don't support vectorization
3033                  of stores with gaps).  */
3034               if (!DR_IS_READ (dra) && init_b - init_prev != type_size_a)
3035                 break;
3036
3037               /* If the step (if not zero or non-constant) is greater than the
3038                  difference between data-refs' inits this splits groups into
3039                  suitable sizes.  */
3040               if (tree_fits_shwi_p (DR_STEP (dra)))
3041                 {
3042                   HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
3043                   if (step != 0 && step <= (init_b - init_a))
3044                     break;
3045                 }
3046             }
3047
3048           if (dump_enabled_p ())
3049             dump_printf_loc (MSG_NOTE, vect_location,
3050                              DR_IS_READ (dra)
3051                              ? "Detected interleaving load %T and %T\n"
3052                              : "Detected interleaving store %T and %T\n",
3053                              DR_REF (dra), DR_REF (drb));
3054
3055           /* Link the found element into the group list.  */
3056           if (!DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3057             {
3058               DR_GROUP_FIRST_ELEMENT (stmtinfo_a) = stmtinfo_a;
3059               lastinfo = stmtinfo_a;
3060             }
3061           DR_GROUP_FIRST_ELEMENT (stmtinfo_b) = stmtinfo_a;
3062           DR_GROUP_NEXT_ELEMENT (lastinfo) = stmtinfo_b;
3063           lastinfo = stmtinfo_b;
3064
3065           STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a)
3066             = !can_group_stmts_p (stmtinfo_a, stmtinfo_b, false);
3067
3068           if (dump_enabled_p () && STMT_VINFO_SLP_VECT_ONLY (stmtinfo_a))
3069             dump_printf_loc (MSG_NOTE, vect_location,
3070                              "Load suitable for SLP vectorization only.\n");
3071
3072           if (init_b == init_prev
3073               && !to_fixup.add (DR_GROUP_FIRST_ELEMENT (stmtinfo_a))
3074               && dump_enabled_p ())
3075             dump_printf_loc (MSG_NOTE, vect_location,
3076                              "Queuing group with duplicate access for fixup\n");
3077         }
3078     }
3079
3080   /* Fixup groups with duplicate entries by splitting it.  */
3081   while (1)
3082     {
3083       hash_set<stmt_vec_info>::iterator it = to_fixup.begin ();
3084       if (!(it != to_fixup.end ()))
3085         break;
3086       stmt_vec_info grp = *it;
3087       to_fixup.remove (grp);
3088
3089       /* Find the earliest duplicate group member.  */
3090       unsigned first_duplicate = -1u;
3091       stmt_vec_info next, g = grp;
3092       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3093         {
3094           if (tree_int_cst_equal (DR_INIT (STMT_VINFO_DR_INFO (next)->dr),
3095                                   DR_INIT (STMT_VINFO_DR_INFO (g)->dr))
3096               && gimple_uid (STMT_VINFO_STMT (next)) < first_duplicate)
3097             first_duplicate = gimple_uid (STMT_VINFO_STMT (next));
3098           g = next;
3099         }
3100       if (first_duplicate == -1U)
3101         continue;
3102
3103       /* Then move all stmts after the first duplicate to a new group.
3104          Note this is a heuristic but one with the property that *it
3105          is fixed up completely.  */
3106       g = grp;
3107       stmt_vec_info newgroup = NULL, ng = grp;
3108       while ((next = DR_GROUP_NEXT_ELEMENT (g)))
3109         {
3110           if (gimple_uid (STMT_VINFO_STMT (next)) >= first_duplicate)
3111             {
3112               DR_GROUP_NEXT_ELEMENT (g) = DR_GROUP_NEXT_ELEMENT (next);
3113               if (!newgroup)
3114                 newgroup = next;
3115               else
3116                 DR_GROUP_NEXT_ELEMENT (ng) = next;
3117               ng = next;
3118               DR_GROUP_FIRST_ELEMENT (ng) = newgroup;
3119             }
3120           else
3121             g = DR_GROUP_NEXT_ELEMENT (g);
3122         }
3123       DR_GROUP_NEXT_ELEMENT (ng) = NULL;
3124
3125       /* Fixup the new group which still may contain duplicates.  */
3126       to_fixup.add (newgroup);
3127     }
3128
3129   FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
3130     {
3131       dr_vec_info *dr_info = vinfo->lookup_dr (dr);
3132       if (STMT_VINFO_VECTORIZABLE (dr_info->stmt)
3133           && !vect_analyze_data_ref_access (dr_info))
3134         {
3135           if (dump_enabled_p ())
3136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3137                              "not vectorized: complicated access pattern.\n");
3138
3139           if (is_a <bb_vec_info> (vinfo))
3140             {
3141               /* Mark the statement as not vectorizable.  */
3142               STMT_VINFO_VECTORIZABLE (dr_info->stmt) = false;
3143               continue;
3144             }
3145           else
3146             {
3147               datarefs_copy.release ();
3148               return opt_result::failure_at (dr_info->stmt->stmt,
3149                                              "not vectorized:"
3150                                              " complicated access pattern.\n");
3151             }
3152         }
3153     }
3154
3155   datarefs_copy.release ();
3156   return opt_result::success ();
3157 }
3158
3159 /* Function vect_vfa_segment_size.
3160
3161    Input:
3162      DR_INFO: The data reference.
3163      LENGTH_FACTOR: segment length to consider.
3164
3165    Return a value suitable for the dr_with_seg_len::seg_len field.
3166    This is the "distance travelled" by the pointer from the first
3167    iteration in the segment to the last.  Note that it does not include
3168    the size of the access; in effect it only describes the first byte.  */
3169
3170 static tree
3171 vect_vfa_segment_size (dr_vec_info *dr_info, tree length_factor)
3172 {
3173   length_factor = size_binop (MINUS_EXPR,
3174                               fold_convert (sizetype, length_factor),
3175                               size_one_node);
3176   return size_binop (MULT_EXPR, fold_convert (sizetype, DR_STEP (dr_info->dr)),
3177                      length_factor);
3178 }
3179
3180 /* Return a value that, when added to abs (vect_vfa_segment_size (DR_INFO)),
3181    gives the worst-case number of bytes covered by the segment.  */
3182
3183 static unsigned HOST_WIDE_INT
3184 vect_vfa_access_size (dr_vec_info *dr_info)
3185 {
3186   stmt_vec_info stmt_vinfo = dr_info->stmt;
3187   tree ref_type = TREE_TYPE (DR_REF (dr_info->dr));
3188   unsigned HOST_WIDE_INT ref_size = tree_to_uhwi (TYPE_SIZE_UNIT (ref_type));
3189   unsigned HOST_WIDE_INT access_size = ref_size;
3190   if (DR_GROUP_FIRST_ELEMENT (stmt_vinfo))
3191     {
3192       gcc_assert (DR_GROUP_FIRST_ELEMENT (stmt_vinfo) == stmt_vinfo);
3193       access_size *= DR_GROUP_SIZE (stmt_vinfo) - DR_GROUP_GAP (stmt_vinfo);
3194     }
3195   if (STMT_VINFO_VEC_STMT (stmt_vinfo)
3196       && (vect_supportable_dr_alignment (dr_info, false)
3197           == dr_explicit_realign_optimized))
3198     {
3199       /* We might access a full vector's worth.  */
3200       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
3201       access_size += tree_to_uhwi (TYPE_SIZE_UNIT (vectype)) - ref_size;
3202     }
3203   return access_size;
3204 }
3205
3206 /* Get the minimum alignment for all the scalar accesses that DR_INFO
3207    describes.  */
3208
3209 static unsigned int
3210 vect_vfa_align (dr_vec_info *dr_info)
3211 {
3212   return TYPE_ALIGN_UNIT (TREE_TYPE (DR_REF (dr_info->dr)));
3213 }
3214
3215 /* Function vect_no_alias_p.
3216
3217    Given data references A and B with equal base and offset, see whether
3218    the alias relation can be decided at compilation time.  Return 1 if
3219    it can and the references alias, 0 if it can and the references do
3220    not alias, and -1 if we cannot decide at compile time.  SEGMENT_LENGTH_A,
3221    SEGMENT_LENGTH_B, ACCESS_SIZE_A and ACCESS_SIZE_B are the equivalent
3222    of dr_with_seg_len::{seg_len,access_size} for A and B.  */
3223
3224 static int
3225 vect_compile_time_alias (dr_vec_info *a, dr_vec_info *b,
3226                          tree segment_length_a, tree segment_length_b,
3227                          unsigned HOST_WIDE_INT access_size_a,
3228                          unsigned HOST_WIDE_INT access_size_b)
3229 {
3230   poly_offset_int offset_a = wi::to_poly_offset (DR_INIT (a->dr));
3231   poly_offset_int offset_b = wi::to_poly_offset (DR_INIT (b->dr));
3232   poly_uint64 const_length_a;
3233   poly_uint64 const_length_b;
3234
3235   /* For negative step, we need to adjust address range by TYPE_SIZE_UNIT
3236      bytes, e.g., int a[3] -> a[1] range is [a+4, a+16) instead of
3237      [a, a+12) */
3238   if (tree_int_cst_compare (DR_STEP (a->dr), size_zero_node) < 0)
3239     {
3240       const_length_a = (-wi::to_poly_wide (segment_length_a)).force_uhwi ();
3241       offset_a = (offset_a + access_size_a) - const_length_a;
3242     }
3243   else
3244     const_length_a = tree_to_poly_uint64 (segment_length_a);
3245   if (tree_int_cst_compare (DR_STEP (b->dr), size_zero_node) < 0)
3246     {
3247       const_length_b = (-wi::to_poly_wide (segment_length_b)).force_uhwi ();
3248       offset_b = (offset_b + access_size_b) - const_length_b;
3249     }
3250   else
3251     const_length_b = tree_to_poly_uint64 (segment_length_b);
3252
3253   const_length_a += access_size_a;
3254   const_length_b += access_size_b;
3255
3256   if (ranges_known_overlap_p (offset_a, const_length_a,
3257                               offset_b, const_length_b))
3258     return 1;
3259
3260   if (!ranges_maybe_overlap_p (offset_a, const_length_a,
3261                                offset_b, const_length_b))
3262     return 0;
3263
3264   return -1;
3265 }
3266
3267 /* Return true if the minimum nonzero dependence distance for loop LOOP_DEPTH
3268    in DDR is >= VF.  */
3269
3270 static bool
3271 dependence_distance_ge_vf (data_dependence_relation *ddr,
3272                            unsigned int loop_depth, poly_uint64 vf)
3273 {
3274   if (DDR_ARE_DEPENDENT (ddr) != NULL_TREE
3275       || DDR_NUM_DIST_VECTS (ddr) == 0)
3276     return false;
3277
3278   /* If the dependence is exact, we should have limited the VF instead.  */
3279   gcc_checking_assert (DDR_COULD_BE_INDEPENDENT_P (ddr));
3280
3281   unsigned int i;
3282   lambda_vector dist_v;
3283   FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
3284     {
3285       HOST_WIDE_INT dist = dist_v[loop_depth];
3286       if (dist != 0
3287           && !(dist > 0 && DDR_REVERSED_P (ddr))
3288           && maybe_lt ((unsigned HOST_WIDE_INT) abs_hwi (dist), vf))
3289         return false;
3290     }
3291
3292   if (dump_enabled_p ())
3293     dump_printf_loc (MSG_NOTE, vect_location,
3294                      "dependence distance between %T and %T is >= VF\n",
3295                      DR_REF (DDR_A (ddr)), DR_REF (DDR_B (ddr)));
3296
3297   return true;
3298 }
3299
3300 /* Dump LOWER_BOUND using flags DUMP_KIND.  Dumps are known to be enabled.  */
3301
3302 static void
3303 dump_lower_bound (dump_flags_t dump_kind, const vec_lower_bound &lower_bound)
3304 {
3305   dump_printf (dump_kind, "%s (%T) >= ",
3306                lower_bound.unsigned_p ? "unsigned" : "abs",
3307                lower_bound.expr);
3308   dump_dec (dump_kind, lower_bound.min_value);
3309 }
3310
3311 /* Record that the vectorized loop requires the vec_lower_bound described
3312    by EXPR, UNSIGNED_P and MIN_VALUE.  */
3313
3314 static void
3315 vect_check_lower_bound (loop_vec_info loop_vinfo, tree expr, bool unsigned_p,
3316                         poly_uint64 min_value)
3317 {
3318   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
3319   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
3320     if (operand_equal_p (lower_bounds[i].expr, expr, 0))
3321       {
3322         unsigned_p &= lower_bounds[i].unsigned_p;
3323         min_value = upper_bound (lower_bounds[i].min_value, min_value);
3324         if (lower_bounds[i].unsigned_p != unsigned_p
3325             || maybe_lt (lower_bounds[i].min_value, min_value))
3326           {
3327             lower_bounds[i].unsigned_p = unsigned_p;
3328             lower_bounds[i].min_value = min_value;
3329             if (dump_enabled_p ())
3330               {
3331                 dump_printf_loc (MSG_NOTE, vect_location,
3332                                  "updating run-time check to ");
3333                 dump_lower_bound (MSG_NOTE, lower_bounds[i]);
3334                 dump_printf (MSG_NOTE, "\n");
3335               }
3336           }
3337         return;
3338       }
3339
3340   vec_lower_bound lower_bound (expr, unsigned_p, min_value);
3341   if (dump_enabled_p ())
3342     {
3343       dump_printf_loc (MSG_NOTE, vect_location, "need a run-time check that ");
3344       dump_lower_bound (MSG_NOTE, lower_bound);
3345       dump_printf (MSG_NOTE, "\n");
3346     }
3347   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).safe_push (lower_bound);
3348 }
3349
3350 /* Return true if it's unlikely that the step of the vectorized form of DR_INFO
3351    will span fewer than GAP bytes.  */
3352
3353 static bool
3354 vect_small_gap_p (loop_vec_info loop_vinfo, dr_vec_info *dr_info,
3355                   poly_int64 gap)
3356 {
3357   stmt_vec_info stmt_info = dr_info->stmt;
3358   HOST_WIDE_INT count
3359     = estimated_poly_value (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
3360   if (DR_GROUP_FIRST_ELEMENT (stmt_info))
3361     count *= DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (stmt_info));
3362   return (estimated_poly_value (gap)
3363           <= count * vect_get_scalar_dr_size (dr_info));
3364 }
3365
3366 /* Return true if we know that there is no alias between DR_INFO_A and
3367    DR_INFO_B when abs (DR_STEP (DR_INFO_A->dr)) >= N for some N.
3368    When returning true, set *LOWER_BOUND_OUT to this N.  */
3369
3370 static bool
3371 vectorizable_with_step_bound_p (dr_vec_info *dr_info_a, dr_vec_info *dr_info_b,
3372                                 poly_uint64 *lower_bound_out)
3373 {
3374   /* Check that there is a constant gap of known sign between DR_A
3375      and DR_B.  */
3376   data_reference *dr_a = dr_info_a->dr;
3377   data_reference *dr_b = dr_info_b->dr;
3378   poly_int64 init_a, init_b;
3379   if (!operand_equal_p (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b), 0)
3380       || !operand_equal_p (DR_OFFSET (dr_a), DR_OFFSET (dr_b), 0)
3381       || !operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0)
3382       || !poly_int_tree_p (DR_INIT (dr_a), &init_a)
3383       || !poly_int_tree_p (DR_INIT (dr_b), &init_b)
3384       || !ordered_p (init_a, init_b))
3385     return false;
3386
3387   /* Sort DR_A and DR_B by the address they access.  */
3388   if (maybe_lt (init_b, init_a))
3389     {
3390       std::swap (init_a, init_b);
3391       std::swap (dr_info_a, dr_info_b);
3392       std::swap (dr_a, dr_b);
3393     }
3394
3395   /* If the two accesses could be dependent within a scalar iteration,
3396      make sure that we'd retain their order.  */
3397   if (maybe_gt (init_a + vect_get_scalar_dr_size (dr_info_a), init_b)
3398       && !vect_preserves_scalar_order_p (dr_info_a, dr_info_b))
3399     return false;
3400
3401   /* There is no alias if abs (DR_STEP) is greater than or equal to
3402      the bytes spanned by the combination of the two accesses.  */
3403   *lower_bound_out = init_b + vect_get_scalar_dr_size (dr_info_b) - init_a;
3404   return true;
3405 }
3406
3407 /* Function vect_prune_runtime_alias_test_list.
3408
3409    Prune a list of ddrs to be tested at run-time by versioning for alias.
3410    Merge several alias checks into one if possible.
3411    Return FALSE if resulting list of ddrs is longer then allowed by
3412    PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE.  */
3413
3414 opt_result
3415 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
3416 {
3417   typedef pair_hash <tree_operand_hash, tree_operand_hash> tree_pair_hash;
3418   hash_set <tree_pair_hash> compared_objects;
3419
3420   vec<ddr_p> may_alias_ddrs = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
3421   vec<dr_with_seg_len_pair_t> &comp_alias_ddrs
3422     = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
3423   vec<vec_object_pair> &check_unequal_addrs
3424     = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
3425   poly_uint64 vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3426   tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
3427
3428   ddr_p ddr;
3429   unsigned int i;
3430   tree length_factor;
3431
3432   DUMP_VECT_SCOPE ("vect_prune_runtime_alias_test_list");
3433
3434   /* Step values are irrelevant for aliasing if the number of vector
3435      iterations is equal to the number of scalar iterations (which can
3436      happen for fully-SLP loops).  */
3437   bool ignore_step_p = known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U);
3438
3439   if (!ignore_step_p)
3440     {
3441       /* Convert the checks for nonzero steps into bound tests.  */
3442       tree value;
3443       FOR_EACH_VEC_ELT (LOOP_VINFO_CHECK_NONZERO (loop_vinfo), i, value)
3444         vect_check_lower_bound (loop_vinfo, value, true, 1);
3445     }
3446
3447   if (may_alias_ddrs.is_empty ())
3448     return opt_result::success ();
3449
3450   comp_alias_ddrs.create (may_alias_ddrs.length ());
3451
3452   unsigned int loop_depth
3453     = index_in_loop_nest (LOOP_VINFO_LOOP (loop_vinfo)->num,
3454                           LOOP_VINFO_LOOP_NEST (loop_vinfo));
3455
3456   /* First, we collect all data ref pairs for aliasing checks.  */
3457   FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
3458     {
3459       int comp_res;
3460       poly_uint64 lower_bound;
3461       tree segment_length_a, segment_length_b;
3462       unsigned HOST_WIDE_INT access_size_a, access_size_b;
3463       unsigned int align_a, align_b;
3464
3465       /* Ignore the alias if the VF we chose ended up being no greater
3466          than the dependence distance.  */
3467       if (dependence_distance_ge_vf (ddr, loop_depth, vect_factor))
3468         continue;
3469
3470       if (DDR_OBJECT_A (ddr))
3471         {
3472           vec_object_pair new_pair (DDR_OBJECT_A (ddr), DDR_OBJECT_B (ddr));
3473           if (!compared_objects.add (new_pair))
3474             {
3475               if (dump_enabled_p ())
3476                 dump_printf_loc (MSG_NOTE, vect_location,
3477                                  "checking that %T and %T"
3478                                  " have different addresses\n",
3479                                  new_pair.first, new_pair.second);
3480               LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).safe_push (new_pair);
3481             }
3482           continue;
3483         }
3484
3485       dr_vec_info *dr_info_a = loop_vinfo->lookup_dr (DDR_A (ddr));
3486       stmt_vec_info stmt_info_a = dr_info_a->stmt;
3487
3488       dr_vec_info *dr_info_b = loop_vinfo->lookup_dr (DDR_B (ddr));
3489       stmt_vec_info stmt_info_b = dr_info_b->stmt;
3490
3491       /* Skip the pair if inter-iteration dependencies are irrelevant
3492          and intra-iteration dependencies are guaranteed to be honored.  */
3493       if (ignore_step_p
3494           && (vect_preserves_scalar_order_p (dr_info_a, dr_info_b)
3495               || vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3496                                                  &lower_bound)))
3497         {
3498           if (dump_enabled_p ())
3499             dump_printf_loc (MSG_NOTE, vect_location,
3500                              "no need for alias check between "
3501                              "%T and %T when VF is 1\n",
3502                              DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3503           continue;
3504         }
3505
3506       /* See whether we can handle the alias using a bounds check on
3507          the step, and whether that's likely to be the best approach.
3508          (It might not be, for example, if the minimum step is much larger
3509          than the number of bytes handled by one vector iteration.)  */
3510       if (!ignore_step_p
3511           && TREE_CODE (DR_STEP (dr_info_a->dr)) != INTEGER_CST
3512           && vectorizable_with_step_bound_p (dr_info_a, dr_info_b,
3513                                              &lower_bound)
3514           && (vect_small_gap_p (loop_vinfo, dr_info_a, lower_bound)
3515               || vect_small_gap_p (loop_vinfo, dr_info_b, lower_bound)))
3516         {
3517           bool unsigned_p = dr_known_forward_stride_p (dr_info_a->dr);
3518           if (dump_enabled_p ())
3519             {
3520               dump_printf_loc (MSG_NOTE, vect_location, "no alias between "
3521                                "%T and %T when the step %T is outside ",
3522                                DR_REF (dr_info_a->dr),
3523                                DR_REF (dr_info_b->dr),
3524                                DR_STEP (dr_info_a->dr));
3525               if (unsigned_p)
3526                 dump_printf (MSG_NOTE, "[0");
3527               else
3528                 {
3529                   dump_printf (MSG_NOTE, "(");
3530                   dump_dec (MSG_NOTE, poly_int64 (-lower_bound));
3531                 }
3532               dump_printf (MSG_NOTE, ", ");
3533               dump_dec (MSG_NOTE, lower_bound);
3534               dump_printf (MSG_NOTE, ")\n");
3535             }
3536           vect_check_lower_bound (loop_vinfo, DR_STEP (dr_info_a->dr),
3537                                   unsigned_p, lower_bound);
3538           continue;
3539         }
3540
3541       stmt_vec_info dr_group_first_a = DR_GROUP_FIRST_ELEMENT (stmt_info_a);
3542       if (dr_group_first_a)
3543         {
3544           stmt_info_a = dr_group_first_a;
3545           dr_info_a = STMT_VINFO_DR_INFO (stmt_info_a);
3546         }
3547
3548       stmt_vec_info dr_group_first_b = DR_GROUP_FIRST_ELEMENT (stmt_info_b);
3549       if (dr_group_first_b)
3550         {
3551           stmt_info_b = dr_group_first_b;
3552           dr_info_b = STMT_VINFO_DR_INFO (stmt_info_b);
3553         }
3554
3555       if (ignore_step_p)
3556         {
3557           segment_length_a = size_zero_node;
3558           segment_length_b = size_zero_node;
3559         }
3560       else
3561         {
3562           if (!operand_equal_p (DR_STEP (dr_info_a->dr),
3563                                 DR_STEP (dr_info_b->dr), 0))
3564             length_factor = scalar_loop_iters;
3565           else
3566             length_factor = size_int (vect_factor);
3567           segment_length_a = vect_vfa_segment_size (dr_info_a, length_factor);
3568           segment_length_b = vect_vfa_segment_size (dr_info_b, length_factor);
3569         }
3570       access_size_a = vect_vfa_access_size (dr_info_a);
3571       access_size_b = vect_vfa_access_size (dr_info_b);
3572       align_a = vect_vfa_align (dr_info_a);
3573       align_b = vect_vfa_align (dr_info_b);
3574
3575       comp_res = data_ref_compare_tree (DR_BASE_ADDRESS (dr_info_a->dr),
3576                                         DR_BASE_ADDRESS (dr_info_b->dr));
3577       if (comp_res == 0)
3578         comp_res = data_ref_compare_tree (DR_OFFSET (dr_info_a->dr),
3579                                           DR_OFFSET (dr_info_b->dr));
3580
3581       /* See whether the alias is known at compilation time.  */
3582       if (comp_res == 0
3583           && TREE_CODE (DR_STEP (dr_info_a->dr)) == INTEGER_CST
3584           && TREE_CODE (DR_STEP (dr_info_b->dr)) == INTEGER_CST
3585           && poly_int_tree_p (segment_length_a)
3586           && poly_int_tree_p (segment_length_b))
3587         {
3588           int res = vect_compile_time_alias (dr_info_a, dr_info_b,
3589                                              segment_length_a,
3590                                              segment_length_b,
3591                                              access_size_a,
3592                                              access_size_b);
3593           if (res >= 0 && dump_enabled_p ())
3594             {
3595               dump_printf_loc (MSG_NOTE, vect_location,
3596                                "can tell at compile time that %T and %T",
3597                                DR_REF (dr_info_a->dr), DR_REF (dr_info_b->dr));
3598               if (res == 0)
3599                 dump_printf (MSG_NOTE, " do not alias\n");
3600               else
3601                 dump_printf (MSG_NOTE, " alias\n");
3602             }
3603
3604           if (res == 0)
3605             continue;
3606
3607           if (res == 1)
3608             return opt_result::failure_at (stmt_info_b->stmt,
3609                                            "not vectorized:"
3610                                            " compilation time alias: %G%G",
3611                                            stmt_info_a->stmt,
3612                                            stmt_info_b->stmt);
3613         }
3614
3615       dr_with_seg_len_pair_t dr_with_seg_len_pair
3616         (dr_with_seg_len (dr_info_a->dr, segment_length_a,
3617                           access_size_a, align_a),
3618          dr_with_seg_len (dr_info_b->dr, segment_length_b,
3619                           access_size_b, align_b));
3620
3621       /* Canonicalize pairs by sorting the two DR members.  */
3622       if (comp_res > 0)
3623         std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
3624
3625       comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
3626     }
3627
3628   prune_runtime_alias_test_list (&comp_alias_ddrs, vect_factor);
3629
3630   unsigned int count = (comp_alias_ddrs.length ()
3631                         + check_unequal_addrs.length ());
3632
3633   if (dump_enabled_p ())
3634     dump_printf_loc (MSG_NOTE, vect_location,
3635                      "improved number of alias checks from %d to %d\n",
3636                      may_alias_ddrs.length (), count);
3637   if ((int) count > PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3638     return opt_result::failure_at
3639       (vect_location,
3640        "number of versioning for alias "
3641        "run-time tests exceeds %d "
3642        "(--param vect-max-version-for-alias-checks)\n",
3643        PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
3644
3645   return opt_result::success ();
3646 }
3647
3648 /* Check whether we can use an internal function for a gather load
3649    or scatter store.  READ_P is true for loads and false for stores.
3650    MASKED_P is true if the load or store is conditional.  MEMORY_TYPE is
3651    the type of the memory elements being loaded or stored.  OFFSET_BITS
3652    is the number of bits in each scalar offset and OFFSET_SIGN is the
3653    sign of the offset.  SCALE is the amount by which the offset should
3654    be multiplied *after* it has been converted to address width.
3655
3656    Return true if the function is supported, storing the function
3657    id in *IFN_OUT and the type of a vector element in *ELEMENT_TYPE_OUT.  */
3658
3659 bool
3660 vect_gather_scatter_fn_p (bool read_p, bool masked_p, tree vectype,
3661                           tree memory_type, unsigned int offset_bits,
3662                           signop offset_sign, int scale,
3663                           internal_fn *ifn_out, tree *element_type_out)
3664 {
3665   unsigned int memory_bits = tree_to_uhwi (TYPE_SIZE (memory_type));
3666   unsigned int element_bits = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
3667   if (offset_bits > element_bits)
3668     /* Internal functions require the offset to be the same width as
3669        the vector elements.  We can extend narrower offsets, but it isn't
3670        safe to truncate wider offsets.  */
3671     return false;
3672
3673   if (element_bits != memory_bits)
3674     /* For now the vector elements must be the same width as the
3675        memory elements.  */
3676     return false;
3677
3678   /* Work out which function we need.  */
3679   internal_fn ifn;
3680   if (read_p)
3681     ifn = masked_p ? IFN_MASK_GATHER_LOAD : IFN_GATHER_LOAD;
3682   else
3683     ifn = masked_p ? IFN_MASK_SCATTER_STORE : IFN_SCATTER_STORE;
3684
3685   /* Test whether the target supports this combination.  */
3686   if (!internal_gather_scatter_fn_supported_p (ifn, vectype, memory_type,
3687                                                offset_sign, scale))
3688     return false;
3689
3690   *ifn_out = ifn;
3691   *element_type_out = TREE_TYPE (vectype);
3692   return true;
3693 }
3694
3695 /* STMT_INFO is a call to an internal gather load or scatter store function.
3696    Describe the operation in INFO.  */
3697
3698 static void
3699 vect_describe_gather_scatter_call (stmt_vec_info stmt_info,
3700                                    gather_scatter_info *info)
3701 {
3702   gcall *call = as_a <gcall *> (stmt_info->stmt);
3703   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3704   data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3705
3706   info->ifn = gimple_call_internal_fn (call);
3707   info->decl = NULL_TREE;
3708   info->base = gimple_call_arg (call, 0);
3709   info->offset = gimple_call_arg (call, 1);
3710   info->offset_dt = vect_unknown_def_type;
3711   info->offset_vectype = NULL_TREE;
3712   info->scale = TREE_INT_CST_LOW (gimple_call_arg (call, 2));
3713   info->element_type = TREE_TYPE (vectype);
3714   info->memory_type = TREE_TYPE (DR_REF (dr));
3715 }
3716
3717 /* Return true if a non-affine read or write in STMT_INFO is suitable for a
3718    gather load or scatter store.  Describe the operation in *INFO if so.  */
3719
3720 bool
3721 vect_check_gather_scatter (stmt_vec_info stmt_info, loop_vec_info loop_vinfo,
3722                            gather_scatter_info *info)
3723 {
3724   HOST_WIDE_INT scale = 1;
3725   poly_int64 pbitpos, pbitsize;
3726   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3727   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3728   tree offtype = NULL_TREE;
3729   tree decl = NULL_TREE, base, off;
3730   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3731   tree memory_type = TREE_TYPE (DR_REF (dr));
3732   machine_mode pmode;
3733   int punsignedp, reversep, pvolatilep = 0;
3734   internal_fn ifn;
3735   tree element_type;
3736   bool masked_p = false;
3737
3738   /* See whether this is already a call to a gather/scatter internal function.
3739      If not, see whether it's a masked load or store.  */
3740   gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
3741   if (call && gimple_call_internal_p (call))
3742     {
3743       ifn = gimple_call_internal_fn (call);
3744       if (internal_gather_scatter_fn_p (ifn))
3745         {
3746           vect_describe_gather_scatter_call (stmt_info, info);
3747           return true;
3748         }
3749       masked_p = (ifn == IFN_MASK_LOAD || ifn == IFN_MASK_STORE);
3750     }
3751
3752   /* True if we should aim to use internal functions rather than
3753      built-in functions.  */
3754   bool use_ifn_p = (DR_IS_READ (dr)
3755                     ? supports_vec_gather_load_p ()
3756                     : supports_vec_scatter_store_p ());
3757
3758   base = DR_REF (dr);
3759   /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3760      see if we can use the def stmt of the address.  */
3761   if (masked_p
3762       && TREE_CODE (base) == MEM_REF
3763       && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3764       && integer_zerop (TREE_OPERAND (base, 1))
3765       && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3766     {
3767       gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3768       if (is_gimple_assign (def_stmt)
3769           && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3770         base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3771     }
3772
3773   /* The gather and scatter builtins need address of the form
3774      loop_invariant + vector * {1, 2, 4, 8}
3775      or
3776      loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3777      Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3778      of loop invariants/SSA_NAMEs defined in the loop, with casts,
3779      multiplications and additions in it.  To get a vector, we need
3780      a single SSA_NAME that will be defined in the loop and will
3781      contain everything that is not loop invariant and that can be
3782      vectorized.  The following code attempts to find such a preexistng
3783      SSA_NAME OFF and put the loop invariants into a tree BASE
3784      that can be gimplified before the loop.  */
3785   base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3786                               &punsignedp, &reversep, &pvolatilep);
3787   if (reversep)
3788     return false;
3789
3790   poly_int64 pbytepos = exact_div (pbitpos, BITS_PER_UNIT);
3791
3792   if (TREE_CODE (base) == MEM_REF)
3793     {
3794       if (!integer_zerop (TREE_OPERAND (base, 1)))
3795         {
3796           if (off == NULL_TREE)
3797             off = wide_int_to_tree (sizetype, mem_ref_offset (base));
3798           else
3799             off = size_binop (PLUS_EXPR, off,
3800                               fold_convert (sizetype, TREE_OPERAND (base, 1)));
3801         }
3802       base = TREE_OPERAND (base, 0);
3803     }
3804   else
3805     base = build_fold_addr_expr (base);
3806
3807   if (off == NULL_TREE)
3808     off = size_zero_node;
3809
3810   /* If base is not loop invariant, either off is 0, then we start with just
3811      the constant offset in the loop invariant BASE and continue with base
3812      as OFF, otherwise give up.
3813      We could handle that case by gimplifying the addition of base + off
3814      into some SSA_NAME and use that as off, but for now punt.  */
3815   if (!expr_invariant_in_loop_p (loop, base))
3816     {
3817       if (!integer_zerop (off))
3818         return false;
3819       off = base;
3820       base = size_int (pbytepos);
3821     }
3822   /* Otherwise put base + constant offset into the loop invariant BASE
3823      and continue with OFF.  */
3824   else
3825     {
3826       base = fold_convert (sizetype, base);
3827       base = size_binop (PLUS_EXPR, base, size_int (pbytepos));
3828     }
3829
3830   /* OFF at this point may be either a SSA_NAME or some tree expression
3831      from get_inner_reference.  Try to peel off loop invariants from it
3832      into BASE as long as possible.  */
3833   STRIP_NOPS (off);
3834   while (offtype == NULL_TREE)
3835     {
3836       enum tree_code code;
3837       tree op0, op1, add = NULL_TREE;
3838
3839       if (TREE_CODE (off) == SSA_NAME)
3840         {
3841           gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3842
3843           if (expr_invariant_in_loop_p (loop, off))
3844             return false;
3845
3846           if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3847             break;
3848
3849           op0 = gimple_assign_rhs1 (def_stmt);
3850           code = gimple_assign_rhs_code (def_stmt);
3851           op1 = gimple_assign_rhs2 (def_stmt);
3852         }
3853       else
3854         {
3855           if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3856             return false;
3857           code = TREE_CODE (off);
3858           extract_ops_from_tree (off, &code, &op0, &op1);
3859         }
3860       switch (code)
3861         {
3862         case POINTER_PLUS_EXPR:
3863         case PLUS_EXPR:
3864           if (expr_invariant_in_loop_p (loop, op0))
3865             {
3866               add = op0;
3867               off = op1;
3868             do_add:
3869               add = fold_convert (sizetype, add);
3870               if (scale != 1)
3871                 add = size_binop (MULT_EXPR, add, size_int (scale));
3872               base = size_binop (PLUS_EXPR, base, add);
3873               continue;
3874             }
3875           if (expr_invariant_in_loop_p (loop, op1))
3876             {
3877               add = op1;
3878               off = op0;
3879               goto do_add;
3880             }
3881           break;
3882         case MINUS_EXPR:
3883           if (expr_invariant_in_loop_p (loop, op1))
3884             {
3885               add = fold_convert (sizetype, op1);
3886               add = size_binop (MINUS_EXPR, size_zero_node, add);
3887               off = op0;
3888               goto do_add;
3889             }
3890           break;
3891         case MULT_EXPR:
3892           if (scale == 1 && tree_fits_shwi_p (op1))
3893             {
3894               int new_scale = tree_to_shwi (op1);
3895               /* Only treat this as a scaling operation if the target
3896                  supports it.  */
3897               if (use_ifn_p
3898                   && !vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p,
3899                                                 vectype, memory_type, 1,
3900                                                 TYPE_SIGN (TREE_TYPE (op0)),
3901                                                 new_scale, &ifn,
3902                                                 &element_type))
3903                 break;
3904               scale = new_scale;
3905               off = op0;
3906               continue;
3907             }
3908           break;
3909         case SSA_NAME:
3910           off = op0;
3911           continue;
3912         CASE_CONVERT:
3913           if (!POINTER_TYPE_P (TREE_TYPE (op0))
3914               && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3915             break;
3916           if (TYPE_PRECISION (TREE_TYPE (op0))
3917               == TYPE_PRECISION (TREE_TYPE (off)))
3918             {
3919               off = op0;
3920               continue;
3921             }
3922
3923           /* The internal functions need the offset to be the same width
3924              as the elements of VECTYPE.  Don't include operations that
3925              cast the offset from that width to a different width.  */
3926           if (use_ifn_p
3927               && (int_size_in_bytes (TREE_TYPE (vectype))
3928                   == int_size_in_bytes (TREE_TYPE (off))))
3929             break;
3930
3931           if (TYPE_PRECISION (TREE_TYPE (op0))
3932               < TYPE_PRECISION (TREE_TYPE (off)))
3933             {
3934               off = op0;
3935               offtype = TREE_TYPE (off);
3936               STRIP_NOPS (off);
3937               continue;
3938             }
3939           break;
3940         default:
3941           break;
3942         }
3943       break;
3944     }
3945
3946   /* If at the end OFF still isn't a SSA_NAME or isn't
3947      defined in the loop, punt.  */
3948   if (TREE_CODE (off) != SSA_NAME
3949       || expr_invariant_in_loop_p (loop, off))
3950     return false;
3951
3952   if (offtype == NULL_TREE)
3953     offtype = TREE_TYPE (off);
3954
3955   if (use_ifn_p)
3956     {
3957       if (!vect_gather_scatter_fn_p (DR_IS_READ (dr), masked_p, vectype,
3958                                      memory_type, TYPE_PRECISION (offtype),
3959                                      TYPE_SIGN (offtype), scale, &ifn,
3960                                      &element_type))
3961         return false;
3962     }
3963   else
3964     {
3965       if (DR_IS_READ (dr))
3966         {
3967           if (targetm.vectorize.builtin_gather)
3968             decl = targetm.vectorize.builtin_gather (vectype, offtype, scale);
3969         }
3970       else
3971         {
3972           if (targetm.vectorize.builtin_scatter)
3973             decl = targetm.vectorize.builtin_scatter (vectype, offtype, scale);
3974         }
3975
3976       if (!decl)
3977         return false;
3978
3979       ifn = IFN_LAST;
3980       element_type = TREE_TYPE (vectype);
3981     }
3982
3983   info->ifn = ifn;
3984   info->decl = decl;
3985   info->base = base;
3986   info->offset = off;
3987   info->offset_dt = vect_unknown_def_type;
3988   info->offset_vectype = NULL_TREE;
3989   info->scale = scale;
3990   info->element_type = element_type;
3991   info->memory_type = memory_type;
3992   return true;
3993 }
3994
3995 /* Find the data references in STMT, analyze them with respect to LOOP and
3996    append them to DATAREFS.  Return false if datarefs in this stmt cannot
3997    be handled.  */
3998
3999 opt_result
4000 vect_find_stmt_data_reference (loop_p loop, gimple *stmt,
4001                                vec<data_reference_p> *datarefs)
4002 {
4003   /* We can ignore clobbers for dataref analysis - they are removed during
4004      loop vectorization and BB vectorization checks dependences with a
4005      stmt walk.  */
4006   if (gimple_clobber_p (stmt))
4007     return opt_result::success ();
4008
4009   if (gimple_has_volatile_ops (stmt))
4010     return opt_result::failure_at (stmt, "not vectorized: volatile type: %G",
4011                                    stmt);
4012
4013   if (stmt_can_throw_internal (cfun, stmt))
4014     return opt_result::failure_at (stmt,
4015                                    "not vectorized:"
4016                                    " statement can throw an exception: %G",
4017                                    stmt);
4018
4019   auto_vec<data_reference_p, 2> refs;
4020   opt_result res = find_data_references_in_stmt (loop, stmt, &refs);
4021   if (!res)
4022     return res;
4023
4024   if (refs.is_empty ())
4025     return opt_result::success ();
4026
4027   if (refs.length () > 1)
4028     return opt_result::failure_at (stmt,
4029                                    "not vectorized:"
4030                                    " more than one data ref in stmt: %G", stmt);
4031
4032   if (gcall *call = dyn_cast <gcall *> (stmt))
4033     if (!gimple_call_internal_p (call)
4034         || (gimple_call_internal_fn (call) != IFN_MASK_LOAD
4035             && gimple_call_internal_fn (call) != IFN_MASK_STORE))
4036       return opt_result::failure_at (stmt,
4037                                      "not vectorized: dr in a call %G", stmt);
4038
4039   data_reference_p dr = refs.pop ();
4040   if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
4041       && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
4042     return opt_result::failure_at (stmt,
4043                                    "not vectorized:"
4044                                    " statement is bitfield access %G", stmt);
4045
4046   if (DR_BASE_ADDRESS (dr)
4047       && TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
4048     return opt_result::failure_at (stmt,
4049                                    "not vectorized:"
4050                                    " base addr of dr is a constant\n");
4051
4052   /* Check whether this may be a SIMD lane access and adjust the
4053      DR to make it easier for us to handle it.  */
4054   if (loop
4055       && loop->simduid
4056       && (!DR_BASE_ADDRESS (dr)
4057           || !DR_OFFSET (dr)
4058           || !DR_INIT (dr)
4059           || !DR_STEP (dr)))
4060     {
4061       struct data_reference *newdr
4062         = create_data_ref (NULL, loop_containing_stmt (stmt), DR_REF (dr), stmt,
4063                            DR_IS_READ (dr), DR_IS_CONDITIONAL_IN_STMT (dr));
4064       if (DR_BASE_ADDRESS (newdr)
4065           && DR_OFFSET (newdr)
4066           && DR_INIT (newdr)
4067           && DR_STEP (newdr)
4068           && integer_zerop (DR_STEP (newdr)))
4069         {
4070           tree off = DR_OFFSET (newdr);
4071           STRIP_NOPS (off);
4072           if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
4073               && TREE_CODE (off) == MULT_EXPR
4074               && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
4075             {
4076               tree step = TREE_OPERAND (off, 1);
4077               off = TREE_OPERAND (off, 0);
4078               STRIP_NOPS (off);
4079               if (CONVERT_EXPR_P (off)
4080                   && (TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off, 0)))
4081                       < TYPE_PRECISION (TREE_TYPE (off))))
4082                 off = TREE_OPERAND (off, 0);
4083               if (TREE_CODE (off) == SSA_NAME)
4084                 {
4085                   gimple *def = SSA_NAME_DEF_STMT (off);
4086                   tree reft = TREE_TYPE (DR_REF (newdr));
4087                   if (is_gimple_call (def)
4088                       && gimple_call_internal_p (def)
4089                       && (gimple_call_internal_fn (def) == IFN_GOMP_SIMD_LANE))
4090                     {
4091                       tree arg = gimple_call_arg (def, 0);
4092                       gcc_assert (TREE_CODE (arg) == SSA_NAME);
4093                       arg = SSA_NAME_VAR (arg);
4094                       if (arg == loop->simduid
4095                           /* For now.  */
4096                           && tree_int_cst_equal (TYPE_SIZE_UNIT (reft), step))
4097                         {
4098                           DR_OFFSET (newdr) = ssize_int (0);
4099                           DR_STEP (newdr) = step;
4100                           DR_OFFSET_ALIGNMENT (newdr) = BIGGEST_ALIGNMENT;
4101                           DR_STEP_ALIGNMENT (newdr)
4102                             = highest_pow2_factor (step);
4103                           /* Mark as simd-lane access.  */
4104                           newdr->aux = (void *)-1;
4105                           free_data_ref (dr);
4106                           datarefs->safe_push (newdr);
4107                           return opt_result::success ();
4108                         }
4109                     }
4110                 }
4111             }
4112         }
4113       free_data_ref (newdr);
4114     }
4115
4116   datarefs->safe_push (dr);
4117   return opt_result::success ();
4118 }
4119
4120 /* Function vect_analyze_data_refs.
4121
4122   Find all the data references in the loop or basic block.
4123
4124    The general structure of the analysis of data refs in the vectorizer is as
4125    follows:
4126    1- vect_analyze_data_refs(loop/bb): call
4127       compute_data_dependences_for_loop/bb to find and analyze all data-refs
4128       in the loop/bb and their dependences.
4129    2- vect_analyze_dependences(): apply dependence testing using ddrs.
4130    3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
4131    4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
4132
4133 */
4134
4135 opt_result
4136 vect_analyze_data_refs (vec_info *vinfo, poly_uint64 *min_vf)
4137 {
4138   struct loop *loop = NULL;
4139   unsigned int i;
4140   struct data_reference *dr;
4141   tree scalar_type;
4142
4143   DUMP_VECT_SCOPE ("vect_analyze_data_refs");
4144
4145   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4146     loop = LOOP_VINFO_LOOP (loop_vinfo);
4147
4148   /* Go through the data-refs, check that the analysis succeeded.  Update
4149      pointer from stmt_vec_info struct to DR and vectype.  */
4150
4151   vec<data_reference_p> datarefs = vinfo->shared->datarefs;
4152   FOR_EACH_VEC_ELT (datarefs, i, dr)
4153     {
4154       enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
4155       poly_uint64 vf;
4156
4157       gcc_assert (DR_REF (dr));
4158       stmt_vec_info stmt_info = vinfo->lookup_stmt (DR_STMT (dr));
4159       gcc_assert (!stmt_info->dr_aux.dr);
4160       stmt_info->dr_aux.dr = dr;
4161       stmt_info->dr_aux.stmt = stmt_info;
4162
4163       /* Check that analysis of the data-ref succeeded.  */
4164       if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
4165           || !DR_STEP (dr))
4166         {
4167           bool maybe_gather
4168             = DR_IS_READ (dr)
4169               && !TREE_THIS_VOLATILE (DR_REF (dr))
4170               && (targetm.vectorize.builtin_gather != NULL
4171                   || supports_vec_gather_load_p ());
4172           bool maybe_scatter
4173             = DR_IS_WRITE (dr)
4174               && !TREE_THIS_VOLATILE (DR_REF (dr))
4175               && (targetm.vectorize.builtin_scatter != NULL
4176                   || supports_vec_scatter_store_p ());
4177
4178           /* If target supports vector gather loads or scatter stores,
4179              see if they can't be used.  */
4180           if (is_a <loop_vec_info> (vinfo)
4181               && !nested_in_vect_loop_p (loop, stmt_info))
4182             {
4183               if (maybe_gather || maybe_scatter)
4184                 {
4185                   if (maybe_gather)
4186                     gatherscatter = GATHER;
4187                   else
4188                     gatherscatter = SCATTER;
4189                 }
4190             }
4191
4192           if (gatherscatter == SG_NONE)
4193             {
4194               if (dump_enabled_p ())
4195                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4196                                  "not vectorized: data ref analysis "
4197                                  "failed %G", stmt_info->stmt);
4198               if (is_a <bb_vec_info> (vinfo))
4199                 {
4200                   /* In BB vectorization the ref can still participate
4201                      in dependence analysis, we just can't vectorize it.  */
4202                   STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4203                   continue;
4204                 }
4205               return opt_result::failure_at (stmt_info->stmt,
4206                                              "not vectorized:"
4207                                              " data ref analysis failed: %G",
4208                                              stmt_info->stmt);
4209             }
4210         }
4211
4212       /* See if this was detected as SIMD lane access.  */
4213       if (dr->aux == (void *)-1)
4214         {
4215           if (nested_in_vect_loop_p (loop, stmt_info))
4216             return opt_result::failure_at (stmt_info->stmt,
4217                                            "not vectorized:"
4218                                            " data ref analysis failed: %G",
4219                                            stmt_info->stmt);
4220           STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
4221         }
4222
4223       tree base = get_base_address (DR_REF (dr));
4224       if (base && VAR_P (base) && DECL_NONALIASED (base))
4225         {
4226           if (dump_enabled_p ())
4227             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4228                              "not vectorized: base object not addressable "
4229                              "for stmt: %G", stmt_info->stmt);
4230           if (is_a <bb_vec_info> (vinfo))
4231             {
4232               /* In BB vectorization the ref can still participate
4233                  in dependence analysis, we just can't vectorize it.  */
4234               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4235               continue;
4236             }
4237           return opt_result::failure_at (stmt_info->stmt,
4238                                          "not vectorized: base object not"
4239                                          " addressable for stmt: %G",
4240                                          stmt_info->stmt);
4241         }
4242
4243       if (is_a <loop_vec_info> (vinfo)
4244           && DR_STEP (dr)
4245           && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
4246         {
4247           if (nested_in_vect_loop_p (loop, stmt_info))
4248             return opt_result::failure_at (stmt_info->stmt,
4249                                            "not vectorized:"
4250                                            "not suitable for strided load %G",
4251                                            stmt_info->stmt);
4252           STMT_VINFO_STRIDED_P (stmt_info) = true;
4253         }
4254
4255       /* Update DR field in stmt_vec_info struct.  */
4256
4257       /* If the dataref is in an inner-loop of the loop that is considered for
4258          for vectorization, we also want to analyze the access relative to
4259          the outer-loop (DR contains information only relative to the
4260          inner-most enclosing loop).  We do that by building a reference to the
4261          first location accessed by the inner-loop, and analyze it relative to
4262          the outer-loop.  */
4263       if (loop && nested_in_vect_loop_p (loop, stmt_info))
4264         {
4265           /* Build a reference to the first location accessed by the
4266              inner loop: *(BASE + INIT + OFFSET).  By construction,
4267              this address must be invariant in the inner loop, so we
4268              can consider it as being used in the outer loop.  */
4269           tree base = unshare_expr (DR_BASE_ADDRESS (dr));
4270           tree offset = unshare_expr (DR_OFFSET (dr));
4271           tree init = unshare_expr (DR_INIT (dr));
4272           tree init_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset),
4273                                           init, offset);
4274           tree init_addr = fold_build_pointer_plus (base, init_offset);
4275           tree init_ref = build_fold_indirect_ref (init_addr);
4276
4277           if (dump_enabled_p ())
4278             dump_printf_loc (MSG_NOTE, vect_location,
4279                              "analyze in outer loop: %T\n", init_ref);
4280
4281           opt_result res
4282             = dr_analyze_innermost (&STMT_VINFO_DR_WRT_VEC_LOOP (stmt_info),
4283                                     init_ref, loop, stmt_info->stmt);
4284           if (!res)
4285             /* dr_analyze_innermost already explained the failure.  */
4286             return res;
4287
4288           if (dump_enabled_p ())
4289             dump_printf_loc (MSG_NOTE, vect_location,
4290                              "\touter base_address: %T\n"
4291                              "\touter offset from base address: %T\n"
4292                              "\touter constant offset from base address: %T\n"
4293                              "\touter step: %T\n"
4294                              "\touter base alignment: %d\n\n"
4295                              "\touter base misalignment: %d\n"
4296                              "\touter offset alignment: %d\n"
4297                              "\touter step alignment: %d\n",
4298                              STMT_VINFO_DR_BASE_ADDRESS (stmt_info),
4299                              STMT_VINFO_DR_OFFSET (stmt_info),
4300                              STMT_VINFO_DR_INIT (stmt_info),
4301                              STMT_VINFO_DR_STEP (stmt_info),
4302                              STMT_VINFO_DR_BASE_ALIGNMENT (stmt_info),
4303                              STMT_VINFO_DR_BASE_MISALIGNMENT (stmt_info),
4304                              STMT_VINFO_DR_OFFSET_ALIGNMENT (stmt_info),
4305                              STMT_VINFO_DR_STEP_ALIGNMENT (stmt_info));
4306         }
4307
4308       /* Set vectype for STMT.  */
4309       scalar_type = TREE_TYPE (DR_REF (dr));
4310       STMT_VINFO_VECTYPE (stmt_info)
4311         = get_vectype_for_scalar_type (scalar_type);
4312       if (!STMT_VINFO_VECTYPE (stmt_info))
4313         {
4314           if (dump_enabled_p ())
4315             {
4316               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4317                                "not vectorized: no vectype for stmt: %G",
4318                                stmt_info->stmt);
4319               dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
4320               dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
4321                                  scalar_type);
4322               dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
4323             }
4324
4325           if (is_a <bb_vec_info> (vinfo))
4326             {
4327               /* No vector type is fine, the ref can still participate
4328                  in dependence analysis, we just can't vectorize it.  */
4329               STMT_VINFO_VECTORIZABLE (stmt_info) = false;
4330               continue;
4331             }
4332           return opt_result::failure_at (stmt_info->stmt,
4333                                          "not vectorized:"
4334                                          " no vectype for stmt: %G"
4335                                          " scalar_type: %T\n",
4336                                          stmt_info->stmt, scalar_type);
4337         }
4338       else
4339         {
4340           if (dump_enabled_p ())
4341             dump_printf_loc (MSG_NOTE, vect_location,
4342                              "got vectype for stmt: %G%T\n",
4343                              stmt_info->stmt, STMT_VINFO_VECTYPE (stmt_info));
4344         }
4345
4346       /* Adjust the minimal vectorization factor according to the
4347          vector type.  */
4348       vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4349       *min_vf = upper_bound (*min_vf, vf);
4350
4351       if (gatherscatter != SG_NONE)
4352         {
4353           gather_scatter_info gs_info;
4354           if (!vect_check_gather_scatter (stmt_info,
4355                                           as_a <loop_vec_info> (vinfo),
4356                                           &gs_info)
4357               || !get_vectype_for_scalar_type (TREE_TYPE (gs_info.offset)))
4358             return opt_result::failure_at
4359               (stmt_info->stmt,
4360                (gatherscatter == GATHER) ?
4361                "not vectorized: not suitable for gather load %G" :
4362                "not vectorized: not suitable for scatter store %G",
4363                stmt_info->stmt);
4364           STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
4365         }
4366     }
4367
4368   /* We used to stop processing and prune the list here.  Verify we no
4369      longer need to.  */
4370   gcc_assert (i == datarefs.length ());
4371
4372   return opt_result::success ();
4373 }
4374
4375
4376 /* Function vect_get_new_vect_var.
4377
4378    Returns a name for a new variable.  The current naming scheme appends the
4379    prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
4380    the name of vectorizer generated variables, and appends that to NAME if
4381    provided.  */
4382
4383 tree
4384 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
4385 {
4386   const char *prefix;
4387   tree new_vect_var;
4388
4389   switch (var_kind)
4390   {
4391   case vect_simple_var:
4392     prefix = "vect";
4393     break;
4394   case vect_scalar_var:
4395     prefix = "stmp";
4396     break;
4397   case vect_mask_var:
4398     prefix = "mask";
4399     break;
4400   case vect_pointer_var:
4401     prefix = "vectp";
4402     break;
4403   default:
4404     gcc_unreachable ();
4405   }
4406
4407   if (name)
4408     {
4409       char* tmp = concat (prefix, "_", name, NULL);
4410       new_vect_var = create_tmp_reg (type, tmp);
4411       free (tmp);
4412     }
4413   else
4414     new_vect_var = create_tmp_reg (type, prefix);
4415
4416   return new_vect_var;
4417 }
4418
4419 /* Like vect_get_new_vect_var but return an SSA name.  */
4420
4421 tree
4422 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
4423 {
4424   const char *prefix;
4425   tree new_vect_var;
4426
4427   switch (var_kind)
4428   {
4429   case vect_simple_var:
4430     prefix = "vect";
4431     break;
4432   case vect_scalar_var:
4433     prefix = "stmp";
4434     break;
4435   case vect_pointer_var:
4436     prefix = "vectp";
4437     break;
4438   default:
4439     gcc_unreachable ();
4440   }
4441
4442   if (name)
4443     {
4444       char* tmp = concat (prefix, "_", name, NULL);
4445       new_vect_var = make_temp_ssa_name (type, NULL, tmp);
4446       free (tmp);
4447     }
4448   else
4449     new_vect_var = make_temp_ssa_name (type, NULL, prefix);
4450
4451   return new_vect_var;
4452 }
4453
4454 /* Duplicate ptr info and set alignment/misaligment on NAME from DR_INFO.  */
4455
4456 static void
4457 vect_duplicate_ssa_name_ptr_info (tree name, dr_vec_info *dr_info)
4458 {
4459   duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr_info->dr));
4460   int misalign = DR_MISALIGNMENT (dr_info);
4461   if (misalign == DR_MISALIGNMENT_UNKNOWN)
4462     mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
4463   else
4464     set_ptr_info_alignment (SSA_NAME_PTR_INFO (name),
4465                             known_alignment (DR_TARGET_ALIGNMENT (dr_info)),
4466                             misalign);
4467 }
4468
4469 /* Function vect_create_addr_base_for_vector_ref.
4470
4471    Create an expression that computes the address of the first memory location
4472    that will be accessed for a data reference.
4473
4474    Input:
4475    STMT_INFO: The statement containing the data reference.
4476    NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
4477    OFFSET: Optional. If supplied, it is be added to the initial address.
4478    LOOP:    Specify relative to which loop-nest should the address be computed.
4479             For example, when the dataref is in an inner-loop nested in an
4480             outer-loop that is now being vectorized, LOOP can be either the
4481             outer-loop, or the inner-loop.  The first memory location accessed
4482             by the following dataref ('in' points to short):
4483
4484                 for (i=0; i<N; i++)
4485                    for (j=0; j<M; j++)
4486                      s += in[i+j]
4487
4488             is as follows:
4489             if LOOP=i_loop:     &in             (relative to i_loop)
4490             if LOOP=j_loop:     &in+i*2B        (relative to j_loop)
4491    BYTE_OFFSET: Optional, defaulted to NULL.  If supplied, it is added to the
4492             initial address.  Unlike OFFSET, which is number of elements to
4493             be added, BYTE_OFFSET is measured in bytes.
4494
4495    Output:
4496    1. Return an SSA_NAME whose value is the address of the memory location of
4497       the first vector of the data reference.
4498    2. If new_stmt_list is not NULL_TREE after return then the caller must insert
4499       these statement(s) which define the returned SSA_NAME.
4500
4501    FORNOW: We are only handling array accesses with step 1.  */
4502
4503 tree
4504 vect_create_addr_base_for_vector_ref (stmt_vec_info stmt_info,
4505                                       gimple_seq *new_stmt_list,
4506                                       tree offset,
4507                                       tree byte_offset)
4508 {
4509   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4510   struct data_reference *dr = dr_info->dr;
4511   const char *base_name;
4512   tree addr_base;
4513   tree dest;
4514   gimple_seq seq = NULL;
4515   tree vect_ptr_type;
4516   tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
4517   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4518   innermost_loop_behavior *drb = vect_dr_behavior (dr_info);
4519
4520   tree data_ref_base = unshare_expr (drb->base_address);
4521   tree base_offset = unshare_expr (drb->offset);
4522   tree init = unshare_expr (drb->init);
4523
4524   if (loop_vinfo)
4525     base_name = get_name (data_ref_base);
4526   else
4527     {
4528       base_offset = ssize_int (0);
4529       init = ssize_int (0);
4530       base_name = get_name (DR_REF (dr));
4531     }
4532
4533   /* Create base_offset */
4534   base_offset = size_binop (PLUS_EXPR,
4535                             fold_convert (sizetype, base_offset),
4536                             fold_convert (sizetype, init));
4537
4538   if (offset)
4539     {
4540       offset = fold_build2 (MULT_EXPR, sizetype,
4541                             fold_convert (sizetype, offset), step);
4542       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4543                                  base_offset, offset);
4544     }
4545   if (byte_offset)
4546     {
4547       byte_offset = fold_convert (sizetype, byte_offset);
4548       base_offset = fold_build2 (PLUS_EXPR, sizetype,
4549                                  base_offset, byte_offset);
4550     }
4551
4552   /* base + base_offset */
4553   if (loop_vinfo)
4554     addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4555   else
4556     {
4557       addr_base = build1 (ADDR_EXPR,
4558                           build_pointer_type (TREE_TYPE (DR_REF (dr))),
4559                           unshare_expr (DR_REF (dr)));
4560     }
4561
4562   vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4563   dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4564   addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4565   gimple_seq_add_seq (new_stmt_list, seq);
4566
4567   if (DR_PTR_INFO (dr)
4568       && TREE_CODE (addr_base) == SSA_NAME
4569       && !SSA_NAME_PTR_INFO (addr_base))
4570     {
4571       vect_duplicate_ssa_name_ptr_info (addr_base, dr_info);
4572       if (offset || byte_offset)
4573         mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4574     }
4575
4576   if (dump_enabled_p ())
4577     dump_printf_loc (MSG_NOTE, vect_location, "created %T\n", addr_base);
4578
4579   return addr_base;
4580 }
4581
4582
4583 /* Function vect_create_data_ref_ptr.
4584
4585    Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4586    location accessed in the loop by STMT_INFO, along with the def-use update
4587    chain to appropriately advance the pointer through the loop iterations.
4588    Also set aliasing information for the pointer.  This pointer is used by
4589    the callers to this function to create a memory reference expression for
4590    vector load/store access.
4591
4592    Input:
4593    1. STMT_INFO: a stmt that references memory. Expected to be of the form
4594          GIMPLE_ASSIGN <name, data-ref> or
4595          GIMPLE_ASSIGN <data-ref, name>.
4596    2. AGGR_TYPE: the type of the reference, which should be either a vector
4597         or an array.
4598    3. AT_LOOP: the loop where the vector memref is to be created.
4599    4. OFFSET (optional): an offset to be added to the initial address accessed
4600         by the data-ref in STMT_INFO.
4601    5. BSI: location where the new stmts are to be placed if there is no loop
4602    6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4603         pointing to the initial address.
4604    7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4605         to the initial address accessed by the data-ref in STMT_INFO.  This is
4606         similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4607         in bytes.
4608    8. IV_STEP (optional, defaults to NULL): the amount that should be added
4609         to the IV during each iteration of the loop.  NULL says to move
4610         by one copy of AGGR_TYPE up or down, depending on the step of the
4611         data reference.
4612
4613    Output:
4614    1. Declare a new ptr to vector_type, and have it point to the base of the
4615       data reference (initial addressed accessed by the data reference).
4616       For example, for vector of type V8HI, the following code is generated:
4617
4618       v8hi *ap;
4619       ap = (v8hi *)initial_address;
4620
4621       if OFFSET is not supplied:
4622          initial_address = &a[init];
4623       if OFFSET is supplied:
4624          initial_address = &a[init + OFFSET];
4625       if BYTE_OFFSET is supplied:
4626          initial_address = &a[init] + BYTE_OFFSET;
4627
4628       Return the initial_address in INITIAL_ADDRESS.
4629
4630    2. If ONLY_INIT is true, just return the initial pointer.  Otherwise, also
4631       update the pointer in each iteration of the loop.
4632
4633       Return the increment stmt that updates the pointer in PTR_INCR.
4634
4635    3. Return the pointer.  */
4636
4637 tree
4638 vect_create_data_ref_ptr (stmt_vec_info stmt_info, tree aggr_type,
4639                           struct loop *at_loop, tree offset,
4640                           tree *initial_address, gimple_stmt_iterator *gsi,
4641                           gimple **ptr_incr, bool only_init,
4642                           tree byte_offset, tree iv_step)
4643 {
4644   const char *base_name;
4645   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4646   struct loop *loop = NULL;
4647   bool nested_in_vect_loop = false;
4648   struct loop *containing_loop = NULL;
4649   tree aggr_ptr_type;
4650   tree aggr_ptr;
4651   tree new_temp;
4652   gimple_seq new_stmt_list = NULL;
4653   edge pe = NULL;
4654   basic_block new_bb;
4655   tree aggr_ptr_init;
4656   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
4657   struct data_reference *dr = dr_info->dr;
4658   tree aptr;
4659   gimple_stmt_iterator incr_gsi;
4660   bool insert_after;
4661   tree indx_before_incr, indx_after_incr;
4662   gimple *incr;
4663   bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4664
4665   gcc_assert (iv_step != NULL_TREE
4666               || TREE_CODE (aggr_type) == ARRAY_TYPE
4667               || TREE_CODE (aggr_type) == VECTOR_TYPE);
4668
4669   if (loop_vinfo)
4670     {
4671       loop = LOOP_VINFO_LOOP (loop_vinfo);
4672       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
4673       containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
4674       pe = loop_preheader_edge (loop);
4675     }
4676   else
4677     {
4678       gcc_assert (bb_vinfo);
4679       only_init = true;
4680       *ptr_incr = NULL;
4681     }
4682
4683   /* Create an expression for the first address accessed by this load
4684      in LOOP.  */
4685   base_name = get_name (DR_BASE_ADDRESS (dr));
4686
4687   if (dump_enabled_p ())
4688     {
4689       tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4690       dump_printf_loc (MSG_NOTE, vect_location,
4691                        "create %s-pointer variable to type: %T",
4692                        get_tree_code_name (TREE_CODE (aggr_type)),
4693                        aggr_type);
4694       if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4695         dump_printf (MSG_NOTE, "  vectorizing an array ref: ");
4696       else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4697         dump_printf (MSG_NOTE, "  vectorizing a vector ref: ");
4698       else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4699         dump_printf (MSG_NOTE, "  vectorizing a record based array ref: ");
4700       else
4701         dump_printf (MSG_NOTE, "  vectorizing a pointer ref: ");
4702       dump_printf (MSG_NOTE, "%T\n", DR_BASE_OBJECT (dr));
4703     }
4704
4705   /* (1) Create the new aggregate-pointer variable.
4706      Vector and array types inherit the alias set of their component
4707      type by default so we need to use a ref-all pointer if the data
4708      reference does not conflict with the created aggregated data
4709      reference because it is not addressable.  */
4710   bool need_ref_all = false;
4711   if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4712                               get_alias_set (DR_REF (dr))))
4713     need_ref_all = true;
4714   /* Likewise for any of the data references in the stmt group.  */
4715   else if (DR_GROUP_SIZE (stmt_info) > 1)
4716     {
4717       stmt_vec_info sinfo = DR_GROUP_FIRST_ELEMENT (stmt_info);
4718       do
4719         {
4720           struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4721           if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4722                                       get_alias_set (DR_REF (sdr))))
4723             {
4724               need_ref_all = true;
4725               break;
4726             }
4727           sinfo = DR_GROUP_NEXT_ELEMENT (sinfo);
4728         }
4729       while (sinfo);
4730     }
4731   aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4732                                                need_ref_all);
4733   aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4734
4735
4736   /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4737      vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4738      def-use update cycles for the pointer: one relative to the outer-loop
4739      (LOOP), which is what steps (3) and (4) below do.  The other is relative
4740      to the inner-loop (which is the inner-most loop containing the dataref),
4741      and this is done be step (5) below.
4742
4743      When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4744      inner-most loop, and so steps (3),(4) work the same, and step (5) is
4745      redundant.  Steps (3),(4) create the following:
4746
4747         vp0 = &base_addr;
4748         LOOP:   vp1 = phi(vp0,vp2)
4749                 ...
4750                 ...
4751                 vp2 = vp1 + step
4752                 goto LOOP
4753
4754      If there is an inner-loop nested in loop, then step (5) will also be
4755      applied, and an additional update in the inner-loop will be created:
4756
4757         vp0 = &base_addr;
4758         LOOP:   vp1 = phi(vp0,vp2)
4759                 ...
4760         inner:     vp3 = phi(vp1,vp4)
4761                    vp4 = vp3 + inner_step
4762                    if () goto inner
4763                 ...
4764                 vp2 = vp1 + step
4765                 if () goto LOOP   */
4766
4767   /* (2) Calculate the initial address of the aggregate-pointer, and set
4768      the aggregate-pointer to point to it before the loop.  */
4769
4770   /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader.  */
4771
4772   new_temp = vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
4773                                                    offset, byte_offset);
4774   if (new_stmt_list)
4775     {
4776       if (pe)
4777         {
4778           new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4779           gcc_assert (!new_bb);
4780         }
4781       else
4782         gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4783     }
4784
4785   *initial_address = new_temp;
4786   aggr_ptr_init = new_temp;
4787
4788   /* (3) Handle the updating of the aggregate-pointer inside the loop.
4789      This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4790      inner-loop nested in LOOP (during outer-loop vectorization).  */
4791
4792   /* No update in loop is required.  */
4793   if (only_init && (!loop_vinfo || at_loop == loop))
4794     aptr = aggr_ptr_init;
4795   else
4796     {
4797       /* Accesses to invariant addresses should be handled specially
4798          by the caller.  */
4799       tree step = vect_dr_behavior (dr_info)->step;
4800       gcc_assert (!integer_zerop (step));
4801
4802       if (iv_step == NULL_TREE)
4803         {
4804           /* The step of the aggregate pointer is the type size,
4805              negated for downward accesses.  */
4806           iv_step = TYPE_SIZE_UNIT (aggr_type);
4807           if (tree_int_cst_sgn (step) == -1)
4808             iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4809         }
4810
4811       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4812
4813       create_iv (aggr_ptr_init,
4814                  fold_convert (aggr_ptr_type, iv_step),
4815                  aggr_ptr, loop, &incr_gsi, insert_after,
4816                  &indx_before_incr, &indx_after_incr);
4817       incr = gsi_stmt (incr_gsi);
4818       loop_vinfo->add_stmt (incr);
4819
4820       /* Copy the points-to information if it exists. */
4821       if (DR_PTR_INFO (dr))
4822         {
4823           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4824           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4825         }
4826       if (ptr_incr)
4827         *ptr_incr = incr;
4828
4829       aptr = indx_before_incr;
4830     }
4831
4832   if (!nested_in_vect_loop || only_init)
4833     return aptr;
4834
4835
4836   /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4837      nested in LOOP, if exists.  */
4838
4839   gcc_assert (nested_in_vect_loop);
4840   if (!only_init)
4841     {
4842       standard_iv_increment_position (containing_loop, &incr_gsi,
4843                                       &insert_after);
4844       create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4845                  containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4846                  &indx_after_incr);
4847       incr = gsi_stmt (incr_gsi);
4848       loop_vinfo->add_stmt (incr);
4849
4850       /* Copy the points-to information if it exists. */
4851       if (DR_PTR_INFO (dr))
4852         {
4853           vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr_info);
4854           vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr_info);
4855         }
4856       if (ptr_incr)
4857         *ptr_incr = incr;
4858
4859       return indx_before_incr;
4860     }
4861   else
4862     gcc_unreachable ();
4863 }
4864
4865
4866 /* Function bump_vector_ptr
4867
4868    Increment a pointer (to a vector type) by vector-size. If requested,
4869    i.e. if PTR-INCR is given, then also connect the new increment stmt
4870    to the existing def-use update-chain of the pointer, by modifying
4871    the PTR_INCR as illustrated below:
4872
4873    The pointer def-use update-chain before this function:
4874                         DATAREF_PTR = phi (p_0, p_2)
4875                         ....
4876         PTR_INCR:       p_2 = DATAREF_PTR + step
4877
4878    The pointer def-use update-chain after this function:
4879                         DATAREF_PTR = phi (p_0, p_2)
4880                         ....
4881                         NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4882                         ....
4883         PTR_INCR:       p_2 = NEW_DATAREF_PTR + step
4884
4885    Input:
4886    DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4887                  in the loop.
4888    PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4889               the loop.  The increment amount across iterations is expected
4890               to be vector_size.
4891    BSI - location where the new update stmt is to be placed.
4892    STMT_INFO - the original scalar memory-access stmt that is being vectorized.
4893    BUMP - optional. The offset by which to bump the pointer. If not given,
4894           the offset is assumed to be vector_size.
4895
4896    Output: Return NEW_DATAREF_PTR as illustrated above.
4897
4898 */
4899
4900 tree
4901 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4902                  stmt_vec_info stmt_info, tree bump)
4903 {
4904   struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4905   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4906   tree update = TYPE_SIZE_UNIT (vectype);
4907   gassign *incr_stmt;
4908   ssa_op_iter iter;
4909   use_operand_p use_p;
4910   tree new_dataref_ptr;
4911
4912   if (bump)
4913     update = bump;
4914
4915   if (TREE_CODE (dataref_ptr) == SSA_NAME)
4916     new_dataref_ptr = copy_ssa_name (dataref_ptr);
4917   else
4918     new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4919   incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4920                                    dataref_ptr, update);
4921   vect_finish_stmt_generation (stmt_info, incr_stmt, gsi);
4922
4923   /* Copy the points-to information if it exists. */
4924   if (DR_PTR_INFO (dr))
4925     {
4926       duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4927       mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4928     }
4929
4930   if (!ptr_incr)
4931     return new_dataref_ptr;
4932
4933   /* Update the vector-pointer's cross-iteration increment.  */
4934   FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4935     {
4936       tree use = USE_FROM_PTR (use_p);
4937
4938       if (use == dataref_ptr)
4939         SET_USE (use_p, new_dataref_ptr);
4940       else
4941         gcc_assert (operand_equal_p (use, update, 0));
4942     }
4943
4944   return new_dataref_ptr;
4945 }
4946
4947
4948 /* Copy memory reference info such as base/clique from the SRC reference
4949    to the DEST MEM_REF.  */
4950
4951 void
4952 vect_copy_ref_info (tree dest, tree src)
4953 {
4954   if (TREE_CODE (dest) != MEM_REF)
4955     return;
4956
4957   tree src_base = src;
4958   while (handled_component_p (src_base))
4959     src_base = TREE_OPERAND (src_base, 0);
4960   if (TREE_CODE (src_base) != MEM_REF
4961       && TREE_CODE (src_base) != TARGET_MEM_REF)
4962     return;
4963
4964   MR_DEPENDENCE_CLIQUE (dest) = MR_DEPENDENCE_CLIQUE (src_base);
4965   MR_DEPENDENCE_BASE (dest) = MR_DEPENDENCE_BASE (src_base);
4966 }
4967
4968
4969 /* Function vect_create_destination_var.
4970
4971    Create a new temporary of type VECTYPE.  */
4972
4973 tree
4974 vect_create_destination_var (tree scalar_dest, tree vectype)
4975 {
4976   tree vec_dest;
4977   const char *name;
4978   char *new_name;
4979   tree type;
4980   enum vect_var_kind kind;
4981
4982   kind = vectype
4983     ? VECTOR_BOOLEAN_TYPE_P (vectype)
4984     ? vect_mask_var
4985     : vect_simple_var
4986     : vect_scalar_var;
4987   type = vectype ? vectype : TREE_TYPE (scalar_dest);
4988
4989   gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4990
4991   name = get_name (scalar_dest);
4992   if (name)
4993     new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4994   else
4995     new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4996   vec_dest = vect_get_new_vect_var (type, kind, new_name);
4997   free (new_name);
4998
4999   return vec_dest;
5000 }
5001
5002 /* Function vect_grouped_store_supported.
5003
5004    Returns TRUE if interleave high and interleave low permutations
5005    are supported, and FALSE otherwise.  */
5006
5007 bool
5008 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
5009 {
5010   machine_mode mode = TYPE_MODE (vectype);
5011
5012   /* vect_permute_store_chain requires the group size to be equal to 3 or
5013      be a power of two.  */
5014   if (count != 3 && exact_log2 (count) == -1)
5015     {
5016       if (dump_enabled_p ())
5017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5018                          "the size of the group of accesses"
5019                          " is not a power of 2 or not eqaul to 3\n");
5020       return false;
5021     }
5022
5023   /* Check that the permutation is supported.  */
5024   if (VECTOR_MODE_P (mode))
5025     {
5026       unsigned int i;
5027       if (count == 3)
5028         {
5029           unsigned int j0 = 0, j1 = 0, j2 = 0;
5030           unsigned int i, j;
5031
5032           unsigned int nelt;
5033           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5034             {
5035               if (dump_enabled_p ())
5036                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5037                                  "cannot handle groups of 3 stores for"
5038                                  " variable-length vectors\n");
5039               return false;
5040             }
5041
5042           vec_perm_builder sel (nelt, nelt, 1);
5043           sel.quick_grow (nelt);
5044           vec_perm_indices indices;
5045           for (j = 0; j < 3; j++)
5046             {
5047               int nelt0 = ((3 - j) * nelt) % 3;
5048               int nelt1 = ((3 - j) * nelt + 1) % 3;
5049               int nelt2 = ((3 - j) * nelt + 2) % 3;
5050               for (i = 0; i < nelt; i++)
5051                 {
5052                   if (3 * i + nelt0 < nelt)
5053                     sel[3 * i + nelt0] = j0++;
5054                   if (3 * i + nelt1 < nelt)
5055                     sel[3 * i + nelt1] = nelt + j1++;
5056                   if (3 * i + nelt2 < nelt)
5057                     sel[3 * i + nelt2] = 0;
5058                 }
5059               indices.new_vector (sel, 2, nelt);
5060               if (!can_vec_perm_const_p (mode, indices))
5061                 {
5062                   if (dump_enabled_p ())
5063                     dump_printf (MSG_MISSED_OPTIMIZATION,
5064                                  "permutation op not supported by target.\n");
5065                   return false;
5066                 }
5067
5068               for (i = 0; i < nelt; i++)
5069                 {
5070                   if (3 * i + nelt0 < nelt)
5071                     sel[3 * i + nelt0] = 3 * i + nelt0;
5072                   if (3 * i + nelt1 < nelt)
5073                     sel[3 * i + nelt1] = 3 * i + nelt1;
5074                   if (3 * i + nelt2 < nelt)
5075                     sel[3 * i + nelt2] = nelt + j2++;
5076                 }
5077               indices.new_vector (sel, 2, nelt);
5078               if (!can_vec_perm_const_p (mode, indices))
5079                 {
5080                   if (dump_enabled_p ())
5081                     dump_printf (MSG_MISSED_OPTIMIZATION,
5082                                  "permutation op not supported by target.\n");
5083                   return false;
5084                 }
5085             }
5086           return true;
5087         }
5088       else
5089         {
5090           /* If length is not equal to 3 then only power of 2 is supported.  */
5091           gcc_assert (pow2p_hwi (count));
5092           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5093
5094           /* The encoding has 2 interleaved stepped patterns.  */
5095           vec_perm_builder sel (nelt, 2, 3);
5096           sel.quick_grow (6);
5097           for (i = 0; i < 3; i++)
5098             {
5099               sel[i * 2] = i;
5100               sel[i * 2 + 1] = i + nelt;
5101             }
5102           vec_perm_indices indices (sel, 2, nelt);
5103           if (can_vec_perm_const_p (mode, indices))
5104             {
5105               for (i = 0; i < 6; i++)
5106                 sel[i] += exact_div (nelt, 2);
5107               indices.new_vector (sel, 2, nelt);
5108               if (can_vec_perm_const_p (mode, indices))
5109                 return true;
5110             }
5111         }
5112     }
5113
5114   if (dump_enabled_p ())
5115     dump_printf (MSG_MISSED_OPTIMIZATION,
5116                  "permutation op not supported by target.\n");
5117   return false;
5118 }
5119
5120
5121 /* Return TRUE if vec_{mask_}store_lanes is available for COUNT vectors of
5122    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5123
5124 bool
5125 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5126                             bool masked_p)
5127 {
5128   if (masked_p)
5129     return vect_lanes_optab_supported_p ("vec_mask_store_lanes",
5130                                          vec_mask_store_lanes_optab,
5131                                          vectype, count);
5132   else
5133     return vect_lanes_optab_supported_p ("vec_store_lanes",
5134                                          vec_store_lanes_optab,
5135                                          vectype, count);
5136 }
5137
5138
5139 /* Function vect_permute_store_chain.
5140
5141    Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
5142    a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
5143    the data correctly for the stores.  Return the final references for stores
5144    in RESULT_CHAIN.
5145
5146    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5147    The input is 4 vectors each containing 8 elements.  We assign a number to
5148    each element, the input sequence is:
5149
5150    1st vec:   0  1  2  3  4  5  6  7
5151    2nd vec:   8  9 10 11 12 13 14 15
5152    3rd vec:  16 17 18 19 20 21 22 23
5153    4th vec:  24 25 26 27 28 29 30 31
5154
5155    The output sequence should be:
5156
5157    1st vec:  0  8 16 24  1  9 17 25
5158    2nd vec:  2 10 18 26  3 11 19 27
5159    3rd vec:  4 12 20 28  5 13 21 30
5160    4th vec:  6 14 22 30  7 15 23 31
5161
5162    i.e., we interleave the contents of the four vectors in their order.
5163
5164    We use interleave_high/low instructions to create such output.  The input of
5165    each interleave_high/low operation is two vectors:
5166    1st vec    2nd vec
5167    0 1 2 3    4 5 6 7
5168    the even elements of the result vector are obtained left-to-right from the
5169    high/low elements of the first vector.  The odd elements of the result are
5170    obtained left-to-right from the high/low elements of the second vector.
5171    The output of interleave_high will be:   0 4 1 5
5172    and of interleave_low:                   2 6 3 7
5173
5174
5175    The permutation is done in log LENGTH stages.  In each stage interleave_high
5176    and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
5177    where the first argument is taken from the first half of DR_CHAIN and the
5178    second argument from it's second half.
5179    In our example,
5180
5181    I1: interleave_high (1st vec, 3rd vec)
5182    I2: interleave_low (1st vec, 3rd vec)
5183    I3: interleave_high (2nd vec, 4th vec)
5184    I4: interleave_low (2nd vec, 4th vec)
5185
5186    The output for the first stage is:
5187
5188    I1:  0 16  1 17  2 18  3 19
5189    I2:  4 20  5 21  6 22  7 23
5190    I3:  8 24  9 25 10 26 11 27
5191    I4: 12 28 13 29 14 30 15 31
5192
5193    The output of the second stage, i.e. the final result is:
5194
5195    I1:  0  8 16 24  1  9 17 25
5196    I2:  2 10 18 26  3 11 19 27
5197    I3:  4 12 20 28  5 13 21 30
5198    I4:  6 14 22 30  7 15 23 31.  */
5199
5200 void
5201 vect_permute_store_chain (vec<tree> dr_chain,
5202                           unsigned int length,
5203                           stmt_vec_info stmt_info,
5204                           gimple_stmt_iterator *gsi,
5205                           vec<tree> *result_chain)
5206 {
5207   tree vect1, vect2, high, low;
5208   gimple *perm_stmt;
5209   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5210   tree perm_mask_low, perm_mask_high;
5211   tree data_ref;
5212   tree perm3_mask_low, perm3_mask_high;
5213   unsigned int i, j, n, log_length = exact_log2 (length);
5214
5215   result_chain->quick_grow (length);
5216   memcpy (result_chain->address (), dr_chain.address (),
5217           length * sizeof (tree));
5218
5219   if (length == 3)
5220     {
5221       /* vect_grouped_store_supported ensures that this is constant.  */
5222       unsigned int nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5223       unsigned int j0 = 0, j1 = 0, j2 = 0;
5224
5225       vec_perm_builder sel (nelt, nelt, 1);
5226       sel.quick_grow (nelt);
5227       vec_perm_indices indices;
5228       for (j = 0; j < 3; j++)
5229         {
5230           int nelt0 = ((3 - j) * nelt) % 3;
5231           int nelt1 = ((3 - j) * nelt + 1) % 3;
5232           int nelt2 = ((3 - j) * nelt + 2) % 3;
5233
5234           for (i = 0; i < nelt; i++)
5235             {
5236               if (3 * i + nelt0 < nelt)
5237                 sel[3 * i + nelt0] = j0++;
5238               if (3 * i + nelt1 < nelt)
5239                 sel[3 * i + nelt1] = nelt + j1++;
5240               if (3 * i + nelt2 < nelt)
5241                 sel[3 * i + nelt2] = 0;
5242             }
5243           indices.new_vector (sel, 2, nelt);
5244           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5245
5246           for (i = 0; i < nelt; i++)
5247             {
5248               if (3 * i + nelt0 < nelt)
5249                 sel[3 * i + nelt0] = 3 * i + nelt0;
5250               if (3 * i + nelt1 < nelt)
5251                 sel[3 * i + nelt1] = 3 * i + nelt1;
5252               if (3 * i + nelt2 < nelt)
5253                 sel[3 * i + nelt2] = nelt + j2++;
5254             }
5255           indices.new_vector (sel, 2, nelt);
5256           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5257
5258           vect1 = dr_chain[0];
5259           vect2 = dr_chain[1];
5260
5261           /* Create interleaving stmt:
5262              low = VEC_PERM_EXPR <vect1, vect2,
5263                                   {j, nelt, *, j + 1, nelt + j + 1, *,
5264                                    j + 2, nelt + j + 2, *, ...}>  */
5265           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5266           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5267                                            vect2, perm3_mask_low);
5268           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5269
5270           vect1 = data_ref;
5271           vect2 = dr_chain[2];
5272           /* Create interleaving stmt:
5273              low = VEC_PERM_EXPR <vect1, vect2,
5274                                   {0, 1, nelt + j, 3, 4, nelt + j + 1,
5275                                    6, 7, nelt + j + 2, ...}>  */
5276           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5277           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
5278                                            vect2, perm3_mask_high);
5279           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5280           (*result_chain)[j] = data_ref;
5281         }
5282     }
5283   else
5284     {
5285       /* If length is not equal to 3 then only power of 2 is supported.  */
5286       gcc_assert (pow2p_hwi (length));
5287
5288       /* The encoding has 2 interleaved stepped patterns.  */
5289       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5290       vec_perm_builder sel (nelt, 2, 3);
5291       sel.quick_grow (6);
5292       for (i = 0; i < 3; i++)
5293         {
5294           sel[i * 2] = i;
5295           sel[i * 2 + 1] = i + nelt;
5296         }
5297         vec_perm_indices indices (sel, 2, nelt);
5298         perm_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5299
5300         for (i = 0; i < 6; i++)
5301           sel[i] += exact_div (nelt, 2);
5302         indices.new_vector (sel, 2, nelt);
5303         perm_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5304
5305         for (i = 0, n = log_length; i < n; i++)
5306           {
5307             for (j = 0; j < length/2; j++)
5308               {
5309                 vect1 = dr_chain[j];
5310                 vect2 = dr_chain[j+length/2];
5311
5312                 /* Create interleaving stmt:
5313                    high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
5314                                                         ...}>  */
5315                 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
5316                 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
5317                                                  vect2, perm_mask_high);
5318                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5319                 (*result_chain)[2*j] = high;
5320
5321                 /* Create interleaving stmt:
5322                    low = VEC_PERM_EXPR <vect1, vect2,
5323                                         {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
5324                                          ...}>  */
5325                 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
5326                 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
5327                                                  vect2, perm_mask_low);
5328                 vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5329                 (*result_chain)[2*j+1] = low;
5330               }
5331             memcpy (dr_chain.address (), result_chain->address (),
5332                     length * sizeof (tree));
5333           }
5334     }
5335 }
5336
5337 /* Function vect_setup_realignment
5338
5339    This function is called when vectorizing an unaligned load using
5340    the dr_explicit_realign[_optimized] scheme.
5341    This function generates the following code at the loop prolog:
5342
5343       p = initial_addr;
5344    x  msq_init = *(floor(p));   # prolog load
5345       realignment_token = call target_builtin;
5346     loop:
5347    x  msq = phi (msq_init, ---)
5348
5349    The stmts marked with x are generated only for the case of
5350    dr_explicit_realign_optimized.
5351
5352    The code above sets up a new (vector) pointer, pointing to the first
5353    location accessed by STMT_INFO, and a "floor-aligned" load using that
5354    pointer.  It also generates code to compute the "realignment-token"
5355    (if the relevant target hook was defined), and creates a phi-node at the
5356    loop-header bb whose arguments are the result of the prolog-load (created
5357    by this function) and the result of a load that takes place in the loop
5358    (to be created by the caller to this function).
5359
5360    For the case of dr_explicit_realign_optimized:
5361    The caller to this function uses the phi-result (msq) to create the
5362    realignment code inside the loop, and sets up the missing phi argument,
5363    as follows:
5364     loop:
5365       msq = phi (msq_init, lsq)
5366       lsq = *(floor(p'));        # load in loop
5367       result = realign_load (msq, lsq, realignment_token);
5368
5369    For the case of dr_explicit_realign:
5370     loop:
5371       msq = *(floor(p));        # load in loop
5372       p' = p + (VS-1);
5373       lsq = *(floor(p'));       # load in loop
5374       result = realign_load (msq, lsq, realignment_token);
5375
5376    Input:
5377    STMT_INFO - (scalar) load stmt to be vectorized. This load accesses
5378                a memory location that may be unaligned.
5379    BSI - place where new code is to be inserted.
5380    ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
5381                               is used.
5382
5383    Output:
5384    REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
5385                        target hook, if defined.
5386    Return value - the result of the loop-header phi node.  */
5387
5388 tree
5389 vect_setup_realignment (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5390                         tree *realignment_token,
5391                         enum dr_alignment_support alignment_support_scheme,
5392                         tree init_addr,
5393                         struct loop **at_loop)
5394 {
5395   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5396   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5397   dr_vec_info *dr_info = STMT_VINFO_DR_INFO (stmt_info);
5398   struct data_reference *dr = dr_info->dr;
5399   struct loop *loop = NULL;
5400   edge pe = NULL;
5401   tree scalar_dest = gimple_assign_lhs (stmt_info->stmt);
5402   tree vec_dest;
5403   gimple *inc;
5404   tree ptr;
5405   tree data_ref;
5406   basic_block new_bb;
5407   tree msq_init = NULL_TREE;
5408   tree new_temp;
5409   gphi *phi_stmt;
5410   tree msq = NULL_TREE;
5411   gimple_seq stmts = NULL;
5412   bool compute_in_loop = false;
5413   bool nested_in_vect_loop = false;
5414   struct loop *containing_loop = (gimple_bb (stmt_info->stmt))->loop_father;
5415   struct loop *loop_for_initial_load = NULL;
5416
5417   if (loop_vinfo)
5418     {
5419       loop = LOOP_VINFO_LOOP (loop_vinfo);
5420       nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt_info);
5421     }
5422
5423   gcc_assert (alignment_support_scheme == dr_explicit_realign
5424               || alignment_support_scheme == dr_explicit_realign_optimized);
5425
5426   /* We need to generate three things:
5427      1. the misalignment computation
5428      2. the extra vector load (for the optimized realignment scheme).
5429      3. the phi node for the two vectors from which the realignment is
5430       done (for the optimized realignment scheme).  */
5431
5432   /* 1. Determine where to generate the misalignment computation.
5433
5434      If INIT_ADDR is NULL_TREE, this indicates that the misalignment
5435      calculation will be generated by this function, outside the loop (in the
5436      preheader).  Otherwise, INIT_ADDR had already been computed for us by the
5437      caller, inside the loop.
5438
5439      Background: If the misalignment remains fixed throughout the iterations of
5440      the loop, then both realignment schemes are applicable, and also the
5441      misalignment computation can be done outside LOOP.  This is because we are
5442      vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
5443      are a multiple of VS (the Vector Size), and therefore the misalignment in
5444      different vectorized LOOP iterations is always the same.
5445      The problem arises only if the memory access is in an inner-loop nested
5446      inside LOOP, which is now being vectorized using outer-loop vectorization.
5447      This is the only case when the misalignment of the memory access may not
5448      remain fixed throughout the iterations of the inner-loop (as explained in
5449      detail in vect_supportable_dr_alignment).  In this case, not only is the
5450      optimized realignment scheme not applicable, but also the misalignment
5451      computation (and generation of the realignment token that is passed to
5452      REALIGN_LOAD) have to be done inside the loop.
5453
5454      In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
5455      or not, which in turn determines if the misalignment is computed inside
5456      the inner-loop, or outside LOOP.  */
5457
5458   if (init_addr != NULL_TREE || !loop_vinfo)
5459     {
5460       compute_in_loop = true;
5461       gcc_assert (alignment_support_scheme == dr_explicit_realign);
5462     }
5463
5464
5465   /* 2. Determine where to generate the extra vector load.
5466
5467      For the optimized realignment scheme, instead of generating two vector
5468      loads in each iteration, we generate a single extra vector load in the
5469      preheader of the loop, and in each iteration reuse the result of the
5470      vector load from the previous iteration.  In case the memory access is in
5471      an inner-loop nested inside LOOP, which is now being vectorized using
5472      outer-loop vectorization, we need to determine whether this initial vector
5473      load should be generated at the preheader of the inner-loop, or can be
5474      generated at the preheader of LOOP.  If the memory access has no evolution
5475      in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
5476      to be generated inside LOOP (in the preheader of the inner-loop).  */
5477
5478   if (nested_in_vect_loop)
5479     {
5480       tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
5481       bool invariant_in_outerloop =
5482             (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
5483       loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
5484     }
5485   else
5486     loop_for_initial_load = loop;
5487   if (at_loop)
5488     *at_loop = loop_for_initial_load;
5489
5490   if (loop_for_initial_load)
5491     pe = loop_preheader_edge (loop_for_initial_load);
5492
5493   /* 3. For the case of the optimized realignment, create the first vector
5494       load at the loop preheader.  */
5495
5496   if (alignment_support_scheme == dr_explicit_realign_optimized)
5497     {
5498       /* Create msq_init = *(floor(p1)) in the loop preheader  */
5499       gassign *new_stmt;
5500
5501       gcc_assert (!compute_in_loop);
5502       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5503       ptr = vect_create_data_ref_ptr (stmt_info, vectype,
5504                                       loop_for_initial_load, NULL_TREE,
5505                                       &init_addr, NULL, &inc, true);
5506       if (TREE_CODE (ptr) == SSA_NAME)
5507         new_temp = copy_ssa_name (ptr);
5508       else
5509         new_temp = make_ssa_name (TREE_TYPE (ptr));
5510       poly_uint64 align = DR_TARGET_ALIGNMENT (dr_info);
5511       tree type = TREE_TYPE (ptr);
5512       new_stmt = gimple_build_assign
5513                    (new_temp, BIT_AND_EXPR, ptr,
5514                     fold_build2 (MINUS_EXPR, type,
5515                                  build_int_cst (type, 0),
5516                                  build_int_cst (type, align)));
5517       new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5518       gcc_assert (!new_bb);
5519       data_ref
5520         = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
5521                   build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
5522       vect_copy_ref_info (data_ref, DR_REF (dr));
5523       new_stmt = gimple_build_assign (vec_dest, data_ref);
5524       new_temp = make_ssa_name (vec_dest, new_stmt);
5525       gimple_assign_set_lhs (new_stmt, new_temp);
5526       if (pe)
5527         {
5528           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5529           gcc_assert (!new_bb);
5530         }
5531       else
5532          gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5533
5534       msq_init = gimple_assign_lhs (new_stmt);
5535     }
5536
5537   /* 4. Create realignment token using a target builtin, if available.
5538       It is done either inside the containing loop, or before LOOP (as
5539       determined above).  */
5540
5541   if (targetm.vectorize.builtin_mask_for_load)
5542     {
5543       gcall *new_stmt;
5544       tree builtin_decl;
5545
5546       /* Compute INIT_ADDR - the initial addressed accessed by this memref.  */
5547       if (!init_addr)
5548         {
5549           /* Generate the INIT_ADDR computation outside LOOP.  */
5550           init_addr = vect_create_addr_base_for_vector_ref (stmt_info, &stmts,
5551                                                             NULL_TREE);
5552           if (loop)
5553             {
5554               pe = loop_preheader_edge (loop);
5555               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
5556               gcc_assert (!new_bb);
5557             }
5558           else
5559              gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
5560         }
5561
5562       builtin_decl = targetm.vectorize.builtin_mask_for_load ();
5563       new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
5564       vec_dest =
5565         vect_create_destination_var (scalar_dest,
5566                                      gimple_call_return_type (new_stmt));
5567       new_temp = make_ssa_name (vec_dest, new_stmt);
5568       gimple_call_set_lhs (new_stmt, new_temp);
5569
5570       if (compute_in_loop)
5571         gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5572       else
5573         {
5574           /* Generate the misalignment computation outside LOOP.  */
5575           pe = loop_preheader_edge (loop);
5576           new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5577           gcc_assert (!new_bb);
5578         }
5579
5580       *realignment_token = gimple_call_lhs (new_stmt);
5581
5582       /* The result of the CALL_EXPR to this builtin is determined from
5583          the value of the parameter and no global variables are touched
5584          which makes the builtin a "const" function.  Requiring the
5585          builtin to have the "const" attribute makes it unnecessary
5586          to call mark_call_clobbered.  */
5587       gcc_assert (TREE_READONLY (builtin_decl));
5588     }
5589
5590   if (alignment_support_scheme == dr_explicit_realign)
5591     return msq;
5592
5593   gcc_assert (!compute_in_loop);
5594   gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5595
5596
5597   /* 5. Create msq = phi <msq_init, lsq> in loop  */
5598
5599   pe = loop_preheader_edge (containing_loop);
5600   vec_dest = vect_create_destination_var (scalar_dest, vectype);
5601   msq = make_ssa_name (vec_dest);
5602   phi_stmt = create_phi_node (msq, containing_loop->header);
5603   add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5604
5605   return msq;
5606 }
5607
5608
5609 /* Function vect_grouped_load_supported.
5610
5611    COUNT is the size of the load group (the number of statements plus the
5612    number of gaps).  SINGLE_ELEMENT_P is true if there is actually
5613    only one statement, with a gap of COUNT - 1.
5614
5615    Returns true if a suitable permute exists.  */
5616
5617 bool
5618 vect_grouped_load_supported (tree vectype, bool single_element_p,
5619                              unsigned HOST_WIDE_INT count)
5620 {
5621   machine_mode mode = TYPE_MODE (vectype);
5622
5623   /* If this is single-element interleaving with an element distance
5624      that leaves unused vector loads around punt - we at least create
5625      very sub-optimal code in that case (and blow up memory,
5626      see PR65518).  */
5627   if (single_element_p && maybe_gt (count, TYPE_VECTOR_SUBPARTS (vectype)))
5628     {
5629       if (dump_enabled_p ())
5630         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5631                          "single-element interleaving not supported "
5632                          "for not adjacent vector loads\n");
5633       return false;
5634     }
5635
5636   /* vect_permute_load_chain requires the group size to be equal to 3 or
5637      be a power of two.  */
5638   if (count != 3 && exact_log2 (count) == -1)
5639     {
5640       if (dump_enabled_p ())
5641         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5642                          "the size of the group of accesses"
5643                          " is not a power of 2 or not equal to 3\n");
5644       return false;
5645     }
5646
5647   /* Check that the permutation is supported.  */
5648   if (VECTOR_MODE_P (mode))
5649     {
5650       unsigned int i, j;
5651       if (count == 3)
5652         {
5653           unsigned int nelt;
5654           if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5655             {
5656               if (dump_enabled_p ())
5657                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5658                                  "cannot handle groups of 3 loads for"
5659                                  " variable-length vectors\n");
5660               return false;
5661             }
5662
5663           vec_perm_builder sel (nelt, nelt, 1);
5664           sel.quick_grow (nelt);
5665           vec_perm_indices indices;
5666           unsigned int k;
5667           for (k = 0; k < 3; k++)
5668             {
5669               for (i = 0; i < nelt; i++)
5670                 if (3 * i + k < 2 * nelt)
5671                   sel[i] = 3 * i + k;
5672                 else
5673                   sel[i] = 0;
5674               indices.new_vector (sel, 2, nelt);
5675               if (!can_vec_perm_const_p (mode, indices))
5676                 {
5677                   if (dump_enabled_p ())
5678                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5679                                      "shuffle of 3 loads is not supported by"
5680                                      " target\n");
5681                   return false;
5682                 }
5683               for (i = 0, j = 0; i < nelt; i++)
5684                 if (3 * i + k < 2 * nelt)
5685                   sel[i] = i;
5686                 else
5687                   sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5688               indices.new_vector (sel, 2, nelt);
5689               if (!can_vec_perm_const_p (mode, indices))
5690                 {
5691                   if (dump_enabled_p ())
5692                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5693                                      "shuffle of 3 loads is not supported by"
5694                                      " target\n");
5695                   return false;
5696                 }
5697             }
5698           return true;
5699         }
5700       else
5701         {
5702           /* If length is not equal to 3 then only power of 2 is supported.  */
5703           gcc_assert (pow2p_hwi (count));
5704           poly_uint64 nelt = GET_MODE_NUNITS (mode);
5705
5706           /* The encoding has a single stepped pattern.  */
5707           vec_perm_builder sel (nelt, 1, 3);
5708           sel.quick_grow (3);
5709           for (i = 0; i < 3; i++)
5710             sel[i] = i * 2;
5711           vec_perm_indices indices (sel, 2, nelt);
5712           if (can_vec_perm_const_p (mode, indices))
5713             {
5714               for (i = 0; i < 3; i++)
5715                 sel[i] = i * 2 + 1;
5716               indices.new_vector (sel, 2, nelt);
5717               if (can_vec_perm_const_p (mode, indices))
5718                 return true;
5719             }
5720         }
5721     }
5722
5723   if (dump_enabled_p ())
5724     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5725                      "extract even/odd not supported by target\n");
5726   return false;
5727 }
5728
5729 /* Return TRUE if vec_{masked_}load_lanes is available for COUNT vectors of
5730    type VECTYPE.  MASKED_P says whether the masked form is needed.  */
5731
5732 bool
5733 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count,
5734                            bool masked_p)
5735 {
5736   if (masked_p)
5737     return vect_lanes_optab_supported_p ("vec_mask_load_lanes",
5738                                          vec_mask_load_lanes_optab,
5739                                          vectype, count);
5740   else
5741     return vect_lanes_optab_supported_p ("vec_load_lanes",
5742                                          vec_load_lanes_optab,
5743                                          vectype, count);
5744 }
5745
5746 /* Function vect_permute_load_chain.
5747
5748    Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5749    a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5750    the input data correctly.  Return the final references for loads in
5751    RESULT_CHAIN.
5752
5753    E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5754    The input is 4 vectors each containing 8 elements. We assign a number to each
5755    element, the input sequence is:
5756
5757    1st vec:   0  1  2  3  4  5  6  7
5758    2nd vec:   8  9 10 11 12 13 14 15
5759    3rd vec:  16 17 18 19 20 21 22 23
5760    4th vec:  24 25 26 27 28 29 30 31
5761
5762    The output sequence should be:
5763
5764    1st vec:  0 4  8 12 16 20 24 28
5765    2nd vec:  1 5  9 13 17 21 25 29
5766    3rd vec:  2 6 10 14 18 22 26 30
5767    4th vec:  3 7 11 15 19 23 27 31
5768
5769    i.e., the first output vector should contain the first elements of each
5770    interleaving group, etc.
5771
5772    We use extract_even/odd instructions to create such output.  The input of
5773    each extract_even/odd operation is two vectors
5774    1st vec    2nd vec
5775    0 1 2 3    4 5 6 7
5776
5777    and the output is the vector of extracted even/odd elements.  The output of
5778    extract_even will be:   0 2 4 6
5779    and of extract_odd:     1 3 5 7
5780
5781
5782    The permutation is done in log LENGTH stages.  In each stage extract_even
5783    and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5784    their order.  In our example,
5785
5786    E1: extract_even (1st vec, 2nd vec)
5787    E2: extract_odd (1st vec, 2nd vec)
5788    E3: extract_even (3rd vec, 4th vec)
5789    E4: extract_odd (3rd vec, 4th vec)
5790
5791    The output for the first stage will be:
5792
5793    E1:  0  2  4  6  8 10 12 14
5794    E2:  1  3  5  7  9 11 13 15
5795    E3: 16 18 20 22 24 26 28 30
5796    E4: 17 19 21 23 25 27 29 31
5797
5798    In order to proceed and create the correct sequence for the next stage (or
5799    for the correct output, if the second stage is the last one, as in our
5800    example), we first put the output of extract_even operation and then the
5801    output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5802    The input for the second stage is:
5803
5804    1st vec (E1):  0  2  4  6  8 10 12 14
5805    2nd vec (E3): 16 18 20 22 24 26 28 30
5806    3rd vec (E2):  1  3  5  7  9 11 13 15
5807    4th vec (E4): 17 19 21 23 25 27 29 31
5808
5809    The output of the second stage:
5810
5811    E1: 0 4  8 12 16 20 24 28
5812    E2: 2 6 10 14 18 22 26 30
5813    E3: 1 5  9 13 17 21 25 29
5814    E4: 3 7 11 15 19 23 27 31
5815
5816    And RESULT_CHAIN after reordering:
5817
5818    1st vec (E1):  0 4  8 12 16 20 24 28
5819    2nd vec (E3):  1 5  9 13 17 21 25 29
5820    3rd vec (E2):  2 6 10 14 18 22 26 30
5821    4th vec (E4):  3 7 11 15 19 23 27 31.  */
5822
5823 static void
5824 vect_permute_load_chain (vec<tree> dr_chain,
5825                          unsigned int length,
5826                          stmt_vec_info stmt_info,
5827                          gimple_stmt_iterator *gsi,
5828                          vec<tree> *result_chain)
5829 {
5830   tree data_ref, first_vect, second_vect;
5831   tree perm_mask_even, perm_mask_odd;
5832   tree perm3_mask_low, perm3_mask_high;
5833   gimple *perm_stmt;
5834   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5835   unsigned int i, j, log_length = exact_log2 (length);
5836
5837   result_chain->quick_grow (length);
5838   memcpy (result_chain->address (), dr_chain.address (),
5839           length * sizeof (tree));
5840
5841   if (length == 3)
5842     {
5843       /* vect_grouped_load_supported ensures that this is constant.  */
5844       unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5845       unsigned int k;
5846
5847       vec_perm_builder sel (nelt, nelt, 1);
5848       sel.quick_grow (nelt);
5849       vec_perm_indices indices;
5850       for (k = 0; k < 3; k++)
5851         {
5852           for (i = 0; i < nelt; i++)
5853             if (3 * i + k < 2 * nelt)
5854               sel[i] = 3 * i + k;
5855             else
5856               sel[i] = 0;
5857           indices.new_vector (sel, 2, nelt);
5858           perm3_mask_low = vect_gen_perm_mask_checked (vectype, indices);
5859
5860           for (i = 0, j = 0; i < nelt; i++)
5861             if (3 * i + k < 2 * nelt)
5862               sel[i] = i;
5863             else
5864               sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5865           indices.new_vector (sel, 2, nelt);
5866           perm3_mask_high = vect_gen_perm_mask_checked (vectype, indices);
5867
5868           first_vect = dr_chain[0];
5869           second_vect = dr_chain[1];
5870
5871           /* Create interleaving stmt (low part of):
5872              low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5873                                                              ...}>  */
5874           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5875           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5876                                            second_vect, perm3_mask_low);
5877           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5878
5879           /* Create interleaving stmt (high part of):
5880              high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5881                                                               ...}>  */
5882           first_vect = data_ref;
5883           second_vect = dr_chain[2];
5884           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5885           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5886                                            second_vect, perm3_mask_high);
5887           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5888           (*result_chain)[k] = data_ref;
5889         }
5890     }
5891   else
5892     {
5893       /* If length is not equal to 3 then only power of 2 is supported.  */
5894       gcc_assert (pow2p_hwi (length));
5895
5896       /* The encoding has a single stepped pattern.  */
5897       poly_uint64 nelt = TYPE_VECTOR_SUBPARTS (vectype);
5898       vec_perm_builder sel (nelt, 1, 3);
5899       sel.quick_grow (3);
5900       for (i = 0; i < 3; ++i)
5901         sel[i] = i * 2;
5902       vec_perm_indices indices (sel, 2, nelt);
5903       perm_mask_even = vect_gen_perm_mask_checked (vectype, indices);
5904
5905       for (i = 0; i < 3; ++i)
5906         sel[i] = i * 2 + 1;
5907       indices.new_vector (sel, 2, nelt);
5908       perm_mask_odd = vect_gen_perm_mask_checked (vectype, indices);
5909
5910       for (i = 0; i < log_length; i++)
5911         {
5912           for (j = 0; j < length; j += 2)
5913             {
5914               first_vect = dr_chain[j];
5915               second_vect = dr_chain[j+1];
5916
5917               /* data_ref = permute_even (first_data_ref, second_data_ref);  */
5918               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5919               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5920                                                first_vect, second_vect,
5921                                                perm_mask_even);
5922               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5923               (*result_chain)[j/2] = data_ref;
5924
5925               /* data_ref = permute_odd (first_data_ref, second_data_ref);  */
5926               data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5927               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5928                                                first_vect, second_vect,
5929                                                perm_mask_odd);
5930               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
5931               (*result_chain)[j/2+length/2] = data_ref;
5932             }
5933           memcpy (dr_chain.address (), result_chain->address (),
5934                   length * sizeof (tree));
5935         }
5936     }
5937 }
5938
5939 /* Function vect_shift_permute_load_chain.
5940
5941    Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5942    sequence of stmts to reorder the input data accordingly.
5943    Return the final references for loads in RESULT_CHAIN.
5944    Return true if successed, false otherwise.
5945
5946    E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5947    The input is 3 vectors each containing 8 elements.  We assign a
5948    number to each element, the input sequence is:
5949
5950    1st vec:   0  1  2  3  4  5  6  7
5951    2nd vec:   8  9 10 11 12 13 14 15
5952    3rd vec:  16 17 18 19 20 21 22 23
5953
5954    The output sequence should be:
5955
5956    1st vec:  0 3 6  9 12 15 18 21
5957    2nd vec:  1 4 7 10 13 16 19 22
5958    3rd vec:  2 5 8 11 14 17 20 23
5959
5960    We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5961
5962    First we shuffle all 3 vectors to get correct elements order:
5963
5964    1st vec:  ( 0  3  6) ( 1  4  7) ( 2  5)
5965    2nd vec:  ( 8 11 14) ( 9 12 15) (10 13)
5966    3rd vec:  (16 19 22) (17 20 23) (18 21)
5967
5968    Next we unite and shift vector 3 times:
5969
5970    1st step:
5971      shift right by 6 the concatenation of:
5972      "1st vec" and  "2nd vec"
5973        ( 0  3  6) ( 1  4  7) |( 2  5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5974      "2nd vec" and  "3rd vec"
5975        ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5976      "3rd vec" and  "1st vec"
5977        (16 19 22) (17 20 23) |(18 21) _ ( 0  3  6) ( 1  4  7)| ( 2  5)
5978                              | New vectors                   |
5979
5980      So that now new vectors are:
5981
5982      1st vec:  ( 2  5) ( 8 11 14) ( 9 12 15)
5983      2nd vec:  (10 13) (16 19 22) (17 20 23)
5984      3rd vec:  (18 21) ( 0  3  6) ( 1  4  7)
5985
5986    2nd step:
5987      shift right by 5 the concatenation of:
5988      "1st vec" and  "3rd vec"
5989        ( 2  5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0  3  6)| ( 1  4  7)
5990      "2nd vec" and  "1st vec"
5991        (10 13) (16 19 22) |(17 20 23) _ ( 2  5) ( 8 11 14)| ( 9 12 15)
5992      "3rd vec" and  "2nd vec"
5993        (18 21) ( 0  3  6) |( 1  4  7) _ (10 13) (16 19 22)| (17 20 23)
5994                           | New vectors                   |
5995
5996      So that now new vectors are:
5997
5998      1st vec:  ( 9 12 15) (18 21) ( 0  3  6)
5999      2nd vec:  (17 20 23) ( 2  5) ( 8 11 14)
6000      3rd vec:  ( 1  4  7) (10 13) (16 19 22) READY
6001
6002    3rd step:
6003      shift right by 5 the concatenation of:
6004      "1st vec" and  "1st vec"
6005        ( 9 12 15) (18 21) |( 0  3  6) _ ( 9 12 15) (18 21)| ( 0  3  6)
6006      shift right by 3 the concatenation of:
6007      "2nd vec" and  "2nd vec"
6008                (17 20 23) |( 2  5) ( 8 11 14) _ (17 20 23)| ( 2  5) ( 8 11 14)
6009                           | New vectors                   |
6010
6011      So that now all vectors are READY:
6012      1st vec:  ( 0  3  6) ( 9 12 15) (18 21)
6013      2nd vec:  ( 2  5) ( 8 11 14) (17 20 23)
6014      3rd vec:  ( 1  4  7) (10 13) (16 19 22)
6015
6016    This algorithm is faster than one in vect_permute_load_chain if:
6017      1.  "shift of a concatination" is faster than general permutation.
6018          This is usually so.
6019      2.  The TARGET machine can't execute vector instructions in parallel.
6020          This is because each step of the algorithm depends on previous.
6021          The algorithm in vect_permute_load_chain is much more parallel.
6022
6023    The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
6024 */
6025
6026 static bool
6027 vect_shift_permute_load_chain (vec<tree> dr_chain,
6028                                unsigned int length,
6029                                stmt_vec_info stmt_info,
6030                                gimple_stmt_iterator *gsi,
6031                                vec<tree> *result_chain)
6032 {
6033   tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
6034   tree perm2_mask1, perm2_mask2, perm3_mask;
6035   tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
6036   gimple *perm_stmt;
6037
6038   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6039   unsigned int i;
6040   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6041
6042   unsigned HOST_WIDE_INT nelt, vf;
6043   if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant (&nelt)
6044       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&vf))
6045     /* Not supported for variable-length vectors.  */
6046     return false;
6047
6048   vec_perm_builder sel (nelt, nelt, 1);
6049   sel.quick_grow (nelt);
6050
6051   result_chain->quick_grow (length);
6052   memcpy (result_chain->address (), dr_chain.address (),
6053           length * sizeof (tree));
6054
6055   if (pow2p_hwi (length) && vf > 4)
6056     {
6057       unsigned int j, log_length = exact_log2 (length);
6058       for (i = 0; i < nelt / 2; ++i)
6059         sel[i] = i * 2;
6060       for (i = 0; i < nelt / 2; ++i)
6061         sel[nelt / 2 + i] = i * 2 + 1;
6062       vec_perm_indices indices (sel, 2, nelt);
6063       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6064         {
6065           if (dump_enabled_p ())
6066             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6067                              "shuffle of 2 fields structure is not \
6068                               supported by target\n");
6069           return false;
6070         }
6071       perm2_mask1 = vect_gen_perm_mask_checked (vectype, indices);
6072
6073       for (i = 0; i < nelt / 2; ++i)
6074         sel[i] = i * 2 + 1;
6075       for (i = 0; i < nelt / 2; ++i)
6076         sel[nelt / 2 + i] = i * 2;
6077       indices.new_vector (sel, 2, nelt);
6078       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6079         {
6080           if (dump_enabled_p ())
6081             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6082                              "shuffle of 2 fields structure is not \
6083                               supported by target\n");
6084           return false;
6085         }
6086       perm2_mask2 = vect_gen_perm_mask_checked (vectype, indices);
6087
6088       /* Generating permutation constant to shift all elements.
6089          For vector length 8 it is {4 5 6 7 8 9 10 11}.  */
6090       for (i = 0; i < nelt; i++)
6091         sel[i] = nelt / 2 + i;
6092       indices.new_vector (sel, 2, nelt);
6093       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6094         {
6095           if (dump_enabled_p ())
6096             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6097                              "shift permutation is not supported by target\n");
6098           return false;
6099         }
6100       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6101
6102       /* Generating permutation constant to select vector from 2.
6103          For vector length 8 it is {0 1 2 3 12 13 14 15}.  */
6104       for (i = 0; i < nelt / 2; i++)
6105         sel[i] = i;
6106       for (i = nelt / 2; i < nelt; i++)
6107         sel[i] = nelt + i;
6108       indices.new_vector (sel, 2, nelt);
6109       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6110         {
6111           if (dump_enabled_p ())
6112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6113                              "select is not supported by target\n");
6114           return false;
6115         }
6116       select_mask = vect_gen_perm_mask_checked (vectype, indices);
6117
6118       for (i = 0; i < log_length; i++)
6119         {
6120           for (j = 0; j < length; j += 2)
6121             {
6122               first_vect = dr_chain[j];
6123               second_vect = dr_chain[j + 1];
6124
6125               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6126               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6127                                                first_vect, first_vect,
6128                                                perm2_mask1);
6129               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6130               vect[0] = data_ref;
6131
6132               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
6133               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6134                                                second_vect, second_vect,
6135                                                perm2_mask2);
6136               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6137               vect[1] = data_ref;
6138
6139               data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
6140               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6141                                                vect[0], vect[1], shift1_mask);
6142               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6143               (*result_chain)[j/2 + length/2] = data_ref;
6144
6145               data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
6146               perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6147                                                vect[0], vect[1], select_mask);
6148               vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6149               (*result_chain)[j/2] = data_ref;
6150             }
6151           memcpy (dr_chain.address (), result_chain->address (),
6152                   length * sizeof (tree));
6153         }
6154       return true;
6155     }
6156   if (length == 3 && vf > 2)
6157     {
6158       unsigned int k = 0, l = 0;
6159
6160       /* Generating permutation constant to get all elements in rigth order.
6161          For vector length 8 it is {0 3 6 1 4 7 2 5}.  */
6162       for (i = 0; i < nelt; i++)
6163         {
6164           if (3 * k + (l % 3) >= nelt)
6165             {
6166               k = 0;
6167               l += (3 - (nelt % 3));
6168             }
6169           sel[i] = 3 * k + (l % 3);
6170           k++;
6171         }
6172       vec_perm_indices indices (sel, 2, nelt);
6173       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6174         {
6175           if (dump_enabled_p ())
6176             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6177                              "shuffle of 3 fields structure is not \
6178                               supported by target\n");
6179           return false;
6180         }
6181       perm3_mask = vect_gen_perm_mask_checked (vectype, indices);
6182
6183       /* Generating permutation constant to shift all elements.
6184          For vector length 8 it is {6 7 8 9 10 11 12 13}.  */
6185       for (i = 0; i < nelt; i++)
6186         sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
6187       indices.new_vector (sel, 2, nelt);
6188       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6189         {
6190           if (dump_enabled_p ())
6191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6192                              "shift permutation is not supported by target\n");
6193           return false;
6194         }
6195       shift1_mask = vect_gen_perm_mask_checked (vectype, indices);
6196
6197       /* Generating permutation constant to shift all elements.
6198          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6199       for (i = 0; i < nelt; i++)
6200         sel[i] = 2 * (nelt / 3) + 1 + i;
6201       indices.new_vector (sel, 2, nelt);
6202       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6203         {
6204           if (dump_enabled_p ())
6205             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6206                              "shift permutation is not supported by target\n");
6207           return false;
6208         }
6209       shift2_mask = vect_gen_perm_mask_checked (vectype, indices);
6210
6211       /* Generating permutation constant to shift all elements.
6212          For vector length 8 it is {3 4 5 6 7 8 9 10}.  */
6213       for (i = 0; i < nelt; i++)
6214         sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
6215       indices.new_vector (sel, 2, nelt);
6216       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6217         {
6218           if (dump_enabled_p ())
6219             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6220                              "shift permutation is not supported by target\n");
6221           return false;
6222         }
6223       shift3_mask = vect_gen_perm_mask_checked (vectype, indices);
6224
6225       /* Generating permutation constant to shift all elements.
6226          For vector length 8 it is {5 6 7 8 9 10 11 12}.  */
6227       for (i = 0; i < nelt; i++)
6228         sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
6229       indices.new_vector (sel, 2, nelt);
6230       if (!can_vec_perm_const_p (TYPE_MODE (vectype), indices))
6231         {
6232           if (dump_enabled_p ())
6233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6234                              "shift permutation is not supported by target\n");
6235           return false;
6236         }
6237       shift4_mask = vect_gen_perm_mask_checked (vectype, indices);
6238
6239       for (k = 0; k < 3; k++)
6240         {
6241           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
6242           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6243                                            dr_chain[k], dr_chain[k],
6244                                            perm3_mask);
6245           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6246           vect[k] = data_ref;
6247         }
6248
6249       for (k = 0; k < 3; k++)
6250         {
6251           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
6252           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6253                                            vect[k % 3], vect[(k + 1) % 3],
6254                                            shift1_mask);
6255           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6256           vect_shift[k] = data_ref;
6257         }
6258
6259       for (k = 0; k < 3; k++)
6260         {
6261           data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
6262           perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
6263                                            vect_shift[(4 - k) % 3],
6264                                            vect_shift[(3 - k) % 3],
6265                                            shift2_mask);
6266           vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6267           vect[k] = data_ref;
6268         }
6269
6270       (*result_chain)[3 - (nelt % 3)] = vect[2];
6271
6272       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
6273       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
6274                                        vect[0], shift3_mask);
6275       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6276       (*result_chain)[nelt % 3] = data_ref;
6277
6278       data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
6279       perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
6280                                        vect[1], shift4_mask);
6281       vect_finish_stmt_generation (stmt_info, perm_stmt, gsi);
6282       (*result_chain)[0] = data_ref;
6283       return true;
6284     }
6285   return false;
6286 }
6287
6288 /* Function vect_transform_grouped_load.
6289
6290    Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
6291    to perform their permutation and ascribe the result vectorized statements to
6292    the scalar statements.
6293 */
6294
6295 void
6296 vect_transform_grouped_load (stmt_vec_info stmt_info, vec<tree> dr_chain,
6297                              int size, gimple_stmt_iterator *gsi)
6298 {
6299   machine_mode mode;
6300   vec<tree> result_chain = vNULL;
6301
6302   /* DR_CHAIN contains input data-refs that are a part of the interleaving.
6303      RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
6304      vectors, that are ready for vector computation.  */
6305   result_chain.create (size);
6306
6307   /* If reassociation width for vector type is 2 or greater target machine can
6308      execute 2 or more vector instructions in parallel.  Otherwise try to
6309      get chain for loads group using vect_shift_permute_load_chain.  */
6310   mode = TYPE_MODE (STMT_VINFO_VECTYPE (stmt_info));
6311   if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
6312       || pow2p_hwi (size)
6313       || !vect_shift_permute_load_chain (dr_chain, size, stmt_info,
6314                                          gsi, &result_chain))
6315     vect_permute_load_chain (dr_chain, size, stmt_info, gsi, &result_chain);
6316   vect_record_grouped_load_vectors (stmt_info, result_chain);
6317   result_chain.release ();
6318 }
6319
6320 /* RESULT_CHAIN contains the output of a group of grouped loads that were
6321    generated as part of the vectorization of STMT_INFO.  Assign the statement
6322    for each vector to the associated scalar statement.  */
6323
6324 void
6325 vect_record_grouped_load_vectors (stmt_vec_info stmt_info,
6326                                   vec<tree> result_chain)
6327 {
6328   vec_info *vinfo = stmt_info->vinfo;
6329   stmt_vec_info first_stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
6330   unsigned int i, gap_count;
6331   tree tmp_data_ref;
6332
6333   /* Put a permuted data-ref in the VECTORIZED_STMT field.
6334      Since we scan the chain starting from it's first node, their order
6335      corresponds the order of data-refs in RESULT_CHAIN.  */
6336   stmt_vec_info next_stmt_info = first_stmt_info;
6337   gap_count = 1;
6338   FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
6339     {
6340       if (!next_stmt_info)
6341         break;
6342
6343       /* Skip the gaps.  Loads created for the gaps will be removed by dead
6344        code elimination pass later.  No need to check for the first stmt in
6345        the group, since it always exists.
6346        DR_GROUP_GAP is the number of steps in elements from the previous
6347        access (if there is no gap DR_GROUP_GAP is 1).  We skip loads that
6348        correspond to the gaps.  */
6349       if (next_stmt_info != first_stmt_info
6350           && gap_count < DR_GROUP_GAP (next_stmt_info))
6351         {
6352           gap_count++;
6353           continue;
6354         }
6355
6356       /* ???  The following needs cleanup after the removal of
6357          DR_GROUP_SAME_DR_STMT.  */
6358       if (next_stmt_info)
6359         {
6360           stmt_vec_info new_stmt_info = vinfo->lookup_def (tmp_data_ref);
6361           /* We assume that if VEC_STMT is not NULL, this is a case of multiple
6362              copies, and we put the new vector statement in the first available
6363              RELATED_STMT.  */
6364           if (!STMT_VINFO_VEC_STMT (next_stmt_info))
6365             STMT_VINFO_VEC_STMT (next_stmt_info) = new_stmt_info;
6366           else
6367             {
6368               stmt_vec_info prev_stmt_info
6369                 = STMT_VINFO_VEC_STMT (next_stmt_info);
6370               stmt_vec_info rel_stmt_info
6371                 = STMT_VINFO_RELATED_STMT (prev_stmt_info);
6372               while (rel_stmt_info)
6373                 {
6374                   prev_stmt_info = rel_stmt_info;
6375                   rel_stmt_info = STMT_VINFO_RELATED_STMT (rel_stmt_info);
6376                 }
6377
6378               STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6379             }
6380
6381           next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
6382           gap_count = 1;
6383         }
6384     }
6385 }
6386
6387 /* Function vect_force_dr_alignment_p.
6388
6389    Returns whether the alignment of a DECL can be forced to be aligned
6390    on ALIGNMENT bit boundary.  */
6391
6392 bool
6393 vect_can_force_dr_alignment_p (const_tree decl, poly_uint64 alignment)
6394 {
6395   if (!VAR_P (decl))
6396     return false;
6397
6398   if (decl_in_symtab_p (decl)
6399       && !symtab_node::get (decl)->can_increase_alignment_p ())
6400     return false;
6401
6402   if (TREE_STATIC (decl))
6403     return (known_le (alignment,
6404                       (unsigned HOST_WIDE_INT) MAX_OFILE_ALIGNMENT));
6405   else
6406     return (known_le (alignment, (unsigned HOST_WIDE_INT) MAX_STACK_ALIGNMENT));
6407 }
6408
6409
6410 /* Return whether the data reference DR_INFO is supported with respect to its
6411    alignment.
6412    If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
6413    it is aligned, i.e., check if it is possible to vectorize it with different
6414    alignment.  */
6415
6416 enum dr_alignment_support
6417 vect_supportable_dr_alignment (dr_vec_info *dr_info,
6418                                bool check_aligned_accesses)
6419 {
6420   data_reference *dr = dr_info->dr;
6421   stmt_vec_info stmt_info = dr_info->stmt;
6422   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6423   machine_mode mode = TYPE_MODE (vectype);
6424   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6425   struct loop *vect_loop = NULL;
6426   bool nested_in_vect_loop = false;
6427
6428   if (aligned_access_p (dr_info) && !check_aligned_accesses)
6429     return dr_aligned;
6430
6431   /* For now assume all conditional loads/stores support unaligned
6432      access without any special code.  */
6433   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
6434     if (gimple_call_internal_p (stmt)
6435         && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
6436             || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
6437       return dr_unaligned_supported;
6438
6439   if (loop_vinfo)
6440     {
6441       vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
6442       nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt_info);
6443     }
6444
6445   /* Possibly unaligned access.  */
6446
6447   /* We can choose between using the implicit realignment scheme (generating
6448      a misaligned_move stmt) and the explicit realignment scheme (generating
6449      aligned loads with a REALIGN_LOAD).  There are two variants to the
6450      explicit realignment scheme: optimized, and unoptimized.
6451      We can optimize the realignment only if the step between consecutive
6452      vector loads is equal to the vector size.  Since the vector memory
6453      accesses advance in steps of VS (Vector Size) in the vectorized loop, it
6454      is guaranteed that the misalignment amount remains the same throughout the
6455      execution of the vectorized loop.  Therefore, we can create the
6456      "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
6457      at the loop preheader.
6458
6459      However, in the case of outer-loop vectorization, when vectorizing a
6460      memory access in the inner-loop nested within the LOOP that is now being
6461      vectorized, while it is guaranteed that the misalignment of the
6462      vectorized memory access will remain the same in different outer-loop
6463      iterations, it is *not* guaranteed that is will remain the same throughout
6464      the execution of the inner-loop.  This is because the inner-loop advances
6465      with the original scalar step (and not in steps of VS).  If the inner-loop
6466      step happens to be a multiple of VS, then the misalignment remains fixed
6467      and we can use the optimized realignment scheme.  For example:
6468
6469       for (i=0; i<N; i++)
6470         for (j=0; j<M; j++)
6471           s += a[i+j];
6472
6473      When vectorizing the i-loop in the above example, the step between
6474      consecutive vector loads is 1, and so the misalignment does not remain
6475      fixed across the execution of the inner-loop, and the realignment cannot
6476      be optimized (as illustrated in the following pseudo vectorized loop):
6477
6478       for (i=0; i<N; i+=4)
6479         for (j=0; j<M; j++){
6480           vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
6481                          // when j is {0,1,2,3,4,5,6,7,...} respectively.
6482                          // (assuming that we start from an aligned address).
6483           }
6484
6485      We therefore have to use the unoptimized realignment scheme:
6486
6487       for (i=0; i<N; i+=4)
6488           for (j=k; j<M; j+=4)
6489           vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
6490                            // that the misalignment of the initial address is
6491                            // 0).
6492
6493      The loop can then be vectorized as follows:
6494
6495       for (k=0; k<4; k++){
6496         rt = get_realignment_token (&vp[k]);
6497         for (i=0; i<N; i+=4){
6498           v1 = vp[i+k];
6499           for (j=k; j<M; j+=4){
6500             v2 = vp[i+j+VS-1];
6501             va = REALIGN_LOAD <v1,v2,rt>;
6502             vs += va;
6503             v1 = v2;
6504           }
6505         }
6506     } */
6507
6508   if (DR_IS_READ (dr))
6509     {
6510       bool is_packed = false;
6511       tree type = (TREE_TYPE (DR_REF (dr)));
6512
6513       if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
6514           && (!targetm.vectorize.builtin_mask_for_load
6515               || targetm.vectorize.builtin_mask_for_load ()))
6516         {
6517           tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6518
6519           /* If we are doing SLP then the accesses need not have the
6520              same alignment, instead it depends on the SLP group size.  */
6521           if (loop_vinfo
6522               && STMT_SLP_TYPE (stmt_info)
6523               && !multiple_p (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6524                               * (DR_GROUP_SIZE
6525                                  (DR_GROUP_FIRST_ELEMENT (stmt_info))),
6526                               TYPE_VECTOR_SUBPARTS (vectype)))
6527             ;
6528           else if (!loop_vinfo
6529                    || (nested_in_vect_loop
6530                        && maybe_ne (TREE_INT_CST_LOW (DR_STEP (dr)),
6531                                     GET_MODE_SIZE (TYPE_MODE (vectype)))))
6532             return dr_explicit_realign;
6533           else
6534             return dr_explicit_realign_optimized;
6535         }
6536       if (!known_alignment_for_access_p (dr_info))
6537         is_packed = not_size_aligned (DR_REF (dr));
6538
6539       if (targetm.vectorize.support_vector_misalignment
6540             (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6541         /* Can't software pipeline the loads, but can at least do them.  */
6542         return dr_unaligned_supported;
6543     }
6544   else
6545     {
6546       bool is_packed = false;
6547       tree type = (TREE_TYPE (DR_REF (dr)));
6548
6549       if (!known_alignment_for_access_p (dr_info))
6550         is_packed = not_size_aligned (DR_REF (dr));
6551
6552      if (targetm.vectorize.support_vector_misalignment
6553            (mode, type, DR_MISALIGNMENT (dr_info), is_packed))
6554        return dr_unaligned_supported;
6555     }
6556
6557   /* Unsupported.  */
6558   return dr_unaligned_unsupported;
6559 }