gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                bool *, bool *);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf,
 168                               vec<stmt_vec_info > *mask_producers)
 169 {
 170   gimple *stmt = stmt_info->stmt;
 171
 172   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 173        && !STMT_VINFO_LIVE_P (stmt_info))
 174       || gimple_clobber_p (stmt))
 175     {
 176       if (dump_enabled_p ())
 177         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 178       return opt_result::success ();
 179     }
 180
 181   tree stmt_vectype, nunits_vectype;
 182   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else if (stmt_vectype == boolean_type_node)
 197         mask_producers->safe_push (stmt_info);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  If some of the statements
 211    produce a mask result whose vector type can only be calculated later,
 212    add them to MASK_PRODUCERS.  Return true on success or false if
 213    something prevented vectorization.  */
 214
 215 static opt_result
 216 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 217                             vec<stmt_vec_info > *mask_producers)
 218 {
 219   vec_info *vinfo = stmt_info->vinfo;
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res
 224     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 225   if (!res)
 226     return res;
 227
 228   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 229       && STMT_VINFO_RELATED_STMT (stmt_info))
 230     {
 231       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 232       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 233
 234       /* If a pattern statement has def stmts, analyze them too.  */
 235       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 236            !gsi_end_p (si); gsi_next (&si))
 237         {
 238           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 239           if (dump_enabled_p ())
 240             dump_printf_loc (MSG_NOTE, vect_location,
 241                              "==> examining pattern def stmt: %G",
 242                              def_stmt_info->stmt);
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 246                                               vf, mask_producers);
 247           if (!res)
 248             return res;
 249         }
 250
 251       if (dump_enabled_p ())
 252         dump_printf_loc (MSG_NOTE, vect_location,
 253                          "==> examining pattern statement: %G",
 254                          stmt_info->stmt);
 255       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 256       if (!res)
 257         return res;
 258     }
 259
 260   return opt_result::success ();
 261 }
 262
 263 /* Function vect_determine_vectorization_factor
 264
 265    Determine the vectorization factor (VF).  VF is the number of data elements
 266    that are operated upon in parallel in a single iteration of the vectorized
 267    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 268    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 269    elements can fit in a single vector register.
 270
 271    We currently support vectorization of loops in which all types operated upon
 272    are of the same size.  Therefore this function currently sets VF according to
 273    the size of the types operated upon, and fails if there are multiple sizes
 274    in the loop.
 275
 276    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 277    original loop:
 278         for (i=0; i<N; i++){
 279           a[i] = b[i] + c[i];
 280         }
 281
 282    vectorized loop:
 283         for (i=0; i<N; i+=VF){
 284           a[i:VF] = b[i:VF] + c[i:VF];
 285         }
 286 */
 287
 288 static opt_result
 289 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 290 {
 291   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 292   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 293   unsigned nbbs = loop->num_nodes;
 294   poly_uint64 vectorization_factor = 1;
 295   tree scalar_type = NULL_TREE;
 296   gphi *phi;
 297   tree vectype;
 298   stmt_vec_info stmt_info;
 299   unsigned i;
 300   auto_vec<stmt_vec_info> mask_producers;
 301
 302   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 303
 304   for (i = 0; i < nbbs; i++)
 305     {
 306       basic_block bb = bbs[i];
 307
 308       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 309            gsi_next (&si))
 310         {
 311           phi = si.phi ();
 312           stmt_info = loop_vinfo->lookup_stmt (phi);
 313           if (dump_enabled_p ())
 314             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 315                              phi);
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 dump_printf_loc (MSG_NOTE, vect_location,
 327                                  "get vectype for scalar type:  %T\n",
 328                                  scalar_type);
 329
 330               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 331               if (!vectype)
 332                 return opt_result::failure_at (phi,
 333                                                "not vectorized: unsupported "
 334                                                "data-type %T\n",
 335                                                scalar_type);
 336               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 337
 338               if (dump_enabled_p ())
 339                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 340                                  vectype);
 341
 342               if (dump_enabled_p ())
 343                 {
 344                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 345                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 346                   dump_printf (MSG_NOTE, "\n");
 347                 }
 348
 349               vect_update_max_nunits (&vectorization_factor, vectype);
 350             }
 351         }
 352
 353       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 354            gsi_next (&si))
 355         {
 356           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 357           opt_result res
 358             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 359                                           &mask_producers);
 360           if (!res)
 361             return res;
 362         }
 363     }
 364
 365   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 366   if (dump_enabled_p ())
 367     {
 368       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 369       dump_dec (MSG_NOTE, vectorization_factor);
 370       dump_printf (MSG_NOTE, "\n");
 371     }
 372
 373   if (known_le (vectorization_factor, 1U))
 374     return opt_result::failure_at (vect_location,
 375                                    "not vectorized: unsupported data-type\n");
 376   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 377
 378   for (i = 0; i < mask_producers.length (); i++)
 379     {
 380       stmt_info = mask_producers[i];
 381       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 382       if (!mask_type)
 383         return opt_result::propagate_failure (mask_type);
 384       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 385     }
 386
 387   return opt_result::success ();
 388 }
 389
 390
 391 /* Function vect_is_simple_iv_evolution.
 392
 393    FORNOW: A simple evolution of an induction variables in the loop is
 394    considered a polynomial evolution.  */
 395
 396 static bool
 397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 398                              tree * step)
 399 {
 400   tree init_expr;
 401   tree step_expr;
 402   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 403   basic_block bb;
 404
 405   /* When there is no evolution in this loop, the evolution function
 406      is not "simple".  */
 407   if (evolution_part == NULL_TREE)
 408     return false;
 409
 410   /* When the evolution is a polynomial of degree >= 2
 411      the evolution function is not "simple".  */
 412   if (tree_is_chrec (evolution_part))
 413     return false;
 414
 415   step_expr = evolution_part;
 416   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 417
 418   if (dump_enabled_p ())
 419     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 420                      step_expr, init_expr);
 421
 422   *init = init_expr;
 423   *step = step_expr;
 424
 425   if (TREE_CODE (step_expr) != INTEGER_CST
 426       && (TREE_CODE (step_expr) != SSA_NAME
 427           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 428               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 429           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 430               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 431                   || !flag_associative_math)))
 432       && (TREE_CODE (step_expr) != REAL_CST
 433           || !flag_associative_math))
 434     {
 435       if (dump_enabled_p ())
 436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                          "step unknown.\n");
 438       return false;
 439     }
 440
 441   return true;
 442 }
 443
 444 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 445    what we are assuming is a double reduction.  For example, given
 446    a structure like this:
 447
 448       outer1:
 449         x_1 = PHI <x_4(outer2), ...>;
 450         ...
 451
 452       inner:
 453         x_2 = PHI <x_1(outer1), ...>;
 454         ...
 455         x_3 = ...;
 456         ...
 457
 458       outer2:
 459         x_4 = PHI <x_3(inner)>;
 460         ...
 461
 462    outer loop analysis would treat x_1 as a double reduction phi and
 463    this function would then return true for x_2.  */
 464
 465 static bool
 466 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 467 {
 468   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 469   use_operand_p use_p;
 470   ssa_op_iter op_iter;
 471   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 472     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 473       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 474         return true;
 475   return false;
 476 }
 477
 478 /* Function vect_analyze_scalar_cycles_1.
 479
 480    Examine the cross iteration def-use cycles of scalar variables
 481    in LOOP.  LOOP_VINFO represents the loop that is now being
 482    considered for vectorization (can be LOOP, or an outer-loop
 483    enclosing LOOP).  */
 484
 485 static void
 486 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 487 {
 488   basic_block bb = loop->header;
 489   tree init, step;
 490   auto_vec<stmt_vec_info, 64> worklist;
 491   gphi_iterator gsi;
 492   bool double_reduc, reduc_chain;
 493
 494   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 495
 496   /* First - identify all inductions.  Reduction detection assumes that all the
 497      inductions have been identified, therefore, this order must not be
 498      changed.  */
 499   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 500     {
 501       gphi *phi = gsi.phi ();
 502       tree access_fn = NULL;
 503       tree def = PHI_RESULT (phi);
 504       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 505
 506       if (dump_enabled_p ())
 507         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 508
 509       /* Skip virtual phi's.  The data dependences that are associated with
 510          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 511       if (virtual_operand_p (def))
 512         continue;
 513
 514       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 515
 516       /* Analyze the evolution function.  */
 517       access_fn = analyze_scalar_evolution (loop, def);
 518       if (access_fn)
 519         {
 520           STRIP_NOPS (access_fn);
 521           if (dump_enabled_p ())
 522             dump_printf_loc (MSG_NOTE, vect_location,
 523                              "Access function of PHI: %T\n", access_fn);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 525             = initial_condition_in_loop_num (access_fn, loop->num);
 526           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 527             = evolution_part_in_loop_num (access_fn, loop->num);
 528         }
 529
 530       if (!access_fn
 531           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 532           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 533           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 534               && TREE_CODE (step) != INTEGER_CST))
 535         {
 536           worklist.safe_push (stmt_vinfo);
 537           continue;
 538         }
 539
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 541                   != NULL_TREE);
 542       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 543
 544       if (dump_enabled_p ())
 545         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 546       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 547     }
 548
 549
 550   /* Second - identify all reductions and nested cycles.  */
 551   while (worklist.length () > 0)
 552     {
 553       stmt_vec_info stmt_vinfo = worklist.pop ();
 554       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 555       tree def = PHI_RESULT (phi);
 556
 557       if (dump_enabled_p ())
 558         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       stmt_vec_info reduc_stmt_info
 564         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 565                                     &reduc_chain);
 566       if (reduc_stmt_info)
 567         {
 568           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 569           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 570           if (double_reduc)
 571             {
 572               if (dump_enabled_p ())
 573                 dump_printf_loc (MSG_NOTE, vect_location,
 574                                  "Detected double reduction.\n");
 575
 576               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 577               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 578             }
 579           else
 580             {
 581               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 582                 {
 583                   if (dump_enabled_p ())
 584                     dump_printf_loc (MSG_NOTE, vect_location,
 585                                      "Detected vectorizable nested cycle.\n");
 586
 587                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 588                 }
 589               else
 590                 {
 591                   if (dump_enabled_p ())
 592                     dump_printf_loc (MSG_NOTE, vect_location,
 593                                      "Detected reduction.\n");
 594
 595                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 596                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 597                   /* Store the reduction cycles for possible vectorization in
 598                      loop-aware SLP if it was not detected as reduction
 599                      chain.  */
 600                   if (! reduc_chain)
 601                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 602                       (reduc_stmt_info);
 603                 }
 604             }
 605         }
 606       else
 607         if (dump_enabled_p ())
 608           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 609                            "Unknown def-use cycle pattern.\n");
 610     }
 611 }
 612
 613
 614 /* Function vect_analyze_scalar_cycles.
 615
 616    Examine the cross iteration def-use cycles of scalar variables, by
 617    analyzing the loop-header PHIs of scalar variables.  Classify each
 618    cycle as one of the following: invariant, induction, reduction, unknown.
 619    We do that for the loop represented by LOOP_VINFO, and also to its
 620    inner-loop, if exists.
 621    Examples for scalar cycles:
 622
 623    Example1: reduction:
 624
 625               loop1:
 626               for (i=0; i<N; i++)
 627                  sum += a[i];
 628
 629    Example2: induction:
 630
 631               loop2:
 632               for (i=0; i<N; i++)
 633                  a[i] = i;  */
 634
 635 static void
 636 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 637 {
 638   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 639
 640   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 641
 642   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 643      Reductions in such inner-loop therefore have different properties than
 644      the reductions in the nest that gets vectorized:
 645      1. When vectorized, they are executed in the same order as in the original
 646         scalar loop, so we can't change the order of computation when
 647         vectorizing them.
 648      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 649         current checks are too strict.  */
 650
 651   if (loop->inner)
 652     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 653 }
 654
 655 /* Transfer group and reduction information from STMT_INFO to its
 656    pattern stmt.  */
 657
 658 static void
 659 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 660 {
 661   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 662   stmt_vec_info stmtp;
 663   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 664               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 665   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 666   do
 667     {
 668       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 669       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 670       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 671       if (stmt_info)
 672         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 673           = STMT_VINFO_RELATED_STMT (stmt_info);
 674     }
 675   while (stmt_info);
 676   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 677 }
 678
 679 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 680
 681 static void
 682 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 683 {
 684   stmt_vec_info first;
 685   unsigned i;
 686
 687   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 688     if (STMT_VINFO_IN_PATTERN_P (first))
 689       {
 690         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 691         while (next)
 692           {
 693             if (! STMT_VINFO_IN_PATTERN_P (next)
 694                 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
 695               break;
 696             next = REDUC_GROUP_NEXT_ELEMENT (next);
 697           }
 698         /* If not all stmt in the chain are patterns or if we failed
 699            to update STMT_VINFO_REDUC_IDX try to handle the chain
 700            without patterns.  */
 701         if (! next
 702             && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
 703           {
 704             vect_fixup_reduc_chain (first);
 705             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 706               = STMT_VINFO_RELATED_STMT (first);
 707           }
 708       }
 709 }
 710
 711 /* Function vect_get_loop_niters.
 712
 713    Determine how many iterations the loop is executed and place it
 714    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 715    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 716    niter information holds in ASSUMPTIONS.
 717
 718    Return the loop exit condition.  */
 719
 720
 721 static gcond *
 722 vect_get_loop_niters (class loop *loop, tree *assumptions,
 723                       tree *number_of_iterations, tree *number_of_iterationsm1)
 724 {
 725   edge exit = single_exit (loop);
 726   class tree_niter_desc niter_desc;
 727   tree niter_assumptions, niter, may_be_zero;
 728   gcond *cond = get_loop_exit_condition (loop);
 729
 730   *assumptions = boolean_true_node;
 731   *number_of_iterationsm1 = chrec_dont_know;
 732   *number_of_iterations = chrec_dont_know;
 733   DUMP_VECT_SCOPE ("get_loop_niters");
 734
 735   if (!exit)
 736     return cond;
 737
 738   may_be_zero = NULL_TREE;
 739   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 740       || chrec_contains_undetermined (niter_desc.niter))
 741     return cond;
 742
 743   niter_assumptions = niter_desc.assumptions;
 744   may_be_zero = niter_desc.may_be_zero;
 745   niter = niter_desc.niter;
 746
 747   if (may_be_zero && integer_zerop (may_be_zero))
 748     may_be_zero = NULL_TREE;
 749
 750   if (may_be_zero)
 751     {
 752       if (COMPARISON_CLASS_P (may_be_zero))
 753         {
 754           /* Try to combine may_be_zero with assumptions, this can simplify
 755              computation of niter expression.  */
 756           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 757             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 758                                              niter_assumptions,
 759                                              fold_build1 (TRUTH_NOT_EXPR,
 760                                                           boolean_type_node,
 761                                                           may_be_zero));
 762           else
 763             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 764                                  build_int_cst (TREE_TYPE (niter), 0),
 765                                  rewrite_to_non_trapping_overflow (niter));
 766
 767           may_be_zero = NULL_TREE;
 768         }
 769       else if (integer_nonzerop (may_be_zero))
 770         {
 771           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 772           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 773           return cond;
 774         }
 775       else
 776         return cond;
 777     }
 778
 779   *assumptions = niter_assumptions;
 780   *number_of_iterationsm1 = niter;
 781
 782   /* We want the number of loop header executions which is the number
 783      of latch executions plus one.
 784      ???  For UINT_MAX latch executions this number overflows to zero
 785      for loops like do { n++; } while (n != 0);  */
 786   if (niter && !chrec_contains_undetermined (niter))
 787     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 788                           build_int_cst (TREE_TYPE (niter), 1));
 789   *number_of_iterations = niter;
 790
 791   return cond;
 792 }
 793
 794 /* Function bb_in_loop_p
 795
 796    Used as predicate for dfs order traversal of the loop bbs.  */
 797
 798 static bool
 799 bb_in_loop_p (const_basic_block bb, const void *data)
 800 {
 801   const class loop *const loop = (const class loop *)data;
 802   if (flow_bb_inside_loop_p (loop, bb))
 803     return true;
 804   return false;
 805 }
 806
 807
 808 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 809    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 810
 811 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 812   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 813     loop (loop_in),
 814     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 815     num_itersm1 (NULL_TREE),
 816     num_iters (NULL_TREE),
 817     num_iters_unchanged (NULL_TREE),
 818     num_iters_assumptions (NULL_TREE),
 819     th (0),
 820     versioning_threshold (0),
 821     vectorization_factor (0),
 822     max_vectorization_factor (0),
 823     mask_skip_niters (NULL_TREE),
 824     mask_compare_type (NULL_TREE),
 825     simd_if_cond (NULL_TREE),
 826     unaligned_dr (NULL),
 827     peeling_for_alignment (0),
 828     ptr_mask (0),
 829     ivexpr_map (NULL),
 830     scan_map (NULL),
 831     slp_unrolling_factor (1),
 832     single_scalar_iteration_cost (0),
 833     vectorizable (false),
 834     can_fully_mask_p (true),
 835     fully_masked_p (false),
 836     peeling_for_gaps (false),
 837     peeling_for_niter (false),
 838     no_data_dependencies (false),
 839     has_mask_store (false),
 840     scalar_loop_scaling (profile_probability::uninitialized ()),
 841     scalar_loop (NULL),
 842     orig_loop_info (NULL)
 843 {
 844   /* CHECKME: We want to visit all BBs before their successors (except for
 845      latch blocks, for which this assertion wouldn't hold).  In the simple
 846      case of the loop forms we allow, a dfs order of the BBs would the same
 847      as reversed postorder traversal, so we are safe.  */
 848
 849   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 850                                           bbs, loop->num_nodes, loop);
 851   gcc_assert (nbbs == loop->num_nodes);
 852
 853   for (unsigned int i = 0; i < nbbs; i++)
 854     {
 855       basic_block bb = bbs[i];
 856       gimple_stmt_iterator si;
 857
 858       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 859         {
 860           gimple *phi = gsi_stmt (si);
 861           gimple_set_uid (phi, 0);
 862           add_stmt (phi);
 863         }
 864
 865       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 866         {
 867           gimple *stmt = gsi_stmt (si);
 868           gimple_set_uid (stmt, 0);
 869           add_stmt (stmt);
 870           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 871              third argument is the #pragma omp simd if (x) condition, when 0,
 872              loop shouldn't be vectorized, when non-zero constant, it should
 873              be vectorized normally, otherwise versioned with vectorized loop
 874              done if the condition is non-zero at runtime.  */
 875           if (loop_in->simduid
 876               && is_gimple_call (stmt)
 877               && gimple_call_internal_p (stmt)
 878               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 879               && gimple_call_num_args (stmt) >= 3
 880               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 881               && (loop_in->simduid
 882                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 883             {
 884               tree arg = gimple_call_arg (stmt, 2);
 885               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 886                 simd_if_cond = arg;
 887               else
 888                 gcc_assert (integer_nonzerop (arg));
 889             }
 890         }
 891     }
 892
 893   epilogue_vinfos.create (6);
 894 }
 895
 896 /* Free all levels of MASKS.  */
 897
 898 void
 899 release_vec_loop_masks (vec_loop_masks *masks)
 900 {
 901   rgroup_masks *rgm;
 902   unsigned int i;
 903   FOR_EACH_VEC_ELT (*masks, i, rgm)
 904     rgm->masks.release ();
 905   masks->release ();
 906 }
 907
 908 /* Free all memory used by the _loop_vec_info, as well as all the
 909    stmt_vec_info structs of all the stmts in the loop.  */
 910
 911 _loop_vec_info::~_loop_vec_info ()
 912 {
 913   free (bbs);
 914
 915   release_vec_loop_masks (&masks);
 916   delete ivexpr_map;
 917   delete scan_map;
 918   epilogue_vinfos.release ();
 919
 920   loop->aux = NULL;
 921 }
 922
 923 /* Return an invariant or register for EXPR and emit necessary
 924    computations in the LOOP_VINFO loop preheader.  */
 925
 926 tree
 927 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 928 {
 929   if (is_gimple_reg (expr)
 930       || is_gimple_min_invariant (expr))
 931     return expr;
 932
 933   if (! loop_vinfo->ivexpr_map)
 934     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 935   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 936   if (! cached)
 937     {
 938       gimple_seq stmts = NULL;
 939       cached = force_gimple_operand (unshare_expr (expr),
 940                                      &stmts, true, NULL_TREE);
 941       if (stmts)
 942         {
 943           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 944           gsi_insert_seq_on_edge_immediate (e, stmts);
 945         }
 946     }
 947   return cached;
 948 }
 949
 950 /* Return true if we can use CMP_TYPE as the comparison type to produce
 951    all masks required to mask LOOP_VINFO.  */
 952
 953 static bool
 954 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 955 {
 956   rgroup_masks *rgm;
 957   unsigned int i;
 958   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 959     if (rgm->mask_type != NULL_TREE
 960         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 961                                             cmp_type, rgm->mask_type,
 962                                             OPTIMIZE_FOR_SPEED))
 963       return false;
 964   return true;
 965 }
 966
 967 /* Calculate the maximum number of scalars per iteration for every
 968    rgroup in LOOP_VINFO.  */
 969
 970 static unsigned int
 971 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 972 {
 973   unsigned int res = 1;
 974   unsigned int i;
 975   rgroup_masks *rgm;
 976   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 977     res = MAX (res, rgm->max_nscalars_per_iter);
 978   return res;
 979 }
 980
 981 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 982    whether we can actually generate the masks required.  Return true if so,
 983    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
 984
 985 static bool
 986 vect_verify_full_masking (loop_vec_info loop_vinfo)
 987 {
 988   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 989   unsigned int min_ni_width;
 990   unsigned int max_nscalars_per_iter
 991     = vect_get_max_nscalars_per_iter (loop_vinfo);
 992
 993   /* Use a normal loop if there are no statements that need masking.
 994      This only happens in rare degenerate cases: it means that the loop
 995      has no loads, no stores, and no live-out values.  */
 996   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
 997     return false;
 998
 999   /* Get the maximum number of iterations that is representable
1000      in the counter type.  */
1001   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1002   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1003
1004   /* Get a more refined estimate for the number of iterations.  */
1005   widest_int max_back_edges;
1006   if (max_loop_iterations (loop, &max_back_edges))
1007     max_ni = wi::smin (max_ni, max_back_edges + 1);
1008
1009   /* Account for rgroup masks, in which each bit is replicated N times.  */
1010   max_ni *= max_nscalars_per_iter;
1011
1012   /* Work out how many bits we need to represent the limit.  */
1013   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1014
1015   /* Find a scalar mode for which WHILE_ULT is supported.  */
1016   opt_scalar_int_mode cmp_mode_iter;
1017   tree cmp_type = NULL_TREE;
1018   tree iv_type = NULL_TREE;
1019   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1020   unsigned int iv_precision = UINT_MAX;
1021
1022   if (iv_limit != -1)
1023     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1024                                       UNSIGNED);
1025
1026   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1027     {
1028       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1029       if (cmp_bits >= min_ni_width
1030           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1031         {
1032           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1033           if (this_type
1034               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1035             {
1036               /* Although we could stop as soon as we find a valid mode,
1037                  there are at least two reasons why that's not always the
1038                  best choice:
1039
1040                  - An IV that's Pmode or wider is more likely to be reusable
1041                    in address calculations than an IV that's narrower than
1042                    Pmode.
1043
1044                  - Doing the comparison in IV_PRECISION or wider allows
1045                    a natural 0-based IV, whereas using a narrower comparison
1046                    type requires mitigations against wrap-around.
1047
1048                  Conversely, if the IV limit is variable, doing the comparison
1049                  in a wider type than the original type can introduce
1050                  unnecessary extensions, so picking the widest valid mode
1051                  is not always a good choice either.
1052
1053                  Here we prefer the first IV type that's Pmode or wider,
1054                  and the first comparison type that's IV_PRECISION or wider.
1055                  (The comparison type must be no wider than the IV type,
1056                  to avoid extensions in the vector loop.)
1057
1058                  ??? We might want to try continuing beyond Pmode for ILP32
1059                  targets if CMP_BITS < IV_PRECISION.  */
1060               iv_type = this_type;
1061               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1062                 cmp_type = this_type;
1063               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064                 break;
1065             }
1066         }
1067     }
1068
1069   if (!cmp_type)
1070     return false;
1071
1072   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1074   return true;
1075 }
1076
1077 /* Calculate the cost of one scalar iteration of the loop.  */
1078 static void
1079 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1080 {
1081   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1082   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1083   int nbbs = loop->num_nodes, factor;
1084   int innerloop_iters, i;
1085
1086   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1087
1088   /* Gather costs for statements in the scalar loop.  */
1089
1090   /* FORNOW.  */
1091   innerloop_iters = 1;
1092   if (loop->inner)
1093     innerloop_iters = 50; /* FIXME */
1094
1095   for (i = 0; i < nbbs; i++)
1096     {
1097       gimple_stmt_iterator si;
1098       basic_block bb = bbs[i];
1099
1100       if (bb->loop_father == loop->inner)
1101         factor = innerloop_iters;
1102       else
1103         factor = 1;
1104
1105       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1106         {
1107           gimple *stmt = gsi_stmt (si);
1108           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1109
1110           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1111             continue;
1112
1113           /* Skip stmts that are not vectorized inside the loop.  */
1114           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1115           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1116               && (!STMT_VINFO_LIVE_P (vstmt_info)
1117                   || !VECTORIZABLE_CYCLE_DEF
1118                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1119             continue;
1120
1121           vect_cost_for_stmt kind;
1122           if (STMT_VINFO_DATA_REF (stmt_info))
1123             {
1124               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1125                kind = scalar_load;
1126              else
1127                kind = scalar_store;
1128             }
1129           else
1130             kind = scalar_stmt;
1131
1132           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1133                             factor, kind, stmt_info, 0, vect_prologue);
1134         }
1135     }
1136
1137   /* Now accumulate cost.  */
1138   void *target_cost_data = init_cost (loop);
1139   stmt_info_for_cost *si;
1140   int j;
1141   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1142                     j, si)
1143     (void) add_stmt_cost (target_cost_data, si->count,
1144                           si->kind, si->stmt_info, si->misalign,
1145                           vect_body);
1146   unsigned dummy, body_cost = 0;
1147   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1148   destroy_cost_data (target_cost_data);
1149   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1150 }
1151
1152
1153 /* Function vect_analyze_loop_form_1.
1154
1155    Verify that certain CFG restrictions hold, including:
1156    - the loop has a pre-header
1157    - the loop has a single entry and exit
1158    - the loop exit condition is simple enough
1159    - the number of iterations can be analyzed, i.e, a countable loop.  The
1160      niter could be analyzed under some assumptions.  */
1161
1162 opt_result
1163 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1164                           tree *assumptions, tree *number_of_iterationsm1,
1165                           tree *number_of_iterations, gcond **inner_loop_cond)
1166 {
1167   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1168
1169   /* Different restrictions apply when we are considering an inner-most loop,
1170      vs. an outer (nested) loop.
1171      (FORNOW. May want to relax some of these restrictions in the future).  */
1172
1173   if (!loop->inner)
1174     {
1175       /* Inner-most loop.  We currently require that the number of BBs is
1176          exactly 2 (the header and latch).  Vectorizable inner-most loops
1177          look like this:
1178
1179                         (pre-header)
1180                            |
1181                           header <--------+
1182                            | |            |
1183                            | +--> latch --+
1184                            |
1185                         (exit-bb)  */
1186
1187       if (loop->num_nodes != 2)
1188         return opt_result::failure_at (vect_location,
1189                                        "not vectorized:"
1190                                        " control flow in loop.\n");
1191
1192       if (empty_block_p (loop->header))
1193         return opt_result::failure_at (vect_location,
1194                                        "not vectorized: empty loop.\n");
1195     }
1196   else
1197     {
1198       class loop *innerloop = loop->inner;
1199       edge entryedge;
1200
1201       /* Nested loop. We currently require that the loop is doubly-nested,
1202          contains a single inner loop, and the number of BBs is exactly 5.
1203          Vectorizable outer-loops look like this:
1204
1205                         (pre-header)
1206                            |
1207                           header <---+
1208                            |         |
1209                           inner-loop |
1210                            |         |
1211                           tail ------+
1212                            |
1213                         (exit-bb)
1214
1215          The inner-loop has the properties expected of inner-most loops
1216          as described above.  */
1217
1218       if ((loop->inner)->inner || (loop->inner)->next)
1219         return opt_result::failure_at (vect_location,
1220                                        "not vectorized:"
1221                                        " multiple nested loops.\n");
1222
1223       if (loop->num_nodes != 5)
1224         return opt_result::failure_at (vect_location,
1225                                        "not vectorized:"
1226                                        " control flow in loop.\n");
1227
1228       entryedge = loop_preheader_edge (innerloop);
1229       if (entryedge->src != loop->header
1230           || !single_exit (innerloop)
1231           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1232         return opt_result::failure_at (vect_location,
1233                                        "not vectorized:"
1234                                        " unsupported outerloop form.\n");
1235
1236       /* Analyze the inner-loop.  */
1237       tree inner_niterm1, inner_niter, inner_assumptions;
1238       opt_result res
1239         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1240                                     &inner_assumptions, &inner_niterm1,
1241                                     &inner_niter, NULL);
1242       if (!res)
1243         {
1244           if (dump_enabled_p ())
1245             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1246                              "not vectorized: Bad inner loop.\n");
1247           return res;
1248         }
1249
1250       /* Don't support analyzing niter under assumptions for inner
1251          loop.  */
1252       if (!integer_onep (inner_assumptions))
1253         return opt_result::failure_at (vect_location,
1254                                        "not vectorized: Bad inner loop.\n");
1255
1256       if (!expr_invariant_in_loop_p (loop, inner_niter))
1257         return opt_result::failure_at (vect_location,
1258                                        "not vectorized: inner-loop count not"
1259                                        " invariant.\n");
1260
1261       if (dump_enabled_p ())
1262         dump_printf_loc (MSG_NOTE, vect_location,
1263                          "Considering outer-loop vectorization.\n");
1264     }
1265
1266   if (!single_exit (loop))
1267     return opt_result::failure_at (vect_location,
1268                                    "not vectorized: multiple exits.\n");
1269   if (EDGE_COUNT (loop->header->preds) != 2)
1270     return opt_result::failure_at (vect_location,
1271                                    "not vectorized:"
1272                                    " too many incoming edges.\n");
1273
1274   /* We assume that the loop exit condition is at the end of the loop. i.e,
1275      that the loop is represented as a do-while (with a proper if-guard
1276      before the loop if needed), where the loop header contains all the
1277      executable statements, and the latch is empty.  */
1278   if (!empty_block_p (loop->latch)
1279       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1280     return opt_result::failure_at (vect_location,
1281                                    "not vectorized: latch block not empty.\n");
1282
1283   /* Make sure the exit is not abnormal.  */
1284   edge e = single_exit (loop);
1285   if (e->flags & EDGE_ABNORMAL)
1286     return opt_result::failure_at (vect_location,
1287                                    "not vectorized:"
1288                                    " abnormal loop exit edge.\n");
1289
1290   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1291                                      number_of_iterationsm1);
1292   if (!*loop_cond)
1293     return opt_result::failure_at
1294       (vect_location,
1295        "not vectorized: complicated exit condition.\n");
1296
1297   if (integer_zerop (*assumptions)
1298       || !*number_of_iterations
1299       || chrec_contains_undetermined (*number_of_iterations))
1300     return opt_result::failure_at
1301       (*loop_cond,
1302        "not vectorized: number of iterations cannot be computed.\n");
1303
1304   if (integer_zerop (*number_of_iterations))
1305     return opt_result::failure_at
1306       (*loop_cond,
1307        "not vectorized: number of iterations = 0.\n");
1308
1309   return opt_result::success ();
1310 }
1311
1312 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1313
1314 opt_loop_vec_info
1315 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1316 {
1317   tree assumptions, number_of_iterations, number_of_iterationsm1;
1318   gcond *loop_cond, *inner_loop_cond = NULL;
1319
1320   opt_result res
1321     = vect_analyze_loop_form_1 (loop, &loop_cond,
1322                                 &assumptions, &number_of_iterationsm1,
1323                                 &number_of_iterations, &inner_loop_cond);
1324   if (!res)
1325     return opt_loop_vec_info::propagate_failure (res);
1326
1327   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1328   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1329   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1330   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1331   if (!integer_onep (assumptions))
1332     {
1333       /* We consider to vectorize this loop by versioning it under
1334          some assumptions.  In order to do this, we need to clear
1335          existing information computed by scev and niter analyzer.  */
1336       scev_reset_htab ();
1337       free_numbers_of_iterations_estimates (loop);
1338       /* Also set flag for this loop so that following scev and niter
1339          analysis are done under the assumptions.  */
1340       loop_constraint_set (loop, LOOP_C_FINITE);
1341       /* Also record the assumptions for versioning.  */
1342       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1343     }
1344
1345   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1346     {
1347       if (dump_enabled_p ())
1348         {
1349           dump_printf_loc (MSG_NOTE, vect_location,
1350                            "Symbolic number of iterations is ");
1351           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1352           dump_printf (MSG_NOTE, "\n");
1353         }
1354     }
1355
1356   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1357   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1358   if (inner_loop_cond)
1359     {
1360       stmt_vec_info inner_loop_cond_info
1361         = loop_vinfo->lookup_stmt (inner_loop_cond);
1362       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1363     }
1364
1365   gcc_assert (!loop->aux);
1366   loop->aux = loop_vinfo;
1367   return opt_loop_vec_info::success (loop_vinfo);
1368 }
1369
1370
1371
1372 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1373    statements update the vectorization factor.  */
1374
1375 static void
1376 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1377 {
1378   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1379   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1380   int nbbs = loop->num_nodes;
1381   poly_uint64 vectorization_factor;
1382   int i;
1383
1384   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1385
1386   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1387   gcc_assert (known_ne (vectorization_factor, 0U));
1388
1389   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1390      vectorization factor of the loop is the unrolling factor required by
1391      the SLP instances.  If that unrolling factor is 1, we say, that we
1392      perform pure SLP on loop - cross iteration parallelism is not
1393      exploited.  */
1394   bool only_slp_in_loop = true;
1395   for (i = 0; i < nbbs; i++)
1396     {
1397       basic_block bb = bbs[i];
1398       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1399            gsi_next (&si))
1400         {
1401           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1402           stmt_info = vect_stmt_to_vectorize (stmt_info);
1403           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1404                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1405               && !PURE_SLP_STMT (stmt_info))
1406             /* STMT needs both SLP and loop-based vectorization.  */
1407             only_slp_in_loop = false;
1408         }
1409     }
1410
1411   if (only_slp_in_loop)
1412     {
1413       if (dump_enabled_p ())
1414         dump_printf_loc (MSG_NOTE, vect_location,
1415                          "Loop contains only SLP stmts\n");
1416       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1417     }
1418   else
1419     {
1420       if (dump_enabled_p ())
1421         dump_printf_loc (MSG_NOTE, vect_location,
1422                          "Loop contains SLP and non-SLP stmts\n");
1423       /* Both the vectorization factor and unroll factor have the form
1424          loop_vinfo->vector_size * X for some rational X, so they must have
1425          a common multiple.  */
1426       vectorization_factor
1427         = force_common_multiple (vectorization_factor,
1428                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1429     }
1430
1431   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1432   if (dump_enabled_p ())
1433     {
1434       dump_printf_loc (MSG_NOTE, vect_location,
1435                        "Updating vectorization factor to ");
1436       dump_dec (MSG_NOTE, vectorization_factor);
1437       dump_printf (MSG_NOTE, ".\n");
1438     }
1439 }
1440
1441 /* Return true if STMT_INFO describes a double reduction phi and if
1442    the other phi in the reduction is also relevant for vectorization.
1443    This rejects cases such as:
1444
1445       outer1:
1446         x_1 = PHI <x_3(outer2), ...>;
1447         ...
1448
1449       inner:
1450         x_2 = ...;
1451         ...
1452
1453       outer2:
1454         x_3 = PHI <x_2(inner)>;
1455
1456    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1457
1458 static bool
1459 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1460 {
1461   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1462     return false;
1463
1464   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1465 }
1466
1467 /* Function vect_analyze_loop_operations.
1468
1469    Scan the loop stmts and make sure they are all vectorizable.  */
1470
1471 static opt_result
1472 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1473 {
1474   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1475   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1476   int nbbs = loop->num_nodes;
1477   int i;
1478   stmt_vec_info stmt_info;
1479   bool need_to_vectorize = false;
1480   bool ok;
1481
1482   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1483
1484   auto_vec<stmt_info_for_cost> cost_vec;
1485
1486   for (i = 0; i < nbbs; i++)
1487     {
1488       basic_block bb = bbs[i];
1489
1490       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1491            gsi_next (&si))
1492         {
1493           gphi *phi = si.phi ();
1494           ok = true;
1495
1496           stmt_info = loop_vinfo->lookup_stmt (phi);
1497           if (dump_enabled_p ())
1498             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1499           if (virtual_operand_p (gimple_phi_result (phi)))
1500             continue;
1501
1502           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1503              (i.e., a phi in the tail of the outer-loop).  */
1504           if (! is_loop_header_bb_p (bb))
1505             {
1506               /* FORNOW: we currently don't support the case that these phis
1507                  are not used in the outerloop (unless it is double reduction,
1508                  i.e., this phi is vect_reduction_def), cause this case
1509                  requires to actually do something here.  */
1510               if (STMT_VINFO_LIVE_P (stmt_info)
1511                   && !vect_active_double_reduction_p (stmt_info))
1512                 return opt_result::failure_at (phi,
1513                                                "Unsupported loop-closed phi"
1514                                                " in outer-loop.\n");
1515
1516               /* If PHI is used in the outer loop, we check that its operand
1517                  is defined in the inner loop.  */
1518               if (STMT_VINFO_RELEVANT_P (stmt_info))
1519                 {
1520                   tree phi_op;
1521
1522                   if (gimple_phi_num_args (phi) != 1)
1523                     return opt_result::failure_at (phi, "unsupported phi");
1524
1525                   phi_op = PHI_ARG_DEF (phi, 0);
1526                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1527                   if (!op_def_info)
1528                     return opt_result::failure_at (phi, "unsupported phi\n");
1529
1530                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1531                       && (STMT_VINFO_RELEVANT (op_def_info)
1532                           != vect_used_in_outer_by_reduction))
1533                     return opt_result::failure_at (phi, "unsupported phi\n");
1534
1535                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1536                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1537                            == vect_double_reduction_def))
1538                       && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1539                     return opt_result::failure_at (phi, "unsupported phi\n");
1540                 }
1541
1542               continue;
1543             }
1544
1545           gcc_assert (stmt_info);
1546
1547           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1548                || STMT_VINFO_LIVE_P (stmt_info))
1549               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1550             /* A scalar-dependence cycle that we don't support.  */
1551             return opt_result::failure_at (phi,
1552                                            "not vectorized:"
1553                                            " scalar dependence cycle.\n");
1554
1555           if (STMT_VINFO_RELEVANT_P (stmt_info))
1556             {
1557               need_to_vectorize = true;
1558               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1559                   && ! PURE_SLP_STMT (stmt_info))
1560                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1561                                              &cost_vec);
1562               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1563                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1564                             == vect_double_reduction_def)
1565                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1566                        && ! PURE_SLP_STMT (stmt_info))
1567                 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1568             }
1569
1570           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1571           if (ok
1572               && STMT_VINFO_LIVE_P (stmt_info)
1573               && !PURE_SLP_STMT (stmt_info))
1574             ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1575                                               -1, false, &cost_vec);
1576
1577           if (!ok)
1578             return opt_result::failure_at (phi,
1579                                            "not vectorized: relevant phi not "
1580                                            "supported: %G",
1581                                            static_cast <gimple *> (phi));
1582         }
1583
1584       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1585            gsi_next (&si))
1586         {
1587           gimple *stmt = gsi_stmt (si);
1588           if (!gimple_clobber_p (stmt))
1589             {
1590               opt_result res
1591                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1592                                      &need_to_vectorize,
1593                                      NULL, NULL, &cost_vec);
1594               if (!res)
1595                 return res;
1596             }
1597         }
1598     } /* bbs */
1599
1600   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1601
1602   /* All operations in the loop are either irrelevant (deal with loop
1603      control, or dead), or only used outside the loop and can be moved
1604      out of the loop (e.g. invariants, inductions).  The loop can be
1605      optimized away by scalar optimizations.  We're better off not
1606      touching this loop.  */
1607   if (!need_to_vectorize)
1608     {
1609       if (dump_enabled_p ())
1610         dump_printf_loc (MSG_NOTE, vect_location,
1611                          "All the computation can be taken out of the loop.\n");
1612       return opt_result::failure_at
1613         (vect_location,
1614          "not vectorized: redundant loop. no profit to vectorize.\n");
1615     }
1616
1617   return opt_result::success ();
1618 }
1619
1620 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1621    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1622    definitely no, or -1 if it's worth retrying.  */
1623
1624 static int
1625 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1626 {
1627   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1629
1630   /* Only fully-masked loops can have iteration counts less than the
1631      vectorization factor.  */
1632   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1633     {
1634       HOST_WIDE_INT max_niter;
1635
1636       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1637         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1638       else
1639         max_niter = max_stmt_executions_int (loop);
1640
1641       if (max_niter != -1
1642           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1643         {
1644           if (dump_enabled_p ())
1645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646                              "not vectorized: iteration count smaller than "
1647                              "vectorization factor.\n");
1648           return 0;
1649         }
1650     }
1651
1652   int min_profitable_iters, min_profitable_estimate;
1653   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1654                                       &min_profitable_estimate);
1655
1656   if (min_profitable_iters < 0)
1657     {
1658       if (dump_enabled_p ())
1659         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660                          "not vectorized: vectorization not profitable.\n");
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663                          "not vectorized: vector version will never be "
1664                          "profitable.\n");
1665       return -1;
1666     }
1667
1668   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1669                                * assumed_vf);
1670
1671   /* Use the cost model only if it is more conservative than user specified
1672      threshold.  */
1673   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1674                                     min_profitable_iters);
1675
1676   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1677
1678   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1679       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1680     {
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "not vectorized: vectorization not profitable.\n");
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_NOTE, vect_location,
1686                          "not vectorized: iteration count smaller than user "
1687                          "specified loop bound parameter or minimum profitable "
1688                          "iterations (whichever is more conservative).\n");
1689       return 0;
1690     }
1691
1692   HOST_WIDE_INT estimated_niter;
1693
1694   /* If we are vectorizing an epilogue then we know the maximum number of
1695      scalar iterations it will cover is at least one lower than the
1696      vectorization factor of the main loop.  */
1697   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1698     estimated_niter
1699       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1700   else
1701     {
1702       estimated_niter = estimated_stmt_executions_int (loop);
1703       if (estimated_niter == -1)
1704         estimated_niter = likely_max_stmt_executions_int (loop);
1705     }
1706   if (estimated_niter != -1
1707       && ((unsigned HOST_WIDE_INT) estimated_niter
1708           < MAX (th, (unsigned) min_profitable_estimate)))
1709     {
1710       if (dump_enabled_p ())
1711         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1712                          "not vectorized: estimated iteration count too "
1713                          "small.\n");
1714       if (dump_enabled_p ())
1715         dump_printf_loc (MSG_NOTE, vect_location,
1716                          "not vectorized: estimated iteration count smaller "
1717                          "than specified loop bound parameter or minimum "
1718                          "profitable iterations (whichever is more "
1719                          "conservative).\n");
1720       return -1;
1721     }
1722
1723   return 1;
1724 }
1725
1726 static opt_result
1727 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1728                            vec<data_reference_p> *datarefs,
1729                            unsigned int *n_stmts)
1730 {
1731   *n_stmts = 0;
1732   for (unsigned i = 0; i < loop->num_nodes; i++)
1733     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1734          !gsi_end_p (gsi); gsi_next (&gsi))
1735       {
1736         gimple *stmt = gsi_stmt (gsi);
1737         if (is_gimple_debug (stmt))
1738           continue;
1739         ++(*n_stmts);
1740         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1741         if (!res)
1742           {
1743             if (is_gimple_call (stmt) && loop->safelen)
1744               {
1745                 tree fndecl = gimple_call_fndecl (stmt), op;
1746                 if (fndecl != NULL_TREE)
1747                   {
1748                     cgraph_node *node = cgraph_node::get (fndecl);
1749                     if (node != NULL && node->simd_clones != NULL)
1750                       {
1751                         unsigned int j, n = gimple_call_num_args (stmt);
1752                         for (j = 0; j < n; j++)
1753                           {
1754                             op = gimple_call_arg (stmt, j);
1755                             if (DECL_P (op)
1756                                 || (REFERENCE_CLASS_P (op)
1757                                     && get_base_address (op)))
1758                               break;
1759                           }
1760                         op = gimple_call_lhs (stmt);
1761                         /* Ignore #pragma omp declare simd functions
1762                            if they don't have data references in the
1763                            call stmt itself.  */
1764                         if (j == n
1765                             && !(op
1766                                  && (DECL_P (op)
1767                                      || (REFERENCE_CLASS_P (op)
1768                                          && get_base_address (op)))))
1769                           continue;
1770                       }
1771                   }
1772               }
1773             return res;
1774           }
1775         /* If dependence analysis will give up due to the limit on the
1776            number of datarefs stop here and fail fatally.  */
1777         if (datarefs->length ()
1778             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1779           return opt_result::failure_at (stmt, "exceeded param "
1780                                          "loop-max-datarefs-for-datadeps\n");
1781       }
1782   return opt_result::success ();
1783 }
1784
1785 /* Look for SLP-only access groups and turn each individual access into its own
1786    group.  */
1787 static void
1788 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1789 {
1790   unsigned int i;
1791   struct data_reference *dr;
1792
1793   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1794
1795   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1796   FOR_EACH_VEC_ELT (datarefs, i, dr)
1797     {
1798       gcc_assert (DR_REF (dr));
1799       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1800
1801       /* Check if the load is a part of an interleaving chain.  */
1802       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1803         {
1804           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1805           unsigned int group_size = DR_GROUP_SIZE (first_element);
1806
1807           /* Check if SLP-only groups.  */
1808           if (!STMT_SLP_TYPE (stmt_info)
1809               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1810             {
1811               /* Dissolve the group.  */
1812               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1813
1814               stmt_vec_info vinfo = first_element;
1815               while (vinfo)
1816                 {
1817                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1818                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1819                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1820                   DR_GROUP_SIZE (vinfo) = 1;
1821                   DR_GROUP_GAP (vinfo) = group_size - 1;
1822                   vinfo = next;
1823                 }
1824             }
1825         }
1826     }
1827 }
1828
1829
1830 /* Decides whether we need to create an epilogue loop to handle
1831    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1832
1833 void
1834 determine_peel_for_niter (loop_vec_info loop_vinfo)
1835 {
1836   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1837
1838   unsigned HOST_WIDE_INT const_vf;
1839   HOST_WIDE_INT max_niter
1840     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1841
1842   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1843   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1844     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1845                                           (loop_vinfo));
1846
1847   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1848     /* The main loop handles all iterations.  */
1849     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1850   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1851            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1852     {
1853       /* Work out the (constant) number of iterations that need to be
1854          peeled for reasons other than niters.  */
1855       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1856       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1857         peel_niter += 1;
1858       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1859                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1860         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1861     }
1862   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1863            /* ??? When peeling for gaps but not alignment, we could
1864               try to check whether the (variable) niters is known to be
1865               VF * N + 1.  That's something of a niche case though.  */
1866            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1867            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1868            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1869                 < (unsigned) exact_log2 (const_vf))
1870                /* In case of versioning, check if the maximum number of
1871                   iterations is greater than th.  If they are identical,
1872                   the epilogue is unnecessary.  */
1873                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1874                    || ((unsigned HOST_WIDE_INT) max_niter
1875                        > (th / const_vf) * const_vf))))
1876     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1877 }
1878
1879
1880 /* Function vect_analyze_loop_2.
1881
1882    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1883    for it.  The different analyses will record information in the
1884    loop_vec_info struct.  */
1885 static opt_result
1886 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1887 {
1888   opt_result ok = opt_result::success ();
1889   int res;
1890   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1891   poly_uint64 min_vf = 2;
1892   loop_vec_info orig_loop_vinfo = NULL;
1893
1894   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1895      loop_vec_info of the first vectorized loop.  */
1896   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1897     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1898   else
1899     orig_loop_vinfo = loop_vinfo;
1900   gcc_assert (orig_loop_vinfo);
1901
1902   /* The first group of checks is independent of the vector size.  */
1903   fatal = true;
1904
1905   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1906       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1907     return opt_result::failure_at (vect_location,
1908                                    "not vectorized: simd if(0)\n");
1909
1910   /* Find all data references in the loop (which correspond to vdefs/vuses)
1911      and analyze their evolution in the loop.  */
1912
1913   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1914
1915   /* Gather the data references and count stmts in the loop.  */
1916   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1917     {
1918       opt_result res
1919         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1920                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1921                                      n_stmts);
1922       if (!res)
1923         {
1924           if (dump_enabled_p ())
1925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1926                              "not vectorized: loop contains function "
1927                              "calls or data references that cannot "
1928                              "be analyzed\n");
1929           return res;
1930         }
1931       loop_vinfo->shared->save_datarefs ();
1932     }
1933   else
1934     loop_vinfo->shared->check_datarefs ();
1935
1936   /* Analyze the data references and also adjust the minimal
1937      vectorization factor according to the loads and stores.  */
1938
1939   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1940   if (!ok)
1941     {
1942       if (dump_enabled_p ())
1943         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1944                          "bad data references.\n");
1945       return ok;
1946     }
1947
1948   /* Classify all cross-iteration scalar data-flow cycles.
1949      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1950   vect_analyze_scalar_cycles (loop_vinfo);
1951
1952   vect_pattern_recog (loop_vinfo);
1953
1954   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1955
1956   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1957      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1958
1959   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1960   if (!ok)
1961     {
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1964                          "bad data access.\n");
1965       return ok;
1966     }
1967
1968   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1969
1970   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1971   if (!ok)
1972     {
1973       if (dump_enabled_p ())
1974         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1975                          "unexpected pattern.\n");
1976       return ok;
1977     }
1978
1979   /* While the rest of the analysis below depends on it in some way.  */
1980   fatal = false;
1981
1982   /* Analyze data dependences between the data-refs in the loop
1983      and adjust the maximum vectorization factor according to
1984      the dependences.
1985      FORNOW: fail at the first data dependence that we encounter.  */
1986
1987   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1988   if (!ok)
1989     {
1990       if (dump_enabled_p ())
1991         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992                          "bad data dependence.\n");
1993       return ok;
1994     }
1995   if (max_vf != MAX_VECTORIZATION_FACTOR
1996       && maybe_lt (max_vf, min_vf))
1997     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1998   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1999
2000   ok = vect_determine_vectorization_factor (loop_vinfo);
2001   if (!ok)
2002     {
2003       if (dump_enabled_p ())
2004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005                          "can't determine vectorization factor.\n");
2006       return ok;
2007     }
2008   if (max_vf != MAX_VECTORIZATION_FACTOR
2009       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2010     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2011
2012   /* Compute the scalar iteration cost.  */
2013   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2014
2015   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2016
2017   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2018   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2019   if (!ok)
2020     return ok;
2021
2022   /* If there are any SLP instances mark them as pure_slp.  */
2023   bool slp = vect_make_slp_decision (loop_vinfo);
2024   if (slp)
2025     {
2026       /* Find stmts that need to be both vectorized and SLPed.  */
2027       vect_detect_hybrid_slp (loop_vinfo);
2028
2029       /* Update the vectorization factor based on the SLP decision.  */
2030       vect_update_vf_for_slp (loop_vinfo);
2031     }
2032
2033   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2034
2035   /* We don't expect to have to roll back to anything other than an empty
2036      set of rgroups.  */
2037   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2038
2039   /* This is the point where we can re-start analysis with SLP forced off.  */
2040 start_over:
2041
2042   /* Now the vectorization factor is final.  */
2043   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2044   gcc_assert (known_ne (vectorization_factor, 0U));
2045
2046   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2047     {
2048       dump_printf_loc (MSG_NOTE, vect_location,
2049                        "vectorization_factor = ");
2050       dump_dec (MSG_NOTE, vectorization_factor);
2051       dump_printf (MSG_NOTE, ", niters = %wd\n",
2052                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2053     }
2054
2055   /* Analyze the alignment of the data-refs in the loop.
2056      Fail if a data reference is found that cannot be vectorized.  */
2057
2058   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2059   if (!ok)
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063                          "bad data alignment.\n");
2064       return ok;
2065     }
2066
2067   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2068      It is important to call pruning after vect_analyze_data_ref_accesses,
2069      since we use grouping information gathered by interleaving analysis.  */
2070   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2071   if (!ok)
2072     return ok;
2073
2074   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2075      vectorization, since we do not want to add extra peeling or
2076      add versioning for alignment.  */
2077   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2078     /* This pass will decide on using loop versioning and/or loop peeling in
2079        order to enhance the alignment of data references in the loop.  */
2080     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2081   else
2082     ok = vect_verify_datarefs_alignment (loop_vinfo);
2083   if (!ok)
2084     return ok;
2085
2086   if (slp)
2087     {
2088       /* Analyze operations in the SLP instances.  Note this may
2089          remove unsupported SLP instances which makes the above
2090          SLP kind detection invalid.  */
2091       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2092       vect_slp_analyze_operations (loop_vinfo);
2093       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2094         {
2095           ok = opt_result::failure_at (vect_location,
2096                                        "unsupported SLP instances\n");
2097           goto again;
2098         }
2099     }
2100
2101   /* Dissolve SLP-only groups.  */
2102   vect_dissolve_slp_only_groups (loop_vinfo);
2103
2104   /* Scan all the remaining operations in the loop that are not subject
2105      to SLP and make sure they are vectorizable.  */
2106   ok = vect_analyze_loop_operations (loop_vinfo);
2107   if (!ok)
2108     {
2109       if (dump_enabled_p ())
2110         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2111                          "bad operation or unsupported loop bound.\n");
2112       return ok;
2113     }
2114
2115   /* Decide whether to use a fully-masked loop for this vectorization
2116      factor.  */
2117   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2118     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2119        && vect_verify_full_masking (loop_vinfo));
2120   if (dump_enabled_p ())
2121     {
2122       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2123         dump_printf_loc (MSG_NOTE, vect_location,
2124                          "using a fully-masked loop.\n");
2125       else
2126         dump_printf_loc (MSG_NOTE, vect_location,
2127                          "not using a fully-masked loop.\n");
2128     }
2129
2130   /* If epilog loop is required because of data accesses with gaps,
2131      one additional iteration needs to be peeled.  Check if there is
2132      enough iterations for vectorization.  */
2133   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2135       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2136     {
2137       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2138       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2139
2140       if (known_lt (wi::to_widest (scalar_niters), vf))
2141         return opt_result::failure_at (vect_location,
2142                                        "loop has no enough iterations to"
2143                                        " support peeling for gaps.\n");
2144     }
2145
2146   /* If we're vectorizing an epilogue loop, we either need a fully-masked
2147      loop or a loop that has a lower VF than the main loop.  */
2148   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2149       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2150       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2151                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2152     return opt_result::failure_at (vect_location,
2153                                    "Vectorization factor too high for"
2154                                    " epilogue loop.\n");
2155
2156   /* Check the costings of the loop make vectorizing worthwhile.  */
2157   res = vect_analyze_loop_costing (loop_vinfo);
2158   if (res < 0)
2159     {
2160       ok = opt_result::failure_at (vect_location,
2161                                    "Loop costings may not be worthwhile.\n");
2162       goto again;
2163     }
2164   if (!res)
2165     return opt_result::failure_at (vect_location,
2166                                    "Loop costings not worthwhile.\n");
2167
2168   determine_peel_for_niter (loop_vinfo);
2169   /* If an epilogue loop is required make sure we can create one.  */
2170   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2171       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2172     {
2173       if (dump_enabled_p ())
2174         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2175       if (!vect_can_advance_ivs_p (loop_vinfo)
2176           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2177                                            single_exit (LOOP_VINFO_LOOP
2178                                                          (loop_vinfo))))
2179         {
2180           ok = opt_result::failure_at (vect_location,
2181                                        "not vectorized: can't create required "
2182                                        "epilog loop\n");
2183           goto again;
2184         }
2185     }
2186
2187   /* During peeling, we need to check if number of loop iterations is
2188      enough for both peeled prolog loop and vector loop.  This check
2189      can be merged along with threshold check of loop versioning, so
2190      increase threshold for this case if necessary.
2191
2192      If we are analyzing an epilogue we still want to check what its
2193      versioning threshold would be.  If we decide to vectorize the epilogues we
2194      will want to use the lowest versioning threshold of all epilogues and main
2195      loop.  This will enable us to enter a vectorized epilogue even when
2196      versioning the loop.  We can't simply check whether the epilogue requires
2197      versioning though since we may have skipped some versioning checks when
2198      analyzing the epilogue.  For instance, checks for alias versioning will be
2199      skipped when dealing with epilogues as we assume we already checked them
2200      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2201   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2202     {
2203       poly_uint64 niters_th = 0;
2204       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2205
2206       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2207         {
2208           /* Niters for peeled prolog loop.  */
2209           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2210             {
2211               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2212               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2213               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2214             }
2215           else
2216             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2217         }
2218
2219       /* Niters for at least one iteration of vectorized loop.  */
2220       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2221         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2222       /* One additional iteration because of peeling for gap.  */
2223       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2224         niters_th += 1;
2225
2226       /*  Use the same condition as vect_transform_loop to decide when to use
2227           the cost to determine a versioning threshold.  */
2228       if (th >= vect_vf_for_cost (loop_vinfo)
2229           && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2230           && ordered_p (th, niters_th))
2231         niters_th = ordered_max (poly_uint64 (th), niters_th);
2232
2233       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2234     }
2235
2236   gcc_assert (known_eq (vectorization_factor,
2237                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2238
2239   /* Ok to vectorize!  */
2240   return opt_result::success ();
2241
2242 again:
2243   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2244   gcc_assert (!ok);
2245
2246   /* Try again with SLP forced off but if we didn't do any SLP there is
2247      no point in re-trying.  */
2248   if (!slp)
2249     return ok;
2250
2251   /* If there are reduction chains re-trying will fail anyway.  */
2252   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2253     return ok;
2254
2255   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2256      via interleaving or lane instructions.  */
2257   slp_instance instance;
2258   slp_tree node;
2259   unsigned i, j;
2260   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2261     {
2262       stmt_vec_info vinfo;
2263       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2264       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2265         continue;
2266       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2267       unsigned int size = DR_GROUP_SIZE (vinfo);
2268       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2269       if (! vect_store_lanes_supported (vectype, size, false)
2270          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2271          && ! vect_grouped_store_supported (vectype, size))
2272         return opt_result::failure_at (vinfo->stmt,
2273                                        "unsupported grouped store\n");
2274       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2275         {
2276           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2277           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2278           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2279           size = DR_GROUP_SIZE (vinfo);
2280           vectype = STMT_VINFO_VECTYPE (vinfo);
2281           if (! vect_load_lanes_supported (vectype, size, false)
2282               && ! vect_grouped_load_supported (vectype, single_element_p,
2283                                                 size))
2284             return opt_result::failure_at (vinfo->stmt,
2285                                            "unsupported grouped load\n");
2286         }
2287     }
2288
2289   if (dump_enabled_p ())
2290     dump_printf_loc (MSG_NOTE, vect_location,
2291                      "re-trying with SLP disabled\n");
2292
2293   /* Roll back state appropriately.  No SLP this time.  */
2294   slp = false;
2295   /* Restore vectorization factor as it were without SLP.  */
2296   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2297   /* Free the SLP instances.  */
2298   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2299     vect_free_slp_instance (instance, false);
2300   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2301   /* Reset SLP type to loop_vect on all stmts.  */
2302   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2303     {
2304       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2305       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2306            !gsi_end_p (si); gsi_next (&si))
2307         {
2308           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2309           STMT_SLP_TYPE (stmt_info) = loop_vect;
2310           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2311               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2312             {
2313               /* vectorizable_reduction adjusts reduction stmt def-types,
2314                  restore them to that of the PHI.  */
2315               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2316                 = STMT_VINFO_DEF_TYPE (stmt_info);
2317               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2318                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2319                 = STMT_VINFO_DEF_TYPE (stmt_info);
2320             }
2321         }
2322       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2323            !gsi_end_p (si); gsi_next (&si))
2324         {
2325           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2326           STMT_SLP_TYPE (stmt_info) = loop_vect;
2327           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2328             {
2329               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2330               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2331               STMT_SLP_TYPE (stmt_info) = loop_vect;
2332               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2333                    !gsi_end_p (pi); gsi_next (&pi))
2334                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2335                   = loop_vect;
2336             }
2337         }
2338     }
2339   /* Free optimized alias test DDRS.  */
2340   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2341   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2342   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2343   /* Reset target cost data.  */
2344   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2345   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2346     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2347   /* Reset accumulated rgroup information.  */
2348   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2349   /* Reset assorted flags.  */
2350   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2351   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2352   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2353   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2354   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2355
2356   goto start_over;
2357 }
2358
2359 /* Function vect_analyze_loop.
2360
2361    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2362    for it.  The different analyses will record information in the
2363    loop_vec_info struct.  */
2364 opt_loop_vec_info
2365 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2366 {
2367   auto_vector_sizes vector_sizes;
2368
2369   /* Autodetect first vector size we try.  */
2370   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2371                                                 loop->simdlen != 0);
2372   unsigned int next_size = 0;
2373
2374   DUMP_VECT_SCOPE ("analyze_loop_nest");
2375
2376   if (loop_outer (loop)
2377       && loop_vec_info_for_loop (loop_outer (loop))
2378       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2379     return opt_loop_vec_info::failure_at (vect_location,
2380                                           "outer-loop already vectorized.\n");
2381
2382   if (!find_loop_nest (loop, &shared->loop_nest))
2383     return opt_loop_vec_info::failure_at
2384       (vect_location,
2385        "not vectorized: loop nest containing two or more consecutive inner"
2386        " loops cannot be vectorized\n");
2387
2388   unsigned n_stmts = 0;
2389   poly_uint64 autodetected_vector_size = 0;
2390   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2391   poly_uint64 next_vector_size = 0;
2392   poly_uint64 lowest_th = 0;
2393   unsigned vectorized_loops = 0;
2394
2395   bool vect_epilogues = false;
2396   opt_result res = opt_result::success ();
2397   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2398   while (1)
2399     {
2400       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2401       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2402       if (!loop_vinfo)
2403         {
2404           if (dump_enabled_p ())
2405             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2406                              "bad loop form.\n");
2407           gcc_checking_assert (first_loop_vinfo == NULL);
2408           return loop_vinfo;
2409         }
2410       loop_vinfo->vector_size = next_vector_size;
2411
2412       bool fatal = false;
2413
2414       if (vect_epilogues)
2415         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2416
2417       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2418       if (next_size == 0)
2419         autodetected_vector_size = loop_vinfo->vector_size;
2420
2421       loop->aux = NULL;
2422       if (res)
2423         {
2424           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2425           vectorized_loops++;
2426
2427           /* Once we hit the desired simdlen for the first time,
2428              discard any previous attempts.  */
2429           if (simdlen
2430               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2431             {
2432               delete first_loop_vinfo;
2433               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2434               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2435               simdlen = 0;
2436             }
2437
2438           if (first_loop_vinfo == NULL)
2439             {
2440               first_loop_vinfo = loop_vinfo;
2441               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2442             }
2443           else if (vect_epilogues)
2444             {
2445               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2446               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2447               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2448                           || maybe_ne (lowest_th, 0U));
2449               /* Keep track of the known smallest versioning
2450                  threshold.  */
2451               if (ordered_p (lowest_th, th))
2452                 lowest_th = ordered_min (lowest_th, th);
2453             }
2454           else
2455             delete loop_vinfo;
2456
2457           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2458              enabled, this is not a simd loop and it is the innermost loop.  */
2459           vect_epilogues = (!loop->simdlen
2460                             && loop->inner == NULL
2461                             && PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK)
2462                             /* For now only allow one epilogue loop.  */
2463                             && first_loop_vinfo->epilogue_vinfos.is_empty ());
2464
2465           /* Commit to first_loop_vinfo if we have no reason to try
2466              alternatives.  */
2467           if (!simdlen && !vect_epilogues)
2468             break;
2469         }
2470       else
2471         {
2472           delete loop_vinfo;
2473           if (fatal)
2474             {
2475               gcc_checking_assert (first_loop_vinfo == NULL);
2476               break;
2477             }
2478         }
2479
2480       if (next_size < vector_sizes.length ()
2481           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2482         next_size += 1;
2483
2484       if (next_size == vector_sizes.length ()
2485           || known_eq (autodetected_vector_size, 0U))
2486         break;
2487
2488       /* Try the next biggest vector size.  */
2489       next_vector_size = vector_sizes[next_size++];
2490       if (dump_enabled_p ())
2491         {
2492           dump_printf_loc (MSG_NOTE, vect_location,
2493                            "***** Re-trying analysis with "
2494                            "vector size ");
2495           dump_dec (MSG_NOTE, next_vector_size);
2496           dump_printf (MSG_NOTE, "\n");
2497         }
2498     }
2499
2500   if (first_loop_vinfo)
2501     {
2502       loop->aux = (loop_vec_info) first_loop_vinfo;
2503       if (dump_enabled_p ())
2504         {
2505           dump_printf_loc (MSG_NOTE, vect_location,
2506                            "***** Choosing vector size ");
2507           dump_dec (MSG_NOTE, first_loop_vinfo->vector_size);
2508           dump_printf (MSG_NOTE, "\n");
2509         }
2510       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2511       return first_loop_vinfo;
2512     }
2513
2514   return opt_loop_vec_info::propagate_failure (res);
2515 }
2516
2517 /* Return true if there is an in-order reduction function for CODE, storing
2518    it in *REDUC_FN if so.  */
2519
2520 static bool
2521 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2522 {
2523   switch (code)
2524     {
2525     case PLUS_EXPR:
2526       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2527       return true;
2528
2529     default:
2530       return false;
2531     }
2532 }
2533
2534 /* Function reduction_fn_for_scalar_code
2535
2536    Input:
2537    CODE - tree_code of a reduction operations.
2538
2539    Output:
2540    REDUC_FN - the corresponding internal function to be used to reduce the
2541       vector of partial results into a single scalar result, or IFN_LAST
2542       if the operation is a supported reduction operation, but does not have
2543       such an internal function.
2544
2545    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2546
2547 static bool
2548 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2549 {
2550   switch (code)
2551     {
2552       case MAX_EXPR:
2553         *reduc_fn = IFN_REDUC_MAX;
2554         return true;
2555
2556       case MIN_EXPR:
2557         *reduc_fn = IFN_REDUC_MIN;
2558         return true;
2559
2560       case PLUS_EXPR:
2561         *reduc_fn = IFN_REDUC_PLUS;
2562         return true;
2563
2564       case BIT_AND_EXPR:
2565         *reduc_fn = IFN_REDUC_AND;
2566         return true;
2567
2568       case BIT_IOR_EXPR:
2569         *reduc_fn = IFN_REDUC_IOR;
2570         return true;
2571
2572       case BIT_XOR_EXPR:
2573         *reduc_fn = IFN_REDUC_XOR;
2574         return true;
2575
2576       case MULT_EXPR:
2577       case MINUS_EXPR:
2578         *reduc_fn = IFN_LAST;
2579         return true;
2580
2581       default:
2582        return false;
2583     }
2584 }
2585
2586 /* If there is a neutral value X such that SLP reduction NODE would not
2587    be affected by the introduction of additional X elements, return that X,
2588    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2589    is true if the SLP statements perform a single reduction, false if each
2590    statement performs an independent reduction.  */
2591
2592 static tree
2593 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2594                               bool reduc_chain)
2595 {
2596   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2597   stmt_vec_info stmt_vinfo = stmts[0];
2598   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2599   tree scalar_type = TREE_TYPE (vector_type);
2600   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2601   gcc_assert (loop);
2602
2603   switch (code)
2604     {
2605     case WIDEN_SUM_EXPR:
2606     case DOT_PROD_EXPR:
2607     case SAD_EXPR:
2608     case PLUS_EXPR:
2609     case MINUS_EXPR:
2610     case BIT_IOR_EXPR:
2611     case BIT_XOR_EXPR:
2612       return build_zero_cst (scalar_type);
2613
2614     case MULT_EXPR:
2615       return build_one_cst (scalar_type);
2616
2617     case BIT_AND_EXPR:
2618       return build_all_ones_cst (scalar_type);
2619
2620     case MAX_EXPR:
2621     case MIN_EXPR:
2622       /* For MIN/MAX the initial values are neutral.  A reduction chain
2623          has only a single initial value, so that value is neutral for
2624          all statements.  */
2625       if (reduc_chain)
2626         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2627                                       loop_preheader_edge (loop));
2628       return NULL_TREE;
2629
2630     default:
2631       return NULL_TREE;
2632     }
2633 }
2634
2635 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2636    STMT is printed with a message MSG. */
2637
2638 static void
2639 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2640 {
2641   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2642 }
2643
2644 /* Return true if we need an in-order reduction for operation CODE
2645    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2646    overflow must wrap.  */
2647
2648 bool
2649 needs_fold_left_reduction_p (tree type, tree_code code)
2650 {
2651   /* CHECKME: check for !flag_finite_math_only too?  */
2652   if (SCALAR_FLOAT_TYPE_P (type))
2653     switch (code)
2654       {
2655       case MIN_EXPR:
2656       case MAX_EXPR:
2657         return false;
2658
2659       default:
2660         return !flag_associative_math;
2661       }
2662
2663   if (INTEGRAL_TYPE_P (type))
2664     {
2665       if (!operation_no_trapping_overflow (type, code))
2666         return true;
2667       return false;
2668     }
2669
2670   if (SAT_FIXED_POINT_TYPE_P (type))
2671     return true;
2672
2673   return false;
2674 }
2675
2676 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2677    has a handled computation expression.  Store the main reduction
2678    operation in *CODE.  */
2679
2680 static bool
2681 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2682                       tree loop_arg, enum tree_code *code,
2683                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2684 {
2685   auto_bitmap visited;
2686   tree lookfor = PHI_RESULT (phi);
2687   ssa_op_iter curri;
2688   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2689   while (USE_FROM_PTR (curr) != loop_arg)
2690     curr = op_iter_next_use (&curri);
2691   curri.i = curri.numops;
2692   do
2693     {
2694       path.safe_push (std::make_pair (curri, curr));
2695       tree use = USE_FROM_PTR (curr);
2696       if (use == lookfor)
2697         break;
2698       gimple *def = SSA_NAME_DEF_STMT (use);
2699       if (gimple_nop_p (def)
2700           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2701         {
2702 pop:
2703           do
2704             {
2705               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2706               curri = x.first;
2707               curr = x.second;
2708               do
2709                 curr = op_iter_next_use (&curri);
2710               /* Skip already visited or non-SSA operands (from iterating
2711                  over PHI args).  */
2712               while (curr != NULL_USE_OPERAND_P
2713                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2714                          || ! bitmap_set_bit (visited,
2715                                               SSA_NAME_VERSION
2716                                                 (USE_FROM_PTR (curr)))));
2717             }
2718           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2719           if (curr == NULL_USE_OPERAND_P)
2720             break;
2721         }
2722       else
2723         {
2724           if (gimple_code (def) == GIMPLE_PHI)
2725             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2726           else
2727             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2728           while (curr != NULL_USE_OPERAND_P
2729                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2730                      || ! bitmap_set_bit (visited,
2731                                           SSA_NAME_VERSION
2732                                             (USE_FROM_PTR (curr)))))
2733             curr = op_iter_next_use (&curri);
2734           if (curr == NULL_USE_OPERAND_P)
2735             goto pop;
2736         }
2737     }
2738   while (1);
2739   if (dump_file && (dump_flags & TDF_DETAILS))
2740     {
2741       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2742       unsigned i;
2743       std::pair<ssa_op_iter, use_operand_p> *x;
2744       FOR_EACH_VEC_ELT (path, i, x)
2745         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2746       dump_printf (MSG_NOTE, "\n");
2747     }
2748
2749   /* Check whether the reduction path detected is valid.  */
2750   bool fail = path.length () == 0;
2751   bool neg = false;
2752   int sign = -1;
2753   *code = ERROR_MARK;
2754   for (unsigned i = 1; i < path.length (); ++i)
2755     {
2756       gimple *use_stmt = USE_STMT (path[i].second);
2757       tree op = USE_FROM_PTR (path[i].second);
2758       if (! is_gimple_assign (use_stmt)
2759           /* The following make sure we can compute the operand index
2760              easily plus it mostly disallows chaining via COND_EXPR condition
2761              operands.  */
2762           || (gimple_assign_rhs1 (use_stmt) != op
2763               && gimple_assign_rhs2 (use_stmt) != op
2764               && gimple_assign_rhs3 (use_stmt) != op))
2765         {
2766           fail = true;
2767           break;
2768         }
2769       /* Check there's only a single stmt the op is used on inside
2770          of the loop.  */
2771       imm_use_iterator imm_iter;
2772       gimple *op_use_stmt;
2773       unsigned cnt = 0;
2774       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2775         if (!is_gimple_debug (op_use_stmt)
2776             && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2777           cnt++;
2778       if (cnt != 1)
2779         {
2780           fail = true;
2781           break;
2782         }
2783       tree_code use_code = gimple_assign_rhs_code (use_stmt);
2784       if (use_code == MINUS_EXPR)
2785         {
2786           use_code = PLUS_EXPR;
2787           /* Track whether we negate the reduction value each iteration.  */
2788           if (gimple_assign_rhs2 (use_stmt) == op)
2789             neg = ! neg;
2790         }
2791       if (CONVERT_EXPR_CODE_P (use_code)
2792           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
2793                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
2794         ;
2795       else if (*code == ERROR_MARK)
2796         {
2797           *code = use_code;
2798           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
2799         }
2800       else if (use_code != *code)
2801         {
2802           fail = true;
2803           break;
2804         }
2805       else if ((use_code == MIN_EXPR
2806                 || use_code == MAX_EXPR)
2807                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
2808         {
2809           fail = true;
2810           break;
2811         }
2812     }
2813   return ! fail && ! neg && *code != ERROR_MARK;
2814 }
2815
2816 bool
2817 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2818                       tree loop_arg, enum tree_code code)
2819 {
2820   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2821   enum tree_code code_;
2822   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
2823           && code_ == code);
2824 }
2825
2826
2827
2828 /* Function vect_is_simple_reduction
2829
2830    (1) Detect a cross-iteration def-use cycle that represents a simple
2831    reduction computation.  We look for the following pattern:
2832
2833    loop_header:
2834      a1 = phi < a0, a2 >
2835      a3 = ...
2836      a2 = operation (a3, a1)
2837
2838    or
2839
2840    a3 = ...
2841    loop_header:
2842      a1 = phi < a0, a2 >
2843      a2 = operation (a3, a1)
2844
2845    such that:
2846    1. operation is commutative and associative and it is safe to
2847       change the order of the computation
2848    2. no uses for a2 in the loop (a2 is used out of the loop)
2849    3. no uses of a1 in the loop besides the reduction operation
2850    4. no uses of a1 outside the loop.
2851
2852    Conditions 1,4 are tested here.
2853    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2854
2855    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2856    nested cycles.
2857
2858    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2859    reductions:
2860
2861      a1 = phi < a0, a2 >
2862      inner loop (def of a3)
2863      a2 = phi < a3 >
2864
2865    (4) Detect condition expressions, ie:
2866      for (int i = 0; i < N; i++)
2867        if (a[i] < val)
2868         ret_val = a[i];
2869
2870 */
2871
2872 static stmt_vec_info
2873 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2874                           bool *double_reduc, bool *reduc_chain_p)
2875 {
2876   gphi *phi = as_a <gphi *> (phi_info->stmt);
2877   gimple *phi_use_stmt = NULL;
2878   imm_use_iterator imm_iter;
2879   use_operand_p use_p;
2880
2881   *double_reduc = false;
2882   *reduc_chain_p = false;
2883   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
2884
2885   tree phi_name = PHI_RESULT (phi);
2886   /* ???  If there are no uses of the PHI result the inner loop reduction
2887      won't be detected as possibly double-reduction by vectorizable_reduction
2888      because that tries to walk the PHI arg from the preheader edge which
2889      can be constant.  See PR60382.  */
2890   if (has_zero_uses (phi_name))
2891     return NULL;
2892   class loop *loop = (gimple_bb (phi))->loop_father;
2893   unsigned nphi_def_loop_uses = 0;
2894   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2895     {
2896       gimple *use_stmt = USE_STMT (use_p);
2897       if (is_gimple_debug (use_stmt))
2898         continue;
2899
2900       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2901         {
2902           if (dump_enabled_p ())
2903             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2904                              "intermediate value used outside loop.\n");
2905
2906           return NULL;
2907         }
2908
2909       nphi_def_loop_uses++;
2910       phi_use_stmt = use_stmt;
2911     }
2912
2913   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2914   if (TREE_CODE (latch_def) != SSA_NAME)
2915     {
2916       if (dump_enabled_p ())
2917         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2918                          "reduction: not ssa_name: %T\n", latch_def);
2919       return NULL;
2920     }
2921
2922   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
2923   if (!def_stmt_info
2924       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2925     return NULL;
2926
2927   bool nested_in_vect_loop
2928     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
2929   unsigned nlatch_def_loop_uses = 0;
2930   auto_vec<gphi *, 3> lcphis;
2931   bool inner_loop_of_double_reduc = false;
2932   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
2933     {
2934       gimple *use_stmt = USE_STMT (use_p);
2935       if (is_gimple_debug (use_stmt))
2936         continue;
2937       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2938         nlatch_def_loop_uses++;
2939       else
2940         {
2941           /* We can have more than one loop-closed PHI.  */
2942           lcphis.safe_push (as_a <gphi *> (use_stmt));
2943           if (nested_in_vect_loop
2944               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2945                   == vect_double_reduction_def))
2946             inner_loop_of_double_reduc = true;
2947         }
2948     }
2949
2950   /* If we are vectorizing an inner reduction we are executing that
2951      in the original order only in case we are not dealing with a
2952      double reduction.  */
2953   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
2954     {
2955       if (dump_enabled_p ())
2956         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
2957                         "detected nested cycle: ");
2958       return def_stmt_info;
2959     }
2960
2961   /* If this isn't a nested cycle or if the nested cycle reduction value
2962      is used ouside of the inner loop we cannot handle uses of the reduction
2963      value.  */
2964   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
2965     {
2966       if (dump_enabled_p ())
2967         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2968                          "reduction used in loop.\n");
2969       return NULL;
2970     }
2971
2972   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2973      defined in the inner loop.  */
2974   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2975     {
2976       tree op1 = PHI_ARG_DEF (def_stmt, 0);
2977       if (gimple_phi_num_args (def_stmt) != 1
2978           || TREE_CODE (op1) != SSA_NAME)
2979         {
2980           if (dump_enabled_p ())
2981             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982                              "unsupported phi node definition.\n");
2983
2984           return NULL;
2985         }
2986
2987       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2988       if (gimple_bb (def1)
2989           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2990           && loop->inner
2991           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2992           && is_gimple_assign (def1)
2993           && is_a <gphi *> (phi_use_stmt)
2994           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2995         {
2996           if (dump_enabled_p ())
2997             report_vect_op (MSG_NOTE, def_stmt,
2998                             "detected double reduction: ");
2999
3000           *double_reduc = true;
3001           return def_stmt_info;
3002         }
3003
3004       return NULL;
3005     }
3006
3007   /* Look for the expression computing latch_def from then loop PHI result.  */
3008   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3009   enum tree_code code;
3010   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3011                             path))
3012     {
3013       STMT_VINFO_REDUC_CODE (phi_info) = code;
3014       if (code == COND_EXPR && !nested_in_vect_loop)
3015         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3016
3017       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3018          reduction chain for which the additional restriction is that
3019          all operations in the chain are the same.  */
3020       auto_vec<stmt_vec_info, 8> reduc_chain;
3021       unsigned i;
3022       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3023       for (i = path.length () - 1; i >= 1; --i)
3024         {
3025           gimple *stmt = USE_STMT (path[i].second);
3026           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3027           STMT_VINFO_REDUC_IDX (stmt_info)
3028             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3029           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3030           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3031                                      && (i == 1 || i == path.length () - 1));
3032           if ((stmt_code != code && !leading_conversion)
3033               /* We can only handle the final value in epilogue
3034                  generation for reduction chains.  */
3035               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3036             is_slp_reduc = false;
3037           /* For reduction chains we support a trailing/leading
3038              conversions.  We do not store those in the actual chain.  */
3039           if (leading_conversion)
3040             continue;
3041           reduc_chain.safe_push (stmt_info);
3042         }
3043       if (is_slp_reduc && reduc_chain.length () > 1)
3044         {
3045           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3046             {
3047               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3048               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3049             }
3050           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3051           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3052
3053           /* Save the chain for further analysis in SLP detection.  */
3054           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3055           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3056
3057           *reduc_chain_p = true;
3058           if (dump_enabled_p ())
3059             dump_printf_loc (MSG_NOTE, vect_location,
3060                             "reduction: detected reduction chain\n");
3061         }
3062       else if (dump_enabled_p ())
3063         dump_printf_loc (MSG_NOTE, vect_location,
3064                          "reduction: detected reduction\n");
3065
3066       return def_stmt_info;
3067     }
3068
3069   if (dump_enabled_p ())
3070     dump_printf_loc (MSG_NOTE, vect_location,
3071                      "reduction: unknown pattern\n");
3072
3073   return NULL;
3074 }
3075
3076 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3077 int
3078 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3079                              int *peel_iters_epilogue,
3080                              stmt_vector_for_cost *scalar_cost_vec,
3081                              stmt_vector_for_cost *prologue_cost_vec,
3082                              stmt_vector_for_cost *epilogue_cost_vec)
3083 {
3084   int retval = 0;
3085   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3086
3087   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3088     {
3089       *peel_iters_epilogue = assumed_vf / 2;
3090       if (dump_enabled_p ())
3091         dump_printf_loc (MSG_NOTE, vect_location,
3092                          "cost model: epilogue peel iters set to vf/2 "
3093                          "because loop iterations are unknown .\n");
3094
3095       /* If peeled iterations are known but number of scalar loop
3096          iterations are unknown, count a taken branch per peeled loop.  */
3097       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3098                                  NULL, 0, vect_prologue);
3099       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3100                                   NULL, 0, vect_epilogue);
3101     }
3102   else
3103     {
3104       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3105       peel_iters_prologue = niters < peel_iters_prologue ?
3106                             niters : peel_iters_prologue;
3107       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3108       /* If we need to peel for gaps, but no peeling is required, we have to
3109          peel VF iterations.  */
3110       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3111         *peel_iters_epilogue = assumed_vf;
3112     }
3113
3114   stmt_info_for_cost *si;
3115   int j;
3116   if (peel_iters_prologue)
3117     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3118       retval += record_stmt_cost (prologue_cost_vec,
3119                                   si->count * peel_iters_prologue,
3120                                   si->kind, si->stmt_info, si->misalign,
3121                                   vect_prologue);
3122   if (*peel_iters_epilogue)
3123     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3124       retval += record_stmt_cost (epilogue_cost_vec,
3125                                   si->count * *peel_iters_epilogue,
3126                                   si->kind, si->stmt_info, si->misalign,
3127                                   vect_epilogue);
3128
3129   return retval;
3130 }
3131
3132 /* Function vect_estimate_min_profitable_iters
3133
3134    Return the number of iterations required for the vector version of the
3135    loop to be profitable relative to the cost of the scalar version of the
3136    loop.
3137
3138    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3139    of iterations for vectorization.  -1 value means loop vectorization
3140    is not profitable.  This returned value may be used for dynamic
3141    profitability check.
3142
3143    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3144    for static check against estimated number of iterations.  */
3145
3146 static void
3147 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3148                                     int *ret_min_profitable_niters,
3149                                     int *ret_min_profitable_estimate)
3150 {
3151   int min_profitable_iters;
3152   int min_profitable_estimate;
3153   int peel_iters_prologue;
3154   int peel_iters_epilogue;
3155   unsigned vec_inside_cost = 0;
3156   int vec_outside_cost = 0;
3157   unsigned vec_prologue_cost = 0;
3158   unsigned vec_epilogue_cost = 0;
3159   int scalar_single_iter_cost = 0;
3160   int scalar_outside_cost = 0;
3161   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3162   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3163   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3164
3165   /* Cost model disabled.  */
3166   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3167     {
3168       if (dump_enabled_p ())
3169         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3170       *ret_min_profitable_niters = 0;
3171       *ret_min_profitable_estimate = 0;
3172       return;
3173     }
3174
3175   /* Requires loop versioning tests to handle misalignment.  */
3176   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3177     {
3178       /*  FIXME: Make cost depend on complexity of individual check.  */
3179       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3180       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3181                             vect_prologue);
3182       if (dump_enabled_p ())
3183         dump_printf (MSG_NOTE,
3184                      "cost model: Adding cost of checks for loop "
3185                      "versioning to treat misalignment.\n");
3186     }
3187
3188   /* Requires loop versioning with alias checks.  */
3189   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3190     {
3191       /*  FIXME: Make cost depend on complexity of individual check.  */
3192       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3193       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3194                             vect_prologue);
3195       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3196       if (len)
3197         /* Count LEN - 1 ANDs and LEN comparisons.  */
3198         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3199                               NULL, 0, vect_prologue);
3200       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3201       if (len)
3202         {
3203           /* Count LEN - 1 ANDs and LEN comparisons.  */
3204           unsigned int nstmts = len * 2 - 1;
3205           /* +1 for each bias that needs adding.  */
3206           for (unsigned int i = 0; i < len; ++i)
3207             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3208               nstmts += 1;
3209           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3210                                 NULL, 0, vect_prologue);
3211         }
3212       if (dump_enabled_p ())
3213         dump_printf (MSG_NOTE,
3214                      "cost model: Adding cost of checks for loop "
3215                      "versioning aliasing.\n");
3216     }
3217
3218   /* Requires loop versioning with niter checks.  */
3219   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3220     {
3221       /*  FIXME: Make cost depend on complexity of individual check.  */
3222       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3223                             vect_prologue);
3224       if (dump_enabled_p ())
3225         dump_printf (MSG_NOTE,
3226                      "cost model: Adding cost of checks for loop "
3227                      "versioning niters.\n");
3228     }
3229
3230   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3231     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3232                           vect_prologue);
3233
3234   /* Count statements in scalar loop.  Using this as scalar cost for a single
3235      iteration for now.
3236
3237      TODO: Add outer loop support.
3238
3239      TODO: Consider assigning different costs to different scalar
3240      statements.  */
3241
3242   scalar_single_iter_cost
3243     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3244
3245   /* Add additional cost for the peeled instructions in prologue and epilogue
3246      loop.  (For fully-masked loops there will be no peeling.)
3247
3248      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3249      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3250
3251      TODO: Build an expression that represents peel_iters for prologue and
3252      epilogue to be used in a run-time test.  */
3253
3254   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3255     {
3256       peel_iters_prologue = 0;
3257       peel_iters_epilogue = 0;
3258
3259       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3260         {
3261           /* We need to peel exactly one iteration.  */
3262           peel_iters_epilogue += 1;
3263           stmt_info_for_cost *si;
3264           int j;
3265           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3266                             j, si)
3267             (void) add_stmt_cost (target_cost_data, si->count,
3268                                   si->kind, si->stmt_info, si->misalign,
3269                                   vect_epilogue);
3270         }
3271     }
3272   else if (npeel < 0)
3273     {
3274       peel_iters_prologue = assumed_vf / 2;
3275       if (dump_enabled_p ())
3276         dump_printf (MSG_NOTE, "cost model: "
3277                      "prologue peel iters set to vf/2.\n");
3278
3279       /* If peeling for alignment is unknown, loop bound of main loop becomes
3280          unknown.  */
3281       peel_iters_epilogue = assumed_vf / 2;
3282       if (dump_enabled_p ())
3283         dump_printf (MSG_NOTE, "cost model: "
3284                      "epilogue peel iters set to vf/2 because "
3285                      "peeling for alignment is unknown.\n");
3286
3287       /* If peeled iterations are unknown, count a taken branch and a not taken
3288          branch per peeled loop. Even if scalar loop iterations are known,
3289          vector iterations are not known since peeled prologue iterations are
3290          not known. Hence guards remain the same.  */
3291       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3292                             NULL, 0, vect_prologue);
3293       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3294                             NULL, 0, vect_prologue);
3295       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3296                             NULL, 0, vect_epilogue);
3297       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3298                             NULL, 0, vect_epilogue);
3299       stmt_info_for_cost *si;
3300       int j;
3301       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3302         {
3303           (void) add_stmt_cost (target_cost_data,
3304                                 si->count * peel_iters_prologue,
3305                                 si->kind, si->stmt_info, si->misalign,
3306                                 vect_prologue);
3307           (void) add_stmt_cost (target_cost_data,
3308                                 si->count * peel_iters_epilogue,
3309                                 si->kind, si->stmt_info, si->misalign,
3310                                 vect_epilogue);
3311         }
3312     }
3313   else
3314     {
3315       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3316       stmt_info_for_cost *si;
3317       int j;
3318       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3319
3320       prologue_cost_vec.create (2);
3321       epilogue_cost_vec.create (2);
3322       peel_iters_prologue = npeel;
3323
3324       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3325                                           &peel_iters_epilogue,
3326                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3327                                             (loop_vinfo),
3328                                           &prologue_cost_vec,
3329                                           &epilogue_cost_vec);
3330
3331       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3332         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3333                               si->misalign, vect_prologue);
3334
3335       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3336         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3337                               si->misalign, vect_epilogue);
3338
3339       prologue_cost_vec.release ();
3340       epilogue_cost_vec.release ();
3341     }
3342
3343   /* FORNOW: The scalar outside cost is incremented in one of the
3344      following ways:
3345
3346      1. The vectorizer checks for alignment and aliasing and generates
3347      a condition that allows dynamic vectorization.  A cost model
3348      check is ANDED with the versioning condition.  Hence scalar code
3349      path now has the added cost of the versioning check.
3350
3351        if (cost > th & versioning_check)
3352          jmp to vector code
3353
3354      Hence run-time scalar is incremented by not-taken branch cost.
3355
3356      2. The vectorizer then checks if a prologue is required.  If the
3357      cost model check was not done before during versioning, it has to
3358      be done before the prologue check.
3359
3360        if (cost <= th)
3361          prologue = scalar_iters
3362        if (prologue == 0)
3363          jmp to vector code
3364        else
3365          execute prologue
3366        if (prologue == num_iters)
3367          go to exit
3368
3369      Hence the run-time scalar cost is incremented by a taken branch,
3370      plus a not-taken branch, plus a taken branch cost.
3371
3372      3. The vectorizer then checks if an epilogue is required.  If the
3373      cost model check was not done before during prologue check, it
3374      has to be done with the epilogue check.
3375
3376        if (prologue == 0)
3377          jmp to vector code
3378        else
3379          execute prologue
3380        if (prologue == num_iters)
3381          go to exit
3382        vector code:
3383          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3384            jmp to epilogue
3385
3386      Hence the run-time scalar cost should be incremented by 2 taken
3387      branches.
3388
3389      TODO: The back end may reorder the BBS's differently and reverse
3390      conditions/branch directions.  Change the estimates below to
3391      something more reasonable.  */
3392
3393   /* If the number of iterations is known and we do not do versioning, we can
3394      decide whether to vectorize at compile time.  Hence the scalar version
3395      do not carry cost model guard costs.  */
3396   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3397       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3398     {
3399       /* Cost model check occurs at versioning.  */
3400       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3401         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3402       else
3403         {
3404           /* Cost model check occurs at prologue generation.  */
3405           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3406             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3407               + vect_get_stmt_cost (cond_branch_not_taken);
3408           /* Cost model check occurs at epilogue generation.  */
3409           else
3410             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3411         }
3412     }
3413
3414   /* Complete the target-specific cost calculations.  */
3415   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3416                &vec_inside_cost, &vec_epilogue_cost);
3417
3418   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3419
3420   if (dump_enabled_p ())
3421     {
3422       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3423       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3424                    vec_inside_cost);
3425       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3426                    vec_prologue_cost);
3427       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3428                    vec_epilogue_cost);
3429       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3430                    scalar_single_iter_cost);
3431       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3432                    scalar_outside_cost);
3433       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3434                    vec_outside_cost);
3435       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3436                    peel_iters_prologue);
3437       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3438                    peel_iters_epilogue);
3439     }
3440
3441   /* Calculate number of iterations required to make the vector version
3442      profitable, relative to the loop bodies only.  The following condition
3443      must hold true:
3444      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3445      where
3446      SIC = scalar iteration cost, VIC = vector iteration cost,
3447      VOC = vector outside cost, VF = vectorization factor,
3448      NPEEL = prologue iterations + epilogue iterations,
3449      SOC = scalar outside cost for run time cost model check.  */
3450
3451   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3452                           - vec_inside_cost);
3453   if (saving_per_viter <= 0)
3454     {
3455       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3456         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3457                     "vectorization did not happen for a simd loop");
3458
3459       if (dump_enabled_p ())
3460         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3461                          "cost model: the vector iteration cost = %d "
3462                          "divided by the scalar iteration cost = %d "
3463                          "is greater or equal to the vectorization factor = %d"
3464                          ".\n",
3465                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3466       *ret_min_profitable_niters = -1;
3467       *ret_min_profitable_estimate = -1;
3468       return;
3469     }
3470
3471   /* ??? The "if" arm is written to handle all cases; see below for what
3472      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3473   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3474     {
3475       /* Rewriting the condition above in terms of the number of
3476          vector iterations (vniters) rather than the number of
3477          scalar iterations (niters) gives:
3478
3479          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3480
3481          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3482
3483          For integer N, X and Y when X > 0:
3484
3485          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3486       int outside_overhead = (vec_outside_cost
3487                               - scalar_single_iter_cost * peel_iters_prologue
3488                               - scalar_single_iter_cost * peel_iters_epilogue
3489                               - scalar_outside_cost);
3490       /* We're only interested in cases that require at least one
3491          vector iteration.  */
3492       int min_vec_niters = 1;
3493       if (outside_overhead > 0)
3494         min_vec_niters = outside_overhead / saving_per_viter + 1;
3495
3496       if (dump_enabled_p ())
3497         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3498                      min_vec_niters);
3499
3500       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3501         {
3502           /* Now that we know the minimum number of vector iterations,
3503              find the minimum niters for which the scalar cost is larger:
3504
3505              SIC * niters > VIC * vniters + VOC - SOC
3506
3507              We know that the minimum niters is no more than
3508              vniters * VF + NPEEL, but it might be (and often is) less
3509              than that if a partial vector iteration is cheaper than the
3510              equivalent scalar code.  */
3511           int threshold = (vec_inside_cost * min_vec_niters
3512                            + vec_outside_cost
3513                            - scalar_outside_cost);
3514           if (threshold <= 0)
3515             min_profitable_iters = 1;
3516           else
3517             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3518         }
3519       else
3520         /* Convert the number of vector iterations into a number of
3521            scalar iterations.  */
3522         min_profitable_iters = (min_vec_niters * assumed_vf
3523                                 + peel_iters_prologue
3524                                 + peel_iters_epilogue);
3525     }
3526   else
3527     {
3528       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3529                               * assumed_vf
3530                               - vec_inside_cost * peel_iters_prologue
3531                               - vec_inside_cost * peel_iters_epilogue);
3532       if (min_profitable_iters <= 0)
3533         min_profitable_iters = 0;
3534       else
3535         {
3536           min_profitable_iters /= saving_per_viter;
3537
3538           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3539               <= (((int) vec_inside_cost * min_profitable_iters)
3540                   + (((int) vec_outside_cost - scalar_outside_cost)
3541                      * assumed_vf)))
3542             min_profitable_iters++;
3543         }
3544     }
3545
3546   if (dump_enabled_p ())
3547     dump_printf (MSG_NOTE,
3548                  "  Calculated minimum iters for profitability: %d\n",
3549                  min_profitable_iters);
3550
3551   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3552       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3553     /* We want the vectorized loop to execute at least once.  */
3554     min_profitable_iters = assumed_vf + peel_iters_prologue;
3555
3556   if (dump_enabled_p ())
3557     dump_printf_loc (MSG_NOTE, vect_location,
3558                      "  Runtime profitability threshold = %d\n",
3559                      min_profitable_iters);
3560
3561   *ret_min_profitable_niters = min_profitable_iters;
3562
3563   /* Calculate number of iterations required to make the vector version
3564      profitable, relative to the loop bodies only.
3565
3566      Non-vectorized variant is SIC * niters and it must win over vector
3567      variant on the expected loop trip count.  The following condition must hold true:
3568      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3569
3570   if (vec_outside_cost <= 0)
3571     min_profitable_estimate = 0;
3572   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3573     {
3574       /* This is a repeat of the code above, but with + SOC rather
3575          than - SOC.  */
3576       int outside_overhead = (vec_outside_cost
3577                               - scalar_single_iter_cost * peel_iters_prologue
3578                               - scalar_single_iter_cost * peel_iters_epilogue
3579                               + scalar_outside_cost);
3580       int min_vec_niters = 1;
3581       if (outside_overhead > 0)
3582         min_vec_niters = outside_overhead / saving_per_viter + 1;
3583
3584       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3585         {
3586           int threshold = (vec_inside_cost * min_vec_niters
3587                            + vec_outside_cost
3588                            + scalar_outside_cost);
3589           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3590         }
3591       else
3592         min_profitable_estimate = (min_vec_niters * assumed_vf
3593                                    + peel_iters_prologue
3594                                    + peel_iters_epilogue);
3595     }
3596   else
3597     {
3598       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3599                                  * assumed_vf
3600                                  - vec_inside_cost * peel_iters_prologue
3601                                  - vec_inside_cost * peel_iters_epilogue)
3602                                  / ((scalar_single_iter_cost * assumed_vf)
3603                                    - vec_inside_cost);
3604     }
3605   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3606   if (dump_enabled_p ())
3607     dump_printf_loc (MSG_NOTE, vect_location,
3608                      "  Static estimate profitability threshold = %d\n",
3609                      min_profitable_estimate);
3610
3611   *ret_min_profitable_estimate = min_profitable_estimate;
3612 }
3613
3614 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3615    vector elements (not bits) for a vector with NELT elements.  */
3616 static void
3617 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3618                               vec_perm_builder *sel)
3619 {
3620   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3621      by vec_perm_indices.  */
3622   sel->new_vector (nelt, 1, 3);
3623   for (unsigned int i = 0; i < 3; i++)
3624     sel->quick_push (i + offset);
3625 }
3626
3627 /* Checks whether the target supports whole-vector shifts for vectors of mode
3628    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3629    it supports vec_perm_const with masks for all necessary shift amounts.  */
3630 static bool
3631 have_whole_vector_shift (machine_mode mode)
3632 {
3633   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3634     return true;
3635
3636   /* Variable-length vectors should be handled via the optab.  */
3637   unsigned int nelt;
3638   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3639     return false;
3640
3641   vec_perm_builder sel;
3642   vec_perm_indices indices;
3643   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3644     {
3645       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3646       indices.new_vector (sel, 2, nelt);
3647       if (!can_vec_perm_const_p (mode, indices, false))
3648         return false;
3649     }
3650   return true;
3651 }
3652
3653 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3654    functions. Design better to avoid maintenance issues.  */
3655
3656 /* Function vect_model_reduction_cost.
3657
3658    Models cost for a reduction operation, including the vector ops
3659    generated within the strip-mine loop, the initial definition before
3660    the loop, and the epilogue code that must be generated.  */
3661
3662 static void
3663 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3664                            vect_reduction_type reduction_type,
3665                            int ncopies, stmt_vector_for_cost *cost_vec)
3666 {
3667   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3668   enum tree_code code;
3669   optab optab;
3670   tree vectype;
3671   machine_mode mode;
3672   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3673   class loop *loop = NULL;
3674
3675   if (loop_vinfo)
3676     loop = LOOP_VINFO_LOOP (loop_vinfo);
3677
3678   /* Condition reductions generate two reductions in the loop.  */
3679   if (reduction_type == COND_REDUCTION)
3680     ncopies *= 2;
3681
3682   vectype = STMT_VINFO_VECTYPE (stmt_info);
3683   mode = TYPE_MODE (vectype);
3684   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3685
3686   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3687
3688   if (reduction_type == EXTRACT_LAST_REDUCTION
3689       || reduction_type == FOLD_LEFT_REDUCTION)
3690     {
3691       /* No extra instructions needed in the prologue.  */
3692       prologue_cost = 0;
3693
3694       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3695         /* Count one reduction-like operation per vector.  */
3696         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3697                                         stmt_info, 0, vect_body);
3698       else
3699         {
3700           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3701           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3702           inside_cost = record_stmt_cost (cost_vec, nelements,
3703                                           vec_to_scalar, stmt_info, 0,
3704                                           vect_body);
3705           inside_cost += record_stmt_cost (cost_vec, nelements,
3706                                            scalar_stmt, stmt_info, 0,
3707                                            vect_body);
3708         }
3709     }
3710   else
3711     {
3712       /* Add in cost for initial definition.
3713          For cond reduction we have four vectors: initial index, step,
3714          initial result of the data reduction, initial value of the index
3715          reduction.  */
3716       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3717       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3718                                          scalar_to_vec, stmt_info, 0,
3719                                          vect_prologue);
3720
3721       /* Cost of reduction op inside loop.  */
3722       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3723                                       stmt_info, 0, vect_body);
3724     }
3725
3726   /* Determine cost of epilogue code.
3727
3728      We have a reduction operator that will reduce the vector in one statement.
3729      Also requires scalar extract.  */
3730
3731   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3732     {
3733       if (reduc_fn != IFN_LAST)
3734         {
3735           if (reduction_type == COND_REDUCTION)
3736             {
3737               /* An EQ stmt and an COND_EXPR stmt.  */
3738               epilogue_cost += record_stmt_cost (cost_vec, 2,
3739                                                  vector_stmt, stmt_info, 0,
3740                                                  vect_epilogue);
3741               /* Reduction of the max index and a reduction of the found
3742                  values.  */
3743               epilogue_cost += record_stmt_cost (cost_vec, 2,
3744                                                  vec_to_scalar, stmt_info, 0,
3745                                                  vect_epilogue);
3746               /* A broadcast of the max value.  */
3747               epilogue_cost += record_stmt_cost (cost_vec, 1,
3748                                                  scalar_to_vec, stmt_info, 0,
3749                                                  vect_epilogue);
3750             }
3751           else
3752             {
3753               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3754                                                  stmt_info, 0, vect_epilogue);
3755               epilogue_cost += record_stmt_cost (cost_vec, 1,
3756                                                  vec_to_scalar, stmt_info, 0,
3757                                                  vect_epilogue);
3758             }
3759         }
3760       else if (reduction_type == COND_REDUCTION)
3761         {
3762           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3763           /* Extraction of scalar elements.  */
3764           epilogue_cost += record_stmt_cost (cost_vec,
3765                                              2 * estimated_nunits,
3766                                              vec_to_scalar, stmt_info, 0,
3767                                              vect_epilogue);
3768           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3769           epilogue_cost += record_stmt_cost (cost_vec,
3770                                              2 * estimated_nunits - 3,
3771                                              scalar_stmt, stmt_info, 0,
3772                                              vect_epilogue);
3773         }
3774       else if (reduction_type == EXTRACT_LAST_REDUCTION
3775                || reduction_type == FOLD_LEFT_REDUCTION)
3776         /* No extra instructions need in the epilogue.  */
3777         ;
3778       else
3779         {
3780           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3781           tree bitsize =
3782             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3783           int element_bitsize = tree_to_uhwi (bitsize);
3784           int nelements = vec_size_in_bits / element_bitsize;
3785
3786           if (code == COND_EXPR)
3787             code = MAX_EXPR;
3788
3789           optab = optab_for_tree_code (code, vectype, optab_default);
3790
3791           /* We have a whole vector shift available.  */
3792           if (optab != unknown_optab
3793               && VECTOR_MODE_P (mode)
3794               && optab_handler (optab, mode) != CODE_FOR_nothing
3795               && have_whole_vector_shift (mode))
3796             {
3797               /* Final reduction via vector shifts and the reduction operator.
3798                  Also requires scalar extract.  */
3799               epilogue_cost += record_stmt_cost (cost_vec,
3800                                                  exact_log2 (nelements) * 2,
3801                                                  vector_stmt, stmt_info, 0,
3802                                                  vect_epilogue);
3803               epilogue_cost += record_stmt_cost (cost_vec, 1,
3804                                                  vec_to_scalar, stmt_info, 0,
3805                                                  vect_epilogue);
3806             }
3807           else
3808             /* Use extracts and reduction op for final reduction.  For N
3809                elements, we have N extracts and N-1 reduction ops.  */
3810             epilogue_cost += record_stmt_cost (cost_vec,
3811                                                nelements + nelements - 1,
3812                                                vector_stmt, stmt_info, 0,
3813                                                vect_epilogue);
3814         }
3815     }
3816
3817   if (dump_enabled_p ())
3818     dump_printf (MSG_NOTE,
3819                  "vect_model_reduction_cost: inside_cost = %d, "
3820                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3821                  prologue_cost, epilogue_cost);
3822 }
3823
3824
3825 /* Function vect_model_induction_cost.
3826
3827    Models cost for induction operations.  */
3828
3829 static void
3830 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3831                            stmt_vector_for_cost *cost_vec)
3832 {
3833   unsigned inside_cost, prologue_cost;
3834
3835   if (PURE_SLP_STMT (stmt_info))
3836     return;
3837
3838   /* loop cost for vec_loop.  */
3839   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3840                                   stmt_info, 0, vect_body);
3841
3842   /* prologue cost for vec_init and vec_step.  */
3843   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3844                                     stmt_info, 0, vect_prologue);
3845
3846   if (dump_enabled_p ())
3847     dump_printf_loc (MSG_NOTE, vect_location,
3848                      "vect_model_induction_cost: inside_cost = %d, "
3849                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3850 }
3851
3852
3853
3854 /* Function get_initial_def_for_reduction
3855
3856    Input:
3857    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3858    INIT_VAL - the initial value of the reduction variable
3859
3860    Output:
3861    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3862         of the reduction (used for adjusting the epilog - see below).
3863    Return a vector variable, initialized according to the operation that
3864         STMT_VINFO performs. This vector will be used as the initial value
3865         of the vector of partial results.
3866
3867    Option1 (adjust in epilog): Initialize the vector as follows:
3868      add/bit or/xor:    [0,0,...,0,0]
3869      mult/bit and:      [1,1,...,1,1]
3870      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3871    and when necessary (e.g. add/mult case) let the caller know
3872    that it needs to adjust the result by init_val.
3873
3874    Option2: Initialize the vector as follows:
3875      add/bit or/xor:    [init_val,0,0,...,0]
3876      mult/bit and:      [init_val,1,1,...,1]
3877      min/max/cond_expr: [init_val,init_val,...,init_val]
3878    and no adjustments are needed.
3879
3880    For example, for the following code:
3881
3882    s = init_val;
3883    for (i=0;i<n;i++)
3884      s = s + a[i];
3885
3886    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3887    For a vector of 4 units, we want to return either [0,0,0,init_val],
3888    or [0,0,0,0] and let the caller know that it needs to adjust
3889    the result at the end by 'init_val'.
3890
3891    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3892    initialization vector is simpler (same element in all entries), if
3893    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3894
3895    A cost model should help decide between these two schemes.  */
3896
3897 static tree
3898 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
3899                                enum tree_code code, tree init_val,
3900                                tree *adjustment_def)
3901 {
3902   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3903   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3904   tree scalar_type = TREE_TYPE (init_val);
3905   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
3906   tree def_for_init;
3907   tree init_def;
3908   REAL_VALUE_TYPE real_init_val = dconst0;
3909   int int_init_val = 0;
3910   gimple_seq stmts = NULL;
3911
3912   gcc_assert (vectype);
3913
3914   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3915               || SCALAR_FLOAT_TYPE_P (scalar_type));
3916
3917   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
3918               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
3919
3920   /* ADJUSTMENT_DEF is NULL when called from
3921      vect_create_epilog_for_reduction to vectorize double reduction.  */
3922   if (adjustment_def)
3923     *adjustment_def = NULL;
3924
3925   switch (code)
3926     {
3927     case WIDEN_SUM_EXPR:
3928     case DOT_PROD_EXPR:
3929     case SAD_EXPR:
3930     case PLUS_EXPR:
3931     case MINUS_EXPR:
3932     case BIT_IOR_EXPR:
3933     case BIT_XOR_EXPR:
3934     case MULT_EXPR:
3935     case BIT_AND_EXPR:
3936       {
3937         if (code == MULT_EXPR)
3938           {
3939             real_init_val = dconst1;
3940             int_init_val = 1;
3941           }
3942
3943         if (code == BIT_AND_EXPR)
3944           int_init_val = -1;
3945
3946         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3947           def_for_init = build_real (scalar_type, real_init_val);
3948         else
3949           def_for_init = build_int_cst (scalar_type, int_init_val);
3950
3951         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
3952           {
3953             /* Option1: the first element is '0' or '1' as well.  */
3954             if (!operand_equal_p (def_for_init, init_val, 0))
3955               *adjustment_def = init_val;
3956             init_def = gimple_build_vector_from_val (&stmts, vectype,
3957                                                      def_for_init);
3958           }
3959         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
3960           {
3961             /* Option2 (variable length): the first element is INIT_VAL.  */
3962             init_def = gimple_build_vector_from_val (&stmts, vectype,
3963                                                      def_for_init);
3964             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
3965                                      vectype, init_def, init_val);
3966           }
3967         else
3968           {
3969             /* Option2: the first element is INIT_VAL.  */
3970             tree_vector_builder elts (vectype, 1, 2);
3971             elts.quick_push (init_val);
3972             elts.quick_push (def_for_init);
3973             init_def = gimple_build_vector (&stmts, &elts);
3974           }
3975       }
3976       break;
3977
3978     case MIN_EXPR:
3979     case MAX_EXPR:
3980     case COND_EXPR:
3981       {
3982         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
3983         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
3984       }
3985       break;
3986
3987     default:
3988       gcc_unreachable ();
3989     }
3990
3991   if (stmts)
3992     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3993   return init_def;
3994 }
3995
3996 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
3997    NUMBER_OF_VECTORS is the number of vector defs to create.
3998    If NEUTRAL_OP is nonnull, introducing extra elements of that
3999    value will not change the result.  */
4000
4001 static void
4002 get_initial_defs_for_reduction (slp_tree slp_node,
4003                                 vec<tree> *vec_oprnds,
4004                                 unsigned int number_of_vectors,
4005                                 bool reduc_chain, tree neutral_op)
4006 {
4007   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4008   stmt_vec_info stmt_vinfo = stmts[0];
4009   vec_info *vinfo = stmt_vinfo->vinfo;
4010   unsigned HOST_WIDE_INT nunits;
4011   unsigned j, number_of_places_left_in_vector;
4012   tree vector_type;
4013   unsigned int group_size = stmts.length ();
4014   unsigned int i;
4015   class loop *loop;
4016
4017   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4018
4019   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4020
4021   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4022   gcc_assert (loop);
4023   edge pe = loop_preheader_edge (loop);
4024
4025   gcc_assert (!reduc_chain || neutral_op);
4026
4027   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4028      created vectors. It is greater than 1 if unrolling is performed.
4029
4030      For example, we have two scalar operands, s1 and s2 (e.g., group of
4031      strided accesses of size two), while NUNITS is four (i.e., four scalars
4032      of this type can be packed in a vector).  The output vector will contain
4033      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4034      will be 2).
4035
4036      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4037      vectors containing the operands.
4038
4039      For example, NUNITS is four as before, and the group size is 8
4040      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4041      {s5, s6, s7, s8}.  */
4042
4043   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4044     nunits = group_size;
4045
4046   number_of_places_left_in_vector = nunits;
4047   bool constant_p = true;
4048   tree_vector_builder elts (vector_type, nunits, 1);
4049   elts.quick_grow (nunits);
4050   gimple_seq ctor_seq = NULL;
4051   for (j = 0; j < nunits * number_of_vectors; ++j)
4052     {
4053       tree op;
4054       i = j % group_size;
4055       stmt_vinfo = stmts[i];
4056
4057       /* Get the def before the loop.  In reduction chain we have only
4058          one initial value.  Else we have as many as PHIs in the group.  */
4059       if (reduc_chain)
4060         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4061       else if (((vec_oprnds->length () + 1) * nunits
4062                 - number_of_places_left_in_vector >= group_size)
4063                && neutral_op)
4064         op = neutral_op;
4065       else
4066         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4067
4068       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4069       number_of_places_left_in_vector--;
4070       elts[nunits - number_of_places_left_in_vector - 1] = op;
4071       if (!CONSTANT_CLASS_P (op))
4072         constant_p = false;
4073
4074       if (number_of_places_left_in_vector == 0)
4075         {
4076           tree init;
4077           if (constant_p && !neutral_op
4078               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4079               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4080             /* Build the vector directly from ELTS.  */
4081             init = gimple_build_vector (&ctor_seq, &elts);
4082           else if (neutral_op)
4083             {
4084               /* Build a vector of the neutral value and shift the
4085                  other elements into place.  */
4086               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4087                                                    neutral_op);
4088               int k = nunits;
4089               while (k > 0 && elts[k - 1] == neutral_op)
4090                 k -= 1;
4091               while (k > 0)
4092                 {
4093                   k -= 1;
4094                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4095                                        vector_type, init, elts[k]);
4096                 }
4097             }
4098           else
4099             {
4100               /* First time round, duplicate ELTS to fill the
4101                  required number of vectors.  */
4102               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4103                                         number_of_vectors, *vec_oprnds);
4104               break;
4105             }
4106           vec_oprnds->quick_push (init);
4107
4108           number_of_places_left_in_vector = nunits;
4109           elts.new_vector (vector_type, nunits, 1);
4110           elts.quick_grow (nunits);
4111           constant_p = true;
4112         }
4113     }
4114   if (ctor_seq != NULL)
4115     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4116 }
4117
4118 /* For a statement STMT_INFO taking part in a reduction operation return
4119    the stmt_vec_info the meta information is stored on.  */
4120
4121 stmt_vec_info
4122 info_for_reduction (stmt_vec_info stmt_info)
4123 {
4124   stmt_info = vect_orig_stmt (stmt_info);
4125   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4126   if (!is_a <gphi *> (stmt_info->stmt))
4127     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4128   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4129   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4130     {
4131       if (gimple_phi_num_args (phi) == 1)
4132         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4133     }
4134   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4135     {
4136       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4137       stmt_vec_info info
4138           = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4139       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4140         stmt_info = info;
4141     }
4142   return stmt_info;
4143 }
4144
4145 /* Function vect_create_epilog_for_reduction
4146
4147    Create code at the loop-epilog to finalize the result of a reduction
4148    computation.
4149
4150    STMT_INFO is the scalar reduction stmt that is being vectorized.
4151    SLP_NODE is an SLP node containing a group of reduction statements. The
4152      first one in this group is STMT_INFO.
4153    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4154    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4155      (counting from 0)
4156
4157    This function:
4158    1. Completes the reduction def-use cycles.
4159    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4160       by calling the function specified by REDUC_FN if available, or by
4161       other means (whole-vector shifts or a scalar loop).
4162       The function also creates a new phi node at the loop exit to preserve
4163       loop-closed form, as illustrated below.
4164
4165      The flow at the entry to this function:
4166
4167         loop:
4168           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4169           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4170           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4171         loop_exit:
4172           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4173           use <s_out0>
4174           use <s_out0>
4175
4176      The above is transformed by this function into:
4177
4178         loop:
4179           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4180           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4181           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4182         loop_exit:
4183           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4184           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4185           v_out2 = reduce <v_out1>
4186           s_out3 = extract_field <v_out2, 0>
4187           s_out4 = adjust_result <s_out3>
4188           use <s_out4>
4189           use <s_out4>
4190 */
4191
4192 static void
4193 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4194                                   slp_tree slp_node,
4195                                   slp_instance slp_node_instance)
4196 {
4197   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4198   gcc_assert (reduc_info->is_reduc_info);
4199   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4200   /* For double reductions we need to get at the inner loop reduction
4201      stmt which has the meta info attached.  Our stmt_info is that of the
4202      loop-closed PHI of the inner loop which we remember as
4203      def for the reduction PHI generation.  */
4204   bool double_reduc = false;
4205   stmt_vec_info rdef_info = stmt_info;
4206   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4207     {
4208       gcc_assert (!slp_node);
4209       double_reduc = true;
4210       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4211                                             (stmt_info->stmt, 0));
4212       stmt_info = vect_stmt_to_vectorize (stmt_info);
4213     }
4214   gphi *reduc_def_stmt
4215     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4216   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4217   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4218   tree neutral_op = NULL_TREE;
4219   if (slp_node)
4220     neutral_op
4221       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
4222                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4223   stmt_vec_info prev_phi_info;
4224   tree vectype;
4225   machine_mode mode;
4226   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4227   basic_block exit_bb;
4228   tree scalar_dest;
4229   tree scalar_type;
4230   gimple *new_phi = NULL, *phi;
4231   stmt_vec_info phi_info;
4232   gimple_stmt_iterator exit_gsi;
4233   tree vec_dest;
4234   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4235   gimple *epilog_stmt = NULL;
4236   gimple *exit_phi;
4237   tree bitsize;
4238   tree def;
4239   tree orig_name, scalar_result;
4240   imm_use_iterator imm_iter, phi_imm_iter;
4241   use_operand_p use_p, phi_use_p;
4242   gimple *use_stmt;
4243   bool nested_in_vect_loop = false;
4244   auto_vec<gimple *> new_phis;
4245   int j, i;
4246   auto_vec<tree> scalar_results;
4247   unsigned int group_size = 1, k;
4248   auto_vec<gimple *> phis;
4249   bool slp_reduc = false;
4250   bool direct_slp_reduc;
4251   tree new_phi_result;
4252   tree induction_index = NULL_TREE;
4253
4254   if (slp_node)
4255     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4256
4257   if (nested_in_vect_loop_p (loop, stmt_info))
4258     {
4259       outer_loop = loop;
4260       loop = loop->inner;
4261       nested_in_vect_loop = true;
4262       gcc_assert (!slp_node);
4263     }
4264   gcc_assert (!nested_in_vect_loop || double_reduc);
4265
4266   vectype = STMT_VINFO_VECTYPE (stmt_info);
4267   gcc_assert (vectype);
4268   mode = TYPE_MODE (vectype);
4269
4270   tree initial_def = NULL;
4271   tree induc_val = NULL_TREE;
4272   tree adjustment_def = NULL;
4273   if (slp_node)
4274     ;
4275   else
4276     {
4277       /* Get at the scalar def before the loop, that defines the initial value
4278          of the reduction variable.  */
4279       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4280                                            loop_preheader_edge (loop));
4281       /* Optimize: for induction condition reduction, if we can't use zero
4282          for induc_val, use initial_def.  */
4283       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4284         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4285       else if (double_reduc)
4286         ;
4287       else if (nested_in_vect_loop)
4288         ;
4289       else
4290         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4291     }
4292
4293   unsigned vec_num;
4294   int ncopies;
4295   if (slp_node)
4296     {
4297       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4298       ncopies = 1;
4299     }
4300   else
4301     {
4302       vec_num = 1;
4303       ncopies = 0;
4304       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4305       do
4306         {
4307           ncopies++;
4308           phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4309         }
4310       while (phi_info);
4311     }
4312
4313   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4314      which is updated with the current index of the loop for every match of
4315      the original loop's cond_expr (VEC_STMT).  This results in a vector
4316      containing the last time the condition passed for that vector lane.
4317      The first match will be a 1 to allow 0 to be used for non-matching
4318      indexes.  If there are no matches at all then the vector will be all
4319      zeroes.  */
4320   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4321     {
4322       tree indx_before_incr, indx_after_incr;
4323       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4324
4325       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4326       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4327
4328       int scalar_precision
4329         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4330       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4331       tree cr_index_vector_type = build_vector_type
4332         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4333
4334       /* First we create a simple vector induction variable which starts
4335          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4336          vector size (STEP).  */
4337
4338       /* Create a {1,2,3,...} vector.  */
4339       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4340
4341       /* Create a vector of the step value.  */
4342       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4343       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4344
4345       /* Create an induction variable.  */
4346       gimple_stmt_iterator incr_gsi;
4347       bool insert_after;
4348       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4349       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4350                  insert_after, &indx_before_incr, &indx_after_incr);
4351
4352       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4353          filled with zeros (VEC_ZERO).  */
4354
4355       /* Create a vector of 0s.  */
4356       tree zero = build_zero_cst (cr_index_scalar_type);
4357       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4358
4359       /* Create a vector phi node.  */
4360       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4361       new_phi = create_phi_node (new_phi_tree, loop->header);
4362       loop_vinfo->add_stmt (new_phi);
4363       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4364                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4365
4366       /* Now take the condition from the loops original cond_expr
4367          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4368          every match uses values from the induction variable
4369          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4370          (NEW_PHI_TREE).
4371          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4372          the new cond_expr (INDEX_COND_EXPR).  */
4373
4374       /* Duplicate the condition from vec_stmt.  */
4375       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4376
4377       /* Create a conditional, where the condition is taken from vec_stmt
4378          (CCOMPARE).  The then and else values mirror the main VEC_COND_EXPR:
4379          the reduction phi corresponds to NEW_PHI_TREE and the new values
4380          correspond to INDEX_BEFORE_INCR.  */
4381       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) >= 1);
4382       tree index_cond_expr;
4383       if (STMT_VINFO_REDUC_IDX (stmt_info) == 2)
4384         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4385                                   ccompare, indx_before_incr, new_phi_tree);
4386       else
4387         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4388                                   ccompare, new_phi_tree, indx_before_incr);
4389       induction_index = make_ssa_name (cr_index_vector_type);
4390       gimple *index_condition = gimple_build_assign (induction_index,
4391                                                      index_cond_expr);
4392       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4393       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4394       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4395
4396       /* Update the phi with the vec cond.  */
4397       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4398                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4399     }
4400
4401   /* 2. Create epilog code.
4402         The reduction epilog code operates across the elements of the vector
4403         of partial results computed by the vectorized loop.
4404         The reduction epilog code consists of:
4405
4406         step 1: compute the scalar result in a vector (v_out2)
4407         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4408         step 3: adjust the scalar result (s_out3) if needed.
4409
4410         Step 1 can be accomplished using one the following three schemes:
4411           (scheme 1) using reduc_fn, if available.
4412           (scheme 2) using whole-vector shifts, if available.
4413           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4414                      combined.
4415
4416           The overall epilog code looks like this:
4417
4418           s_out0 = phi <s_loop>         # original EXIT_PHI
4419           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4420           v_out2 = reduce <v_out1>              # step 1
4421           s_out3 = extract_field <v_out2, 0>    # step 2
4422           s_out4 = adjust_result <s_out3>       # step 3
4423
4424           (step 3 is optional, and steps 1 and 2 may be combined).
4425           Lastly, the uses of s_out0 are replaced by s_out4.  */
4426
4427
4428   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4429          v_out1 = phi <VECT_DEF>
4430          Store them in NEW_PHIS.  */
4431   if (double_reduc)
4432     loop = outer_loop;
4433   exit_bb = single_exit (loop)->dest;
4434   prev_phi_info = NULL;
4435   new_phis.create (slp_node ? vec_num : ncopies);
4436   for (unsigned i = 0; i < vec_num; i++)
4437     {
4438       if (slp_node)
4439         def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4440       else
4441         def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4442       for (j = 0; j < ncopies; j++)
4443         {
4444           tree new_def = copy_ssa_name (def);
4445           phi = create_phi_node (new_def, exit_bb);
4446           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4447           if (j == 0)
4448             new_phis.quick_push (phi);
4449           else
4450             {
4451               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4452               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4453             }
4454
4455           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4456           prev_phi_info = phi_info;
4457         }
4458     }
4459
4460   exit_gsi = gsi_after_labels (exit_bb);
4461
4462   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4463          (i.e. when reduc_fn is not available) and in the final adjustment
4464          code (if needed).  Also get the original scalar reduction variable as
4465          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4466          represents a reduction pattern), the tree-code and scalar-def are
4467          taken from the original stmt that the pattern-stmt (STMT) replaces.
4468          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4469          are taken from STMT.  */
4470
4471   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4472   if (orig_stmt_info != stmt_info)
4473     {
4474       /* Reduction pattern  */
4475       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4476       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4477     }
4478
4479   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4480   scalar_type = TREE_TYPE (scalar_dest);
4481   scalar_results.create (group_size);
4482   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4483   bitsize = TYPE_SIZE (scalar_type);
4484
4485   /* SLP reduction without reduction chain, e.g.,
4486      # a1 = phi <a2, a0>
4487      # b1 = phi <b2, b0>
4488      a2 = operation (a1)
4489      b2 = operation (b1)  */
4490   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4491
4492   /* True if we should implement SLP_REDUC using native reduction operations
4493      instead of scalar operations.  */
4494   direct_slp_reduc = (reduc_fn != IFN_LAST
4495                       && slp_reduc
4496                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4497
4498   /* In case of reduction chain, e.g.,
4499      # a1 = phi <a3, a0>
4500      a2 = operation (a1)
4501      a3 = operation (a2),
4502
4503      we may end up with more than one vector result.  Here we reduce them to
4504      one vector.  */
4505   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4506     {
4507       tree first_vect = PHI_RESULT (new_phis[0]);
4508       gassign *new_vec_stmt = NULL;
4509       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4510       for (k = 1; k < new_phis.length (); k++)
4511         {
4512           gimple *next_phi = new_phis[k];
4513           tree second_vect = PHI_RESULT (next_phi);
4514           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4515           new_vec_stmt = gimple_build_assign (tem, code,
4516                                               first_vect, second_vect);
4517           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4518           first_vect = tem;
4519         }
4520
4521       new_phi_result = first_vect;
4522       if (new_vec_stmt)
4523         {
4524           new_phis.truncate (0);
4525           new_phis.safe_push (new_vec_stmt);
4526         }
4527     }
4528   /* Likewise if we couldn't use a single defuse cycle.  */
4529   else if (ncopies > 1)
4530     {
4531       gcc_assert (new_phis.length () == 1);
4532       tree first_vect = PHI_RESULT (new_phis[0]);
4533       gassign *new_vec_stmt = NULL;
4534       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4535       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4536       for (int k = 1; k < ncopies; ++k)
4537         {
4538           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4539           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4540           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4541           new_vec_stmt = gimple_build_assign (tem, code,
4542                                               first_vect, second_vect);
4543           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4544           first_vect = tem;
4545         }
4546       new_phi_result = first_vect;
4547       new_phis.truncate (0);
4548       new_phis.safe_push (new_vec_stmt);
4549     }
4550   else
4551     new_phi_result = PHI_RESULT (new_phis[0]);
4552
4553   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4554       && reduc_fn != IFN_LAST)
4555     {
4556       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4557          various data values where the condition matched and another vector
4558          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4559          need to extract the last matching index (which will be the index with
4560          highest value) and use this to index into the data vector.
4561          For the case where there were no matches, the data vector will contain
4562          all default values and the index vector will be all zeros.  */
4563
4564       /* Get various versions of the type of the vector of indexes.  */
4565       tree index_vec_type = TREE_TYPE (induction_index);
4566       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4567       tree index_scalar_type = TREE_TYPE (index_vec_type);
4568       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4569         (index_vec_type);
4570
4571       /* Get an unsigned integer version of the type of the data vector.  */
4572       int scalar_precision
4573         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4574       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4575       tree vectype_unsigned = build_vector_type
4576         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4577
4578       /* First we need to create a vector (ZERO_VEC) of zeros and another
4579          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4580          can create using a MAX reduction and then expanding.
4581          In the case where the loop never made any matches, the max index will
4582          be zero.  */
4583
4584       /* Vector of {0, 0, 0,...}.  */
4585       tree zero_vec = make_ssa_name (vectype);
4586       tree zero_vec_rhs = build_zero_cst (vectype);
4587       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4588       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4589
4590       /* Find maximum value from the vector of found indexes.  */
4591       tree max_index = make_ssa_name (index_scalar_type);
4592       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4593                                                           1, induction_index);
4594       gimple_call_set_lhs (max_index_stmt, max_index);
4595       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4596
4597       /* Vector of {max_index, max_index, max_index,...}.  */
4598       tree max_index_vec = make_ssa_name (index_vec_type);
4599       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4600                                                       max_index);
4601       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4602                                                         max_index_vec_rhs);
4603       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4604
4605       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4606          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4607          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4608          otherwise.  Only one value should match, resulting in a vector
4609          (VEC_COND) with one data value and the rest zeros.
4610          In the case where the loop never made any matches, every index will
4611          match, resulting in a vector with all data values (which will all be
4612          the default value).  */
4613
4614       /* Compare the max index vector to the vector of found indexes to find
4615          the position of the max value.  */
4616       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4617       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4618                                                       induction_index,
4619                                                       max_index_vec);
4620       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4621
4622       /* Use the compare to choose either values from the data vector or
4623          zero.  */
4624       tree vec_cond = make_ssa_name (vectype);
4625       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4626                                                    vec_compare, new_phi_result,
4627                                                    zero_vec);
4628       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4629
4630       /* Finally we need to extract the data value from the vector (VEC_COND)
4631          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4632          reduction, but because this doesn't exist, we can use a MAX reduction
4633          instead.  The data value might be signed or a float so we need to cast
4634          it first.
4635          In the case where the loop never made any matches, the data values are
4636          all identical, and so will reduce down correctly.  */
4637
4638       /* Make the matched data values unsigned.  */
4639       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4640       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4641                                        vec_cond);
4642       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4643                                                         VIEW_CONVERT_EXPR,
4644                                                         vec_cond_cast_rhs);
4645       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4646
4647       /* Reduce down to a scalar value.  */
4648       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4649       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4650                                                            1, vec_cond_cast);
4651       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4652       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4653
4654       /* Convert the reduced value back to the result type and set as the
4655          result.  */
4656       gimple_seq stmts = NULL;
4657       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4658                                data_reduc);
4659       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4660       scalar_results.safe_push (new_temp);
4661     }
4662   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4663            && reduc_fn == IFN_LAST)
4664     {
4665       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4666          idx = 0;
4667          idx_val = induction_index[0];
4668          val = data_reduc[0];
4669          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4670            if (induction_index[i] > idx_val)
4671              val = data_reduc[i], idx_val = induction_index[i];
4672          return val;  */
4673
4674       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4675       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4676       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4677       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4678       /* Enforced by vectorizable_reduction, which ensures we have target
4679          support before allowing a conditional reduction on variable-length
4680          vectors.  */
4681       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4682       tree idx_val = NULL_TREE, val = NULL_TREE;
4683       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4684         {
4685           tree old_idx_val = idx_val;
4686           tree old_val = val;
4687           idx_val = make_ssa_name (idx_eltype);
4688           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4689                                              build3 (BIT_FIELD_REF, idx_eltype,
4690                                                      induction_index,
4691                                                      bitsize_int (el_size),
4692                                                      bitsize_int (off)));
4693           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4694           val = make_ssa_name (data_eltype);
4695           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4696                                              build3 (BIT_FIELD_REF,
4697                                                      data_eltype,
4698                                                      new_phi_result,
4699                                                      bitsize_int (el_size),
4700                                                      bitsize_int (off)));
4701           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4702           if (off != 0)
4703             {
4704               tree new_idx_val = idx_val;
4705               if (off != v_size - el_size)
4706                 {
4707                   new_idx_val = make_ssa_name (idx_eltype);
4708                   epilog_stmt = gimple_build_assign (new_idx_val,
4709                                                      MAX_EXPR, idx_val,
4710                                                      old_idx_val);
4711                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4712                 }
4713               tree new_val = make_ssa_name (data_eltype);
4714               epilog_stmt = gimple_build_assign (new_val,
4715                                                  COND_EXPR,
4716                                                  build2 (GT_EXPR,
4717                                                          boolean_type_node,
4718                                                          idx_val,
4719                                                          old_idx_val),
4720                                                  val, old_val);
4721               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4722               idx_val = new_idx_val;
4723               val = new_val;
4724             }
4725         }
4726       /* Convert the reduced value back to the result type and set as the
4727          result.  */
4728       gimple_seq stmts = NULL;
4729       val = gimple_convert (&stmts, scalar_type, val);
4730       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4731       scalar_results.safe_push (val);
4732     }
4733
4734   /* 2.3 Create the reduction code, using one of the three schemes described
4735          above. In SLP we simply need to extract all the elements from the
4736          vector (without reducing them), so we use scalar shifts.  */
4737   else if (reduc_fn != IFN_LAST && !slp_reduc)
4738     {
4739       tree tmp;
4740       tree vec_elem_type;
4741
4742       /* Case 1:  Create:
4743          v_out2 = reduc_expr <v_out1>  */
4744
4745       if (dump_enabled_p ())
4746         dump_printf_loc (MSG_NOTE, vect_location,
4747                          "Reduce using direct vector reduction.\n");
4748
4749       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4750       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4751         {
4752           tree tmp_dest
4753             = vect_create_destination_var (scalar_dest, vec_elem_type);
4754           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4755                                                     new_phi_result);
4756           gimple_set_lhs (epilog_stmt, tmp_dest);
4757           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4758           gimple_set_lhs (epilog_stmt, new_temp);
4759           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4760
4761           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4762                                              new_temp);
4763         }
4764       else
4765         {
4766           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4767                                                     new_phi_result);
4768           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4769         }
4770
4771       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4772       gimple_set_lhs (epilog_stmt, new_temp);
4773       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4774
4775       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4776           && induc_val)
4777         {
4778           /* Earlier we set the initial value to be a vector if induc_val
4779              values.  Check the result and if it is induc_val then replace
4780              with the original initial value, unless induc_val is
4781              the same as initial_def already.  */
4782           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4783                                   induc_val);
4784
4785           tmp = make_ssa_name (new_scalar_dest);
4786           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4787                                              initial_def, new_temp);
4788           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4789           new_temp = tmp;
4790         }
4791
4792       scalar_results.safe_push (new_temp);
4793     }
4794   else if (direct_slp_reduc)
4795     {
4796       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4797          with the elements for other SLP statements replaced with the
4798          neutral value.  We can then do a normal reduction on each vector.  */
4799
4800       /* Enforced by vectorizable_reduction.  */
4801       gcc_assert (new_phis.length () == 1);
4802       gcc_assert (pow2p_hwi (group_size));
4803
4804       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
4805       vec<stmt_vec_info> orig_phis
4806         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
4807       gimple_seq seq = NULL;
4808
4809       /* Build a vector {0, 1, 2, ...}, with the same number of elements
4810          and the same element size as VECTYPE.  */
4811       tree index = build_index_vector (vectype, 0, 1);
4812       tree index_type = TREE_TYPE (index);
4813       tree index_elt_type = TREE_TYPE (index_type);
4814       tree mask_type = build_same_sized_truth_vector_type (index_type);
4815
4816       /* Create a vector that, for each element, identifies which of
4817          the REDUC_GROUP_SIZE results should use it.  */
4818       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
4819       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
4820                             build_vector_from_val (index_type, index_mask));
4821
4822       /* Get a neutral vector value.  This is simply a splat of the neutral
4823          scalar value if we have one, otherwise the initial scalar value
4824          is itself a neutral value.  */
4825       tree vector_identity = NULL_TREE;
4826       if (neutral_op)
4827         vector_identity = gimple_build_vector_from_val (&seq, vectype,
4828                                                         neutral_op);
4829       for (unsigned int i = 0; i < group_size; ++i)
4830         {
4831           /* If there's no univeral neutral value, we can use the
4832              initial scalar value from the original PHI.  This is used
4833              for MIN and MAX reduction, for example.  */
4834           if (!neutral_op)
4835             {
4836               tree scalar_value
4837                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
4838                                          loop_preheader_edge (loop));
4839               vector_identity = gimple_build_vector_from_val (&seq, vectype,
4840                                                               scalar_value);
4841             }
4842
4843           /* Calculate the equivalent of:
4844
4845              sel[j] = (index[j] == i);
4846
4847              which selects the elements of NEW_PHI_RESULT that should
4848              be included in the result.  */
4849           tree compare_val = build_int_cst (index_elt_type, i);
4850           compare_val = build_vector_from_val (index_type, compare_val);
4851           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
4852                                    index, compare_val);
4853
4854           /* Calculate the equivalent of:
4855
4856              vec = seq ? new_phi_result : vector_identity;
4857
4858              VEC is now suitable for a full vector reduction.  */
4859           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
4860                                    sel, new_phi_result, vector_identity);
4861
4862           /* Do the reduction and convert it to the appropriate type.  */
4863           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
4864                                       TREE_TYPE (vectype), vec);
4865           scalar = gimple_convert (&seq, scalar_type, scalar);
4866           scalar_results.safe_push (scalar);
4867         }
4868       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
4869     }
4870   else
4871     {
4872       bool reduce_with_shift;
4873       tree vec_temp;
4874
4875       /* See if the target wants to do the final (shift) reduction
4876          in a vector mode of smaller size and first reduce upper/lower
4877          halves against each other.  */
4878       enum machine_mode mode1 = mode;
4879       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
4880       unsigned sz1 = sz;
4881       if (!slp_reduc
4882           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
4883         sz1 = GET_MODE_SIZE (mode1).to_constant ();
4884
4885       tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
4886       reduce_with_shift = have_whole_vector_shift (mode1);
4887       if (!VECTOR_MODE_P (mode1))
4888         reduce_with_shift = false;
4889       else
4890         {
4891           optab optab = optab_for_tree_code (code, vectype1, optab_default);
4892           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
4893             reduce_with_shift = false;
4894         }
4895
4896       /* First reduce the vector to the desired vector size we should
4897          do shift reduction on by combining upper and lower halves.  */
4898       new_temp = new_phi_result;
4899       while (sz > sz1)
4900         {
4901           gcc_assert (!slp_reduc);
4902           sz /= 2;
4903           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
4904
4905           /* The target has to make sure we support lowpart/highpart
4906              extraction, either via direct vector extract or through
4907              an integer mode punning.  */
4908           tree dst1, dst2;
4909           if (convert_optab_handler (vec_extract_optab,
4910                                      TYPE_MODE (TREE_TYPE (new_temp)),
4911                                      TYPE_MODE (vectype1))
4912               != CODE_FOR_nothing)
4913             {
4914               /* Extract sub-vectors directly once vec_extract becomes
4915                  a conversion optab.  */
4916               dst1 = make_ssa_name (vectype1);
4917               epilog_stmt
4918                   = gimple_build_assign (dst1, BIT_FIELD_REF,
4919                                          build3 (BIT_FIELD_REF, vectype1,
4920                                                  new_temp, TYPE_SIZE (vectype1),
4921                                                  bitsize_int (0)));
4922               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4923               dst2 =  make_ssa_name (vectype1);
4924               epilog_stmt
4925                   = gimple_build_assign (dst2, BIT_FIELD_REF,
4926                                          build3 (BIT_FIELD_REF, vectype1,
4927                                                  new_temp, TYPE_SIZE (vectype1),
4928                                                  bitsize_int (sz * BITS_PER_UNIT)));
4929               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4930             }
4931           else
4932             {
4933               /* Extract via punning to appropriately sized integer mode
4934                  vector.  */
4935               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
4936                                                             1);
4937               tree etype = build_vector_type (eltype, 2);
4938               gcc_assert (convert_optab_handler (vec_extract_optab,
4939                                                  TYPE_MODE (etype),
4940                                                  TYPE_MODE (eltype))
4941                           != CODE_FOR_nothing);
4942               tree tem = make_ssa_name (etype);
4943               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
4944                                                  build1 (VIEW_CONVERT_EXPR,
4945                                                          etype, new_temp));
4946               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4947               new_temp = tem;
4948               tem = make_ssa_name (eltype);
4949               epilog_stmt
4950                   = gimple_build_assign (tem, BIT_FIELD_REF,
4951                                          build3 (BIT_FIELD_REF, eltype,
4952                                                  new_temp, TYPE_SIZE (eltype),
4953                                                  bitsize_int (0)));
4954               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4955               dst1 = make_ssa_name (vectype1);
4956               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
4957                                                  build1 (VIEW_CONVERT_EXPR,
4958                                                          vectype1, tem));
4959               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4960               tem = make_ssa_name (eltype);
4961               epilog_stmt
4962                   = gimple_build_assign (tem, BIT_FIELD_REF,
4963                                          build3 (BIT_FIELD_REF, eltype,
4964                                                  new_temp, TYPE_SIZE (eltype),
4965                                                  bitsize_int (sz * BITS_PER_UNIT)));
4966               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4967               dst2 =  make_ssa_name (vectype1);
4968               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
4969                                                  build1 (VIEW_CONVERT_EXPR,
4970                                                          vectype1, tem));
4971               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4972             }
4973
4974           new_temp = make_ssa_name (vectype1);
4975           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
4976           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4977         }
4978
4979       if (reduce_with_shift && !slp_reduc)
4980         {
4981           int element_bitsize = tree_to_uhwi (bitsize);
4982           /* Enforced by vectorizable_reduction, which disallows SLP reductions
4983              for variable-length vectors and also requires direct target support
4984              for loop reductions.  */
4985           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
4986           int nelements = vec_size_in_bits / element_bitsize;
4987           vec_perm_builder sel;
4988           vec_perm_indices indices;
4989
4990           int elt_offset;
4991
4992           tree zero_vec = build_zero_cst (vectype1);
4993           /* Case 2: Create:
4994              for (offset = nelements/2; offset >= 1; offset/=2)
4995                 {
4996                   Create:  va' = vec_shift <va, offset>
4997                   Create:  va = vop <va, va'>
4998                 }  */
4999
5000           tree rhs;
5001
5002           if (dump_enabled_p ())
5003             dump_printf_loc (MSG_NOTE, vect_location,
5004                              "Reduce using vector shifts\n");
5005
5006           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5007           for (elt_offset = nelements / 2;
5008                elt_offset >= 1;
5009                elt_offset /= 2)
5010             {
5011               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5012               indices.new_vector (sel, 2, nelements);
5013               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5014               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5015                                                  new_temp, zero_vec, mask);
5016               new_name = make_ssa_name (vec_dest, epilog_stmt);
5017               gimple_assign_set_lhs (epilog_stmt, new_name);
5018               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5019
5020               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5021                                                  new_temp);
5022               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5023               gimple_assign_set_lhs (epilog_stmt, new_temp);
5024               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5025             }
5026
5027           /* 2.4  Extract the final scalar result.  Create:
5028              s_out3 = extract_field <v_out2, bitpos>  */
5029
5030           if (dump_enabled_p ())
5031             dump_printf_loc (MSG_NOTE, vect_location,
5032                              "extract scalar result\n");
5033
5034           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5035                         bitsize, bitsize_zero_node);
5036           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5037           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5038           gimple_assign_set_lhs (epilog_stmt, new_temp);
5039           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5040           scalar_results.safe_push (new_temp);
5041         }
5042       else
5043         {
5044           /* Case 3: Create:
5045              s = extract_field <v_out2, 0>
5046              for (offset = element_size;
5047                   offset < vector_size;
5048                   offset += element_size;)
5049                {
5050                  Create:  s' = extract_field <v_out2, offset>
5051                  Create:  s = op <s, s'>  // For non SLP cases
5052                }  */
5053
5054           if (dump_enabled_p ())
5055             dump_printf_loc (MSG_NOTE, vect_location,
5056                              "Reduce using scalar code.\n");
5057
5058           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5059           int element_bitsize = tree_to_uhwi (bitsize);
5060           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5061             {
5062               int bit_offset;
5063               if (gimple_code (new_phi) == GIMPLE_PHI)
5064                 vec_temp = PHI_RESULT (new_phi);
5065               else
5066                 vec_temp = gimple_assign_lhs (new_phi);
5067               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5068                                  bitsize_zero_node);
5069               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5070               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5071               gimple_assign_set_lhs (epilog_stmt, new_temp);
5072               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5073
5074               /* In SLP we don't need to apply reduction operation, so we just
5075                  collect s' values in SCALAR_RESULTS.  */
5076               if (slp_reduc)
5077                 scalar_results.safe_push (new_temp);
5078
5079               for (bit_offset = element_bitsize;
5080                    bit_offset < vec_size_in_bits;
5081                    bit_offset += element_bitsize)
5082                 {
5083                   tree bitpos = bitsize_int (bit_offset);
5084                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5085                                      bitsize, bitpos);
5086
5087                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5088                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5089                   gimple_assign_set_lhs (epilog_stmt, new_name);
5090                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5091
5092                   if (slp_reduc)
5093                     {
5094                       /* In SLP we don't need to apply reduction operation, so
5095                          we just collect s' values in SCALAR_RESULTS.  */
5096                       new_temp = new_name;
5097                       scalar_results.safe_push (new_name);
5098                     }
5099                   else
5100                     {
5101                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5102                                                          new_name, new_temp);
5103                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5104                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5105                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5106                     }
5107                 }
5108             }
5109
5110           /* The only case where we need to reduce scalar results in SLP, is
5111              unrolling.  If the size of SCALAR_RESULTS is greater than
5112              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5113              REDUC_GROUP_SIZE.  */
5114           if (slp_reduc)
5115             {
5116               tree res, first_res, new_res;
5117               gimple *new_stmt;
5118
5119               /* Reduce multiple scalar results in case of SLP unrolling.  */
5120               for (j = group_size; scalar_results.iterate (j, &res);
5121                    j++)
5122                 {
5123                   first_res = scalar_results[j % group_size];
5124                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5125                                                   first_res, res);
5126                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5127                   gimple_assign_set_lhs (new_stmt, new_res);
5128                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5129                   scalar_results[j % group_size] = new_res;
5130                 }
5131             }
5132           else
5133             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5134             scalar_results.safe_push (new_temp);
5135         }
5136
5137       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5138           && induc_val)
5139         {
5140           /* Earlier we set the initial value to be a vector if induc_val
5141              values.  Check the result and if it is induc_val then replace
5142              with the original initial value, unless induc_val is
5143              the same as initial_def already.  */
5144           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5145                                   induc_val);
5146
5147           tree tmp = make_ssa_name (new_scalar_dest);
5148           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5149                                              initial_def, new_temp);
5150           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5151           scalar_results[0] = tmp;
5152         }
5153     }
5154
5155   /* 2.5 Adjust the final result by the initial value of the reduction
5156          variable. (When such adjustment is not needed, then
5157          'adjustment_def' is zero).  For example, if code is PLUS we create:
5158          new_temp = loop_exit_def + adjustment_def  */
5159
5160   if (adjustment_def)
5161     {
5162       gcc_assert (!slp_reduc);
5163       gimple_seq stmts = NULL;
5164       if (nested_in_vect_loop)
5165         {
5166           new_phi = new_phis[0];
5167           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5168           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5169           new_temp = gimple_build (&stmts, code, vectype,
5170                                    PHI_RESULT (new_phi), adjustment_def);
5171         }
5172       else
5173         {
5174           new_temp = scalar_results[0];
5175           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5176           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5177           new_temp = gimple_build (&stmts, code, scalar_type,
5178                                    new_temp, adjustment_def);
5179         }
5180
5181       epilog_stmt = gimple_seq_last_stmt (stmts);
5182       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5183       if (nested_in_vect_loop)
5184         {
5185           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5186           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5187             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5188
5189           if (!double_reduc)
5190             scalar_results.quick_push (new_temp);
5191           else
5192             scalar_results[0] = new_temp;
5193         }
5194       else
5195         scalar_results[0] = new_temp;
5196
5197       new_phis[0] = epilog_stmt;
5198     }
5199
5200   if (double_reduc)
5201     loop = loop->inner;
5202
5203   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5204           phis with new adjusted scalar results, i.e., replace use <s_out0>
5205           with use <s_out4>.
5206
5207      Transform:
5208         loop_exit:
5209           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5210           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5211           v_out2 = reduce <v_out1>
5212           s_out3 = extract_field <v_out2, 0>
5213           s_out4 = adjust_result <s_out3>
5214           use <s_out0>
5215           use <s_out0>
5216
5217      into:
5218
5219         loop_exit:
5220           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5221           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5222           v_out2 = reduce <v_out1>
5223           s_out3 = extract_field <v_out2, 0>
5224           s_out4 = adjust_result <s_out3>
5225           use <s_out4>
5226           use <s_out4> */
5227
5228
5229   /* In SLP reduction chain we reduce vector results into one vector if
5230      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5231      LHS of the last stmt in the reduction chain, since we are looking for
5232      the loop exit phi node.  */
5233   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5234     {
5235       stmt_vec_info dest_stmt_info
5236         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5237       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5238       group_size = 1;
5239     }
5240
5241   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5242      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5243      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5244      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5245      correspond to the first vector stmt, etc.
5246      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5247   if (group_size > new_phis.length ())
5248     gcc_assert (!(group_size % new_phis.length ()));
5249
5250   for (k = 0; k < group_size; k++)
5251     {
5252       if (slp_reduc)
5253         {
5254           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5255
5256           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5257           /* SLP statements can't participate in patterns.  */
5258           gcc_assert (!orig_stmt_info);
5259           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5260         }
5261
5262       if (nested_in_vect_loop)
5263         {
5264           if (double_reduc)
5265             loop = outer_loop;
5266           else
5267             gcc_unreachable ();
5268         }
5269
5270       phis.create (3);
5271       /* Find the loop-closed-use at the loop exit of the original scalar
5272          result.  (The reduction result is expected to have two immediate uses,
5273          one at the latch block, and one at the loop exit).  For double
5274          reductions we are looking for exit phis of the outer loop.  */
5275       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5276         {
5277           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5278             {
5279               if (!is_gimple_debug (USE_STMT (use_p)))
5280                 phis.safe_push (USE_STMT (use_p));
5281             }
5282           else
5283             {
5284               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5285                 {
5286                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5287
5288                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5289                     {
5290                       if (!flow_bb_inside_loop_p (loop,
5291                                              gimple_bb (USE_STMT (phi_use_p)))
5292                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5293                         phis.safe_push (USE_STMT (phi_use_p));
5294                     }
5295                 }
5296             }
5297         }
5298
5299       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5300         {
5301           /* Replace the uses:  */
5302           orig_name = PHI_RESULT (exit_phi);
5303           scalar_result = scalar_results[k];
5304           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5305             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5306               SET_USE (use_p, scalar_result);
5307         }
5308
5309       phis.release ();
5310     }
5311 }
5312
5313 /* Return a vector of type VECTYPE that is equal to the vector select
5314    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5315    before GSI.  */
5316
5317 static tree
5318 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5319                      tree vec, tree identity)
5320 {
5321   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5322   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5323                                           mask, vec, identity);
5324   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5325   return cond;
5326 }
5327
5328 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5329    order, starting with LHS.  Insert the extraction statements before GSI and
5330    associate the new scalar SSA names with variable SCALAR_DEST.
5331    Return the SSA name for the result.  */
5332
5333 static tree
5334 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5335                        tree_code code, tree lhs, tree vector_rhs)
5336 {
5337   tree vectype = TREE_TYPE (vector_rhs);
5338   tree scalar_type = TREE_TYPE (vectype);
5339   tree bitsize = TYPE_SIZE (scalar_type);
5340   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5341   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5342
5343   for (unsigned HOST_WIDE_INT bit_offset = 0;
5344        bit_offset < vec_size_in_bits;
5345        bit_offset += element_bitsize)
5346     {
5347       tree bitpos = bitsize_int (bit_offset);
5348       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5349                          bitsize, bitpos);
5350
5351       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5352       rhs = make_ssa_name (scalar_dest, stmt);
5353       gimple_assign_set_lhs (stmt, rhs);
5354       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5355
5356       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5357       tree new_name = make_ssa_name (scalar_dest, stmt);
5358       gimple_assign_set_lhs (stmt, new_name);
5359       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5360       lhs = new_name;
5361     }
5362   return lhs;
5363 }
5364
5365 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5366    type of the vector input.  */
5367
5368 static internal_fn
5369 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5370 {
5371   internal_fn mask_reduc_fn;
5372
5373   switch (reduc_fn)
5374     {
5375     case IFN_FOLD_LEFT_PLUS:
5376       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5377       break;
5378
5379     default:
5380       return IFN_LAST;
5381     }
5382
5383   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5384                                       OPTIMIZE_FOR_SPEED))
5385     return mask_reduc_fn;
5386   return IFN_LAST;
5387 }
5388
5389 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5390    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5391    statement.  CODE is the operation performed by STMT_INFO and OPS are
5392    its scalar operands.  REDUC_INDEX is the index of the operand in
5393    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5394    implements in-order reduction, or IFN_LAST if we should open-code it.
5395    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5396    that should be used to control the operation in a fully-masked loop.  */
5397
5398 static bool
5399 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5400                                gimple_stmt_iterator *gsi,
5401                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5402                                gimple *reduc_def_stmt,
5403                                tree_code code, internal_fn reduc_fn,
5404                                tree ops[3], tree vectype_in,
5405                                int reduc_index, vec_loop_masks *masks)
5406 {
5407   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5408   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5409   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5410   stmt_vec_info new_stmt_info = NULL;
5411   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5412
5413   int ncopies;
5414   if (slp_node)
5415     ncopies = 1;
5416   else
5417     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5418
5419   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5420   gcc_assert (ncopies == 1);
5421   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5422
5423   if (slp_node)
5424     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5425                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5426
5427   tree op0 = ops[1 - reduc_index];
5428
5429   int group_size = 1;
5430   stmt_vec_info scalar_dest_def_info;
5431   auto_vec<tree> vec_oprnds0;
5432   if (slp_node)
5433     {
5434       auto_vec<vec<tree> > vec_defs (2);
5435       vect_get_slp_defs (slp_node, &vec_defs);
5436       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5437       vec_defs[0].release ();
5438       vec_defs[1].release ();
5439       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5440       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5441     }
5442   else
5443     {
5444       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5445       vec_oprnds0.create (1);
5446       vec_oprnds0.quick_push (loop_vec_def0);
5447       scalar_dest_def_info = stmt_info;
5448     }
5449
5450   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5451   tree scalar_type = TREE_TYPE (scalar_dest);
5452   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5453
5454   int vec_num = vec_oprnds0.length ();
5455   gcc_assert (vec_num == 1 || slp_node);
5456   tree vec_elem_type = TREE_TYPE (vectype_out);
5457   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5458
5459   tree vector_identity = NULL_TREE;
5460   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5461     vector_identity = build_zero_cst (vectype_out);
5462
5463   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5464   int i;
5465   tree def0;
5466   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5467     {
5468       gimple *new_stmt;
5469       tree mask = NULL_TREE;
5470       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5471         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5472
5473       /* Handle MINUS by adding the negative.  */
5474       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5475         {
5476           tree negated = make_ssa_name (vectype_out);
5477           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5478           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5479           def0 = negated;
5480         }
5481
5482       if (mask && mask_reduc_fn == IFN_LAST)
5483         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5484                                     vector_identity);
5485
5486       /* On the first iteration the input is simply the scalar phi
5487          result, and for subsequent iterations it is the output of
5488          the preceding operation.  */
5489       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5490         {
5491           if (mask && mask_reduc_fn != IFN_LAST)
5492             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5493                                                    def0, mask);
5494           else
5495             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5496                                                    def0);
5497           /* For chained SLP reductions the output of the previous reduction
5498              operation serves as the input of the next. For the final statement
5499              the output cannot be a temporary - we reuse the original
5500              scalar destination of the last statement.  */
5501           if (i != vec_num - 1)
5502             {
5503               gimple_set_lhs (new_stmt, scalar_dest_var);
5504               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5505               gimple_set_lhs (new_stmt, reduc_var);
5506             }
5507         }
5508       else
5509         {
5510           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5511                                              reduc_var, def0);
5512           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5513           /* Remove the statement, so that we can use the same code paths
5514              as for statements that we've just created.  */
5515           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5516           gsi_remove (&tmp_gsi, true);
5517         }
5518
5519       if (i == vec_num - 1)
5520         {
5521           gimple_set_lhs (new_stmt, scalar_dest);
5522           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5523                                                     new_stmt);
5524         }
5525       else
5526         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5527                                                      new_stmt, gsi);
5528
5529       if (slp_node)
5530         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5531     }
5532
5533   if (!slp_node)
5534     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5535
5536   return true;
5537 }
5538
5539 /* Function is_nonwrapping_integer_induction.
5540
5541    Check if STMT_VINO (which is part of loop LOOP) both increments and
5542    does not cause overflow.  */
5543
5544 static bool
5545 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5546 {
5547   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5548   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5549   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5550   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5551   widest_int ni, max_loop_value, lhs_max;
5552   wi::overflow_type overflow = wi::OVF_NONE;
5553
5554   /* Make sure the loop is integer based.  */
5555   if (TREE_CODE (base) != INTEGER_CST
5556       || TREE_CODE (step) != INTEGER_CST)
5557     return false;
5558
5559   /* Check that the max size of the loop will not wrap.  */
5560
5561   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5562     return true;
5563
5564   if (! max_stmt_executions (loop, &ni))
5565     return false;
5566
5567   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5568                             &overflow);
5569   if (overflow)
5570     return false;
5571
5572   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5573                             TYPE_SIGN (lhs_type), &overflow);
5574   if (overflow)
5575     return false;
5576
5577   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5578           <= TYPE_PRECISION (lhs_type));
5579 }
5580
5581 /* Check if masking can be supported by inserting a conditional expression.
5582    CODE is the code for the operation.  COND_FN is the conditional internal
5583    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5584 static bool
5585 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5586                          tree vectype_in)
5587 {
5588   if (cond_fn != IFN_LAST
5589       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5590                                          OPTIMIZE_FOR_SPEED))
5591     return false;
5592
5593   switch (code)
5594     {
5595     case DOT_PROD_EXPR:
5596     case SAD_EXPR:
5597       return true;
5598
5599     default:
5600       return false;
5601     }
5602 }
5603
5604 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5605    code for the operation.  VOP is the array of operands.  MASK is the loop
5606    mask.  GSI is a statement iterator used to place the new conditional
5607    expression.  */
5608 static void
5609 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5610                       gimple_stmt_iterator *gsi)
5611 {
5612   switch (code)
5613     {
5614     case DOT_PROD_EXPR:
5615       {
5616         tree vectype = TREE_TYPE (vop[1]);
5617         tree zero = build_zero_cst (vectype);
5618         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5619         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5620                                                mask, vop[1], zero);
5621         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5622         vop[1] = masked_op1;
5623         break;
5624       }
5625
5626     case SAD_EXPR:
5627       {
5628         tree vectype = TREE_TYPE (vop[1]);
5629         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5630         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5631                                                mask, vop[1], vop[0]);
5632         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5633         vop[1] = masked_op1;
5634         break;
5635       }
5636
5637     default:
5638       gcc_unreachable ();
5639     }
5640 }
5641
5642 /* Function vectorizable_reduction.
5643
5644    Check if STMT_INFO performs a reduction operation that can be vectorized.
5645    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5646    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5647    Return true if STMT_INFO is vectorizable in this way.
5648
5649    This function also handles reduction idioms (patterns) that have been
5650    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5651    may be of this form:
5652      X = pattern_expr (arg0, arg1, ..., X)
5653    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5654    sequence that had been detected and replaced by the pattern-stmt
5655    (STMT_INFO).
5656
5657    This function also handles reduction of condition expressions, for example:
5658      for (int i = 0; i < N; i++)
5659        if (a[i] < value)
5660          last = a[i];
5661    This is handled by vectorising the loop and creating an additional vector
5662    containing the loop indexes for which "a[i] < value" was true.  In the
5663    function epilogue this is reduced to a single max value and then used to
5664    index into the vector of results.
5665
5666    In some cases of reduction patterns, the type of the reduction variable X is
5667    different than the type of the other arguments of STMT_INFO.
5668    In such cases, the vectype that is used when transforming STMT_INFO into
5669    a vector stmt is different than the vectype that is used to determine the
5670    vectorization factor, because it consists of a different number of elements
5671    than the actual number of elements that are being operated upon in parallel.
5672
5673    For example, consider an accumulation of shorts into an int accumulator.
5674    On some targets it's possible to vectorize this pattern operating on 8
5675    shorts at a time (hence, the vectype for purposes of determining the
5676    vectorization factor should be V8HI); on the other hand, the vectype that
5677    is used to create the vector form is actually V4SI (the type of the result).
5678
5679    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5680    indicates what is the actual level of parallelism (V8HI in the example), so
5681    that the right vectorization factor would be derived.  This vectype
5682    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5683    be used to create the vectorized stmt.  The right vectype for the vectorized
5684    stmt is obtained from the type of the result X:
5685       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5686
5687    This means that, contrary to "regular" reductions (or "regular" stmts in
5688    general), the following equation:
5689       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5690    does *NOT* necessarily hold for reduction patterns.  */
5691
5692 bool
5693 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5694                         slp_instance slp_node_instance,
5695                         stmt_vector_for_cost *cost_vec)
5696 {
5697   tree scalar_dest;
5698   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5699   tree vectype_in = NULL_TREE;
5700   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5701   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5702   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5703   stmt_vec_info cond_stmt_vinfo = NULL;
5704   tree scalar_type;
5705   int i;
5706   int ncopies;
5707   bool single_defuse_cycle = false;
5708   bool nested_cycle = false;
5709   bool double_reduc = false;
5710   int vec_num;
5711   tree tem;
5712   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5713   tree cond_reduc_val = NULL_TREE;
5714
5715   /* Make sure it was already recognized as a reduction computation.  */
5716   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5717       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5718       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5719     return false;
5720
5721   /* The stmt we store reduction analysis meta on.  */
5722   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5723   reduc_info->is_reduc_info = true;
5724
5725   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5726     {
5727       if (is_a <gphi *> (stmt_info->stmt))
5728         /* Analysis for double-reduction is done on the outer
5729            loop PHI, nested cycles have no further restrictions.  */
5730         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5731       else
5732         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5733       return true;
5734     }
5735
5736   stmt_vec_info orig_stmt_of_analysis = stmt_info;
5737   stmt_vec_info phi_info = stmt_info;
5738   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
5739       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5740     {
5741       if (!is_a <gphi *> (stmt_info->stmt))
5742         {
5743           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5744           return true;
5745         }
5746       if (slp_node)
5747         {
5748           slp_node_instance->reduc_phis = slp_node;
5749           /* ???  We're leaving slp_node to point to the PHIs, we only
5750              need it to get at the number of vector stmts which wasn't
5751              yet initialized for the instance root.  */
5752         }
5753       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5754         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5755       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
5756         {
5757           use_operand_p use_p;
5758           gimple *use_stmt;
5759           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
5760                                      &use_p, &use_stmt);
5761           gcc_assert (res);
5762           phi_info = loop_vinfo->lookup_stmt (use_stmt);
5763           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
5764         }
5765       /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
5766          element.  */
5767       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5768         {
5769           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
5770           stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5771         }
5772     }
5773   /* PHIs should not participate in patterns.  */
5774   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
5775
5776   if (nested_in_vect_loop_p (loop, stmt_info))
5777     {
5778       loop = loop->inner;
5779       nested_cycle = true;
5780     }
5781
5782   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5783     gcc_assert (slp_node
5784                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
5785
5786   /* 1. Is vectorizable reduction?  */
5787   /* Not supportable if the reduction variable is used in the loop, unless
5788      it's a reduction chain.  */
5789   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5790       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5791     return false;
5792
5793   /* Reductions that are not used even in an enclosing outer-loop,
5794      are expected to be "live" (used out of the loop).  */
5795   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5796       && !STMT_VINFO_LIVE_P (stmt_info))
5797     return false;
5798
5799   /* 2. Has this been recognized as a reduction pattern?
5800
5801      Check if STMT represents a pattern that has been recognized
5802      in earlier analysis stages.  For stmts that represent a pattern,
5803      the STMT_VINFO_RELATED_STMT field records the last stmt in
5804      the original sequence that constitutes the pattern.  */
5805
5806   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5807   if (orig_stmt_info)
5808     {
5809       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5810       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5811     }
5812
5813   /* 3. Check the operands of the operation.  The first operands are defined
5814         inside the loop body. The last operand is the reduction variable,
5815         which is defined by the loop-header-phi.  */
5816
5817   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5818   enum tree_code code = gimple_assign_rhs_code (stmt);
5819   bool lane_reduc_code_p
5820     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
5821   int op_type = TREE_CODE_LENGTH (code);
5822
5823   scalar_dest = gimple_assign_lhs (stmt);
5824   scalar_type = TREE_TYPE (scalar_dest);
5825   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5826       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5827     return false;
5828
5829   /* Do not try to vectorize bit-precision reductions.  */
5830   if (!type_has_mode_precision_p (scalar_type))
5831     return false;
5832
5833   /* All uses but the last are expected to be defined in the loop.
5834      The last use is the reduction variable.  In case of nested cycle this
5835      assumption is not true: we use reduc_index to record the index of the
5836      reduction variable.  */
5837   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
5838
5839   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
5840      and compute the reduction chain length.  */
5841   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
5842                                           loop_latch_edge (loop));
5843   unsigned reduc_chain_length = 0;
5844   bool only_slp_reduc_chain = true;
5845   while (reduc_def != PHI_RESULT (reduc_def_phi))
5846     {
5847       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
5848       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
5849       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
5850         {
5851           if (dump_enabled_p ())
5852             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5853                              "reduction chain broken by patterns.\n");
5854           return false;
5855         }
5856       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
5857         only_slp_reduc_chain = false;
5858       /* ???  For epilogue generation live members of the chain need
5859          to point back to the PHI via their original stmt for
5860          info_for_reduction to work.  */
5861       if (STMT_VINFO_LIVE_P (vdef))
5862         STMT_VINFO_REDUC_DEF (def) = phi_info;
5863       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
5864       reduc_chain_length++;
5865     }
5866
5867   reduc_def = PHI_RESULT (reduc_def_phi);
5868   for (i = 0; i < op_type; i++)
5869     {
5870       tree op = gimple_op (stmt, i + 1);
5871       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5872       if (i == 0 && code == COND_EXPR)
5873         continue;
5874
5875       stmt_vec_info def_stmt_info;
5876       enum vect_def_type dt;
5877       if (!vect_is_simple_use (op, loop_vinfo, &dt, &tem,
5878                                &def_stmt_info))
5879         {
5880           if (dump_enabled_p ())
5881             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5882                              "use not simple.\n");
5883           return false;
5884         }
5885       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
5886         continue;
5887
5888       /* There should be only one cycle def in the stmt, the one
5889          leading to reduc_def.  */
5890       if (VECTORIZABLE_CYCLE_DEF (dt))
5891         return false;
5892
5893       /* To properly compute ncopies we are interested in the widest
5894          non-reduction input type in case we're looking at a widening
5895          accumulation that we later handle in vect_transform_reduction.  */
5896       if (lane_reduc_code_p
5897           && tem
5898           && (!vectype_in
5899               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5900                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
5901         vectype_in = tem;
5902
5903       if (code == COND_EXPR)
5904         {
5905           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
5906           if (dt == vect_constant_def)
5907             {
5908               cond_reduc_dt = dt;
5909               cond_reduc_val = op;
5910             }
5911           if (dt == vect_induction_def
5912               && def_stmt_info
5913               && is_nonwrapping_integer_induction (def_stmt_info, loop))
5914             {
5915               cond_reduc_dt = dt;
5916               cond_stmt_vinfo = def_stmt_info;
5917             }
5918         }
5919     }
5920   if (!vectype_in)
5921     vectype_in = vectype_out;
5922   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
5923
5924   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
5925   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
5926   /* If we have a condition reduction, see if we can simplify it further.  */
5927   if (v_reduc_type == COND_REDUCTION)
5928     {
5929       if (slp_node)
5930         return false;
5931
5932       /* When the condition uses the reduction value in the condition, fail.  */
5933       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
5934         {
5935           if (dump_enabled_p ())
5936             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5937                              "condition depends on previous iteration\n");
5938           return false;
5939         }
5940
5941       if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
5942                                           vectype_in, OPTIMIZE_FOR_SPEED))
5943         {
5944           if (dump_enabled_p ())
5945             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5946                              "optimizing condition reduction with"
5947                              " FOLD_EXTRACT_LAST.\n");
5948           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
5949         }
5950       else if (cond_reduc_dt == vect_induction_def)
5951         {
5952           tree base
5953             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5954           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5955
5956           gcc_assert (TREE_CODE (base) == INTEGER_CST
5957                       && TREE_CODE (step) == INTEGER_CST);
5958           cond_reduc_val = NULL_TREE;
5959           enum tree_code cond_reduc_op_code = ERROR_MARK;
5960           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
5961           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
5962             ;
5963           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5964              above base; punt if base is the minimum value of the type for
5965              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
5966           else if (tree_int_cst_sgn (step) == -1)
5967             {
5968               cond_reduc_op_code = MIN_EXPR;
5969               if (tree_int_cst_sgn (base) == -1)
5970                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5971               else if (tree_int_cst_lt (base,
5972                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
5973                 cond_reduc_val
5974                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
5975             }
5976           else
5977             {
5978               cond_reduc_op_code = MAX_EXPR;
5979               if (tree_int_cst_sgn (base) == 1)
5980                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5981               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
5982                                         base))
5983                 cond_reduc_val
5984                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
5985             }
5986           if (cond_reduc_val)
5987             {
5988               if (dump_enabled_p ())
5989                 dump_printf_loc (MSG_NOTE, vect_location,
5990                                  "condition expression based on "
5991                                  "integer induction.\n");
5992               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
5993               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
5994                 = cond_reduc_val;
5995               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
5996             }
5997         }
5998       else if (cond_reduc_dt == vect_constant_def)
5999         {
6000           enum vect_def_type cond_initial_dt;
6001           tree cond_initial_val
6002             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6003
6004           gcc_assert (cond_reduc_val != NULL_TREE);
6005           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6006           if (cond_initial_dt == vect_constant_def
6007               && types_compatible_p (TREE_TYPE (cond_initial_val),
6008                                      TREE_TYPE (cond_reduc_val)))
6009             {
6010               tree e = fold_binary (LE_EXPR, boolean_type_node,
6011                                     cond_initial_val, cond_reduc_val);
6012               if (e && (integer_onep (e) || integer_zerop (e)))
6013                 {
6014                   if (dump_enabled_p ())
6015                     dump_printf_loc (MSG_NOTE, vect_location,
6016                                      "condition expression based on "
6017                                      "compile time constant.\n");
6018                   /* Record reduction code at analysis stage.  */
6019                   STMT_VINFO_REDUC_CODE (reduc_info)
6020                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6021                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6022                 }
6023             }
6024         }
6025     }
6026
6027   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6028     /* We changed STMT to be the first stmt in reduction chain, hence we
6029        check that in this case the first element in the chain is STMT.  */
6030     gcc_assert (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (phi_info))
6031                 == vect_orig_stmt (stmt_info));
6032
6033   if (STMT_VINFO_LIVE_P (phi_info))
6034     return false;
6035
6036   if (slp_node)
6037     ncopies = 1;
6038   else
6039     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6040
6041   gcc_assert (ncopies >= 1);
6042
6043   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6044
6045   if (nested_cycle)
6046     {
6047       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6048                   == vect_double_reduction_def);
6049       double_reduc = true;
6050     }
6051
6052   /* 4.2. Check support for the epilog operation.
6053
6054           If STMT represents a reduction pattern, then the type of the
6055           reduction variable may be different than the type of the rest
6056           of the arguments.  For example, consider the case of accumulation
6057           of shorts into an int accumulator; The original code:
6058                         S1: int_a = (int) short_a;
6059           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6060
6061           was replaced with:
6062                         STMT: int_acc = widen_sum <short_a, int_acc>
6063
6064           This means that:
6065           1. The tree-code that is used to create the vector operation in the
6066              epilog code (that reduces the partial results) is not the
6067              tree-code of STMT, but is rather the tree-code of the original
6068              stmt from the pattern that STMT is replacing.  I.e, in the example
6069              above we want to use 'widen_sum' in the loop, but 'plus' in the
6070              epilog.
6071           2. The type (mode) we use to check available target support
6072              for the vector operation to be created in the *epilog*, is
6073              determined by the type of the reduction variable (in the example
6074              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6075              However the type (mode) we use to check available target support
6076              for the vector operation to be created *inside the loop*, is
6077              determined by the type of the other arguments to STMT (in the
6078              example we'd check this: optab_handler (widen_sum_optab,
6079              vect_short_mode)).
6080
6081           This is contrary to "regular" reductions, in which the types of all
6082           the arguments are the same as the type of the reduction variable.
6083           For "regular" reductions we can therefore use the same vector type
6084           (and also the same tree-code) when generating the epilog code and
6085           when generating the code inside the loop.  */
6086
6087   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6088   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6089
6090   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6091   if (reduction_type == TREE_CODE_REDUCTION)
6092     {
6093       /* Check whether it's ok to change the order of the computation.
6094          Generally, when vectorizing a reduction we change the order of the
6095          computation.  This may change the behavior of the program in some
6096          cases, so we need to check that this is ok.  One exception is when
6097          vectorizing an outer-loop: the inner-loop is executed sequentially,
6098          and therefore vectorizing reductions in the inner-loop during
6099          outer-loop vectorization is safe.  */
6100       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6101         {
6102           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6103              is not directy used in stmt.  */
6104           if (!only_slp_reduc_chain
6105               && reduc_chain_length != 1)
6106             {
6107               if (dump_enabled_p ())
6108                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6109                                  "in-order reduction chain without SLP.\n");
6110               return false;
6111             }
6112           STMT_VINFO_REDUC_TYPE (reduc_info)
6113             = reduction_type = FOLD_LEFT_REDUCTION;
6114         }
6115       else if (!commutative_tree_code (orig_code)
6116                || !associative_tree_code (orig_code))
6117         {
6118           if (dump_enabled_p ())
6119             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6120                             "reduction: not commutative/associative");
6121           return false;
6122         }
6123     }
6124
6125   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6126       && ncopies > 1)
6127     {
6128       if (dump_enabled_p ())
6129         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6130                          "multiple types in double reduction or condition "
6131                          "reduction or fold-left reduction.\n");
6132       return false;
6133     }
6134
6135   internal_fn reduc_fn = IFN_LAST;
6136   if (reduction_type == TREE_CODE_REDUCTION
6137       || reduction_type == FOLD_LEFT_REDUCTION
6138       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6139       || reduction_type == CONST_COND_REDUCTION)
6140     {
6141       if (reduction_type == FOLD_LEFT_REDUCTION
6142           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6143           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6144         {
6145           if (reduc_fn != IFN_LAST
6146               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6147                                                   OPTIMIZE_FOR_SPEED))
6148             {
6149               if (dump_enabled_p ())
6150                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6151                                  "reduc op not supported by target.\n");
6152
6153               reduc_fn = IFN_LAST;
6154             }
6155         }
6156       else
6157         {
6158           if (!nested_cycle || double_reduc)
6159             {
6160               if (dump_enabled_p ())
6161                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6162                                  "no reduc code for scalar code.\n");
6163
6164               return false;
6165             }
6166         }
6167     }
6168   else if (reduction_type == COND_REDUCTION)
6169     {
6170       int scalar_precision
6171         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6172       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6173       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6174                                                 nunits_out);
6175
6176       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6177                                           OPTIMIZE_FOR_SPEED))
6178         reduc_fn = IFN_REDUC_MAX;
6179     }
6180   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6181
6182   if (reduction_type != EXTRACT_LAST_REDUCTION
6183       && (!nested_cycle || double_reduc)
6184       && reduc_fn == IFN_LAST
6185       && !nunits_out.is_constant ())
6186     {
6187       if (dump_enabled_p ())
6188         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6189                          "missing target support for reduction on"
6190                          " variable-length vectors.\n");
6191       return false;
6192     }
6193
6194   /* For SLP reductions, see if there is a neutral value we can use.  */
6195   tree neutral_op = NULL_TREE;
6196   if (slp_node)
6197     neutral_op = neutral_op_for_slp_reduction
6198       (slp_node_instance->reduc_phis, orig_code,
6199        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6200
6201   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6202     {
6203       /* We can't support in-order reductions of code such as this:
6204
6205            for (int i = 0; i < n1; ++i)
6206              for (int j = 0; j < n2; ++j)
6207                l += a[j];
6208
6209          since GCC effectively transforms the loop when vectorizing:
6210
6211            for (int i = 0; i < n1 / VF; ++i)
6212              for (int j = 0; j < n2; ++j)
6213                for (int k = 0; k < VF; ++k)
6214                  l += a[j];
6215
6216          which is a reassociation of the original operation.  */
6217       if (dump_enabled_p ())
6218         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6219                          "in-order double reduction not supported.\n");
6220
6221       return false;
6222     }
6223
6224   if (reduction_type == FOLD_LEFT_REDUCTION
6225       && slp_node
6226       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6227     {
6228       /* We cannot use in-order reductions in this case because there is
6229          an implicit reassociation of the operations involved.  */
6230       if (dump_enabled_p ())
6231         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232                          "in-order unchained SLP reductions not supported.\n");
6233       return false;
6234     }
6235
6236   /* For double reductions, and for SLP reductions with a neutral value,
6237      we construct a variable-length initial vector by loading a vector
6238      full of the neutral value and then shift-and-inserting the start
6239      values into the low-numbered elements.  */
6240   if ((double_reduc || neutral_op)
6241       && !nunits_out.is_constant ()
6242       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6243                                           vectype_out, OPTIMIZE_FOR_SPEED))
6244     {
6245       if (dump_enabled_p ())
6246         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6247                          "reduction on variable-length vectors requires"
6248                          " target support for a vector-shift-and-insert"
6249                          " operation.\n");
6250       return false;
6251     }
6252
6253   /* Check extra constraints for variable-length unchained SLP reductions.  */
6254   if (STMT_SLP_TYPE (stmt_info)
6255       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6256       && !nunits_out.is_constant ())
6257     {
6258       /* We checked above that we could build the initial vector when
6259          there's a neutral element value.  Check here for the case in
6260          which each SLP statement has its own initial value and in which
6261          that value needs to be repeated for every instance of the
6262          statement within the initial vector.  */
6263       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6264       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6265       if (!neutral_op
6266           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6267                                               elt_mode))
6268         {
6269           if (dump_enabled_p ())
6270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6271                              "unsupported form of SLP reduction for"
6272                              " variable-length vectors: cannot build"
6273                              " initial vector.\n");
6274           return false;
6275         }
6276       /* The epilogue code relies on the number of elements being a multiple
6277          of the group size.  The duplicate-and-interleave approach to setting
6278          up the the initial vector does too.  */
6279       if (!multiple_p (nunits_out, group_size))
6280         {
6281           if (dump_enabled_p ())
6282             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6283                              "unsupported form of SLP reduction for"
6284                              " variable-length vectors: the vector size"
6285                              " is not a multiple of the number of results.\n");
6286           return false;
6287         }
6288     }
6289
6290   if (reduction_type == COND_REDUCTION)
6291     {
6292       widest_int ni;
6293
6294       if (! max_loop_iterations (loop, &ni))
6295         {
6296           if (dump_enabled_p ())
6297             dump_printf_loc (MSG_NOTE, vect_location,
6298                              "loop count not known, cannot create cond "
6299                              "reduction.\n");
6300           return false;
6301         }
6302       /* Convert backedges to iterations.  */
6303       ni += 1;
6304
6305       /* The additional index will be the same type as the condition.  Check
6306          that the loop can fit into this less one (because we'll use up the
6307          zero slot for when there are no matches).  */
6308       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6309       if (wi::geu_p (ni, wi::to_widest (max_index)))
6310         {
6311           if (dump_enabled_p ())
6312             dump_printf_loc (MSG_NOTE, vect_location,
6313                              "loop size is greater than data size.\n");
6314           return false;
6315         }
6316     }
6317
6318   /* In case the vectorization factor (VF) is bigger than the number
6319      of elements that we can fit in a vectype (nunits), we have to generate
6320      more than one vector stmt - i.e - we need to "unroll" the
6321      vector stmt by a factor VF/nunits.  For more details see documentation
6322      in vectorizable_operation.  */
6323
6324   /* If the reduction is used in an outer loop we need to generate
6325      VF intermediate results, like so (e.g. for ncopies=2):
6326         r0 = phi (init, r0)
6327         r1 = phi (init, r1)
6328         r0 = x0 + r0;
6329         r1 = x1 + r1;
6330     (i.e. we generate VF results in 2 registers).
6331     In this case we have a separate def-use cycle for each copy, and therefore
6332     for each copy we get the vector def for the reduction variable from the
6333     respective phi node created for this copy.
6334
6335     Otherwise (the reduction is unused in the loop nest), we can combine
6336     together intermediate results, like so (e.g. for ncopies=2):
6337         r = phi (init, r)
6338         r = x0 + r;
6339         r = x1 + r;
6340    (i.e. we generate VF/2 results in a single register).
6341    In this case for each copy we get the vector def for the reduction variable
6342    from the vectorized reduction operation generated in the previous iteration.
6343
6344    This only works when we see both the reduction PHI and its only consumer
6345    in vectorizable_reduction and there are no intermediate stmts
6346    participating.  */
6347   if (ncopies > 1
6348       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6349       && reduc_chain_length == 1)
6350     single_defuse_cycle = true;
6351
6352   if (single_defuse_cycle || lane_reduc_code_p)
6353     {
6354       gcc_assert (code != COND_EXPR);
6355
6356       /* 4. Supportable by target?  */
6357       bool ok = true;
6358
6359       /* 4.1. check support for the operation in the loop  */
6360       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6361       if (!optab)
6362         {
6363           if (dump_enabled_p ())
6364             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6365                              "no optab.\n");
6366           ok = false;
6367         }
6368
6369       machine_mode vec_mode = TYPE_MODE (vectype_in);
6370       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6371         {
6372           if (dump_enabled_p ())
6373             dump_printf (MSG_NOTE, "op not supported by target.\n");
6374           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6375               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6376             ok = false;
6377           else
6378             if (dump_enabled_p ())
6379               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6380         }
6381
6382       /* Worthwhile without SIMD support?  */
6383       if (ok
6384           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6385           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6386         {
6387           if (dump_enabled_p ())
6388             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6389                              "not worthwhile without SIMD support.\n");
6390           ok = false;
6391         }
6392
6393       /* lane-reducing operations have to go through vect_transform_reduction.
6394          For the other cases try without the single cycle optimization.  */
6395       if (!ok)
6396         {
6397           if (lane_reduc_code_p)
6398             return false;
6399           else
6400             single_defuse_cycle = false;
6401         }
6402     }
6403   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6404
6405   /* If the reduction stmt is one of the patterns that have lane
6406      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6407   if ((ncopies > 1 && ! single_defuse_cycle)
6408       && lane_reduc_code_p)
6409     {
6410       if (dump_enabled_p ())
6411         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6412                          "multi def-use cycle not possible for lane-reducing "
6413                          "reduction operation\n");
6414       return false;
6415     }
6416
6417   if (slp_node)
6418     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6419   else
6420     vec_num = 1;
6421
6422   vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6423                              cost_vec);
6424   if (dump_enabled_p ()
6425       && reduction_type == FOLD_LEFT_REDUCTION)
6426     dump_printf_loc (MSG_NOTE, vect_location,
6427                      "using an in-order (fold-left) reduction.\n");
6428   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6429   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6430      reductions go through their own vectorizable_* routines.  */
6431   if (!single_defuse_cycle
6432       && code != DOT_PROD_EXPR
6433       && code != WIDEN_SUM_EXPR
6434       && code != SAD_EXPR
6435       && reduction_type != FOLD_LEFT_REDUCTION)
6436     {
6437       STMT_VINFO_DEF_TYPE (stmt_info) = vect_internal_def;
6438       STMT_VINFO_DEF_TYPE (vect_orig_stmt (stmt_info)) = vect_internal_def;
6439     }
6440   else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6441     {
6442       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6443       internal_fn cond_fn = get_conditional_internal_fn (code);
6444
6445       if (reduction_type != FOLD_LEFT_REDUCTION
6446           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6447           && (cond_fn == IFN_LAST
6448               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6449                                                   OPTIMIZE_FOR_SPEED)))
6450         {
6451           if (dump_enabled_p ())
6452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6453                              "can't use a fully-masked loop because no"
6454                              " conditional operation is available.\n");
6455           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6456         }
6457       else
6458         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6459                                vectype_in, NULL);
6460     }
6461   return true;
6462 }
6463
6464 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6465    value.  */
6466
6467 bool
6468 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6469                           stmt_vec_info *vec_stmt, slp_tree slp_node)
6470 {
6471   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6472   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6473   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6474   int i;
6475   int ncopies;
6476   int j;
6477   int vec_num;
6478
6479   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6480   gcc_assert (reduc_info->is_reduc_info);
6481
6482   if (nested_in_vect_loop_p (loop, stmt_info))
6483     {
6484       loop = loop->inner;
6485       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6486     }
6487
6488   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6489   enum tree_code code = gimple_assign_rhs_code (stmt);
6490   int op_type = TREE_CODE_LENGTH (code);
6491
6492   /* Flatten RHS.  */
6493   tree ops[3];
6494   switch (get_gimple_rhs_class (code))
6495     {
6496     case GIMPLE_TERNARY_RHS:
6497       ops[2] = gimple_assign_rhs3 (stmt);
6498       /* Fall thru.  */
6499     case GIMPLE_BINARY_RHS:
6500       ops[0] = gimple_assign_rhs1 (stmt);
6501       ops[1] = gimple_assign_rhs2 (stmt);
6502       break;
6503     default:
6504       gcc_unreachable ();
6505     }
6506
6507   /* All uses but the last are expected to be defined in the loop.
6508      The last use is the reduction variable.  In case of nested cycle this
6509      assumption is not true: we use reduc_index to record the index of the
6510      reduction variable.  */
6511   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6512   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6513   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6514   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6515
6516   if (slp_node)
6517     {
6518       ncopies = 1;
6519       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6520     }
6521   else
6522     {
6523       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6524       vec_num = 1;
6525     }
6526
6527   internal_fn cond_fn = get_conditional_internal_fn (code);
6528   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6529   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6530
6531   /* Transform.  */
6532   stmt_vec_info new_stmt_info = NULL;
6533   stmt_vec_info prev_stmt_info;
6534   tree new_temp = NULL_TREE;
6535   auto_vec<tree> vec_oprnds0;
6536   auto_vec<tree> vec_oprnds1;
6537   auto_vec<tree> vec_oprnds2;
6538   tree def0;
6539
6540   if (dump_enabled_p ())
6541     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6542
6543   /* FORNOW: Multiple types are not supported for condition.  */
6544   if (code == COND_EXPR)
6545     gcc_assert (ncopies == 1);
6546
6547   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6548
6549   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6550   if (reduction_type == FOLD_LEFT_REDUCTION)
6551     {
6552       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6553       return vectorize_fold_left_reduction
6554           (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6555            reduc_fn, ops, vectype_in, reduc_index, masks);
6556     }
6557
6558   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6559   gcc_assert (single_defuse_cycle
6560               || code == DOT_PROD_EXPR
6561               || code == WIDEN_SUM_EXPR
6562               || code == SAD_EXPR);
6563
6564   /* Create the destination vector  */
6565   tree scalar_dest = gimple_assign_lhs (stmt);
6566   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6567
6568   prev_stmt_info = NULL;
6569   if (!slp_node)
6570     {
6571       vec_oprnds0.create (1);
6572       vec_oprnds1.create (1);
6573       if (op_type == ternary_op)
6574         vec_oprnds2.create (1);
6575     }
6576
6577   for (j = 0; j < ncopies; j++)
6578     {
6579       /* Handle uses.  */
6580       if (j == 0)
6581         {
6582           if (slp_node)
6583             {
6584               /* Get vec defs for all the operands except the reduction index,
6585                  ensuring the ordering of the ops in the vector is kept.  */
6586               auto_vec<vec<tree>, 3> vec_defs;
6587               vect_get_slp_defs (slp_node, &vec_defs);
6588               vec_oprnds0.safe_splice (vec_defs[0]);
6589               vec_defs[0].release ();
6590               vec_oprnds1.safe_splice (vec_defs[1]);
6591               vec_defs[1].release ();
6592               if (op_type == ternary_op)
6593                 {
6594                   vec_oprnds2.safe_splice (vec_defs[2]);
6595                   vec_defs[2].release ();
6596                 }
6597             }
6598           else
6599             {
6600               vec_oprnds0.quick_push
6601                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6602               vec_oprnds1.quick_push
6603                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6604               if (op_type == ternary_op)
6605                 vec_oprnds2.quick_push
6606                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
6607             }
6608         }
6609       else
6610         {
6611           if (!slp_node)
6612             {
6613               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6614
6615               if (single_defuse_cycle && reduc_index == 0)
6616                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6617               else
6618                 vec_oprnds0[0]
6619                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6620                                                     vec_oprnds0[0]);
6621               if (single_defuse_cycle && reduc_index == 1)
6622                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6623               else
6624                 vec_oprnds1[0]
6625                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6626                                                     vec_oprnds1[0]);
6627               if (op_type == ternary_op)
6628                 {
6629                   if (single_defuse_cycle && reduc_index == 2)
6630                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6631                   else
6632                     vec_oprnds2[0]
6633                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6634                                                         vec_oprnds2[0]);
6635                 }
6636             }
6637         }
6638
6639       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6640         {
6641           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6642           if (masked_loop_p && !mask_by_cond_expr)
6643             {
6644               /* Make sure that the reduction accumulator is vop[0].  */
6645               if (reduc_index == 1)
6646                 {
6647                   gcc_assert (commutative_tree_code (code));
6648                   std::swap (vop[0], vop[1]);
6649                 }
6650               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6651                                               vectype_in, i * ncopies + j);
6652               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6653                                                         vop[0], vop[1],
6654                                                         vop[0]);
6655               new_temp = make_ssa_name (vec_dest, call);
6656               gimple_call_set_lhs (call, new_temp);
6657               gimple_call_set_nothrow (call, true);
6658               new_stmt_info
6659                 = vect_finish_stmt_generation (stmt_info, call, gsi);
6660             }
6661           else
6662             {
6663               if (op_type == ternary_op)
6664                 vop[2] = vec_oprnds2[i];
6665
6666               if (masked_loop_p && mask_by_cond_expr)
6667                 {
6668                   tree mask = vect_get_loop_mask (gsi, masks,
6669                                                   vec_num * ncopies,
6670                                                   vectype_in, i * ncopies + j);
6671                   build_vect_cond_expr (code, vop, mask, gsi);
6672                 }
6673
6674               gassign *new_stmt = gimple_build_assign (vec_dest, code,
6675                                                        vop[0], vop[1], vop[2]);
6676               new_temp = make_ssa_name (vec_dest, new_stmt);
6677               gimple_assign_set_lhs (new_stmt, new_temp);
6678               new_stmt_info
6679                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6680             }
6681
6682           if (slp_node)
6683             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6684         }
6685
6686       if (slp_node || single_defuse_cycle)
6687         continue;
6688
6689       if (j == 0)
6690         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6691       else
6692         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6693
6694       prev_stmt_info = new_stmt_info;
6695     }
6696
6697   if (single_defuse_cycle && !slp_node)
6698     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6699
6700   return true;
6701 }
6702
6703 /* Transform phase of a cycle PHI.  */
6704
6705 bool
6706 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6707                           slp_tree slp_node, slp_instance slp_node_instance)
6708 {
6709   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6710   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6711   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6712   int i;
6713   int ncopies;
6714   stmt_vec_info prev_phi_info;
6715   int j;
6716   bool nested_cycle = false;
6717   int vec_num;
6718
6719   if (nested_in_vect_loop_p (loop, stmt_info))
6720     {
6721       loop = loop->inner;
6722       nested_cycle = true;
6723     }
6724
6725   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6726   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6727   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6728   gcc_assert (reduc_info->is_reduc_info);
6729
6730   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
6731       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
6732     /* Leave the scalar phi in place.  */
6733     return true;
6734
6735   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6736   /* For a nested cycle we do not fill the above.  */
6737   if (!vectype_in)
6738     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6739   gcc_assert (vectype_in);
6740
6741   if (slp_node)
6742     {
6743       /* The size vect_schedule_slp_instance computes is off for us.  */
6744       vec_num = vect_get_num_vectors
6745           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6746            * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
6747       ncopies = 1;
6748     }
6749   else
6750     {
6751       vec_num = 1;
6752       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6753     }
6754
6755   /* Check whether we should use a single PHI node and accumulate
6756      vectors to one before the backedge.  */
6757   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
6758     ncopies = 1;
6759
6760   /* Create the destination vector  */
6761   gphi *phi = as_a <gphi *> (stmt_info->stmt);
6762   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
6763                                                vectype_out);
6764
6765   /* Get the loop-entry arguments.  */
6766   tree vec_initial_def;
6767   auto_vec<tree> vec_initial_defs;
6768   if (slp_node)
6769     {
6770       vec_initial_defs.reserve (vec_num);
6771       gcc_assert (slp_node == slp_node_instance->reduc_phis);
6772       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
6773       tree neutral_op
6774         = neutral_op_for_slp_reduction (slp_node,
6775                                         STMT_VINFO_REDUC_CODE (reduc_info),
6776                                         first != NULL);
6777       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
6778                                       &vec_initial_defs, vec_num,
6779                                       first != NULL, neutral_op);
6780     }
6781   else
6782     {
6783       /* Get at the scalar def before the loop, that defines the initial
6784          value of the reduction variable.  */
6785       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
6786                                                 loop_preheader_edge (loop));
6787       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
6788          and we can't use zero for induc_val, use initial_def.  Similarly
6789          for REDUC_MIN and initial_def larger than the base.  */
6790       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6791         {
6792           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6793           if (TREE_CODE (initial_def) == INTEGER_CST
6794               && !integer_zerop (induc_val)
6795               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
6796                    && tree_int_cst_lt (initial_def, induc_val))
6797                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
6798                       && tree_int_cst_lt (induc_val, initial_def))))
6799             {
6800               induc_val = initial_def;
6801               /* Communicate we used the initial_def to epilouge
6802                  generation.  */
6803               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
6804             }
6805           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
6806         }
6807       else if (nested_cycle)
6808         {
6809           /* Do not use an adjustment def as that case is not supported
6810              correctly if ncopies is not one.  */
6811           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
6812                                                           reduc_stmt_info);
6813         }
6814       else
6815         {
6816           tree adjustment_def = NULL_TREE;
6817           tree *adjustment_defp = &adjustment_def;
6818           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
6819           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6820             adjustment_defp = NULL;
6821           vec_initial_def
6822             = get_initial_def_for_reduction (reduc_stmt_info, code,
6823                                              initial_def, adjustment_defp);
6824           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
6825         }
6826       vec_initial_defs.create (1);
6827       vec_initial_defs.quick_push (vec_initial_def);
6828     }
6829
6830   /* Generate the reduction PHIs upfront.  */
6831   prev_phi_info = NULL;
6832   for (i = 0; i < vec_num; i++)
6833     {
6834       tree vec_init_def = vec_initial_defs[i];
6835       for (j = 0; j < ncopies; j++)
6836         {
6837           /* Create the reduction-phi that defines the reduction
6838              operand.  */
6839           gphi *new_phi = create_phi_node (vec_dest, loop->header);
6840           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6841
6842           /* Set the loop-entry arg of the reduction-phi.  */
6843           if (j != 0 && nested_cycle)
6844             vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6845                                                            vec_init_def);
6846           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
6847                        UNKNOWN_LOCATION);
6848
6849           /* The loop-latch arg is set in epilogue processing.  */
6850
6851           if (slp_node)
6852             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6853           else
6854             {
6855               if (j == 0)
6856                 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6857               else
6858                 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6859               prev_phi_info = new_phi_info;
6860             }
6861         }
6862     }
6863
6864   return true;
6865 }
6866
6867 /* Vectorizes LC PHIs.  */
6868
6869 bool
6870 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6871                      slp_tree slp_node)
6872 {
6873   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6874   if (!loop_vinfo
6875       || !is_a <gphi *> (stmt_info->stmt)
6876       || gimple_phi_num_args (stmt_info->stmt) != 1)
6877     return false;
6878
6879   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6880       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
6881     return false;
6882
6883   if (!vec_stmt) /* transformation not required.  */
6884     {
6885       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
6886       return true;
6887     }
6888
6889   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6890   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
6891   basic_block bb = gimple_bb (stmt_info->stmt);
6892   edge e = single_pred_edge (bb);
6893   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
6894   vec<tree> vec_oprnds = vNULL;
6895   vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
6896                      stmt_info, &vec_oprnds, NULL, slp_node);
6897   if (slp_node)
6898     {
6899       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6900       gcc_assert (vec_oprnds.length () == vec_num);
6901       for (unsigned i = 0; i < vec_num; i++)
6902         {
6903           /* Create the vectorized LC PHI node.  */
6904           gphi *new_phi = create_phi_node (vec_dest, bb);
6905           add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
6906           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6907           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6908         }
6909     }
6910   else
6911     {
6912       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
6913       stmt_vec_info prev_phi_info = NULL;
6914       for (unsigned i = 0; i < ncopies; i++)
6915         {
6916           if (i != 0)
6917             vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
6918           /* Create the vectorized LC PHI node.  */
6919           gphi *new_phi = create_phi_node (vec_dest, bb);
6920           add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
6921           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6922           if (i == 0)
6923             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6924           else
6925             STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6926           prev_phi_info = new_phi_info;
6927         }
6928     }
6929   vec_oprnds.release ();
6930
6931   return true;
6932 }
6933
6934
6935 /* Function vect_min_worthwhile_factor.
6936
6937    For a loop where we could vectorize the operation indicated by CODE,
6938    return the minimum vectorization factor that makes it worthwhile
6939    to use generic vectors.  */
6940 static unsigned int
6941 vect_min_worthwhile_factor (enum tree_code code)
6942 {
6943   switch (code)
6944     {
6945     case PLUS_EXPR:
6946     case MINUS_EXPR:
6947     case NEGATE_EXPR:
6948       return 4;
6949
6950     case BIT_AND_EXPR:
6951     case BIT_IOR_EXPR:
6952     case BIT_XOR_EXPR:
6953     case BIT_NOT_EXPR:
6954       return 2;
6955
6956     default:
6957       return INT_MAX;
6958     }
6959 }
6960
6961 /* Return true if VINFO indicates we are doing loop vectorization and if
6962    it is worth decomposing CODE operations into scalar operations for
6963    that loop's vectorization factor.  */
6964
6965 bool
6966 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6967 {
6968   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6969   unsigned HOST_WIDE_INT value;
6970   return (loop_vinfo
6971           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
6972           && value >= vect_min_worthwhile_factor (code));
6973 }
6974
6975 /* Function vectorizable_induction
6976
6977    Check if STMT_INFO performs an induction computation that can be vectorized.
6978    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6979    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6980    Return true if STMT_INFO is vectorizable in this way.  */
6981
6982 bool
6983 vectorizable_induction (stmt_vec_info stmt_info,
6984                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6985                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6986                         stmt_vector_for_cost *cost_vec)
6987 {
6988   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6989   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6990   unsigned ncopies;
6991   bool nested_in_vect_loop = false;
6992   class loop *iv_loop;
6993   tree vec_def;
6994   edge pe = loop_preheader_edge (loop);
6995   basic_block new_bb;
6996   tree new_vec, vec_init, vec_step, t;
6997   tree new_name;
6998   gimple *new_stmt;
6999   gphi *induction_phi;
7000   tree induc_def, vec_dest;
7001   tree init_expr, step_expr;
7002   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7003   unsigned i;
7004   tree expr;
7005   gimple_seq stmts;
7006   imm_use_iterator imm_iter;
7007   use_operand_p use_p;
7008   gimple *exit_phi;
7009   edge latch_e;
7010   tree loop_arg;
7011   gimple_stmt_iterator si;
7012
7013   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7014   if (!phi)
7015     return false;
7016
7017   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7018     return false;
7019
7020   /* Make sure it was recognized as induction computation.  */
7021   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7022     return false;
7023
7024   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7025   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7026
7027   if (slp_node)
7028     ncopies = 1;
7029   else
7030     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7031   gcc_assert (ncopies >= 1);
7032
7033   /* FORNOW. These restrictions should be relaxed.  */
7034   if (nested_in_vect_loop_p (loop, stmt_info))
7035     {
7036       imm_use_iterator imm_iter;
7037       use_operand_p use_p;
7038       gimple *exit_phi;
7039       edge latch_e;
7040       tree loop_arg;
7041
7042       if (ncopies > 1)
7043         {
7044           if (dump_enabled_p ())
7045             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7046                              "multiple types in nested loop.\n");
7047           return false;
7048         }
7049
7050       /* FORNOW: outer loop induction with SLP not supported.  */
7051       if (STMT_SLP_TYPE (stmt_info))
7052         return false;
7053
7054       exit_phi = NULL;
7055       latch_e = loop_latch_edge (loop->inner);
7056       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7057       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7058         {
7059           gimple *use_stmt = USE_STMT (use_p);
7060           if (is_gimple_debug (use_stmt))
7061             continue;
7062
7063           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7064             {
7065               exit_phi = use_stmt;
7066               break;
7067             }
7068         }
7069       if (exit_phi)
7070         {
7071           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7072           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7073                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7074             {
7075               if (dump_enabled_p ())
7076                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7077                                  "inner-loop induction only used outside "
7078                                  "of the outer vectorized loop.\n");
7079               return false;
7080             }
7081         }
7082
7083       nested_in_vect_loop = true;
7084       iv_loop = loop->inner;
7085     }
7086   else
7087     iv_loop = loop;
7088   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7089
7090   if (slp_node && !nunits.is_constant ())
7091     {
7092       /* The current SLP code creates the initial value element-by-element.  */
7093       if (dump_enabled_p ())
7094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7095                          "SLP induction not supported for variable-length"
7096                          " vectors.\n");
7097       return false;
7098     }
7099
7100   if (!vec_stmt) /* transformation not required.  */
7101     {
7102       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7103       DUMP_VECT_SCOPE ("vectorizable_induction");
7104       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7105       return true;
7106     }
7107
7108   /* Transform.  */
7109
7110   /* Compute a vector variable, initialized with the first VF values of
7111      the induction variable.  E.g., for an iv with IV_PHI='X' and
7112      evolution S, for a vector of 4 units, we want to compute:
7113      [X, X + S, X + 2*S, X + 3*S].  */
7114
7115   if (dump_enabled_p ())
7116     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7117
7118   latch_e = loop_latch_edge (iv_loop);
7119   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7120
7121   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7122   gcc_assert (step_expr != NULL_TREE);
7123   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7124
7125   pe = loop_preheader_edge (iv_loop);
7126   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7127                                      loop_preheader_edge (iv_loop));
7128
7129   stmts = NULL;
7130   if (!nested_in_vect_loop)
7131     {
7132       /* Convert the initial value to the IV update type.  */
7133       tree new_type = TREE_TYPE (step_expr);
7134       init_expr = gimple_convert (&stmts, new_type, init_expr);
7135
7136       /* If we are using the loop mask to "peel" for alignment then we need
7137          to adjust the start value here.  */
7138       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7139       if (skip_niters != NULL_TREE)
7140         {
7141           if (FLOAT_TYPE_P (vectype))
7142             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7143                                         skip_niters);
7144           else
7145             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7146           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7147                                          skip_niters, step_expr);
7148           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7149                                     init_expr, skip_step);
7150         }
7151     }
7152
7153   if (stmts)
7154     {
7155       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7156       gcc_assert (!new_bb);
7157     }
7158
7159   /* Find the first insertion point in the BB.  */
7160   basic_block bb = gimple_bb (phi);
7161   si = gsi_after_labels (bb);
7162
7163   /* For SLP induction we have to generate several IVs as for example
7164      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7165      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7166      [VF*S, VF*S, VF*S, VF*S] for all.  */
7167   if (slp_node)
7168     {
7169       /* Enforced above.  */
7170       unsigned int const_nunits = nunits.to_constant ();
7171
7172       /* Generate [VF*S, VF*S, ... ].  */
7173       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7174         {
7175           expr = build_int_cst (integer_type_node, vf);
7176           expr = fold_convert (TREE_TYPE (step_expr), expr);
7177         }
7178       else
7179         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7180       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7181                               expr, step_expr);
7182       if (! CONSTANT_CLASS_P (new_name))
7183         new_name = vect_init_vector (stmt_info, new_name,
7184                                      TREE_TYPE (step_expr), NULL);
7185       new_vec = build_vector_from_val (step_vectype, new_name);
7186       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7187
7188       /* Now generate the IVs.  */
7189       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7190       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7191       unsigned elts = const_nunits * nvects;
7192       unsigned nivs = least_common_multiple (group_size,
7193                                              const_nunits) / const_nunits;
7194       gcc_assert (elts % group_size == 0);
7195       tree elt = init_expr;
7196       unsigned ivn;
7197       for (ivn = 0; ivn < nivs; ++ivn)
7198         {
7199           tree_vector_builder elts (step_vectype, const_nunits, 1);
7200           stmts = NULL;
7201           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7202             {
7203               if (ivn*const_nunits + eltn >= group_size
7204                   && (ivn * const_nunits + eltn) % group_size == 0)
7205                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7206                                     elt, step_expr);
7207               elts.quick_push (elt);
7208             }
7209           vec_init = gimple_build_vector (&stmts, &elts);
7210           vec_init = gimple_convert (&stmts, vectype, vec_init);
7211           if (stmts)
7212             {
7213               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7214               gcc_assert (!new_bb);
7215             }
7216
7217           /* Create the induction-phi that defines the induction-operand.  */
7218           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7219           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7220           stmt_vec_info induction_phi_info
7221             = loop_vinfo->add_stmt (induction_phi);
7222           induc_def = PHI_RESULT (induction_phi);
7223
7224           /* Create the iv update inside the loop  */
7225           gimple_seq stmts = NULL;
7226           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7227           vec_def = gimple_build (&stmts,
7228                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7229           vec_def = gimple_convert (&stmts, vectype, vec_def);
7230           loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7231           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7232
7233           /* Set the arguments of the phi node:  */
7234           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7235           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7236                        UNKNOWN_LOCATION);
7237
7238           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7239         }
7240
7241       /* Re-use IVs when we can.  */
7242       if (ivn < nvects)
7243         {
7244           unsigned vfp
7245             = least_common_multiple (group_size, const_nunits) / group_size;
7246           /* Generate [VF'*S, VF'*S, ... ].  */
7247           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7248             {
7249               expr = build_int_cst (integer_type_node, vfp);
7250               expr = fold_convert (TREE_TYPE (step_expr), expr);
7251             }
7252           else
7253             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7254           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7255                                   expr, step_expr);
7256           if (! CONSTANT_CLASS_P (new_name))
7257             new_name = vect_init_vector (stmt_info, new_name,
7258                                          TREE_TYPE (step_expr), NULL);
7259           new_vec = build_vector_from_val (step_vectype, new_name);
7260           vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7261           for (; ivn < nvects; ++ivn)
7262             {
7263               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7264               tree def;
7265               if (gimple_code (iv) == GIMPLE_PHI)
7266                 def = gimple_phi_result (iv);
7267               else
7268                 def = gimple_assign_lhs (iv);
7269               gimple_seq stmts = NULL;
7270               def = gimple_convert (&stmts, step_vectype, def);
7271               def = gimple_build (&stmts,
7272                                   PLUS_EXPR, step_vectype, def, vec_step);
7273               def = gimple_convert (&stmts, vectype, def);
7274               if (gimple_code (iv) == GIMPLE_PHI)
7275                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7276               else
7277                 {
7278                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7279                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7280                 }
7281               SLP_TREE_VEC_STMTS (slp_node).quick_push
7282                 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7283             }
7284         }
7285
7286       return true;
7287     }
7288
7289   /* Create the vector that holds the initial_value of the induction.  */
7290   if (nested_in_vect_loop)
7291     {
7292       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7293          been created during vectorization of previous stmts.  We obtain it
7294          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7295       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7296       /* If the initial value is not of proper type, convert it.  */
7297       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7298         {
7299           new_stmt
7300             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7301                                                           vect_simple_var,
7302                                                           "vec_iv_"),
7303                                    VIEW_CONVERT_EXPR,
7304                                    build1 (VIEW_CONVERT_EXPR, vectype,
7305                                            vec_init));
7306           vec_init = gimple_assign_lhs (new_stmt);
7307           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7308                                                  new_stmt);
7309           gcc_assert (!new_bb);
7310           loop_vinfo->add_stmt (new_stmt);
7311         }
7312     }
7313   else
7314     {
7315       /* iv_loop is the loop to be vectorized. Create:
7316          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7317       stmts = NULL;
7318       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7319
7320       unsigned HOST_WIDE_INT const_nunits;
7321       if (nunits.is_constant (&const_nunits))
7322         {
7323           tree_vector_builder elts (step_vectype, const_nunits, 1);
7324           elts.quick_push (new_name);
7325           for (i = 1; i < const_nunits; i++)
7326             {
7327               /* Create: new_name_i = new_name + step_expr  */
7328               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7329                                        new_name, step_expr);
7330               elts.quick_push (new_name);
7331             }
7332           /* Create a vector from [new_name_0, new_name_1, ...,
7333              new_name_nunits-1]  */
7334           vec_init = gimple_build_vector (&stmts, &elts);
7335         }
7336       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7337         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7338         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7339                                  new_name, step_expr);
7340       else
7341         {
7342           /* Build:
7343                 [base, base, base, ...]
7344                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7345           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7346           gcc_assert (flag_associative_math);
7347           tree index = build_index_vector (step_vectype, 0, 1);
7348           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7349                                                         new_name);
7350           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7351                                                         step_expr);
7352           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7353           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7354                                    vec_init, step_vec);
7355           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7356                                    vec_init, base_vec);
7357         }
7358       vec_init = gimple_convert (&stmts, vectype, vec_init);
7359
7360       if (stmts)
7361         {
7362           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7363           gcc_assert (!new_bb);
7364         }
7365     }
7366
7367
7368   /* Create the vector that holds the step of the induction.  */
7369   if (nested_in_vect_loop)
7370     /* iv_loop is nested in the loop to be vectorized. Generate:
7371        vec_step = [S, S, S, S]  */
7372     new_name = step_expr;
7373   else
7374     {
7375       /* iv_loop is the loop to be vectorized. Generate:
7376           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7377       gimple_seq seq = NULL;
7378       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7379         {
7380           expr = build_int_cst (integer_type_node, vf);
7381           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7382         }
7383       else
7384         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7385       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7386                                expr, step_expr);
7387       if (seq)
7388         {
7389           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7390           gcc_assert (!new_bb);
7391         }
7392     }
7393
7394   t = unshare_expr (new_name);
7395   gcc_assert (CONSTANT_CLASS_P (new_name)
7396               || TREE_CODE (new_name) == SSA_NAME);
7397   new_vec = build_vector_from_val (step_vectype, t);
7398   vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7399
7400
7401   /* Create the following def-use cycle:
7402      loop prolog:
7403          vec_init = ...
7404          vec_step = ...
7405      loop:
7406          vec_iv = PHI <vec_init, vec_loop>
7407          ...
7408          STMT
7409          ...
7410          vec_loop = vec_iv + vec_step;  */
7411
7412   /* Create the induction-phi that defines the induction-operand.  */
7413   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7414   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7415   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7416   induc_def = PHI_RESULT (induction_phi);
7417
7418   /* Create the iv update inside the loop  */
7419   stmts = NULL;
7420   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7421   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7422   vec_def = gimple_convert (&stmts, vectype, vec_def);
7423   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7424   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7425   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7426
7427   /* Set the arguments of the phi node:  */
7428   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7429   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7430                UNKNOWN_LOCATION);
7431
7432   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7433
7434   /* In case that vectorization factor (VF) is bigger than the number
7435      of elements that we can fit in a vectype (nunits), we have to generate
7436      more than one vector stmt - i.e - we need to "unroll" the
7437      vector stmt by a factor VF/nunits.  For more details see documentation
7438      in vectorizable_operation.  */
7439
7440   if (ncopies > 1)
7441     {
7442       gimple_seq seq = NULL;
7443       stmt_vec_info prev_stmt_vinfo;
7444       /* FORNOW. This restriction should be relaxed.  */
7445       gcc_assert (!nested_in_vect_loop);
7446
7447       /* Create the vector that holds the step of the induction.  */
7448       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7449         {
7450           expr = build_int_cst (integer_type_node, nunits);
7451           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7452         }
7453       else
7454         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7455       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7456                                expr, step_expr);
7457       if (seq)
7458         {
7459           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7460           gcc_assert (!new_bb);
7461         }
7462
7463       t = unshare_expr (new_name);
7464       gcc_assert (CONSTANT_CLASS_P (new_name)
7465                   || TREE_CODE (new_name) == SSA_NAME);
7466       new_vec = build_vector_from_val (step_vectype, t);
7467       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7468
7469       vec_def = induc_def;
7470       prev_stmt_vinfo = induction_phi_info;
7471       for (i = 1; i < ncopies; i++)
7472         {
7473           /* vec_i = vec_prev + vec_step  */
7474           gimple_seq stmts = NULL;
7475           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7476           vec_def = gimple_build (&stmts,
7477                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7478           vec_def = gimple_convert (&stmts, vectype, vec_def);
7479
7480           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7481           new_stmt = SSA_NAME_DEF_STMT (vec_def);
7482           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7483           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7484           prev_stmt_vinfo = new_stmt_info;
7485         }
7486     }
7487
7488   if (nested_in_vect_loop)
7489     {
7490       /* Find the loop-closed exit-phi of the induction, and record
7491          the final vector of induction results:  */
7492       exit_phi = NULL;
7493       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7494         {
7495           gimple *use_stmt = USE_STMT (use_p);
7496           if (is_gimple_debug (use_stmt))
7497             continue;
7498
7499           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7500             {
7501               exit_phi = use_stmt;
7502               break;
7503             }
7504         }
7505       if (exit_phi)
7506         {
7507           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7508           /* FORNOW. Currently not supporting the case that an inner-loop induction
7509              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7510           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7511                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7512
7513           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7514           if (dump_enabled_p ())
7515             dump_printf_loc (MSG_NOTE, vect_location,
7516                              "vector of inductions after inner-loop:%G",
7517                              new_stmt);
7518         }
7519     }
7520
7521
7522   if (dump_enabled_p ())
7523     dump_printf_loc (MSG_NOTE, vect_location,
7524                      "transform induction: created def-use cycle: %G%G",
7525                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7526
7527   return true;
7528 }
7529
7530 /* Function vectorizable_live_operation.
7531
7532    STMT_INFO computes a value that is used outside the loop.  Check if
7533    it can be supported.  */
7534
7535 bool
7536 vectorizable_live_operation (stmt_vec_info stmt_info,
7537                              gimple_stmt_iterator *gsi,
7538                              slp_tree slp_node, slp_instance slp_node_instance,
7539                              int slp_index, bool vec_stmt_p,
7540                              stmt_vector_for_cost *)
7541 {
7542   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7543   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7544   imm_use_iterator imm_iter;
7545   tree lhs, lhs_type, bitsize, vec_bitsize;
7546   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7547   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7548   int ncopies;
7549   gimple *use_stmt;
7550   auto_vec<tree> vec_oprnds;
7551   int vec_entry = 0;
7552   poly_uint64 vec_index = 0;
7553
7554   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7555
7556   /* If a stmt of a reduction is live, vectorize it via
7557      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7558      validity so just trigger the transform here.  */
7559   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7560     {
7561       if (!vec_stmt_p)
7562         return true;
7563       if (slp_node)
7564         {
7565           /* For reduction chains the meta-info is attached to
7566              the group leader.  */
7567           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7568             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7569           /* For SLP reductions we vectorize the epilogue for
7570              all involved stmts together.  */
7571           else if (slp_index != 0)
7572             return true;
7573         }
7574       stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7575       gcc_assert (reduc_info->is_reduc_info);
7576       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7577           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7578         return true;
7579       vect_create_epilog_for_reduction (stmt_info, slp_node,
7580                                         slp_node_instance);
7581       return true;
7582     }
7583
7584   /* FORNOW.  CHECKME.  */
7585   if (nested_in_vect_loop_p (loop, stmt_info))
7586     return false;
7587
7588   /* If STMT is not relevant and it is a simple assignment and its inputs are
7589      invariant then it can remain in place, unvectorized.  The original last
7590      scalar value that it computes will be used.  */
7591   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7592     {
7593       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7594       if (dump_enabled_p ())
7595         dump_printf_loc (MSG_NOTE, vect_location,
7596                          "statement is simple and uses invariant.  Leaving in "
7597                          "place.\n");
7598       return true;
7599     }
7600
7601   if (slp_node)
7602     ncopies = 1;
7603   else
7604     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7605
7606   if (slp_node)
7607     {
7608       gcc_assert (slp_index >= 0);
7609
7610       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7611       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7612
7613       /* Get the last occurrence of the scalar index from the concatenation of
7614          all the slp vectors. Calculate which slp vector it is and the index
7615          within.  */
7616       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7617
7618       /* Calculate which vector contains the result, and which lane of
7619          that vector we need.  */
7620       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7621         {
7622           if (dump_enabled_p ())
7623             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7624                              "Cannot determine which vector holds the"
7625                              " final result.\n");
7626           return false;
7627         }
7628     }
7629
7630   if (!vec_stmt_p)
7631     {
7632       /* No transformation required.  */
7633       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7634         {
7635           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7636                                                OPTIMIZE_FOR_SPEED))
7637             {
7638               if (dump_enabled_p ())
7639                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7640                                  "can't use a fully-masked loop because "
7641                                  "the target doesn't support extract last "
7642                                  "reduction.\n");
7643               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7644             }
7645           else if (slp_node)
7646             {
7647               if (dump_enabled_p ())
7648                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7649                                  "can't use a fully-masked loop because an "
7650                                  "SLP statement is live after the loop.\n");
7651               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7652             }
7653           else if (ncopies > 1)
7654             {
7655               if (dump_enabled_p ())
7656                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7657                                  "can't use a fully-masked loop because"
7658                                  " ncopies is greater than 1.\n");
7659               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7660             }
7661           else
7662             {
7663               gcc_assert (ncopies == 1 && !slp_node);
7664               vect_record_loop_mask (loop_vinfo,
7665                                      &LOOP_VINFO_MASKS (loop_vinfo),
7666                                      1, vectype, NULL);
7667             }
7668         }
7669       return true;
7670     }
7671
7672   /* Use the lhs of the original scalar statement.  */
7673   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7674
7675   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7676         : gimple_get_lhs (stmt);
7677   lhs_type = TREE_TYPE (lhs);
7678
7679   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7680              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7681              : TYPE_SIZE (TREE_TYPE (vectype)));
7682   vec_bitsize = TYPE_SIZE (vectype);
7683
7684   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7685   tree vec_lhs, bitstart;
7686   if (slp_node)
7687     {
7688       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7689
7690       /* Get the correct slp vectorized stmt.  */
7691       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7692       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7693         vec_lhs = gimple_phi_result (phi);
7694       else
7695         vec_lhs = gimple_get_lhs (vec_stmt);
7696
7697       /* Get entry to use.  */
7698       bitstart = bitsize_int (vec_index);
7699       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7700     }
7701   else
7702     {
7703       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7704       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7705       gcc_checking_assert (ncopies == 1
7706                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7707
7708       /* For multiple copies, get the last copy.  */
7709       for (int i = 1; i < ncopies; ++i)
7710         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7711
7712       /* Get the last lane in the vector.  */
7713       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7714     }
7715
7716   gimple_seq stmts = NULL;
7717   tree new_tree;
7718   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7719     {
7720       /* Emit:
7721
7722            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7723
7724          where VEC_LHS is the vectorized live-out result and MASK is
7725          the loop mask for the final iteration.  */
7726       gcc_assert (ncopies == 1 && !slp_node);
7727       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7728       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7729                                       1, vectype, 0);
7730       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7731                                       scalar_type, mask, vec_lhs);
7732
7733       /* Convert the extracted vector element to the required scalar type.  */
7734       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7735     }
7736   else
7737     {
7738       tree bftype = TREE_TYPE (vectype);
7739       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7740         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7741       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7742       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7743                                        &stmts, true, NULL_TREE);
7744     }
7745
7746   if (stmts)
7747     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7748
7749   /* Replace use of lhs with newly computed result.  If the use stmt is a
7750      single arg PHI, just replace all uses of PHI result.  It's necessary
7751      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7752   use_operand_p use_p;
7753   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7754     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7755         && !is_gimple_debug (use_stmt))
7756     {
7757       if (gimple_code (use_stmt) == GIMPLE_PHI
7758           && gimple_phi_num_args (use_stmt) == 1)
7759         {
7760           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7761         }
7762       else
7763         {
7764           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7765             SET_USE (use_p, new_tree);
7766         }
7767       update_stmt (use_stmt);
7768     }
7769
7770   return true;
7771 }
7772
7773 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7774
7775 static void
7776 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
7777 {
7778   ssa_op_iter op_iter;
7779   imm_use_iterator imm_iter;
7780   def_operand_p def_p;
7781   gimple *ustmt;
7782
7783   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7784     {
7785       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7786         {
7787           basic_block bb;
7788
7789           if (!is_gimple_debug (ustmt))
7790             continue;
7791
7792           bb = gimple_bb (ustmt);
7793
7794           if (!flow_bb_inside_loop_p (loop, bb))
7795             {
7796               if (gimple_debug_bind_p (ustmt))
7797                 {
7798                   if (dump_enabled_p ())
7799                     dump_printf_loc (MSG_NOTE, vect_location,
7800                                      "killing debug use\n");
7801
7802                   gimple_debug_bind_reset_value (ustmt);
7803                   update_stmt (ustmt);
7804                 }
7805               else
7806                 gcc_unreachable ();
7807             }
7808         }
7809     }
7810 }
7811
7812 /* Given loop represented by LOOP_VINFO, return true if computation of
7813    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7814    otherwise.  */
7815
7816 static bool
7817 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7818 {
7819   /* Constant case.  */
7820   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7821     {
7822       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7823       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7824
7825       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7826       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7827       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7828         return true;
7829     }
7830
7831   widest_int max;
7832   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7833   /* Check the upper bound of loop niters.  */
7834   if (get_max_loop_iterations (loop, &max))
7835     {
7836       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7837       signop sgn = TYPE_SIGN (type);
7838       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7839       if (max < type_max)
7840         return true;
7841     }
7842   return false;
7843 }
7844
7845 /* Return a mask type with half the number of elements as TYPE.  */
7846
7847 tree
7848 vect_halve_mask_nunits (vec_info *vinfo, tree type)
7849 {
7850   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7851   return build_truth_vector_type (nunits, vinfo->vector_size);
7852 }
7853
7854 /* Return a mask type with twice as many elements as TYPE.  */
7855
7856 tree
7857 vect_double_mask_nunits (vec_info *vinfo, tree type)
7858 {
7859   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7860   return build_truth_vector_type (nunits, vinfo->vector_size);
7861 }
7862
7863 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7864    contain a sequence of NVECTORS masks that each control a vector of type
7865    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
7866    these vector masks with the vector version of SCALAR_MASK.  */
7867
7868 void
7869 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
7870                        unsigned int nvectors, tree vectype, tree scalar_mask)
7871 {
7872   gcc_assert (nvectors != 0);
7873   if (masks->length () < nvectors)
7874     masks->safe_grow_cleared (nvectors);
7875   rgroup_masks *rgm = &(*masks)[nvectors - 1];
7876   /* The number of scalars per iteration and the number of vectors are
7877      both compile-time constants.  */
7878   unsigned int nscalars_per_iter
7879     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
7880                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
7881
7882   if (scalar_mask)
7883     {
7884       scalar_cond_masked_key cond (scalar_mask, nvectors);
7885       loop_vinfo->scalar_cond_masked_set.add (cond);
7886     }
7887
7888   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
7889     {
7890       rgm->max_nscalars_per_iter = nscalars_per_iter;
7891       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
7892     }
7893 }
7894
7895 /* Given a complete set of masks MASKS, extract mask number INDEX
7896    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
7897    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
7898
7899    See the comment above vec_loop_masks for more details about the mask
7900    arrangement.  */
7901
7902 tree
7903 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
7904                     unsigned int nvectors, tree vectype, unsigned int index)
7905 {
7906   rgroup_masks *rgm = &(*masks)[nvectors - 1];
7907   tree mask_type = rgm->mask_type;
7908
7909   /* Populate the rgroup's mask array, if this is the first time we've
7910      used it.  */
7911   if (rgm->masks.is_empty ())
7912     {
7913       rgm->masks.safe_grow_cleared (nvectors);
7914       for (unsigned int i = 0; i < nvectors; ++i)
7915         {
7916           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
7917           /* Provide a dummy definition until the real one is available.  */
7918           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
7919           rgm->masks[i] = mask;
7920         }
7921     }
7922
7923   tree mask = rgm->masks[index];
7924   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
7925                 TYPE_VECTOR_SUBPARTS (vectype)))
7926     {
7927       /* A loop mask for data type X can be reused for data type Y
7928          if X has N times more elements than Y and if Y's elements
7929          are N times bigger than X's.  In this case each sequence
7930          of N elements in the loop mask will be all-zero or all-one.
7931          We can then view-convert the mask so that each sequence of
7932          N elements is replaced by a single element.  */
7933       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
7934                               TYPE_VECTOR_SUBPARTS (vectype)));
7935       gimple_seq seq = NULL;
7936       mask_type = build_same_sized_truth_vector_type (vectype);
7937       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
7938       if (seq)
7939         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
7940     }
7941   return mask;
7942 }
7943
7944 /* Scale profiling counters by estimation for LOOP which is vectorized
7945    by factor VF.  */
7946
7947 static void
7948 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
7949 {
7950   edge preheader = loop_preheader_edge (loop);
7951   /* Reduce loop iterations by the vectorization factor.  */
7952   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7953   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7954
7955   if (freq_h.nonzero_p ())
7956     {
7957       profile_probability p;
7958
7959       /* Avoid dropping loop body profile counter to 0 because of zero count
7960          in loop's preheader.  */
7961       if (!(freq_e == profile_count::zero ()))
7962         freq_e = freq_e.force_nonzero ();
7963       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7964       scale_loop_frequencies (loop, p);
7965     }
7966
7967   edge exit_e = single_exit (loop);
7968   exit_e->probability = profile_probability::always ()
7969                                  .apply_scale (1, new_est_niter + 1);
7970
7971   edge exit_l = single_pred_edge (loop->latch);
7972   profile_probability prob = exit_l->probability;
7973   exit_l->probability = exit_e->probability.invert ();
7974   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7975     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7976 }
7977
7978 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
7979    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
7980    stmt_vec_info.  */
7981
7982 static void
7983 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7984                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
7985 {
7986   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7987   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7988
7989   if (dump_enabled_p ())
7990     dump_printf_loc (MSG_NOTE, vect_location,
7991                      "------>vectorizing statement: %G", stmt_info->stmt);
7992
7993   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7994     vect_loop_kill_debug_uses (loop, stmt_info);
7995
7996   if (!STMT_VINFO_RELEVANT_P (stmt_info)
7997       && !STMT_VINFO_LIVE_P (stmt_info))
7998     return;
7999
8000   if (STMT_VINFO_VECTYPE (stmt_info))
8001     {
8002       poly_uint64 nunits
8003         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8004       if (!STMT_SLP_TYPE (stmt_info)
8005           && maybe_ne (nunits, vf)
8006           && dump_enabled_p ())
8007         /* For SLP VF is set according to unrolling factor, and not
8008            to vector size, hence for SLP this print is not valid.  */
8009         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8010     }
8011
8012   /* Pure SLP statements have already been vectorized.  We still need
8013      to apply loop vectorization to hybrid SLP statements.  */
8014   if (PURE_SLP_STMT (stmt_info))
8015     return;
8016
8017   if (dump_enabled_p ())
8018     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8019
8020   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8021     *seen_store = stmt_info;
8022 }
8023
8024 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8025    in the hash_map with its corresponding values.  */
8026
8027 static tree
8028 find_in_mapping (tree t, void *context)
8029 {
8030   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8031
8032   tree *value = mapping->get (t);
8033   return value ? *value : t;
8034 }
8035
8036 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8037    original loop that has now been vectorized.
8038
8039    The inits of the data_references need to be advanced with the number of
8040    iterations of the main loop.  This has been computed in vect_do_peeling and
8041    is stored in parameter ADVANCE.  We first restore the data_references
8042    initial offset with the values recored in ORIG_DRS_INIT.
8043
8044    Since the loop_vec_info of this EPILOGUE was constructed for the original
8045    loop, its stmt_vec_infos all point to the original statements.  These need
8046    to be updated to point to their corresponding copies as well as the SSA_NAMES
8047    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8048
8049    The data_reference's connections also need to be updated.  Their
8050    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8051    stmt_vec_infos, their statements need to point to their corresponding copy,
8052    if they are gather loads or scatter stores then their reference needs to be
8053    updated to point to its corresponding copy and finally we set
8054    'base_misaligned' to false as we have already peeled for alignment in the
8055    prologue of the main loop.  */
8056
8057 static void
8058 update_epilogue_loop_vinfo (class loop *epilogue, tree advance,
8059                             drs_init_vec &orig_drs_init)
8060 {
8061   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8062   auto_vec<gimple *> stmt_worklist;
8063   hash_map<tree,tree> mapping;
8064   gimple *orig_stmt, *new_stmt;
8065   gimple_stmt_iterator epilogue_gsi;
8066   gphi_iterator epilogue_phi_gsi;
8067   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8068   basic_block *epilogue_bbs = get_loop_body (epilogue);
8069
8070   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8071
8072   /* Restore original data_reference's offset, before the previous loop and its
8073      prologue.  */
8074   std::pair<data_reference*, tree> *dr_init;
8075   unsigned i;
8076   for (i = 0; orig_drs_init.iterate (i, &dr_init); i++)
8077     DR_OFFSET (dr_init->first) = dr_init->second;
8078
8079   /* Advance data_reference's with the number of iterations of the previous
8080      loop and its prologue.  */
8081   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8082
8083
8084   /* The EPILOGUE loop is a copy of the original loop so they share the same
8085      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8086      point to the copied statements.  We also create a mapping of all LHS' in
8087      the original loop and all the LHS' in the EPILOGUE and create worklists to
8088      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8089   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8090     {
8091       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8092            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8093         {
8094           new_stmt = epilogue_phi_gsi.phi ();
8095
8096           gcc_assert (gimple_uid (new_stmt) > 0);
8097           stmt_vinfo
8098             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8099
8100           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8101           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8102
8103           mapping.put (gimple_phi_result (orig_stmt),
8104                        gimple_phi_result (new_stmt));
8105           /* PHI nodes can not have patterns or related statements.  */
8106           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8107                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8108         }
8109
8110       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8111            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8112         {
8113           new_stmt = gsi_stmt (epilogue_gsi);
8114
8115           gcc_assert (gimple_uid (new_stmt) > 0);
8116           stmt_vinfo
8117             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8118
8119           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8120           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8121
8122           if (tree old_lhs = gimple_get_lhs (orig_stmt))
8123             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8124
8125           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8126             {
8127               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8128               for (gimple_stmt_iterator gsi = gsi_start (seq);
8129                    !gsi_end_p (gsi); gsi_next (&gsi))
8130                 stmt_worklist.safe_push (gsi_stmt (gsi));
8131             }
8132
8133           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8134           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8135             {
8136               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8137               stmt_worklist.safe_push (stmt);
8138               /* Set BB such that the assert in
8139                 'get_initial_def_for_reduction' is able to determine that
8140                 the BB of the related stmt is inside this loop.  */
8141               gimple_set_bb (stmt,
8142                              gimple_bb (new_stmt));
8143               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8144               gcc_assert (related_vinfo == NULL
8145                           || related_vinfo == stmt_vinfo);
8146             }
8147         }
8148     }
8149
8150   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8151      using the original main loop and thus need to be updated to refer to the
8152      cloned variables used in the epilogue.  */
8153   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8154     {
8155       gimple *stmt = stmt_worklist[i];
8156       tree *new_op;
8157
8158       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8159         {
8160           tree op = gimple_op (stmt, j);
8161           if ((new_op = mapping.get(op)))
8162             gimple_set_op (stmt, j, *new_op);
8163           else
8164             {
8165               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8166                                      &find_in_mapping, &mapping);
8167               gimple_set_op (stmt, j, op);
8168             }
8169         }
8170     }
8171
8172   struct data_reference *dr;
8173   vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8174   FOR_EACH_VEC_ELT (datarefs, i, dr)
8175     {
8176       orig_stmt = DR_STMT (dr);
8177       gcc_assert (gimple_uid (orig_stmt) > 0);
8178       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8179       /* Data references for gather loads and scatter stores do not use the
8180          updated offset we set using ADVANCE.  Instead we have to make sure the
8181          reference in the data references point to the corresponding copy of
8182          the original in the epilogue.  */
8183       if (STMT_VINFO_GATHER_SCATTER_P (stmt_vinfo))
8184         {
8185           DR_REF (dr)
8186             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8187                                      &find_in_mapping, &mapping);
8188           DR_BASE_ADDRESS (dr)
8189             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8190                                      &find_in_mapping, &mapping);
8191         }
8192       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8193       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8194       /* The vector size of the epilogue is smaller than that of the main loop
8195          so the alignment is either the same or lower. This means the dr will
8196          thus by definition be aligned.  */
8197       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8198     }
8199
8200   epilogue_vinfo->shared->datarefs_copy.release ();
8201   epilogue_vinfo->shared->save_datarefs ();
8202 }
8203
8204 /* Function vect_transform_loop.
8205
8206    The analysis phase has determined that the loop is vectorizable.
8207    Vectorize the loop - created vectorized stmts to replace the scalar
8208    stmts in the loop, and update the loop exit condition.
8209    Returns scalar epilogue loop if any.  */
8210
8211 class loop *
8212 vect_transform_loop (loop_vec_info loop_vinfo)
8213 {
8214   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8215   class loop *epilogue = NULL;
8216   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8217   int nbbs = loop->num_nodes;
8218   int i;
8219   tree niters_vector = NULL_TREE;
8220   tree step_vector = NULL_TREE;
8221   tree niters_vector_mult_vf = NULL_TREE;
8222   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8223   unsigned int lowest_vf = constant_lower_bound (vf);
8224   gimple *stmt;
8225   bool check_profitability = false;
8226   unsigned int th;
8227
8228   DUMP_VECT_SCOPE ("vec_transform_loop");
8229
8230   loop_vinfo->shared->check_datarefs ();
8231
8232   /* Use the more conservative vectorization threshold.  If the number
8233      of iterations is constant assume the cost check has been performed
8234      by our caller.  If the threshold makes all loops profitable that
8235      run at least the (estimated) vectorization factor number of times
8236      checking is pointless, too.  */
8237   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8238   if (th >= vect_vf_for_cost (loop_vinfo)
8239       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8240     {
8241         if (dump_enabled_p ())
8242           dump_printf_loc (MSG_NOTE, vect_location,
8243                            "Profitability threshold is %d loop iterations.\n",
8244                            th);
8245         check_profitability = true;
8246     }
8247
8248   /* Make sure there exists a single-predecessor exit bb.  Do this before
8249      versioning.   */
8250   edge e = single_exit (loop);
8251   if (! single_pred_p (e->dest))
8252     {
8253       split_loop_exit_edge (e, true);
8254       if (dump_enabled_p ())
8255         dump_printf (MSG_NOTE, "split exit edge\n");
8256     }
8257
8258   /* Version the loop first, if required, so the profitability check
8259      comes first.  */
8260
8261   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8262     {
8263       class loop *sloop
8264         = vect_loop_versioning (loop_vinfo);
8265       sloop->force_vectorize = false;
8266       check_profitability = false;
8267     }
8268
8269   /* Make sure there exists a single-predecessor exit bb also on the
8270      scalar loop copy.  Do this after versioning but before peeling
8271      so CFG structure is fine for both scalar and if-converted loop
8272      to make slpeel_duplicate_current_defs_from_edges face matched
8273      loop closed PHI nodes on the exit.  */
8274   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8275     {
8276       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8277       if (! single_pred_p (e->dest))
8278         {
8279           split_loop_exit_edge (e, true);
8280           if (dump_enabled_p ())
8281             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8282         }
8283     }
8284
8285   tree niters = vect_build_loop_niters (loop_vinfo);
8286   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8287   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8288   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8289   tree advance;
8290   drs_init_vec orig_drs_init;
8291
8292   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8293                               &step_vector, &niters_vector_mult_vf, th,
8294                               check_profitability, niters_no_overflow,
8295                               &advance, orig_drs_init);
8296
8297   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8298       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8299     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8300                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8301
8302   if (niters_vector == NULL_TREE)
8303     {
8304       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8305           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8306           && known_eq (lowest_vf, vf))
8307         {
8308           niters_vector
8309             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8310                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8311           step_vector = build_one_cst (TREE_TYPE (niters));
8312         }
8313       else
8314         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8315                                      &step_vector, niters_no_overflow);
8316     }
8317
8318   /* 1) Make sure the loop header has exactly two entries
8319      2) Make sure we have a preheader basic block.  */
8320
8321   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8322
8323   split_edge (loop_preheader_edge (loop));
8324
8325   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8326       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8327     /* This will deal with any possible peeling.  */
8328     vect_prepare_for_masked_peels (loop_vinfo);
8329
8330   /* Schedule the SLP instances first, then handle loop vectorization
8331      below.  */
8332   if (!loop_vinfo->slp_instances.is_empty ())
8333     {
8334       DUMP_VECT_SCOPE ("scheduling SLP instances");
8335       vect_schedule_slp (loop_vinfo);
8336     }
8337
8338   /* FORNOW: the vectorizer supports only loops which body consist
8339      of one basic block (header + empty latch). When the vectorizer will
8340      support more involved loop forms, the order by which the BBs are
8341      traversed need to be reconsidered.  */
8342
8343   for (i = 0; i < nbbs; i++)
8344     {
8345       basic_block bb = bbs[i];
8346       stmt_vec_info stmt_info;
8347
8348       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8349            gsi_next (&si))
8350         {
8351           gphi *phi = si.phi ();
8352           if (dump_enabled_p ())
8353             dump_printf_loc (MSG_NOTE, vect_location,
8354                              "------>vectorizing phi: %G", phi);
8355           stmt_info = loop_vinfo->lookup_stmt (phi);
8356           if (!stmt_info)
8357             continue;
8358
8359           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8360             vect_loop_kill_debug_uses (loop, stmt_info);
8361
8362           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8363               && !STMT_VINFO_LIVE_P (stmt_info))
8364             continue;
8365
8366           if (STMT_VINFO_VECTYPE (stmt_info)
8367               && (maybe_ne
8368                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8369               && dump_enabled_p ())
8370             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8371
8372           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8373                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8374                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8375                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8376                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8377               && ! PURE_SLP_STMT (stmt_info))
8378             {
8379               if (dump_enabled_p ())
8380                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8381               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8382             }
8383         }
8384
8385       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8386            !gsi_end_p (si);)
8387         {
8388           stmt = gsi_stmt (si);
8389           /* During vectorization remove existing clobber stmts.  */
8390           if (gimple_clobber_p (stmt))
8391             {
8392               unlink_stmt_vdef (stmt);
8393               gsi_remove (&si, true);
8394               release_defs (stmt);
8395             }
8396           else
8397             {
8398               stmt_info = loop_vinfo->lookup_stmt (stmt);
8399
8400               /* vector stmts created in the outer-loop during vectorization of
8401                  stmts in an inner-loop may not have a stmt_info, and do not
8402                  need to be vectorized.  */
8403               stmt_vec_info seen_store = NULL;
8404               if (stmt_info)
8405                 {
8406                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8407                     {
8408                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8409                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8410                            !gsi_end_p (subsi); gsi_next (&subsi))
8411                         {
8412                           stmt_vec_info pat_stmt_info
8413                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8414                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8415                                                     &si, &seen_store);
8416                         }
8417                       stmt_vec_info pat_stmt_info
8418                         = STMT_VINFO_RELATED_STMT (stmt_info);
8419                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8420                                                 &seen_store);
8421                     }
8422                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8423                                             &seen_store);
8424                 }
8425               gsi_next (&si);
8426               if (seen_store)
8427                 {
8428                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8429                     /* Interleaving.  If IS_STORE is TRUE, the
8430                        vectorization of the interleaving chain was
8431                        completed - free all the stores in the chain.  */
8432                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8433                   else
8434                     /* Free the attached stmt_vec_info and remove the stmt.  */
8435                     loop_vinfo->remove_stmt (stmt_info);
8436                 }
8437             }
8438         }
8439
8440       /* Stub out scalar statements that must not survive vectorization.
8441          Doing this here helps with grouped statements, or statements that
8442          are involved in patterns.  */
8443       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8444            !gsi_end_p (gsi); gsi_next (&gsi))
8445         {
8446           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8447           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8448             {
8449               tree lhs = gimple_get_lhs (call);
8450               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8451                 {
8452                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8453                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8454                   gsi_replace (&gsi, new_stmt, true);
8455                 }
8456             }
8457         }
8458     }                           /* BBs in loop */
8459
8460   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8461      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8462   if (integer_onep (step_vector))
8463     niters_no_overflow = true;
8464   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8465                            niters_vector_mult_vf, !niters_no_overflow);
8466
8467   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8468   scale_profile_for_vect_loop (loop, assumed_vf);
8469
8470   /* True if the final iteration might not handle a full vector's
8471      worth of scalar iterations.  */
8472   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8473   /* The minimum number of iterations performed by the epilogue.  This
8474      is 1 when peeling for gaps because we always need a final scalar
8475      iteration.  */
8476   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8477   /* +1 to convert latch counts to loop iteration counts,
8478      -min_epilogue_iters to remove iterations that cannot be performed
8479        by the vector code.  */
8480   int bias_for_lowest = 1 - min_epilogue_iters;
8481   int bias_for_assumed = bias_for_lowest;
8482   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8483   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8484     {
8485       /* When the amount of peeling is known at compile time, the first
8486          iteration will have exactly alignment_npeels active elements.
8487          In the worst case it will have at least one.  */
8488       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8489       bias_for_lowest += lowest_vf - min_first_active;
8490       bias_for_assumed += assumed_vf - min_first_active;
8491     }
8492   /* In these calculations the "- 1" converts loop iteration counts
8493      back to latch counts.  */
8494   if (loop->any_upper_bound)
8495     loop->nb_iterations_upper_bound
8496       = (final_iter_may_be_partial
8497          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8498                           lowest_vf) - 1
8499          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8500                            lowest_vf) - 1);
8501   if (loop->any_likely_upper_bound)
8502     loop->nb_iterations_likely_upper_bound
8503       = (final_iter_may_be_partial
8504          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8505                           + bias_for_lowest, lowest_vf) - 1
8506          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8507                            + bias_for_lowest, lowest_vf) - 1);
8508   if (loop->any_estimate)
8509     loop->nb_iterations_estimate
8510       = (final_iter_may_be_partial
8511          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8512                           assumed_vf) - 1
8513          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8514                            assumed_vf) - 1);
8515
8516   if (dump_enabled_p ())
8517     {
8518       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8519         {
8520           dump_printf_loc (MSG_NOTE, vect_location,
8521                            "LOOP VECTORIZED\n");
8522           if (loop->inner)
8523             dump_printf_loc (MSG_NOTE, vect_location,
8524                              "OUTER LOOP VECTORIZED\n");
8525           dump_printf (MSG_NOTE, "\n");
8526         }
8527       else
8528         {
8529           dump_printf_loc (MSG_NOTE, vect_location,
8530                            "LOOP EPILOGUE VECTORIZED (VS=");
8531           dump_dec (MSG_NOTE, loop_vinfo->vector_size);
8532           dump_printf (MSG_NOTE, ")\n");
8533         }
8534     }
8535
8536   /* Loops vectorized with a variable factor won't benefit from
8537      unrolling/peeling.  */
8538   if (!vf.is_constant ())
8539     {
8540       loop->unroll = 1;
8541       if (dump_enabled_p ())
8542         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8543                          " variable-length vectorization factor\n");
8544     }
8545   /* Free SLP instances here because otherwise stmt reference counting
8546      won't work.  */
8547   slp_instance instance;
8548   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8549     vect_free_slp_instance (instance, true);
8550   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8551   /* Clear-up safelen field since its value is invalid after vectorization
8552      since vectorized loop can have loop-carried dependencies.  */
8553   loop->safelen = 0;
8554
8555   if (epilogue)
8556     {
8557       update_epilogue_loop_vinfo (epilogue, advance, orig_drs_init);
8558
8559       epilogue->simduid = loop->simduid;
8560       epilogue->force_vectorize = loop->force_vectorize;
8561       epilogue->safelen = loop->safelen;
8562       epilogue->dont_vectorize = false;
8563     }
8564
8565   return epilogue;
8566 }
8567
8568 /* The code below is trying to perform simple optimization - revert
8569    if-conversion for masked stores, i.e. if the mask of a store is zero
8570    do not perform it and all stored value producers also if possible.
8571    For example,
8572      for (i=0; i<n; i++)
8573        if (c[i])
8574         {
8575           p1[i] += 1;
8576           p2[i] = p3[i] +2;
8577         }
8578    this transformation will produce the following semi-hammock:
8579
8580    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8581      {
8582        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8583        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8584        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8585        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8586        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8587        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8588      }
8589 */
8590
8591 void
8592 optimize_mask_stores (class loop *loop)
8593 {
8594   basic_block *bbs = get_loop_body (loop);
8595   unsigned nbbs = loop->num_nodes;
8596   unsigned i;
8597   basic_block bb;
8598   class loop *bb_loop;
8599   gimple_stmt_iterator gsi;
8600   gimple *stmt;
8601   auto_vec<gimple *> worklist;
8602   auto_purge_vect_location sentinel;
8603
8604   vect_location = find_loop_location (loop);
8605   /* Pick up all masked stores in loop if any.  */
8606   for (i = 0; i < nbbs; i++)
8607     {
8608       bb = bbs[i];
8609       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8610            gsi_next (&gsi))
8611         {
8612           stmt = gsi_stmt (gsi);
8613           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8614             worklist.safe_push (stmt);
8615         }
8616     }
8617
8618   free (bbs);
8619   if (worklist.is_empty ())
8620     return;
8621
8622   /* Loop has masked stores.  */
8623   while (!worklist.is_empty ())
8624     {
8625       gimple *last, *last_store;
8626       edge e, efalse;
8627       tree mask;
8628       basic_block store_bb, join_bb;
8629       gimple_stmt_iterator gsi_to;
8630       tree vdef, new_vdef;
8631       gphi *phi;
8632       tree vectype;
8633       tree zero;
8634
8635       last = worklist.pop ();
8636       mask = gimple_call_arg (last, 2);
8637       bb = gimple_bb (last);
8638       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8639          the same loop as if_bb.  It could be different to LOOP when two
8640          level loop-nest is vectorized and mask_store belongs to the inner
8641          one.  */
8642       e = split_block (bb, last);
8643       bb_loop = bb->loop_father;
8644       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8645       join_bb = e->dest;
8646       store_bb = create_empty_bb (bb);
8647       add_bb_to_loop (store_bb, bb_loop);
8648       e->flags = EDGE_TRUE_VALUE;
8649       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8650       /* Put STORE_BB to likely part.  */
8651       efalse->probability = profile_probability::unlikely ();
8652       store_bb->count = efalse->count ();
8653       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8654       if (dom_info_available_p (CDI_DOMINATORS))
8655         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8656       if (dump_enabled_p ())
8657         dump_printf_loc (MSG_NOTE, vect_location,
8658                          "Create new block %d to sink mask stores.",
8659                          store_bb->index);
8660       /* Create vector comparison with boolean result.  */
8661       vectype = TREE_TYPE (mask);
8662       zero = build_zero_cst (vectype);
8663       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8664       gsi = gsi_last_bb (bb);
8665       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8666       /* Create new PHI node for vdef of the last masked store:
8667          .MEM_2 = VDEF <.MEM_1>
8668          will be converted to
8669          .MEM.3 = VDEF <.MEM_1>
8670          and new PHI node will be created in join bb
8671          .MEM_2 = PHI <.MEM_1, .MEM_3>
8672       */
8673       vdef = gimple_vdef (last);
8674       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8675       gimple_set_vdef (last, new_vdef);
8676       phi = create_phi_node (vdef, join_bb);
8677       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8678
8679       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8680       while (true)
8681         {
8682           gimple_stmt_iterator gsi_from;
8683           gimple *stmt1 = NULL;
8684
8685           /* Move masked store to STORE_BB.  */
8686           last_store = last;
8687           gsi = gsi_for_stmt (last);
8688           gsi_from = gsi;
8689           /* Shift GSI to the previous stmt for further traversal.  */
8690           gsi_prev (&gsi);
8691           gsi_to = gsi_start_bb (store_bb);
8692           gsi_move_before (&gsi_from, &gsi_to);
8693           /* Setup GSI_TO to the non-empty block start.  */
8694           gsi_to = gsi_start_bb (store_bb);
8695           if (dump_enabled_p ())
8696             dump_printf_loc (MSG_NOTE, vect_location,
8697                              "Move stmt to created bb\n%G", last);
8698           /* Move all stored value producers if possible.  */
8699           while (!gsi_end_p (gsi))
8700             {
8701               tree lhs;
8702               imm_use_iterator imm_iter;
8703               use_operand_p use_p;
8704               bool res;
8705
8706               /* Skip debug statements.  */
8707               if (is_gimple_debug (gsi_stmt (gsi)))
8708                 {
8709                   gsi_prev (&gsi);
8710                   continue;
8711                 }
8712               stmt1 = gsi_stmt (gsi);
8713               /* Do not consider statements writing to memory or having
8714                  volatile operand.  */
8715               if (gimple_vdef (stmt1)
8716                   || gimple_has_volatile_ops (stmt1))
8717                 break;
8718               gsi_from = gsi;
8719               gsi_prev (&gsi);
8720               lhs = gimple_get_lhs (stmt1);
8721               if (!lhs)
8722                 break;
8723
8724               /* LHS of vectorized stmt must be SSA_NAME.  */
8725               if (TREE_CODE (lhs) != SSA_NAME)
8726                 break;
8727
8728               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8729                 {
8730                   /* Remove dead scalar statement.  */
8731                   if (has_zero_uses (lhs))
8732                     {
8733                       gsi_remove (&gsi_from, true);
8734                       continue;
8735                     }
8736                 }
8737
8738               /* Check that LHS does not have uses outside of STORE_BB.  */
8739               res = true;
8740               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8741                 {
8742                   gimple *use_stmt;
8743                   use_stmt = USE_STMT (use_p);
8744                   if (is_gimple_debug (use_stmt))
8745                     continue;
8746                   if (gimple_bb (use_stmt) != store_bb)
8747                     {
8748                       res = false;
8749                       break;
8750                     }
8751                 }
8752               if (!res)
8753                 break;
8754
8755               if (gimple_vuse (stmt1)
8756                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8757                 break;
8758
8759               /* Can move STMT1 to STORE_BB.  */
8760               if (dump_enabled_p ())
8761                 dump_printf_loc (MSG_NOTE, vect_location,
8762                                  "Move stmt to created bb\n%G", stmt1);
8763               gsi_move_before (&gsi_from, &gsi_to);
8764               /* Shift GSI_TO for further insertion.  */
8765               gsi_prev (&gsi_to);
8766             }
8767           /* Put other masked stores with the same mask to STORE_BB.  */
8768           if (worklist.is_empty ()
8769               || gimple_call_arg (worklist.last (), 2) != mask
8770               || worklist.last () != stmt1)
8771             break;
8772           last = worklist.pop ();
8773         }
8774       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8775     }
8776 }
8777
8778 /* Decide whether it is possible to use a zero-based induction variable
8779    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
8780    return the value that the induction variable must be able to hold
8781    in order to ensure that the loop ends with an all-false mask.
8782    Return -1 otherwise.  */
8783 widest_int
8784 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
8785 {
8786   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8787   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8788   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
8789
8790   /* Calculate the value that the induction variable must be able
8791      to hit in order to ensure that we end the loop with an all-false mask.
8792      This involves adding the maximum number of inactive trailing scalar
8793      iterations.  */
8794   widest_int iv_limit = -1;
8795   if (max_loop_iterations (loop, &iv_limit))
8796     {
8797       if (niters_skip)
8798         {
8799           /* Add the maximum number of skipped iterations to the
8800              maximum iteration count.  */
8801           if (TREE_CODE (niters_skip) == INTEGER_CST)
8802             iv_limit += wi::to_widest (niters_skip);
8803           else
8804             iv_limit += max_vf - 1;
8805         }
8806       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
8807         /* Make a conservatively-correct assumption.  */
8808         iv_limit += max_vf - 1;
8809
8810       /* IV_LIMIT is the maximum number of latch iterations, which is also
8811          the maximum in-range IV value.  Round this value down to the previous
8812          vector alignment boundary and then add an extra full iteration.  */
8813       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8814       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
8815     }
8816   return iv_limit;
8817 }
8818