gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           if (is_gimple_debug (gsi_stmt (si)))
 346             continue;
 347           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 348           opt_result res
 349             = vect_determine_vf_for_stmt (loop_vinfo,
 350                                           stmt_info, &vectorization_factor);
 351           if (!res)
 352             return res;
 353         }
 354     }
 355
 356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 357   if (dump_enabled_p ())
 358     {
 359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 360       dump_dec (MSG_NOTE, vectorization_factor);
 361       dump_printf (MSG_NOTE, "\n");
 362     }
 363
 364   if (known_le (vectorization_factor, 1U))
 365     return opt_result::failure_at (vect_location,
 366                                    "not vectorized: unsupported data-type\n");
 367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 368   return opt_result::success ();
 369 }
 370
 371
 372 /* Function vect_is_simple_iv_evolution.
 373
 374    FORNOW: A simple evolution of an induction variables in the loop is
 375    considered a polynomial evolution.  */
 376
 377 static bool
 378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 379                              tree * step)
 380 {
 381   tree init_expr;
 382   tree step_expr;
 383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 384   basic_block bb;
 385
 386   /* When there is no evolution in this loop, the evolution function
 387      is not "simple".  */
 388   if (evolution_part == NULL_TREE)
 389     return false;
 390
 391   /* When the evolution is a polynomial of degree >= 2
 392      the evolution function is not "simple".  */
 393   if (tree_is_chrec (evolution_part))
 394     return false;
 395
 396   step_expr = evolution_part;
 397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 398
 399   if (dump_enabled_p ())
 400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 401                      step_expr, init_expr);
 402
 403   *init = init_expr;
 404   *step = step_expr;
 405
 406   if (TREE_CODE (step_expr) != INTEGER_CST
 407       && (TREE_CODE (step_expr) != SSA_NAME
 408           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 409               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 410           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 411               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 412                   || !flag_associative_math)))
 413       && (TREE_CODE (step_expr) != REAL_CST
 414           || !flag_associative_math))
 415     {
 416       if (dump_enabled_p ())
 417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                          "step unknown.\n");
 419       return false;
 420     }
 421
 422   return true;
 423 }
 424
 425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 426    what we are assuming is a double reduction.  For example, given
 427    a structure like this:
 428
 429       outer1:
 430         x_1 = PHI <x_4(outer2), ...>;
 431         ...
 432
 433       inner:
 434         x_2 = PHI <x_1(outer1), ...>;
 435         ...
 436         x_3 = ...;
 437         ...
 438
 439       outer2:
 440         x_4 = PHI <x_3(inner)>;
 441         ...
 442
 443    outer loop analysis would treat x_1 as a double reduction phi and
 444    this function would then return true for x_2.  */
 445
 446 static bool
 447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 448 {
 449   use_operand_p use_p;
 450   ssa_op_iter op_iter;
 451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 454         return true;
 455   return false;
 456 }
 457
 458 /* Function vect_analyze_scalar_cycles_1.
 459
 460    Examine the cross iteration def-use cycles of scalar variables
 461    in LOOP.  LOOP_VINFO represents the loop that is now being
 462    considered for vectorization (can be LOOP, or an outer-loop
 463    enclosing LOOP).  */
 464
 465 static void
 466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 467 {
 468   basic_block bb = loop->header;
 469   tree init, step;
 470   auto_vec<stmt_vec_info, 64> worklist;
 471   gphi_iterator gsi;
 472   bool double_reduc, reduc_chain;
 473
 474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 475
 476   /* First - identify all inductions.  Reduction detection assumes that all the
 477      inductions have been identified, therefore, this order must not be
 478      changed.  */
 479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 480     {
 481       gphi *phi = gsi.phi ();
 482       tree access_fn = NULL;
 483       tree def = PHI_RESULT (phi);
 484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 485
 486       if (dump_enabled_p ())
 487         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 488
 489       /* Skip virtual phi's.  The data dependences that are associated with
 490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 491       if (virtual_operand_p (def))
 492         continue;
 493
 494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 495
 496       /* Analyze the evolution function.  */
 497       access_fn = analyze_scalar_evolution (loop, def);
 498       if (access_fn)
 499         {
 500           STRIP_NOPS (access_fn);
 501           if (dump_enabled_p ())
 502             dump_printf_loc (MSG_NOTE, vect_location,
 503                              "Access function of PHI: %T\n", access_fn);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 505             = initial_condition_in_loop_num (access_fn, loop->num);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 507             = evolution_part_in_loop_num (access_fn, loop->num);
 508         }
 509
 510       if (!access_fn
 511           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 512           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 513           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 514               && TREE_CODE (step) != INTEGER_CST))
 515         {
 516           worklist.safe_push (stmt_vinfo);
 517           continue;
 518         }
 519
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 521                   != NULL_TREE);
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 523
 524       if (dump_enabled_p ())
 525         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 527     }
 528
 529
 530   /* Second - identify all reductions and nested cycles.  */
 531   while (worklist.length () > 0)
 532     {
 533       stmt_vec_info stmt_vinfo = worklist.pop ();
 534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 535       tree def = PHI_RESULT (phi);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 539
 540       gcc_assert (!virtual_operand_p (def)
 541                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 542
 543       stmt_vec_info reduc_stmt_info
 544         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 545                                     &reduc_chain);
 546       if (reduc_stmt_info)
 547         {
 548           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 549           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 550           if (double_reduc)
 551             {
 552               if (dump_enabled_p ())
 553                 dump_printf_loc (MSG_NOTE, vect_location,
 554                                  "Detected double reduction.\n");
 555
 556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 557               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 558             }
 559           else
 560             {
 561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 562                 {
 563                   if (dump_enabled_p ())
 564                     dump_printf_loc (MSG_NOTE, vect_location,
 565                                      "Detected vectorizable nested cycle.\n");
 566
 567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 568                 }
 569               else
 570                 {
 571                   if (dump_enabled_p ())
 572                     dump_printf_loc (MSG_NOTE, vect_location,
 573                                      "Detected reduction.\n");
 574
 575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 576                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 577                   /* Store the reduction cycles for possible vectorization in
 578                      loop-aware SLP if it was not detected as reduction
 579                      chain.  */
 580                   if (! reduc_chain)
 581                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 582                       (reduc_stmt_info);
 583                 }
 584             }
 585         }
 586       else
 587         if (dump_enabled_p ())
 588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 589                            "Unknown def-use cycle pattern.\n");
 590     }
 591 }
 592
 593
 594 /* Function vect_analyze_scalar_cycles.
 595
 596    Examine the cross iteration def-use cycles of scalar variables, by
 597    analyzing the loop-header PHIs of scalar variables.  Classify each
 598    cycle as one of the following: invariant, induction, reduction, unknown.
 599    We do that for the loop represented by LOOP_VINFO, and also to its
 600    inner-loop, if exists.
 601    Examples for scalar cycles:
 602
 603    Example1: reduction:
 604
 605               loop1:
 606               for (i=0; i<N; i++)
 607                  sum += a[i];
 608
 609    Example2: induction:
 610
 611               loop2:
 612               for (i=0; i<N; i++)
 613                  a[i] = i;  */
 614
 615 static void
 616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 617 {
 618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 619
 620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 621
 622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 623      Reductions in such inner-loop therefore have different properties than
 624      the reductions in the nest that gets vectorized:
 625      1. When vectorized, they are executed in the same order as in the original
 626         scalar loop, so we can't change the order of computation when
 627         vectorizing them.
 628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 629         current checks are too strict.  */
 630
 631   if (loop->inner)
 632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 633 }
 634
 635 /* Transfer group and reduction information from STMT_INFO to its
 636    pattern stmt.  */
 637
 638 static void
 639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 640 {
 641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 642   stmt_vec_info stmtp;
 643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 644               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 646   do
 647     {
 648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 650                            == STMT_VINFO_DEF_TYPE (stmt_info));
 651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 653       if (stmt_info)
 654         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 655           = STMT_VINFO_RELATED_STMT (stmt_info);
 656     }
 657   while (stmt_info);
 658 }
 659
 660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 661
 662 static void
 663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 664 {
 665   stmt_vec_info first;
 666   unsigned i;
 667
 668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 669     {
 670       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 671       while (next)
 672         {
 673           if ((STMT_VINFO_IN_PATTERN_P (next)
 674                != STMT_VINFO_IN_PATTERN_P (first))
 675               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 676             break;
 677           next = REDUC_GROUP_NEXT_ELEMENT (next);
 678         }
 679       /* If all reduction chain members are well-formed patterns adjust
 680          the group to group the pattern stmts instead.  */
 681       if (! next
 682           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 683         {
 684           if (STMT_VINFO_IN_PATTERN_P (first))
 685             {
 686               vect_fixup_reduc_chain (first);
 687               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 688                 = STMT_VINFO_RELATED_STMT (first);
 689             }
 690         }
 691       /* If not all stmt in the chain are patterns or if we failed
 692          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 693          it as regular reduction instead.  */
 694       else
 695         {
 696           stmt_vec_info vinfo = first;
 697           stmt_vec_info last = NULL;
 698           while (vinfo)
 699             {
 700               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 701               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 702               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 703               last = vinfo;
 704               vinfo = next;
 705             }
 706           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 707             = vect_internal_def;
 708           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 709           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 710           --i;
 711         }
 712     }
 713 }
 714
 715 /* Function vect_get_loop_niters.
 716
 717    Determine how many iterations the loop is executed and place it
 718    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 719    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 720    niter information holds in ASSUMPTIONS.
 721
 722    Return the loop exit condition.  */
 723
 724
 725 static gcond *
 726 vect_get_loop_niters (class loop *loop, tree *assumptions,
 727                       tree *number_of_iterations, tree *number_of_iterationsm1)
 728 {
 729   edge exit = single_exit (loop);
 730   class tree_niter_desc niter_desc;
 731   tree niter_assumptions, niter, may_be_zero;
 732   gcond *cond = get_loop_exit_condition (loop);
 733
 734   *assumptions = boolean_true_node;
 735   *number_of_iterationsm1 = chrec_dont_know;
 736   *number_of_iterations = chrec_dont_know;
 737   DUMP_VECT_SCOPE ("get_loop_niters");
 738
 739   if (!exit)
 740     return cond;
 741
 742   may_be_zero = NULL_TREE;
 743   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 744       || chrec_contains_undetermined (niter_desc.niter))
 745     return cond;
 746
 747   niter_assumptions = niter_desc.assumptions;
 748   may_be_zero = niter_desc.may_be_zero;
 749   niter = niter_desc.niter;
 750
 751   if (may_be_zero && integer_zerop (may_be_zero))
 752     may_be_zero = NULL_TREE;
 753
 754   if (may_be_zero)
 755     {
 756       if (COMPARISON_CLASS_P (may_be_zero))
 757         {
 758           /* Try to combine may_be_zero with assumptions, this can simplify
 759              computation of niter expression.  */
 760           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 761             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 762                                              niter_assumptions,
 763                                              fold_build1 (TRUTH_NOT_EXPR,
 764                                                           boolean_type_node,
 765                                                           may_be_zero));
 766           else
 767             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 768                                  build_int_cst (TREE_TYPE (niter), 0),
 769                                  rewrite_to_non_trapping_overflow (niter));
 770
 771           may_be_zero = NULL_TREE;
 772         }
 773       else if (integer_nonzerop (may_be_zero))
 774         {
 775           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 776           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 777           return cond;
 778         }
 779       else
 780         return cond;
 781     }
 782
 783   *assumptions = niter_assumptions;
 784   *number_of_iterationsm1 = niter;
 785
 786   /* We want the number of loop header executions which is the number
 787      of latch executions plus one.
 788      ???  For UINT_MAX latch executions this number overflows to zero
 789      for loops like do { n++; } while (n != 0);  */
 790   if (niter && !chrec_contains_undetermined (niter))
 791     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 792                           build_int_cst (TREE_TYPE (niter), 1));
 793   *number_of_iterations = niter;
 794
 795   return cond;
 796 }
 797
 798 /* Function bb_in_loop_p
 799
 800    Used as predicate for dfs order traversal of the loop bbs.  */
 801
 802 static bool
 803 bb_in_loop_p (const_basic_block bb, const void *data)
 804 {
 805   const class loop *const loop = (const class loop *)data;
 806   if (flow_bb_inside_loop_p (loop, bb))
 807     return true;
 808   return false;
 809 }
 810
 811
 812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 813    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 814
 815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 816   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 817     loop (loop_in),
 818     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 819     num_itersm1 (NULL_TREE),
 820     num_iters (NULL_TREE),
 821     num_iters_unchanged (NULL_TREE),
 822     num_iters_assumptions (NULL_TREE),
 823     th (0),
 824     versioning_threshold (0),
 825     vectorization_factor (0),
 826     max_vectorization_factor (0),
 827     mask_skip_niters (NULL_TREE),
 828     rgroup_compare_type (NULL_TREE),
 829     simd_if_cond (NULL_TREE),
 830     unaligned_dr (NULL),
 831     peeling_for_alignment (0),
 832     ptr_mask (0),
 833     ivexpr_map (NULL),
 834     scan_map (NULL),
 835     slp_unrolling_factor (1),
 836     single_scalar_iteration_cost (0),
 837     vec_outside_cost (0),
 838     vec_inside_cost (0),
 839     vectorizable (false),
 840     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 841     using_partial_vectors_p (false),
 842     epil_using_partial_vectors_p (false),
 843     peeling_for_gaps (false),
 844     peeling_for_niter (false),
 845     no_data_dependencies (false),
 846     has_mask_store (false),
 847     scalar_loop_scaling (profile_probability::uninitialized ()),
 848     scalar_loop (NULL),
 849     orig_loop_info (NULL)
 850 {
 851   /* CHECKME: We want to visit all BBs before their successors (except for
 852      latch blocks, for which this assertion wouldn't hold).  In the simple
 853      case of the loop forms we allow, a dfs order of the BBs would the same
 854      as reversed postorder traversal, so we are safe.  */
 855
 856   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 857                                           bbs, loop->num_nodes, loop);
 858   gcc_assert (nbbs == loop->num_nodes);
 859
 860   for (unsigned int i = 0; i < nbbs; i++)
 861     {
 862       basic_block bb = bbs[i];
 863       gimple_stmt_iterator si;
 864
 865       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 866         {
 867           gimple *phi = gsi_stmt (si);
 868           gimple_set_uid (phi, 0);
 869           add_stmt (phi);
 870         }
 871
 872       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 873         {
 874           gimple *stmt = gsi_stmt (si);
 875           gimple_set_uid (stmt, 0);
 876           if (is_gimple_debug (stmt))
 877             continue;
 878           add_stmt (stmt);
 879           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 880              third argument is the #pragma omp simd if (x) condition, when 0,
 881              loop shouldn't be vectorized, when non-zero constant, it should
 882              be vectorized normally, otherwise versioned with vectorized loop
 883              done if the condition is non-zero at runtime.  */
 884           if (loop_in->simduid
 885               && is_gimple_call (stmt)
 886               && gimple_call_internal_p (stmt)
 887               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 888               && gimple_call_num_args (stmt) >= 3
 889               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 890               && (loop_in->simduid
 891                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 892             {
 893               tree arg = gimple_call_arg (stmt, 2);
 894               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 895                 simd_if_cond = arg;
 896               else
 897                 gcc_assert (integer_nonzerop (arg));
 898             }
 899         }
 900     }
 901
 902   epilogue_vinfos.create (6);
 903 }
 904
 905 /* Free all levels of rgroup CONTROLS.  */
 906
 907 void
 908 release_vec_loop_controls (vec<rgroup_controls> *controls)
 909 {
 910   rgroup_controls *rgc;
 911   unsigned int i;
 912   FOR_EACH_VEC_ELT (*controls, i, rgc)
 913     rgc->controls.release ();
 914   controls->release ();
 915 }
 916
 917 /* Free all memory used by the _loop_vec_info, as well as all the
 918    stmt_vec_info structs of all the stmts in the loop.  */
 919
 920 _loop_vec_info::~_loop_vec_info ()
 921 {
 922   free (bbs);
 923
 924   release_vec_loop_controls (&masks);
 925   release_vec_loop_controls (&lens);
 926   delete ivexpr_map;
 927   delete scan_map;
 928   epilogue_vinfos.release ();
 929
 930   loop->aux = NULL;
 931 }
 932
 933 /* Return an invariant or register for EXPR and emit necessary
 934    computations in the LOOP_VINFO loop preheader.  */
 935
 936 tree
 937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 938 {
 939   if (is_gimple_reg (expr)
 940       || is_gimple_min_invariant (expr))
 941     return expr;
 942
 943   if (! loop_vinfo->ivexpr_map)
 944     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 945   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 946   if (! cached)
 947     {
 948       gimple_seq stmts = NULL;
 949       cached = force_gimple_operand (unshare_expr (expr),
 950                                      &stmts, true, NULL_TREE);
 951       if (stmts)
 952         {
 953           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 954           gsi_insert_seq_on_edge_immediate (e, stmts);
 955         }
 956     }
 957   return cached;
 958 }
 959
 960 /* Return true if we can use CMP_TYPE as the comparison type to produce
 961    all masks required to mask LOOP_VINFO.  */
 962
 963 static bool
 964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 965 {
 966   rgroup_controls *rgm;
 967   unsigned int i;
 968   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 969     if (rgm->type != NULL_TREE
 970         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 971                                             cmp_type, rgm->type,
 972                                             OPTIMIZE_FOR_SPEED))
 973       return false;
 974   return true;
 975 }
 976
 977 /* Calculate the maximum number of scalars per iteration for every
 978    rgroup in LOOP_VINFO.  */
 979
 980 static unsigned int
 981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 982 {
 983   unsigned int res = 1;
 984   unsigned int i;
 985   rgroup_controls *rgm;
 986   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 987     res = MAX (res, rgm->max_nscalars_per_iter);
 988   return res;
 989 }
 990
 991 /* Calculate the minimum precision necessary to represent:
 992
 993       MAX_NITERS * FACTOR
 994
 995    as an unsigned integer, where MAX_NITERS is the maximum number of
 996    loop header iterations for the original scalar form of LOOP_VINFO.  */
 997
 998 static unsigned
 999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1000 {
1001   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1002
1003   /* Get the maximum number of iterations that is representable
1004      in the counter type.  */
1005   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1006   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1007
1008   /* Get a more refined estimate for the number of iterations.  */
1009   widest_int max_back_edges;
1010   if (max_loop_iterations (loop, &max_back_edges))
1011     max_ni = wi::smin (max_ni, max_back_edges + 1);
1012
1013   /* Work out how many bits we need to represent the limit.  */
1014   return wi::min_precision (max_ni * factor, UNSIGNED);
1015 }
1016
1017 /* True if the loop needs peeling or partial vectors when vectorized.  */
1018
1019 static bool
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1021 {
1022   unsigned HOST_WIDE_INT const_vf;
1023   HOST_WIDE_INT max_niter
1024     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1025
1026   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1027   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1028     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1029                                           (loop_vinfo));
1030
1031   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1032       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1033     {
1034       /* Work out the (constant) number of iterations that need to be
1035          peeled for reasons other than niters.  */
1036       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1037       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1038         peel_niter += 1;
1039       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1040                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1041         return true;
1042     }
1043   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1044       /* ??? When peeling for gaps but not alignment, we could
1045          try to check whether the (variable) niters is known to be
1046          VF * N + 1.  That's something of a niche case though.  */
1047       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1048       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1049       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1050            < (unsigned) exact_log2 (const_vf))
1051           /* In case of versioning, check if the maximum number of
1052              iterations is greater than th.  If they are identical,
1053              the epilogue is unnecessary.  */
1054           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1055               || ((unsigned HOST_WIDE_INT) max_niter
1056                   > (th / const_vf) * const_vf))))
1057     return true;
1058
1059   return false;
1060 }
1061
1062 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1063    whether we can actually generate the masks required.  Return true if so,
1064    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1065
1066 static bool
1067 vect_verify_full_masking (loop_vec_info loop_vinfo)
1068 {
1069   unsigned int min_ni_width;
1070   unsigned int max_nscalars_per_iter
1071     = vect_get_max_nscalars_per_iter (loop_vinfo);
1072
1073   /* Use a normal loop if there are no statements that need masking.
1074      This only happens in rare degenerate cases: it means that the loop
1075      has no loads, no stores, and no live-out values.  */
1076   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1077     return false;
1078
1079   /* Work out how many bits we need to represent the limit.  */
1080   min_ni_width
1081     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1082
1083   /* Find a scalar mode for which WHILE_ULT is supported.  */
1084   opt_scalar_int_mode cmp_mode_iter;
1085   tree cmp_type = NULL_TREE;
1086   tree iv_type = NULL_TREE;
1087   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1088   unsigned int iv_precision = UINT_MAX;
1089
1090   if (iv_limit != -1)
1091     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1092                                       UNSIGNED);
1093
1094   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1095     {
1096       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1097       if (cmp_bits >= min_ni_width
1098           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1099         {
1100           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1101           if (this_type
1102               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1103             {
1104               /* Although we could stop as soon as we find a valid mode,
1105                  there are at least two reasons why that's not always the
1106                  best choice:
1107
1108                  - An IV that's Pmode or wider is more likely to be reusable
1109                    in address calculations than an IV that's narrower than
1110                    Pmode.
1111
1112                  - Doing the comparison in IV_PRECISION or wider allows
1113                    a natural 0-based IV, whereas using a narrower comparison
1114                    type requires mitigations against wrap-around.
1115
1116                  Conversely, if the IV limit is variable, doing the comparison
1117                  in a wider type than the original type can introduce
1118                  unnecessary extensions, so picking the widest valid mode
1119                  is not always a good choice either.
1120
1121                  Here we prefer the first IV type that's Pmode or wider,
1122                  and the first comparison type that's IV_PRECISION or wider.
1123                  (The comparison type must be no wider than the IV type,
1124                  to avoid extensions in the vector loop.)
1125
1126                  ??? We might want to try continuing beyond Pmode for ILP32
1127                  targets if CMP_BITS < IV_PRECISION.  */
1128               iv_type = this_type;
1129               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1130                 cmp_type = this_type;
1131               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1132                 break;
1133             }
1134         }
1135     }
1136
1137   if (!cmp_type)
1138     return false;
1139
1140   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1141   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1142   return true;
1143 }
1144
1145 /* Check whether we can use vector access with length based on precison
1146    comparison.  So far, to keep it simple, we only allow the case that the
1147    precision of the target supported length is larger than the precision
1148    required by loop niters.  */
1149
1150 static bool
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1152 {
1153   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1154     return false;
1155
1156   unsigned int max_nitems_per_iter = 1;
1157   unsigned int i;
1158   rgroup_controls *rgl;
1159   /* Find the maximum number of items per iteration for every rgroup.  */
1160   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1161     {
1162       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1163       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1164     }
1165
1166   /* Work out how many bits we need to represent the length limit.  */
1167   unsigned int min_ni_prec
1168     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1169
1170   /* Now use the maximum of below precisions for one suitable IV type:
1171      - the IV's natural precision
1172      - the precision needed to hold: the maximum number of scalar
1173        iterations multiplied by the scale factor (min_ni_prec above)
1174      - the Pmode precision
1175
1176      If min_ni_prec is less than the precision of the current niters,
1177      we perfer to still use the niters type.  Prefer to use Pmode and
1178      wider IV to avoid narrow conversions.  */
1179
1180   unsigned int ni_prec
1181     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1182   min_ni_prec = MAX (min_ni_prec, ni_prec);
1183   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1184
1185   tree iv_type = NULL_TREE;
1186   opt_scalar_int_mode tmode_iter;
1187   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1188     {
1189       scalar_mode tmode = tmode_iter.require ();
1190       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1191
1192       /* ??? Do we really want to construct one IV whose precision exceeds
1193          BITS_PER_WORD?  */
1194       if (tbits > BITS_PER_WORD)
1195         break;
1196
1197       /* Find the first available standard integral type.  */
1198       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1199         {
1200           iv_type = build_nonstandard_integer_type (tbits, true);
1201           break;
1202         }
1203     }
1204
1205   if (!iv_type)
1206     {
1207       if (dump_enabled_p ())
1208         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209                          "can't vectorize with length-based partial vectors"
1210                          " because there is no suitable iv type.\n");
1211       return false;
1212     }
1213
1214   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1215   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1216
1217   return true;
1218 }
1219
1220 /* Calculate the cost of one scalar iteration of the loop.  */
1221 static void
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1223 {
1224   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1225   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1226   int nbbs = loop->num_nodes, factor;
1227   int innerloop_iters, i;
1228
1229   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1230
1231   /* Gather costs for statements in the scalar loop.  */
1232
1233   /* FORNOW.  */
1234   innerloop_iters = 1;
1235   if (loop->inner)
1236     innerloop_iters = 50; /* FIXME */
1237
1238   for (i = 0; i < nbbs; i++)
1239     {
1240       gimple_stmt_iterator si;
1241       basic_block bb = bbs[i];
1242
1243       if (bb->loop_father == loop->inner)
1244         factor = innerloop_iters;
1245       else
1246         factor = 1;
1247
1248       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1249         {
1250           gimple *stmt = gsi_stmt (si);
1251           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1252
1253           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254             continue;
1255
1256           /* Skip stmts that are not vectorized inside the loop.  */
1257           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1258           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1259               && (!STMT_VINFO_LIVE_P (vstmt_info)
1260                   || !VECTORIZABLE_CYCLE_DEF
1261                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1262             continue;
1263
1264           vect_cost_for_stmt kind;
1265           if (STMT_VINFO_DATA_REF (stmt_info))
1266             {
1267               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1268                kind = scalar_load;
1269              else
1270                kind = scalar_store;
1271             }
1272           else if (vect_nop_conversion_p (stmt_info))
1273             continue;
1274           else
1275             kind = scalar_stmt;
1276
1277           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278                             factor, kind, stmt_info, 0, vect_prologue);
1279         }
1280     }
1281
1282   /* Now accumulate cost.  */
1283   void *target_cost_data = init_cost (loop);
1284   stmt_info_for_cost *si;
1285   int j;
1286   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1287                     j, si)
1288     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1289                           si->kind, si->stmt_info, si->vectype,
1290                           si->misalign, vect_body);
1291   unsigned dummy, body_cost = 0;
1292   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1293   destroy_cost_data (target_cost_data);
1294   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1295 }
1296
1297
1298 /* Function vect_analyze_loop_form_1.
1299
1300    Verify that certain CFG restrictions hold, including:
1301    - the loop has a pre-header
1302    - the loop has a single entry and exit
1303    - the loop exit condition is simple enough
1304    - the number of iterations can be analyzed, i.e, a countable loop.  The
1305      niter could be analyzed under some assumptions.  */
1306
1307 opt_result
1308 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1309                           tree *assumptions, tree *number_of_iterationsm1,
1310                           tree *number_of_iterations, gcond **inner_loop_cond)
1311 {
1312   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1313
1314   /* Different restrictions apply when we are considering an inner-most loop,
1315      vs. an outer (nested) loop.
1316      (FORNOW. May want to relax some of these restrictions in the future).  */
1317
1318   if (!loop->inner)
1319     {
1320       /* Inner-most loop.  We currently require that the number of BBs is
1321          exactly 2 (the header and latch).  Vectorizable inner-most loops
1322          look like this:
1323
1324                         (pre-header)
1325                            |
1326                           header <--------+
1327                            | |            |
1328                            | +--> latch --+
1329                            |
1330                         (exit-bb)  */
1331
1332       if (loop->num_nodes != 2)
1333         return opt_result::failure_at (vect_location,
1334                                        "not vectorized:"
1335                                        " control flow in loop.\n");
1336
1337       if (empty_block_p (loop->header))
1338         return opt_result::failure_at (vect_location,
1339                                        "not vectorized: empty loop.\n");
1340     }
1341   else
1342     {
1343       class loop *innerloop = loop->inner;
1344       edge entryedge;
1345
1346       /* Nested loop. We currently require that the loop is doubly-nested,
1347          contains a single inner loop, and the number of BBs is exactly 5.
1348          Vectorizable outer-loops look like this:
1349
1350                         (pre-header)
1351                            |
1352                           header <---+
1353                            |         |
1354                           inner-loop |
1355                            |         |
1356                           tail ------+
1357                            |
1358                         (exit-bb)
1359
1360          The inner-loop has the properties expected of inner-most loops
1361          as described above.  */
1362
1363       if ((loop->inner)->inner || (loop->inner)->next)
1364         return opt_result::failure_at (vect_location,
1365                                        "not vectorized:"
1366                                        " multiple nested loops.\n");
1367
1368       if (loop->num_nodes != 5)
1369         return opt_result::failure_at (vect_location,
1370                                        "not vectorized:"
1371                                        " control flow in loop.\n");
1372
1373       entryedge = loop_preheader_edge (innerloop);
1374       if (entryedge->src != loop->header
1375           || !single_exit (innerloop)
1376           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1377         return opt_result::failure_at (vect_location,
1378                                        "not vectorized:"
1379                                        " unsupported outerloop form.\n");
1380
1381       /* Analyze the inner-loop.  */
1382       tree inner_niterm1, inner_niter, inner_assumptions;
1383       opt_result res
1384         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1385                                     &inner_assumptions, &inner_niterm1,
1386                                     &inner_niter, NULL);
1387       if (!res)
1388         {
1389           if (dump_enabled_p ())
1390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391                              "not vectorized: Bad inner loop.\n");
1392           return res;
1393         }
1394
1395       /* Don't support analyzing niter under assumptions for inner
1396          loop.  */
1397       if (!integer_onep (inner_assumptions))
1398         return opt_result::failure_at (vect_location,
1399                                        "not vectorized: Bad inner loop.\n");
1400
1401       if (!expr_invariant_in_loop_p (loop, inner_niter))
1402         return opt_result::failure_at (vect_location,
1403                                        "not vectorized: inner-loop count not"
1404                                        " invariant.\n");
1405
1406       if (dump_enabled_p ())
1407         dump_printf_loc (MSG_NOTE, vect_location,
1408                          "Considering outer-loop vectorization.\n");
1409     }
1410
1411   if (!single_exit (loop))
1412     return opt_result::failure_at (vect_location,
1413                                    "not vectorized: multiple exits.\n");
1414   if (EDGE_COUNT (loop->header->preds) != 2)
1415     return opt_result::failure_at (vect_location,
1416                                    "not vectorized:"
1417                                    " too many incoming edges.\n");
1418
1419   /* We assume that the loop exit condition is at the end of the loop. i.e,
1420      that the loop is represented as a do-while (with a proper if-guard
1421      before the loop if needed), where the loop header contains all the
1422      executable statements, and the latch is empty.  */
1423   if (!empty_block_p (loop->latch)
1424       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425     return opt_result::failure_at (vect_location,
1426                                    "not vectorized: latch block not empty.\n");
1427
1428   /* Make sure the exit is not abnormal.  */
1429   edge e = single_exit (loop);
1430   if (e->flags & EDGE_ABNORMAL)
1431     return opt_result::failure_at (vect_location,
1432                                    "not vectorized:"
1433                                    " abnormal loop exit edge.\n");
1434
1435   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1436                                      number_of_iterationsm1);
1437   if (!*loop_cond)
1438     return opt_result::failure_at
1439       (vect_location,
1440        "not vectorized: complicated exit condition.\n");
1441
1442   if (integer_zerop (*assumptions)
1443       || !*number_of_iterations
1444       || chrec_contains_undetermined (*number_of_iterations))
1445     return opt_result::failure_at
1446       (*loop_cond,
1447        "not vectorized: number of iterations cannot be computed.\n");
1448
1449   if (integer_zerop (*number_of_iterations))
1450     return opt_result::failure_at
1451       (*loop_cond,
1452        "not vectorized: number of iterations = 0.\n");
1453
1454   return opt_result::success ();
1455 }
1456
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1458
1459 opt_loop_vec_info
1460 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1461 {
1462   tree assumptions, number_of_iterations, number_of_iterationsm1;
1463   gcond *loop_cond, *inner_loop_cond = NULL;
1464
1465   opt_result res
1466     = vect_analyze_loop_form_1 (loop, &loop_cond,
1467                                 &assumptions, &number_of_iterationsm1,
1468                                 &number_of_iterations, &inner_loop_cond);
1469   if (!res)
1470     return opt_loop_vec_info::propagate_failure (res);
1471
1472   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1473   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1474   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1475   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1476   if (!integer_onep (assumptions))
1477     {
1478       /* We consider to vectorize this loop by versioning it under
1479          some assumptions.  In order to do this, we need to clear
1480          existing information computed by scev and niter analyzer.  */
1481       scev_reset_htab ();
1482       free_numbers_of_iterations_estimates (loop);
1483       /* Also set flag for this loop so that following scev and niter
1484          analysis are done under the assumptions.  */
1485       loop_constraint_set (loop, LOOP_C_FINITE);
1486       /* Also record the assumptions for versioning.  */
1487       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1488     }
1489
1490   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1491     {
1492       if (dump_enabled_p ())
1493         {
1494           dump_printf_loc (MSG_NOTE, vect_location,
1495                            "Symbolic number of iterations is ");
1496           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497           dump_printf (MSG_NOTE, "\n");
1498         }
1499     }
1500
1501   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1502   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503   if (inner_loop_cond)
1504     {
1505       stmt_vec_info inner_loop_cond_info
1506         = loop_vinfo->lookup_stmt (inner_loop_cond);
1507       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508     }
1509
1510   gcc_assert (!loop->aux);
1511   loop->aux = loop_vinfo;
1512   return opt_loop_vec_info::success (loop_vinfo);
1513 }
1514
1515
1516
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518    statements update the vectorization factor.  */
1519
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1522 {
1523   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525   int nbbs = loop->num_nodes;
1526   poly_uint64 vectorization_factor;
1527   int i;
1528
1529   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1530
1531   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532   gcc_assert (known_ne (vectorization_factor, 0U));
1533
1534   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535      vectorization factor of the loop is the unrolling factor required by
1536      the SLP instances.  If that unrolling factor is 1, we say, that we
1537      perform pure SLP on loop - cross iteration parallelism is not
1538      exploited.  */
1539   bool only_slp_in_loop = true;
1540   for (i = 0; i < nbbs; i++)
1541     {
1542       basic_block bb = bbs[i];
1543       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544            gsi_next (&si))
1545         {
1546           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547           if (!stmt_info)
1548             continue;
1549           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551               && !PURE_SLP_STMT (stmt_info))
1552             /* STMT needs both SLP and loop-based vectorization.  */
1553             only_slp_in_loop = false;
1554         }
1555       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556            gsi_next (&si))
1557         {
1558           if (is_gimple_debug (gsi_stmt (si)))
1559             continue;
1560           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561           stmt_info = vect_stmt_to_vectorize (stmt_info);
1562           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564               && !PURE_SLP_STMT (stmt_info))
1565             /* STMT needs both SLP and loop-based vectorization.  */
1566             only_slp_in_loop = false;
1567         }
1568     }
1569
1570   if (only_slp_in_loop)
1571     {
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_NOTE, vect_location,
1574                          "Loop contains only SLP stmts\n");
1575       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1576     }
1577   else
1578     {
1579       if (dump_enabled_p ())
1580         dump_printf_loc (MSG_NOTE, vect_location,
1581                          "Loop contains SLP and non-SLP stmts\n");
1582       /* Both the vectorization factor and unroll factor have the form
1583          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584          so they must have a common multiple.  */
1585       vectorization_factor
1586         = force_common_multiple (vectorization_factor,
1587                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1588     }
1589
1590   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591   if (dump_enabled_p ())
1592     {
1593       dump_printf_loc (MSG_NOTE, vect_location,
1594                        "Updating vectorization factor to ");
1595       dump_dec (MSG_NOTE, vectorization_factor);
1596       dump_printf (MSG_NOTE, ".\n");
1597     }
1598 }
1599
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601    the other phi in the reduction is also relevant for vectorization.
1602    This rejects cases such as:
1603
1604       outer1:
1605         x_1 = PHI <x_3(outer2), ...>;
1606         ...
1607
1608       inner:
1609         x_2 = ...;
1610         ...
1611
1612       outer2:
1613         x_3 = PHI <x_2(inner)>;
1614
1615    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1616
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1619 {
1620   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621     return false;
1622
1623   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1624 }
1625
1626 /* Function vect_analyze_loop_operations.
1627
1628    Scan the loop stmts and make sure they are all vectorizable.  */
1629
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1632 {
1633   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635   int nbbs = loop->num_nodes;
1636   int i;
1637   stmt_vec_info stmt_info;
1638   bool need_to_vectorize = false;
1639   bool ok;
1640
1641   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1642
1643   auto_vec<stmt_info_for_cost> cost_vec;
1644
1645   for (i = 0; i < nbbs; i++)
1646     {
1647       basic_block bb = bbs[i];
1648
1649       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650            gsi_next (&si))
1651         {
1652           gphi *phi = si.phi ();
1653           ok = true;
1654
1655           stmt_info = loop_vinfo->lookup_stmt (phi);
1656           if (dump_enabled_p ())
1657             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658           if (virtual_operand_p (gimple_phi_result (phi)))
1659             continue;
1660
1661           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662              (i.e., a phi in the tail of the outer-loop).  */
1663           if (! is_loop_header_bb_p (bb))
1664             {
1665               /* FORNOW: we currently don't support the case that these phis
1666                  are not used in the outerloop (unless it is double reduction,
1667                  i.e., this phi is vect_reduction_def), cause this case
1668                  requires to actually do something here.  */
1669               if (STMT_VINFO_LIVE_P (stmt_info)
1670                   && !vect_active_double_reduction_p (stmt_info))
1671                 return opt_result::failure_at (phi,
1672                                                "Unsupported loop-closed phi"
1673                                                " in outer-loop.\n");
1674
1675               /* If PHI is used in the outer loop, we check that its operand
1676                  is defined in the inner loop.  */
1677               if (STMT_VINFO_RELEVANT_P (stmt_info))
1678                 {
1679                   tree phi_op;
1680
1681                   if (gimple_phi_num_args (phi) != 1)
1682                     return opt_result::failure_at (phi, "unsupported phi");
1683
1684                   phi_op = PHI_ARG_DEF (phi, 0);
1685                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686                   if (!op_def_info)
1687                     return opt_result::failure_at (phi, "unsupported phi\n");
1688
1689                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690                       && (STMT_VINFO_RELEVANT (op_def_info)
1691                           != vect_used_in_outer_by_reduction))
1692                     return opt_result::failure_at (phi, "unsupported phi\n");
1693
1694                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1696                            == vect_double_reduction_def))
1697                       && !vectorizable_lc_phi (loop_vinfo,
1698                                                stmt_info, NULL, NULL))
1699                     return opt_result::failure_at (phi, "unsupported phi\n");
1700                 }
1701
1702               continue;
1703             }
1704
1705           gcc_assert (stmt_info);
1706
1707           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708                || STMT_VINFO_LIVE_P (stmt_info))
1709               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710             /* A scalar-dependence cycle that we don't support.  */
1711             return opt_result::failure_at (phi,
1712                                            "not vectorized:"
1713                                            " scalar dependence cycle.\n");
1714
1715           if (STMT_VINFO_RELEVANT_P (stmt_info))
1716             {
1717               need_to_vectorize = true;
1718               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719                   && ! PURE_SLP_STMT (stmt_info))
1720                 ok = vectorizable_induction (loop_vinfo,
1721                                              stmt_info, NULL, NULL,
1722                                              &cost_vec);
1723               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1725                             == vect_double_reduction_def)
1726                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727                        && ! PURE_SLP_STMT (stmt_info))
1728                 ok = vectorizable_reduction (loop_vinfo,
1729                                              stmt_info, NULL, NULL, &cost_vec);
1730             }
1731
1732           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1733           if (ok
1734               && STMT_VINFO_LIVE_P (stmt_info)
1735               && !PURE_SLP_STMT (stmt_info))
1736             ok = vectorizable_live_operation (loop_vinfo,
1737                                               stmt_info, NULL, NULL, NULL,
1738                                               -1, false, &cost_vec);
1739
1740           if (!ok)
1741             return opt_result::failure_at (phi,
1742                                            "not vectorized: relevant phi not "
1743                                            "supported: %G",
1744                                            static_cast <gimple *> (phi));
1745         }
1746
1747       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748            gsi_next (&si))
1749         {
1750           gimple *stmt = gsi_stmt (si);
1751           if (!gimple_clobber_p (stmt)
1752               && !is_gimple_debug (stmt))
1753             {
1754               opt_result res
1755                 = vect_analyze_stmt (loop_vinfo,
1756                                      loop_vinfo->lookup_stmt (stmt),
1757                                      &need_to_vectorize,
1758                                      NULL, NULL, &cost_vec);
1759               if (!res)
1760                 return res;
1761             }
1762         }
1763     } /* bbs */
1764
1765   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1766
1767   /* All operations in the loop are either irrelevant (deal with loop
1768      control, or dead), or only used outside the loop and can be moved
1769      out of the loop (e.g. invariants, inductions).  The loop can be
1770      optimized away by scalar optimizations.  We're better off not
1771      touching this loop.  */
1772   if (!need_to_vectorize)
1773     {
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_NOTE, vect_location,
1776                          "All the computation can be taken out of the loop.\n");
1777       return opt_result::failure_at
1778         (vect_location,
1779          "not vectorized: redundant loop. no profit to vectorize.\n");
1780     }
1781
1782   return opt_result::success ();
1783 }
1784
1785 /* Return true if we know that the iteration count is smaller than the
1786    vectorization factor.  Return false if it isn't, or if we can't be sure
1787    either way.  */
1788
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1791 {
1792   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1793
1794   HOST_WIDE_INT max_niter;
1795   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797   else
1798     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1799
1800   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801     return true;
1802
1803   return false;
1804 }
1805
1806 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1807    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1808    definitely no, or -1 if it's worth retrying.  */
1809
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1812 {
1813   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1815
1816   /* Only loops that can handle partially-populated vectors can have iteration
1817      counts less than the vectorization factor.  */
1818   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1819     {
1820       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1821         {
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                              "not vectorized: iteration count smaller than "
1825                              "vectorization factor.\n");
1826           return 0;
1827         }
1828     }
1829
1830   int min_profitable_iters, min_profitable_estimate;
1831   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1832                                       &min_profitable_estimate);
1833
1834   if (min_profitable_iters < 0)
1835     {
1836       if (dump_enabled_p ())
1837         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838                          "not vectorized: vectorization not profitable.\n");
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841                          "not vectorized: vector version will never be "
1842                          "profitable.\n");
1843       return -1;
1844     }
1845
1846   int min_scalar_loop_bound = (param_min_vect_loop_bound
1847                                * assumed_vf);
1848
1849   /* Use the cost model only if it is more conservative than user specified
1850      threshold.  */
1851   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1852                                     min_profitable_iters);
1853
1854   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1855
1856   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1857       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1858     {
1859       if (dump_enabled_p ())
1860         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861                          "not vectorized: vectorization not profitable.\n");
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_NOTE, vect_location,
1864                          "not vectorized: iteration count smaller than user "
1865                          "specified loop bound parameter or minimum profitable "
1866                          "iterations (whichever is more conservative).\n");
1867       return 0;
1868     }
1869
1870   /* The static profitablity threshold min_profitable_estimate includes
1871      the cost of having to check at runtime whether the scalar loop
1872      should be used instead.  If it turns out that we don't need or want
1873      such a check, the threshold we should use for the static estimate
1874      is simply the point at which the vector loop becomes more profitable
1875      than the scalar loop.  */
1876   if (min_profitable_estimate > min_profitable_iters
1877       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1878       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1879       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1880       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1881     {
1882       if (dump_enabled_p ())
1883         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1884                          " choice between the scalar and vector loops\n");
1885       min_profitable_estimate = min_profitable_iters;
1886     }
1887
1888   HOST_WIDE_INT estimated_niter;
1889
1890   /* If we are vectorizing an epilogue then we know the maximum number of
1891      scalar iterations it will cover is at least one lower than the
1892      vectorization factor of the main loop.  */
1893   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1894     estimated_niter
1895       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1896   else
1897     {
1898       estimated_niter = estimated_stmt_executions_int (loop);
1899       if (estimated_niter == -1)
1900         estimated_niter = likely_max_stmt_executions_int (loop);
1901     }
1902   if (estimated_niter != -1
1903       && ((unsigned HOST_WIDE_INT) estimated_niter
1904           < MAX (th, (unsigned) min_profitable_estimate)))
1905     {
1906       if (dump_enabled_p ())
1907         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908                          "not vectorized: estimated iteration count too "
1909                          "small.\n");
1910       if (dump_enabled_p ())
1911         dump_printf_loc (MSG_NOTE, vect_location,
1912                          "not vectorized: estimated iteration count smaller "
1913                          "than specified loop bound parameter or minimum "
1914                          "profitable iterations (whichever is more "
1915                          "conservative).\n");
1916       return -1;
1917     }
1918
1919   return 1;
1920 }
1921
1922 static opt_result
1923 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1924                            vec<data_reference_p> *datarefs,
1925                            unsigned int *n_stmts)
1926 {
1927   *n_stmts = 0;
1928   for (unsigned i = 0; i < loop->num_nodes; i++)
1929     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1930          !gsi_end_p (gsi); gsi_next (&gsi))
1931       {
1932         gimple *stmt = gsi_stmt (gsi);
1933         if (is_gimple_debug (stmt))
1934           continue;
1935         ++(*n_stmts);
1936         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1937                                                         NULL, 0);
1938         if (!res)
1939           {
1940             if (is_gimple_call (stmt) && loop->safelen)
1941               {
1942                 tree fndecl = gimple_call_fndecl (stmt), op;
1943                 if (fndecl != NULL_TREE)
1944                   {
1945                     cgraph_node *node = cgraph_node::get (fndecl);
1946                     if (node != NULL && node->simd_clones != NULL)
1947                       {
1948                         unsigned int j, n = gimple_call_num_args (stmt);
1949                         for (j = 0; j < n; j++)
1950                           {
1951                             op = gimple_call_arg (stmt, j);
1952                             if (DECL_P (op)
1953                                 || (REFERENCE_CLASS_P (op)
1954                                     && get_base_address (op)))
1955                               break;
1956                           }
1957                         op = gimple_call_lhs (stmt);
1958                         /* Ignore #pragma omp declare simd functions
1959                            if they don't have data references in the
1960                            call stmt itself.  */
1961                         if (j == n
1962                             && !(op
1963                                  && (DECL_P (op)
1964                                      || (REFERENCE_CLASS_P (op)
1965                                          && get_base_address (op)))))
1966                           continue;
1967                       }
1968                   }
1969               }
1970             return res;
1971           }
1972         /* If dependence analysis will give up due to the limit on the
1973            number of datarefs stop here and fail fatally.  */
1974         if (datarefs->length ()
1975             > (unsigned)param_loop_max_datarefs_for_datadeps)
1976           return opt_result::failure_at (stmt, "exceeded param "
1977                                          "loop-max-datarefs-for-datadeps\n");
1978       }
1979   return opt_result::success ();
1980 }
1981
1982 /* Look for SLP-only access groups and turn each individual access into its own
1983    group.  */
1984 static void
1985 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1986 {
1987   unsigned int i;
1988   struct data_reference *dr;
1989
1990   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1991
1992   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1993   FOR_EACH_VEC_ELT (datarefs, i, dr)
1994     {
1995       gcc_assert (DR_REF (dr));
1996       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1997
1998       /* Check if the load is a part of an interleaving chain.  */
1999       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2000         {
2001           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2002           unsigned int group_size = DR_GROUP_SIZE (first_element);
2003
2004           /* Check if SLP-only groups.  */
2005           if (!STMT_SLP_TYPE (stmt_info)
2006               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2007             {
2008               /* Dissolve the group.  */
2009               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2010
2011               stmt_vec_info vinfo = first_element;
2012               while (vinfo)
2013                 {
2014                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2015                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2016                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2017                   DR_GROUP_SIZE (vinfo) = 1;
2018                   if (STMT_VINFO_STRIDED_P (first_element))
2019                     DR_GROUP_GAP (vinfo) = 0;
2020                   else
2021                     DR_GROUP_GAP (vinfo) = group_size - 1;
2022                   vinfo = next;
2023                 }
2024             }
2025         }
2026     }
2027 }
2028
2029 /* Determine if operating on full vectors for LOOP_VINFO might leave
2030    some scalar iterations still to do.  If so, decide how we should
2031    handle those scalar iterations.  The possibilities are:
2032
2033    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2034        In this case:
2035
2036          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2037          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2038          LOOP_VINFO_PEELING_FOR_NITER == false
2039
2040    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2041        to handle the remaining scalar iterations.  In this case:
2042
2043          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2044          LOOP_VINFO_PEELING_FOR_NITER == true
2045
2046        There are two choices:
2047
2048        (2a) Consider vectorizing the epilogue loop at the same VF as the
2049             main loop, but using partial vectors instead of full vectors.
2050             In this case:
2051
2052               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2053
2054        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2055             In this case:
2056
2057               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2058
2059    When FOR_EPILOGUE_P is true, make this determination based on the
2060    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2061    based on the assumption that LOOP_VINFO is the main loop.  The caller
2062    has made sure that the number of iterations is set appropriately for
2063    this value of FOR_EPILOGUE_P.  */
2064
2065 opt_result
2066 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2067                                             bool for_epilogue_p)
2068 {
2069   /* Determine whether there would be any scalar iterations left over.  */
2070   bool need_peeling_or_partial_vectors_p
2071     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2072
2073   /* Decide whether to vectorize the loop with partial vectors.  */
2074   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2075   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2076   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2077       && need_peeling_or_partial_vectors_p)
2078     {
2079       /* For partial-vector-usage=1, try to push the handling of partial
2080          vectors to the epilogue, with the main loop continuing to operate
2081          on full vectors.
2082
2083          ??? We could then end up failing to use partial vectors if we
2084          decide to peel iterations into a prologue, and if the main loop
2085          then ends up processing fewer than VF iterations.  */
2086       if (param_vect_partial_vector_usage == 1
2087           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2088           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2089         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2090       else
2091         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2092     }
2093
2094   if (dump_enabled_p ())
2095     {
2096       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2097         dump_printf_loc (MSG_NOTE, vect_location,
2098                          "operating on partial vectors%s.\n",
2099                          for_epilogue_p ? " for epilogue loop" : "");
2100       else
2101         dump_printf_loc (MSG_NOTE, vect_location,
2102                          "operating only on full vectors%s.\n",
2103                          for_epilogue_p ? " for epilogue loop" : "");
2104     }
2105
2106   if (for_epilogue_p)
2107     {
2108       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2109       gcc_assert (orig_loop_vinfo);
2110       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2111         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2112                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2113     }
2114
2115   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2116       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2117     {
2118       /* Check that the loop processes at least one full vector.  */
2119       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2120       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2121       if (known_lt (wi::to_widest (scalar_niters), vf))
2122         return opt_result::failure_at (vect_location,
2123                                        "loop does not have enough iterations"
2124                                        " to support vectorization.\n");
2125
2126       /* If we need to peel an extra epilogue iteration to handle data
2127          accesses with gaps, check that there are enough scalar iterations
2128          available.
2129
2130          The check above is redundant with this one when peeling for gaps,
2131          but the distinction is useful for diagnostics.  */
2132       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2133       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2135         return opt_result::failure_at (vect_location,
2136                                        "loop does not have enough iterations"
2137                                        " to support peeling for gaps.\n");
2138     }
2139
2140   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2141     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2142        && need_peeling_or_partial_vectors_p);
2143
2144   return opt_result::success ();
2145 }
2146
2147 /* Function vect_analyze_loop_2.
2148
2149    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2150    for it.  The different analyses will record information in the
2151    loop_vec_info struct.  */
2152 static opt_result
2153 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2154 {
2155   opt_result ok = opt_result::success ();
2156   int res;
2157   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2158   poly_uint64 min_vf = 2;
2159   loop_vec_info orig_loop_vinfo = NULL;
2160
2161   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2162      loop_vec_info of the first vectorized loop.  */
2163   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2164     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2165   else
2166     orig_loop_vinfo = loop_vinfo;
2167   gcc_assert (orig_loop_vinfo);
2168
2169   /* The first group of checks is independent of the vector size.  */
2170   fatal = true;
2171
2172   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2173       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2174     return opt_result::failure_at (vect_location,
2175                                    "not vectorized: simd if(0)\n");
2176
2177   /* Find all data references in the loop (which correspond to vdefs/vuses)
2178      and analyze their evolution in the loop.  */
2179
2180   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2181
2182   /* Gather the data references and count stmts in the loop.  */
2183   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2184     {
2185       opt_result res
2186         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2187                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2188                                      n_stmts);
2189       if (!res)
2190         {
2191           if (dump_enabled_p ())
2192             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193                              "not vectorized: loop contains function "
2194                              "calls or data references that cannot "
2195                              "be analyzed\n");
2196           return res;
2197         }
2198       loop_vinfo->shared->save_datarefs ();
2199     }
2200   else
2201     loop_vinfo->shared->check_datarefs ();
2202
2203   /* Analyze the data references and also adjust the minimal
2204      vectorization factor according to the loads and stores.  */
2205
2206   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2207   if (!ok)
2208     {
2209       if (dump_enabled_p ())
2210         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2211                          "bad data references.\n");
2212       return ok;
2213     }
2214
2215   /* Classify all cross-iteration scalar data-flow cycles.
2216      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2217   vect_analyze_scalar_cycles (loop_vinfo);
2218
2219   vect_pattern_recog (loop_vinfo);
2220
2221   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2222
2223   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2224      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2225
2226   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2227   if (!ok)
2228     {
2229       if (dump_enabled_p ())
2230         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2231                          "bad data access.\n");
2232       return ok;
2233     }
2234
2235   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2236
2237   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2238   if (!ok)
2239     {
2240       if (dump_enabled_p ())
2241         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242                          "unexpected pattern.\n");
2243       return ok;
2244     }
2245
2246   /* While the rest of the analysis below depends on it in some way.  */
2247   fatal = false;
2248
2249   /* Analyze data dependences between the data-refs in the loop
2250      and adjust the maximum vectorization factor according to
2251      the dependences.
2252      FORNOW: fail at the first data dependence that we encounter.  */
2253
2254   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2255   if (!ok)
2256     {
2257       if (dump_enabled_p ())
2258         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259                          "bad data dependence.\n");
2260       return ok;
2261     }
2262   if (max_vf != MAX_VECTORIZATION_FACTOR
2263       && maybe_lt (max_vf, min_vf))
2264     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2265   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2266
2267   ok = vect_determine_vectorization_factor (loop_vinfo);
2268   if (!ok)
2269     {
2270       if (dump_enabled_p ())
2271         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272                          "can't determine vectorization factor.\n");
2273       return ok;
2274     }
2275   if (max_vf != MAX_VECTORIZATION_FACTOR
2276       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2277     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2278
2279   /* Compute the scalar iteration cost.  */
2280   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2281
2282   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2283
2284   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2285   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2286   if (!ok)
2287     return ok;
2288
2289   /* If there are any SLP instances mark them as pure_slp.  */
2290   bool slp = vect_make_slp_decision (loop_vinfo);
2291   if (slp)
2292     {
2293       /* Find stmts that need to be both vectorized and SLPed.  */
2294       vect_detect_hybrid_slp (loop_vinfo);
2295
2296       /* Update the vectorization factor based on the SLP decision.  */
2297       vect_update_vf_for_slp (loop_vinfo);
2298
2299       /* Optimize the SLP graph with the vectorization factor fixed.  */
2300       vect_optimize_slp (loop_vinfo);
2301     }
2302
2303   bool saved_can_use_partial_vectors_p
2304     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2305
2306   /* We don't expect to have to roll back to anything other than an empty
2307      set of rgroups.  */
2308   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2309
2310   /* This is the point where we can re-start analysis with SLP forced off.  */
2311 start_over:
2312
2313   /* Now the vectorization factor is final.  */
2314   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2315   gcc_assert (known_ne (vectorization_factor, 0U));
2316
2317   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2318     {
2319       dump_printf_loc (MSG_NOTE, vect_location,
2320                        "vectorization_factor = ");
2321       dump_dec (MSG_NOTE, vectorization_factor);
2322       dump_printf (MSG_NOTE, ", niters = %wd\n",
2323                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2324     }
2325
2326   /* Analyze the alignment of the data-refs in the loop.
2327      Fail if a data reference is found that cannot be vectorized.  */
2328
2329   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2330   if (!ok)
2331     {
2332       if (dump_enabled_p ())
2333         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334                          "bad data alignment.\n");
2335       return ok;
2336     }
2337
2338   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2339      It is important to call pruning after vect_analyze_data_ref_accesses,
2340      since we use grouping information gathered by interleaving analysis.  */
2341   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2342   if (!ok)
2343     return ok;
2344
2345   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2346      vectorization, since we do not want to add extra peeling or
2347      add versioning for alignment.  */
2348   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2349     /* This pass will decide on using loop versioning and/or loop peeling in
2350        order to enhance the alignment of data references in the loop.  */
2351     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2352   if (!ok)
2353     return ok;
2354
2355   if (slp)
2356     {
2357       /* Analyze operations in the SLP instances.  Note this may
2358          remove unsupported SLP instances which makes the above
2359          SLP kind detection invalid.  */
2360       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2361       vect_slp_analyze_operations (loop_vinfo);
2362       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2363         {
2364           ok = opt_result::failure_at (vect_location,
2365                                        "unsupported SLP instances\n");
2366           goto again;
2367         }
2368     }
2369
2370   /* Dissolve SLP-only groups.  */
2371   vect_dissolve_slp_only_groups (loop_vinfo);
2372
2373   /* Scan all the remaining operations in the loop that are not subject
2374      to SLP and make sure they are vectorizable.  */
2375   ok = vect_analyze_loop_operations (loop_vinfo);
2376   if (!ok)
2377     {
2378       if (dump_enabled_p ())
2379         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2380                          "bad operation or unsupported loop bound.\n");
2381       return ok;
2382     }
2383
2384   /* For now, we don't expect to mix both masking and length approaches for one
2385      loop, disable it if both are recorded.  */
2386   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2387       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2388       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2389     {
2390       if (dump_enabled_p ())
2391         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2392                          "can't vectorize a loop with partial vectors"
2393                          " because we don't expect to mix different"
2394                          " approaches with partial vectors for the"
2395                          " same loop.\n");
2396       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2397     }
2398
2399   /* If we still have the option of using partial vectors,
2400      check whether we can generate the necessary loop controls.  */
2401   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2402       && !vect_verify_full_masking (loop_vinfo)
2403       && !vect_verify_loop_lens (loop_vinfo))
2404     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2405
2406   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2407      to be able to handle fewer than VF scalars, or needs to have a lower VF
2408      than the main loop.  */
2409   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2410       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2411       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2412                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2413     return opt_result::failure_at (vect_location,
2414                                    "Vectorization factor too high for"
2415                                    " epilogue loop.\n");
2416
2417   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2418      assuming that the loop will be used as a main loop.  We will redo
2419      this analysis later if we instead decide to use the loop as an
2420      epilogue loop.  */
2421   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2422   if (!ok)
2423     return ok;
2424
2425   /* Check the costings of the loop make vectorizing worthwhile.  */
2426   res = vect_analyze_loop_costing (loop_vinfo);
2427   if (res < 0)
2428     {
2429       ok = opt_result::failure_at (vect_location,
2430                                    "Loop costings may not be worthwhile.\n");
2431       goto again;
2432     }
2433   if (!res)
2434     return opt_result::failure_at (vect_location,
2435                                    "Loop costings not worthwhile.\n");
2436
2437   /* If an epilogue loop is required make sure we can create one.  */
2438   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2439       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2440     {
2441       if (dump_enabled_p ())
2442         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2443       if (!vect_can_advance_ivs_p (loop_vinfo)
2444           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2445                                            single_exit (LOOP_VINFO_LOOP
2446                                                          (loop_vinfo))))
2447         {
2448           ok = opt_result::failure_at (vect_location,
2449                                        "not vectorized: can't create required "
2450                                        "epilog loop\n");
2451           goto again;
2452         }
2453     }
2454
2455   /* During peeling, we need to check if number of loop iterations is
2456      enough for both peeled prolog loop and vector loop.  This check
2457      can be merged along with threshold check of loop versioning, so
2458      increase threshold for this case if necessary.
2459
2460      If we are analyzing an epilogue we still want to check what its
2461      versioning threshold would be.  If we decide to vectorize the epilogues we
2462      will want to use the lowest versioning threshold of all epilogues and main
2463      loop.  This will enable us to enter a vectorized epilogue even when
2464      versioning the loop.  We can't simply check whether the epilogue requires
2465      versioning though since we may have skipped some versioning checks when
2466      analyzing the epilogue.  For instance, checks for alias versioning will be
2467      skipped when dealing with epilogues as we assume we already checked them
2468      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2469   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2470     {
2471       poly_uint64 niters_th = 0;
2472       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2473
2474       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2475         {
2476           /* Niters for peeled prolog loop.  */
2477           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2478             {
2479               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2480               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2481               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2482             }
2483           else
2484             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2485         }
2486
2487       /* Niters for at least one iteration of vectorized loop.  */
2488       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2489         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2490       /* One additional iteration because of peeling for gap.  */
2491       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2492         niters_th += 1;
2493
2494       /*  Use the same condition as vect_transform_loop to decide when to use
2495           the cost to determine a versioning threshold.  */
2496       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2497           && ordered_p (th, niters_th))
2498         niters_th = ordered_max (poly_uint64 (th), niters_th);
2499
2500       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2501     }
2502
2503   gcc_assert (known_eq (vectorization_factor,
2504                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2505
2506   /* Ok to vectorize!  */
2507   return opt_result::success ();
2508
2509 again:
2510   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2511   gcc_assert (!ok);
2512
2513   /* Try again with SLP forced off but if we didn't do any SLP there is
2514      no point in re-trying.  */
2515   if (!slp)
2516     return ok;
2517
2518   /* If there are reduction chains re-trying will fail anyway.  */
2519   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2520     return ok;
2521
2522   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2523      via interleaving or lane instructions.  */
2524   slp_instance instance;
2525   slp_tree node;
2526   unsigned i, j;
2527   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2528     {
2529       stmt_vec_info vinfo;
2530       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2531       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2532         continue;
2533       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2534       unsigned int size = DR_GROUP_SIZE (vinfo);
2535       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2536       if (! vect_store_lanes_supported (vectype, size, false)
2537          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2538          && ! vect_grouped_store_supported (vectype, size))
2539         return opt_result::failure_at (vinfo->stmt,
2540                                        "unsupported grouped store\n");
2541       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2542         {
2543           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2544           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2545           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2546           size = DR_GROUP_SIZE (vinfo);
2547           vectype = STMT_VINFO_VECTYPE (vinfo);
2548           if (! vect_load_lanes_supported (vectype, size, false)
2549               && ! vect_grouped_load_supported (vectype, single_element_p,
2550                                                 size))
2551             return opt_result::failure_at (vinfo->stmt,
2552                                            "unsupported grouped load\n");
2553         }
2554     }
2555
2556   if (dump_enabled_p ())
2557     dump_printf_loc (MSG_NOTE, vect_location,
2558                      "re-trying with SLP disabled\n");
2559
2560   /* Roll back state appropriately.  No SLP this time.  */
2561   slp = false;
2562   /* Restore vectorization factor as it were without SLP.  */
2563   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2564   /* Free the SLP instances.  */
2565   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2566     vect_free_slp_instance (instance);
2567   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2568   /* Reset SLP type to loop_vect on all stmts.  */
2569   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2570     {
2571       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2572       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2573            !gsi_end_p (si); gsi_next (&si))
2574         {
2575           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2576           STMT_SLP_TYPE (stmt_info) = loop_vect;
2577           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2578               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2579             {
2580               /* vectorizable_reduction adjusts reduction stmt def-types,
2581                  restore them to that of the PHI.  */
2582               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2583                 = STMT_VINFO_DEF_TYPE (stmt_info);
2584               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2585                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2586                 = STMT_VINFO_DEF_TYPE (stmt_info);
2587             }
2588         }
2589       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2590            !gsi_end_p (si); gsi_next (&si))
2591         {
2592           if (is_gimple_debug (gsi_stmt (si)))
2593             continue;
2594           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2595           STMT_SLP_TYPE (stmt_info) = loop_vect;
2596           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2597             {
2598               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2599               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2600               STMT_SLP_TYPE (stmt_info) = loop_vect;
2601               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2602                    !gsi_end_p (pi); gsi_next (&pi))
2603                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2604                   = loop_vect;
2605             }
2606         }
2607     }
2608   /* Free optimized alias test DDRS.  */
2609   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2610   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2611   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2612   /* Reset target cost data.  */
2613   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2614   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2615     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2616   /* Reset accumulated rgroup information.  */
2617   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2618   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2619   /* Reset assorted flags.  */
2620   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2621   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2622   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2623   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2624   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2625     = saved_can_use_partial_vectors_p;
2626
2627   goto start_over;
2628 }
2629
2630 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2631    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2632    OLD_LOOP_VINFO is better unless something specifically indicates
2633    otherwise.
2634
2635    Note that this deliberately isn't a partial order.  */
2636
2637 static bool
2638 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2639                           loop_vec_info old_loop_vinfo)
2640 {
2641   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2642   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2643
2644   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2645   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2646
2647   /* Always prefer a VF of loop->simdlen over any other VF.  */
2648   if (loop->simdlen)
2649     {
2650       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2651       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2652       if (new_simdlen_p != old_simdlen_p)
2653         return new_simdlen_p;
2654     }
2655
2656   /* Limit the VFs to what is likely to be the maximum number of iterations,
2657      to handle cases in which at least one loop_vinfo is fully-masked.  */
2658   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2659   if (estimated_max_niter != -1)
2660     {
2661       if (known_le (estimated_max_niter, new_vf))
2662         new_vf = estimated_max_niter;
2663       if (known_le (estimated_max_niter, old_vf))
2664         old_vf = estimated_max_niter;
2665     }
2666
2667   /* Check whether the (fractional) cost per scalar iteration is lower
2668      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2669   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2670                              * poly_widest_int (old_vf));
2671   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2672                              * poly_widest_int (new_vf));
2673   if (maybe_lt (rel_old, rel_new))
2674     {
2675       /* When old_loop_vinfo uses a variable vectorization factor,
2676          we know that it has a lower cost for at least one runtime VF.
2677          However, we don't know how likely that VF is.
2678
2679          One option would be to compare the costs for the estimated VFs.
2680          The problem is that that can put too much pressure on the cost
2681          model.  E.g. if the estimated VF is also the lowest possible VF,
2682          and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2683          for the estimated VF, we'd then choose new_loop_vinfo even
2684          though (a) new_loop_vinfo might not actually be better than
2685          old_loop_vinfo for that VF and (b) it would be significantly
2686          worse at larger VFs.
2687
2688          Here we go for a hacky compromise: pick new_loop_vinfo if it is
2689          no more expensive than old_loop_vinfo even after doubling the
2690          estimated old_loop_vinfo VF.  For all but trivial loops, this
2691          ensures that we only pick new_loop_vinfo if it is significantly
2692          better than old_loop_vinfo at the estimated VF.  */
2693       if (rel_new.is_constant ())
2694         return false;
2695
2696       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2697       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2698       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2699                                       * widest_int (old_estimated_vf));
2700       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2701                                       * widest_int (new_estimated_vf));
2702       return estimated_rel_new * 2 <= estimated_rel_old;
2703     }
2704   if (known_lt (rel_new, rel_old))
2705     return true;
2706
2707   /* If there's nothing to choose between the loop bodies, see whether
2708      there's a difference in the prologue and epilogue costs.  */
2709   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2710     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2711
2712   return false;
2713 }
2714
2715 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2716    true if we should.  */
2717
2718 static bool
2719 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2720                         loop_vec_info old_loop_vinfo)
2721 {
2722   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2723     return false;
2724
2725   if (dump_enabled_p ())
2726     dump_printf_loc (MSG_NOTE, vect_location,
2727                      "***** Preferring vector mode %s to vector mode %s\n",
2728                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2729                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2730   return true;
2731 }
2732
2733 /* Function vect_analyze_loop.
2734
2735    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2736    for it.  The different analyses will record information in the
2737    loop_vec_info struct.  */
2738 opt_loop_vec_info
2739 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2740 {
2741   auto_vector_modes vector_modes;
2742
2743   /* Autodetect first vector size we try.  */
2744   unsigned int autovec_flags
2745     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2746                                                     loop->simdlen != 0);
2747   unsigned int mode_i = 0;
2748
2749   DUMP_VECT_SCOPE ("analyze_loop_nest");
2750
2751   if (loop_outer (loop)
2752       && loop_vec_info_for_loop (loop_outer (loop))
2753       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2754     return opt_loop_vec_info::failure_at (vect_location,
2755                                           "outer-loop already vectorized.\n");
2756
2757   if (!find_loop_nest (loop, &shared->loop_nest))
2758     return opt_loop_vec_info::failure_at
2759       (vect_location,
2760        "not vectorized: loop nest containing two or more consecutive inner"
2761        " loops cannot be vectorized\n");
2762
2763   unsigned n_stmts = 0;
2764   machine_mode autodetected_vector_mode = VOIDmode;
2765   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2766   machine_mode next_vector_mode = VOIDmode;
2767   poly_uint64 lowest_th = 0;
2768   unsigned vectorized_loops = 0;
2769   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2770                              && !unlimited_cost_model (loop));
2771
2772   bool vect_epilogues = false;
2773   opt_result res = opt_result::success ();
2774   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2775   while (1)
2776     {
2777       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2778       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2779       if (!loop_vinfo)
2780         {
2781           if (dump_enabled_p ())
2782             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2783                              "bad loop form.\n");
2784           gcc_checking_assert (first_loop_vinfo == NULL);
2785           return loop_vinfo;
2786         }
2787       loop_vinfo->vector_mode = next_vector_mode;
2788
2789       bool fatal = false;
2790
2791       /* When pick_lowest_cost_p is true, we should in principle iterate
2792          over all the loop_vec_infos that LOOP_VINFO could replace and
2793          try to vectorize LOOP_VINFO under the same conditions.
2794          E.g. when trying to replace an epilogue loop, we should vectorize
2795          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2796          to replace the main loop, we should vectorize LOOP_VINFO as a main
2797          loop too.
2798
2799          However, autovectorize_vector_modes is usually sorted as follows:
2800
2801          - Modes that naturally produce lower VFs usually follow modes that
2802            naturally produce higher VFs.
2803
2804          - When modes naturally produce the same VF, maskable modes
2805            usually follow unmaskable ones, so that the maskable mode
2806            can be used to vectorize the epilogue of the unmaskable mode.
2807
2808          This order is preferred because it leads to the maximum
2809          epilogue vectorization opportunities.  Targets should only use
2810          a different order if they want to make wide modes available while
2811          disparaging them relative to earlier, smaller modes.  The assumption
2812          in that case is that the wider modes are more expensive in some
2813          way that isn't reflected directly in the costs.
2814
2815          There should therefore be few interesting cases in which
2816          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2817          treated as a standalone loop, and ends up being genuinely cheaper
2818          than FIRST_LOOP_VINFO.  */
2819       if (vect_epilogues)
2820         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2821
2822       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2823       if (mode_i == 0)
2824         autodetected_vector_mode = loop_vinfo->vector_mode;
2825       if (dump_enabled_p ())
2826         {
2827           if (res)
2828             dump_printf_loc (MSG_NOTE, vect_location,
2829                              "***** Analysis succeeded with vector mode %s\n",
2830                              GET_MODE_NAME (loop_vinfo->vector_mode));
2831           else
2832             dump_printf_loc (MSG_NOTE, vect_location,
2833                              "***** Analysis failed with vector mode %s\n",
2834                              GET_MODE_NAME (loop_vinfo->vector_mode));
2835         }
2836
2837       loop->aux = NULL;
2838
2839       if (!fatal)
2840         while (mode_i < vector_modes.length ()
2841                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2842           {
2843             if (dump_enabled_p ())
2844               dump_printf_loc (MSG_NOTE, vect_location,
2845                                "***** The result for vector mode %s would"
2846                                " be the same\n",
2847                                GET_MODE_NAME (vector_modes[mode_i]));
2848             mode_i += 1;
2849           }
2850
2851       if (res)
2852         {
2853           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2854           vectorized_loops++;
2855
2856           /* Once we hit the desired simdlen for the first time,
2857              discard any previous attempts.  */
2858           if (simdlen
2859               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2860             {
2861               delete first_loop_vinfo;
2862               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2863               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2864               simdlen = 0;
2865             }
2866           else if (pick_lowest_cost_p && first_loop_vinfo)
2867             {
2868               /* Keep trying to roll back vectorization attempts while the
2869                  loop_vec_infos they produced were worse than this one.  */
2870               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2871               while (!vinfos.is_empty ()
2872                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2873                 {
2874                   gcc_assert (vect_epilogues);
2875                   delete vinfos.pop ();
2876                 }
2877               if (vinfos.is_empty ()
2878                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2879                 {
2880                   delete first_loop_vinfo;
2881                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2882                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2883                 }
2884             }
2885
2886           if (first_loop_vinfo == NULL)
2887             {
2888               first_loop_vinfo = loop_vinfo;
2889               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2890             }
2891           else if (vect_epilogues
2892                    /* For now only allow one epilogue loop.  */
2893                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2894             {
2895               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2896               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2897               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2898                           || maybe_ne (lowest_th, 0U));
2899               /* Keep track of the known smallest versioning
2900                  threshold.  */
2901               if (ordered_p (lowest_th, th))
2902                 lowest_th = ordered_min (lowest_th, th);
2903             }
2904           else
2905             {
2906               delete loop_vinfo;
2907               loop_vinfo = opt_loop_vec_info::success (NULL);
2908             }
2909
2910           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2911              enabled, SIMDUID is not set, it is the innermost loop and we have
2912              either already found the loop's SIMDLEN or there was no SIMDLEN to
2913              begin with.
2914              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2915           vect_epilogues = (!simdlen
2916                             && loop->inner == NULL
2917                             && param_vect_epilogues_nomask
2918                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2919                             && !loop->simduid
2920                             /* For now only allow one epilogue loop, but allow
2921                                pick_lowest_cost_p to replace it.  */
2922                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2923                                 || pick_lowest_cost_p));
2924
2925           /* Commit to first_loop_vinfo if we have no reason to try
2926              alternatives.  */
2927           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2928             break;
2929         }
2930       else
2931         {
2932           delete loop_vinfo;
2933           loop_vinfo = opt_loop_vec_info::success (NULL);
2934           if (fatal)
2935             {
2936               gcc_checking_assert (first_loop_vinfo == NULL);
2937               break;
2938             }
2939         }
2940
2941       /* Handle the case that the original loop can use partial
2942          vectorization, but want to only adopt it for the epilogue.
2943          The retry should be in the same mode as original.  */
2944       if (vect_epilogues
2945           && loop_vinfo
2946           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
2947         {
2948           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2949                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
2950           if (dump_enabled_p ())
2951             dump_printf_loc (MSG_NOTE, vect_location,
2952                              "***** Re-trying analysis with same vector mode"
2953                              " %s for epilogue with partial vectors.\n",
2954                              GET_MODE_NAME (loop_vinfo->vector_mode));
2955           continue;
2956         }
2957
2958       if (mode_i < vector_modes.length ()
2959           && VECTOR_MODE_P (autodetected_vector_mode)
2960           && (related_vector_mode (vector_modes[mode_i],
2961                                    GET_MODE_INNER (autodetected_vector_mode))
2962               == autodetected_vector_mode)
2963           && (related_vector_mode (autodetected_vector_mode,
2964                                    GET_MODE_INNER (vector_modes[mode_i]))
2965               == vector_modes[mode_i]))
2966         {
2967           if (dump_enabled_p ())
2968             dump_printf_loc (MSG_NOTE, vect_location,
2969                              "***** Skipping vector mode %s, which would"
2970                              " repeat the analysis for %s\n",
2971                              GET_MODE_NAME (vector_modes[mode_i]),
2972                              GET_MODE_NAME (autodetected_vector_mode));
2973           mode_i += 1;
2974         }
2975
2976       if (mode_i == vector_modes.length ()
2977           || autodetected_vector_mode == VOIDmode)
2978         break;
2979
2980       /* Try the next biggest vector size.  */
2981       next_vector_mode = vector_modes[mode_i++];
2982       if (dump_enabled_p ())
2983         dump_printf_loc (MSG_NOTE, vect_location,
2984                          "***** Re-trying analysis with vector mode %s\n",
2985                          GET_MODE_NAME (next_vector_mode));
2986     }
2987
2988   if (first_loop_vinfo)
2989     {
2990       loop->aux = (loop_vec_info) first_loop_vinfo;
2991       if (dump_enabled_p ())
2992         dump_printf_loc (MSG_NOTE, vect_location,
2993                          "***** Choosing vector mode %s\n",
2994                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
2995       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2996       return first_loop_vinfo;
2997     }
2998
2999   return opt_loop_vec_info::propagate_failure (res);
3000 }
3001
3002 /* Return true if there is an in-order reduction function for CODE, storing
3003    it in *REDUC_FN if so.  */
3004
3005 static bool
3006 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3007 {
3008   switch (code)
3009     {
3010     case PLUS_EXPR:
3011       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3012       return true;
3013
3014     default:
3015       return false;
3016     }
3017 }
3018
3019 /* Function reduction_fn_for_scalar_code
3020
3021    Input:
3022    CODE - tree_code of a reduction operations.
3023
3024    Output:
3025    REDUC_FN - the corresponding internal function to be used to reduce the
3026       vector of partial results into a single scalar result, or IFN_LAST
3027       if the operation is a supported reduction operation, but does not have
3028       such an internal function.
3029
3030    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3031
3032 static bool
3033 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3034 {
3035   switch (code)
3036     {
3037       case MAX_EXPR:
3038         *reduc_fn = IFN_REDUC_MAX;
3039         return true;
3040
3041       case MIN_EXPR:
3042         *reduc_fn = IFN_REDUC_MIN;
3043         return true;
3044
3045       case PLUS_EXPR:
3046         *reduc_fn = IFN_REDUC_PLUS;
3047         return true;
3048
3049       case BIT_AND_EXPR:
3050         *reduc_fn = IFN_REDUC_AND;
3051         return true;
3052
3053       case BIT_IOR_EXPR:
3054         *reduc_fn = IFN_REDUC_IOR;
3055         return true;
3056
3057       case BIT_XOR_EXPR:
3058         *reduc_fn = IFN_REDUC_XOR;
3059         return true;
3060
3061       case MULT_EXPR:
3062       case MINUS_EXPR:
3063         *reduc_fn = IFN_LAST;
3064         return true;
3065
3066       default:
3067        return false;
3068     }
3069 }
3070
3071 /* If there is a neutral value X such that SLP reduction NODE would not
3072    be affected by the introduction of additional X elements, return that X,
3073    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
3074    is the vector type that would hold element X.  REDUC_CHAIN is true if
3075    the SLP statements perform a single reduction, false if each statement
3076    performs an independent reduction.  */
3077
3078 static tree
3079 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3080                               tree_code code, bool reduc_chain)
3081 {
3082   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3083   stmt_vec_info stmt_vinfo = stmts[0];
3084   tree scalar_type = TREE_TYPE (vector_type);
3085   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3086   gcc_assert (loop);
3087
3088   switch (code)
3089     {
3090     case WIDEN_SUM_EXPR:
3091     case DOT_PROD_EXPR:
3092     case SAD_EXPR:
3093     case PLUS_EXPR:
3094     case MINUS_EXPR:
3095     case BIT_IOR_EXPR:
3096     case BIT_XOR_EXPR:
3097       return build_zero_cst (scalar_type);
3098
3099     case MULT_EXPR:
3100       return build_one_cst (scalar_type);
3101
3102     case BIT_AND_EXPR:
3103       return build_all_ones_cst (scalar_type);
3104
3105     case MAX_EXPR:
3106     case MIN_EXPR:
3107       /* For MIN/MAX the initial values are neutral.  A reduction chain
3108          has only a single initial value, so that value is neutral for
3109          all statements.  */
3110       if (reduc_chain)
3111         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3112                                       loop_preheader_edge (loop));
3113       return NULL_TREE;
3114
3115     default:
3116       return NULL_TREE;
3117     }
3118 }
3119
3120 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3121    STMT is printed with a message MSG. */
3122
3123 static void
3124 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3125 {
3126   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3127 }
3128
3129 /* Return true if we need an in-order reduction for operation CODE
3130    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3131    overflow must wrap.  */
3132
3133 bool
3134 needs_fold_left_reduction_p (tree type, tree_code code)
3135 {
3136   /* CHECKME: check for !flag_finite_math_only too?  */
3137   if (SCALAR_FLOAT_TYPE_P (type))
3138     switch (code)
3139       {
3140       case MIN_EXPR:
3141       case MAX_EXPR:
3142         return false;
3143
3144       default:
3145         return !flag_associative_math;
3146       }
3147
3148   if (INTEGRAL_TYPE_P (type))
3149     {
3150       if (!operation_no_trapping_overflow (type, code))
3151         return true;
3152       return false;
3153     }
3154
3155   if (SAT_FIXED_POINT_TYPE_P (type))
3156     return true;
3157
3158   return false;
3159 }
3160
3161 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3162    has a handled computation expression.  Store the main reduction
3163    operation in *CODE.  */
3164
3165 static bool
3166 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3167                       tree loop_arg, enum tree_code *code,
3168                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3169 {
3170   auto_bitmap visited;
3171   tree lookfor = PHI_RESULT (phi);
3172   ssa_op_iter curri;
3173   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3174   while (USE_FROM_PTR (curr) != loop_arg)
3175     curr = op_iter_next_use (&curri);
3176   curri.i = curri.numops;
3177   do
3178     {
3179       path.safe_push (std::make_pair (curri, curr));
3180       tree use = USE_FROM_PTR (curr);
3181       if (use == lookfor)
3182         break;
3183       gimple *def = SSA_NAME_DEF_STMT (use);
3184       if (gimple_nop_p (def)
3185           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3186         {
3187 pop:
3188           do
3189             {
3190               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3191               curri = x.first;
3192               curr = x.second;
3193               do
3194                 curr = op_iter_next_use (&curri);
3195               /* Skip already visited or non-SSA operands (from iterating
3196                  over PHI args).  */
3197               while (curr != NULL_USE_OPERAND_P
3198                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3199                          || ! bitmap_set_bit (visited,
3200                                               SSA_NAME_VERSION
3201                                                 (USE_FROM_PTR (curr)))));
3202             }
3203           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3204           if (curr == NULL_USE_OPERAND_P)
3205             break;
3206         }
3207       else
3208         {
3209           if (gimple_code (def) == GIMPLE_PHI)
3210             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3211           else
3212             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3213           while (curr != NULL_USE_OPERAND_P
3214                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3215                      || ! bitmap_set_bit (visited,
3216                                           SSA_NAME_VERSION
3217                                             (USE_FROM_PTR (curr)))))
3218             curr = op_iter_next_use (&curri);
3219           if (curr == NULL_USE_OPERAND_P)
3220             goto pop;
3221         }
3222     }
3223   while (1);
3224   if (dump_file && (dump_flags & TDF_DETAILS))
3225     {
3226       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3227       unsigned i;
3228       std::pair<ssa_op_iter, use_operand_p> *x;
3229       FOR_EACH_VEC_ELT (path, i, x)
3230         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3231       dump_printf (MSG_NOTE, "\n");
3232     }
3233
3234   /* Check whether the reduction path detected is valid.  */
3235   bool fail = path.length () == 0;
3236   bool neg = false;
3237   int sign = -1;
3238   *code = ERROR_MARK;
3239   for (unsigned i = 1; i < path.length (); ++i)
3240     {
3241       gimple *use_stmt = USE_STMT (path[i].second);
3242       tree op = USE_FROM_PTR (path[i].second);
3243       if (! is_gimple_assign (use_stmt)
3244           /* The following make sure we can compute the operand index
3245              easily plus it mostly disallows chaining via COND_EXPR condition
3246              operands.  */
3247           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3248               && (gimple_num_ops (use_stmt) <= 2
3249                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3250               && (gimple_num_ops (use_stmt) <= 3
3251                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3252         {
3253           fail = true;
3254           break;
3255         }
3256       /* Check there's only a single stmt the op is used on inside
3257          of the loop.  */
3258       imm_use_iterator imm_iter;
3259       gimple *op_use_stmt;
3260       unsigned cnt = 0;
3261       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3262         if (!is_gimple_debug (op_use_stmt)
3263             && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3264           {
3265             /* We want to allow x + x but not x < 1 ? x : 2.  */
3266             if (is_gimple_assign (op_use_stmt)
3267                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3268               {
3269                 use_operand_p use_p;
3270                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3271                   cnt++;
3272               }
3273             else
3274               cnt++;
3275           }
3276       if (cnt != 1)
3277         {
3278           fail = true;
3279           break;
3280         }
3281       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3282       if (use_code == MINUS_EXPR)
3283         {
3284           use_code = PLUS_EXPR;
3285           /* Track whether we negate the reduction value each iteration.  */
3286           if (gimple_assign_rhs2 (use_stmt) == op)
3287             neg = ! neg;
3288         }
3289       if (CONVERT_EXPR_CODE_P (use_code)
3290           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3291                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3292         ;
3293       else if (*code == ERROR_MARK)
3294         {
3295           *code = use_code;
3296           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3297         }
3298       else if (use_code != *code)
3299         {
3300           fail = true;
3301           break;
3302         }
3303       else if ((use_code == MIN_EXPR
3304                 || use_code == MAX_EXPR)
3305                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3306         {
3307           fail = true;
3308           break;
3309         }
3310     }
3311   return ! fail && ! neg && *code != ERROR_MARK;
3312 }
3313
3314 bool
3315 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3316                       tree loop_arg, enum tree_code code)
3317 {
3318   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3319   enum tree_code code_;
3320   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3321           && code_ == code);
3322 }
3323
3324
3325
3326 /* Function vect_is_simple_reduction
3327
3328    (1) Detect a cross-iteration def-use cycle that represents a simple
3329    reduction computation.  We look for the following pattern:
3330
3331    loop_header:
3332      a1 = phi < a0, a2 >
3333      a3 = ...
3334      a2 = operation (a3, a1)
3335
3336    or
3337
3338    a3 = ...
3339    loop_header:
3340      a1 = phi < a0, a2 >
3341      a2 = operation (a3, a1)
3342
3343    such that:
3344    1. operation is commutative and associative and it is safe to
3345       change the order of the computation
3346    2. no uses for a2 in the loop (a2 is used out of the loop)
3347    3. no uses of a1 in the loop besides the reduction operation
3348    4. no uses of a1 outside the loop.
3349
3350    Conditions 1,4 are tested here.
3351    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3352
3353    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3354    nested cycles.
3355
3356    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3357    reductions:
3358
3359      a1 = phi < a0, a2 >
3360      inner loop (def of a3)
3361      a2 = phi < a3 >
3362
3363    (4) Detect condition expressions, ie:
3364      for (int i = 0; i < N; i++)
3365        if (a[i] < val)
3366         ret_val = a[i];
3367
3368 */
3369
3370 static stmt_vec_info
3371 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3372                           bool *double_reduc, bool *reduc_chain_p)
3373 {
3374   gphi *phi = as_a <gphi *> (phi_info->stmt);
3375   gimple *phi_use_stmt = NULL;
3376   imm_use_iterator imm_iter;
3377   use_operand_p use_p;
3378
3379   *double_reduc = false;
3380   *reduc_chain_p = false;
3381   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3382
3383   tree phi_name = PHI_RESULT (phi);
3384   /* ???  If there are no uses of the PHI result the inner loop reduction
3385      won't be detected as possibly double-reduction by vectorizable_reduction
3386      because that tries to walk the PHI arg from the preheader edge which
3387      can be constant.  See PR60382.  */
3388   if (has_zero_uses (phi_name))
3389     return NULL;
3390   class loop *loop = (gimple_bb (phi))->loop_father;
3391   unsigned nphi_def_loop_uses = 0;
3392   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3393     {
3394       gimple *use_stmt = USE_STMT (use_p);
3395       if (is_gimple_debug (use_stmt))
3396         continue;
3397
3398       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3399         {
3400           if (dump_enabled_p ())
3401             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3402                              "intermediate value used outside loop.\n");
3403
3404           return NULL;
3405         }
3406
3407       nphi_def_loop_uses++;
3408       phi_use_stmt = use_stmt;
3409     }
3410
3411   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3412   if (TREE_CODE (latch_def) != SSA_NAME)
3413     {
3414       if (dump_enabled_p ())
3415         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3416                          "reduction: not ssa_name: %T\n", latch_def);
3417       return NULL;
3418     }
3419
3420   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3421   if (!def_stmt_info
3422       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3423     return NULL;
3424
3425   bool nested_in_vect_loop
3426     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3427   unsigned nlatch_def_loop_uses = 0;
3428   auto_vec<gphi *, 3> lcphis;
3429   bool inner_loop_of_double_reduc = false;
3430   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3431     {
3432       gimple *use_stmt = USE_STMT (use_p);
3433       if (is_gimple_debug (use_stmt))
3434         continue;
3435       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3436         nlatch_def_loop_uses++;
3437       else
3438         {
3439           /* We can have more than one loop-closed PHI.  */
3440           lcphis.safe_push (as_a <gphi *> (use_stmt));
3441           if (nested_in_vect_loop
3442               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3443                   == vect_double_reduction_def))
3444             inner_loop_of_double_reduc = true;
3445         }
3446     }
3447
3448   /* If we are vectorizing an inner reduction we are executing that
3449      in the original order only in case we are not dealing with a
3450      double reduction.  */
3451   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3452     {
3453       if (dump_enabled_p ())
3454         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3455                         "detected nested cycle: ");
3456       return def_stmt_info;
3457     }
3458
3459   /* If this isn't a nested cycle or if the nested cycle reduction value
3460      is used ouside of the inner loop we cannot handle uses of the reduction
3461      value.  */
3462   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3463     {
3464       if (dump_enabled_p ())
3465         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3466                          "reduction used in loop.\n");
3467       return NULL;
3468     }
3469
3470   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3471      defined in the inner loop.  */
3472   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3473     {
3474       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3475       if (gimple_phi_num_args (def_stmt) != 1
3476           || TREE_CODE (op1) != SSA_NAME)
3477         {
3478           if (dump_enabled_p ())
3479             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3480                              "unsupported phi node definition.\n");
3481
3482           return NULL;
3483         }
3484
3485       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3486       if (gimple_bb (def1)
3487           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3488           && loop->inner
3489           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3490           && is_gimple_assign (def1)
3491           && is_a <gphi *> (phi_use_stmt)
3492           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3493         {
3494           if (dump_enabled_p ())
3495             report_vect_op (MSG_NOTE, def_stmt,
3496                             "detected double reduction: ");
3497
3498           *double_reduc = true;
3499           return def_stmt_info;
3500         }
3501
3502       return NULL;
3503     }
3504
3505   /* Look for the expression computing latch_def from then loop PHI result.  */
3506   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3507   enum tree_code code;
3508   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3509                             path))
3510     {
3511       STMT_VINFO_REDUC_CODE (phi_info) = code;
3512       if (code == COND_EXPR && !nested_in_vect_loop)
3513         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3514
3515       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3516          reduction chain for which the additional restriction is that
3517          all operations in the chain are the same.  */
3518       auto_vec<stmt_vec_info, 8> reduc_chain;
3519       unsigned i;
3520       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3521       for (i = path.length () - 1; i >= 1; --i)
3522         {
3523           gimple *stmt = USE_STMT (path[i].second);
3524           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3525           STMT_VINFO_REDUC_IDX (stmt_info)
3526             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3527           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3528           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3529                                      && (i == 1 || i == path.length () - 1));
3530           if ((stmt_code != code && !leading_conversion)
3531               /* We can only handle the final value in epilogue
3532                  generation for reduction chains.  */
3533               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3534             is_slp_reduc = false;
3535           /* For reduction chains we support a trailing/leading
3536              conversions.  We do not store those in the actual chain.  */
3537           if (leading_conversion)
3538             continue;
3539           reduc_chain.safe_push (stmt_info);
3540         }
3541       if (is_slp_reduc && reduc_chain.length () > 1)
3542         {
3543           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3544             {
3545               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3546               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3547             }
3548           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3549           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3550
3551           /* Save the chain for further analysis in SLP detection.  */
3552           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3553           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3554
3555           *reduc_chain_p = true;
3556           if (dump_enabled_p ())
3557             dump_printf_loc (MSG_NOTE, vect_location,
3558                             "reduction: detected reduction chain\n");
3559         }
3560       else if (dump_enabled_p ())
3561         dump_printf_loc (MSG_NOTE, vect_location,
3562                          "reduction: detected reduction\n");
3563
3564       return def_stmt_info;
3565     }
3566
3567   if (dump_enabled_p ())
3568     dump_printf_loc (MSG_NOTE, vect_location,
3569                      "reduction: unknown pattern\n");
3570
3571   return NULL;
3572 }
3573
3574 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3575    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3576    or -1 if not known.  */
3577
3578 static int
3579 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3580 {
3581   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3582   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3583     {
3584       if (dump_enabled_p ())
3585         dump_printf_loc (MSG_NOTE, vect_location,
3586                          "cost model: epilogue peel iters set to vf/2 "
3587                          "because loop iterations are unknown .\n");
3588       return assumed_vf / 2;
3589     }
3590   else
3591     {
3592       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3593       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3594       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3595       /* If we need to peel for gaps, but no peeling is required, we have to
3596          peel VF iterations.  */
3597       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3598         peel_iters_epilogue = assumed_vf;
3599       return peel_iters_epilogue;
3600     }
3601 }
3602
3603 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3604 int
3605 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3606                              int *peel_iters_epilogue,
3607                              stmt_vector_for_cost *scalar_cost_vec,
3608                              stmt_vector_for_cost *prologue_cost_vec,
3609                              stmt_vector_for_cost *epilogue_cost_vec)
3610 {
3611   int retval = 0;
3612
3613   *peel_iters_epilogue
3614     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3615
3616   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3617     {
3618       /* If peeled iterations are known but number of scalar loop
3619          iterations are unknown, count a taken branch per peeled loop.  */
3620       if (peel_iters_prologue > 0)
3621         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3622                                    NULL, NULL_TREE, 0, vect_prologue);
3623       if (*peel_iters_epilogue > 0)
3624         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3625                                     NULL, NULL_TREE, 0, vect_epilogue);
3626     }
3627
3628   stmt_info_for_cost *si;
3629   int j;
3630   if (peel_iters_prologue)
3631     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3632       retval += record_stmt_cost (prologue_cost_vec,
3633                                   si->count * peel_iters_prologue,
3634                                   si->kind, si->stmt_info, si->misalign,
3635                                   vect_prologue);
3636   if (*peel_iters_epilogue)
3637     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3638       retval += record_stmt_cost (epilogue_cost_vec,
3639                                   si->count * *peel_iters_epilogue,
3640                                   si->kind, si->stmt_info, si->misalign,
3641                                   vect_epilogue);
3642
3643   return retval;
3644 }
3645
3646 /* Function vect_estimate_min_profitable_iters
3647
3648    Return the number of iterations required for the vector version of the
3649    loop to be profitable relative to the cost of the scalar version of the
3650    loop.
3651
3652    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3653    of iterations for vectorization.  -1 value means loop vectorization
3654    is not profitable.  This returned value may be used for dynamic
3655    profitability check.
3656
3657    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3658    for static check against estimated number of iterations.  */
3659
3660 static void
3661 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3662                                     int *ret_min_profitable_niters,
3663                                     int *ret_min_profitable_estimate)
3664 {
3665   int min_profitable_iters;
3666   int min_profitable_estimate;
3667   int peel_iters_prologue;
3668   int peel_iters_epilogue;
3669   unsigned vec_inside_cost = 0;
3670   int vec_outside_cost = 0;
3671   unsigned vec_prologue_cost = 0;
3672   unsigned vec_epilogue_cost = 0;
3673   int scalar_single_iter_cost = 0;
3674   int scalar_outside_cost = 0;
3675   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3676   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3677   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3678
3679   /* Cost model disabled.  */
3680   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3681     {
3682       if (dump_enabled_p ())
3683         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3684       *ret_min_profitable_niters = 0;
3685       *ret_min_profitable_estimate = 0;
3686       return;
3687     }
3688
3689   /* Requires loop versioning tests to handle misalignment.  */
3690   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3691     {
3692       /*  FIXME: Make cost depend on complexity of individual check.  */
3693       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3694       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3695                             NULL, NULL_TREE, 0, vect_prologue);
3696       if (dump_enabled_p ())
3697         dump_printf (MSG_NOTE,
3698                      "cost model: Adding cost of checks for loop "
3699                      "versioning to treat misalignment.\n");
3700     }
3701
3702   /* Requires loop versioning with alias checks.  */
3703   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3704     {
3705       /*  FIXME: Make cost depend on complexity of individual check.  */
3706       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3707       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3708                             NULL, NULL_TREE, 0, vect_prologue);
3709       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3710       if (len)
3711         /* Count LEN - 1 ANDs and LEN comparisons.  */
3712         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3713                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3714       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3715       if (len)
3716         {
3717           /* Count LEN - 1 ANDs and LEN comparisons.  */
3718           unsigned int nstmts = len * 2 - 1;
3719           /* +1 for each bias that needs adding.  */
3720           for (unsigned int i = 0; i < len; ++i)
3721             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3722               nstmts += 1;
3723           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3724                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3725         }
3726       if (dump_enabled_p ())
3727         dump_printf (MSG_NOTE,
3728                      "cost model: Adding cost of checks for loop "
3729                      "versioning aliasing.\n");
3730     }
3731
3732   /* Requires loop versioning with niter checks.  */
3733   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3734     {
3735       /*  FIXME: Make cost depend on complexity of individual check.  */
3736       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3737                             NULL, NULL_TREE, 0, vect_prologue);
3738       if (dump_enabled_p ())
3739         dump_printf (MSG_NOTE,
3740                      "cost model: Adding cost of checks for loop "
3741                      "versioning niters.\n");
3742     }
3743
3744   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3745     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3746                           NULL, NULL_TREE, 0, vect_prologue);
3747
3748   /* Count statements in scalar loop.  Using this as scalar cost for a single
3749      iteration for now.
3750
3751      TODO: Add outer loop support.
3752
3753      TODO: Consider assigning different costs to different scalar
3754      statements.  */
3755
3756   scalar_single_iter_cost
3757     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3758
3759   /* Add additional cost for the peeled instructions in prologue and epilogue
3760      loop.  (For fully-masked loops there will be no peeling.)
3761
3762      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3763      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3764
3765      TODO: Build an expression that represents peel_iters for prologue and
3766      epilogue to be used in a run-time test.  */
3767
3768   bool prologue_need_br_taken_cost = false;
3769   bool prologue_need_br_not_taken_cost = false;
3770
3771   /* Calculate peel_iters_prologue.  */
3772   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3773     peel_iters_prologue = 0;
3774   else if (npeel < 0)
3775     {
3776       peel_iters_prologue = assumed_vf / 2;
3777       if (dump_enabled_p ())
3778         dump_printf (MSG_NOTE, "cost model: "
3779                      "prologue peel iters set to vf/2.\n");
3780
3781       /* If peeled iterations are unknown, count a taken branch and a not taken
3782          branch per peeled loop.  Even if scalar loop iterations are known,
3783          vector iterations are not known since peeled prologue iterations are
3784          not known.  Hence guards remain the same.  */
3785       prologue_need_br_taken_cost = true;
3786       prologue_need_br_not_taken_cost = true;
3787     }
3788   else
3789     {
3790       peel_iters_prologue = npeel;
3791       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3792         /* If peeled iterations are known but number of scalar loop
3793            iterations are unknown, count a taken branch per peeled loop.  */
3794         prologue_need_br_taken_cost = true;
3795     }
3796
3797   bool epilogue_need_br_taken_cost = false;
3798   bool epilogue_need_br_not_taken_cost = false;
3799
3800   /* Calculate peel_iters_epilogue.  */
3801   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3802     /* We need to peel exactly one iteration for gaps.  */
3803     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3804   else if (npeel < 0)
3805     {
3806       /* If peeling for alignment is unknown, loop bound of main loop
3807          becomes unknown.  */
3808       peel_iters_epilogue = assumed_vf / 2;
3809       if (dump_enabled_p ())
3810         dump_printf (MSG_NOTE, "cost model: "
3811                      "epilogue peel iters set to vf/2 because "
3812                      "peeling for alignment is unknown.\n");
3813
3814       /* See the same reason above in peel_iters_prologue calculation.  */
3815       epilogue_need_br_taken_cost = true;
3816       epilogue_need_br_not_taken_cost = true;
3817     }
3818   else
3819     {
3820       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3821       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3822         /* If peeled iterations are known but number of scalar loop
3823            iterations are unknown, count a taken branch per peeled loop.  */
3824         epilogue_need_br_taken_cost = true;
3825     }
3826
3827   stmt_info_for_cost *si;
3828   int j;
3829   /* Add costs associated with peel_iters_prologue.  */
3830   if (peel_iters_prologue)
3831     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3832       {
3833         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3834                               si->count * peel_iters_prologue, si->kind,
3835                               si->stmt_info, si->vectype, si->misalign,
3836                               vect_prologue);
3837       }
3838
3839   /* Add costs associated with peel_iters_epilogue.  */
3840   if (peel_iters_epilogue)
3841     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3842       {
3843         (void) add_stmt_cost (loop_vinfo, target_cost_data,
3844                               si->count * peel_iters_epilogue, si->kind,
3845                               si->stmt_info, si->vectype, si->misalign,
3846                               vect_epilogue);
3847       }
3848
3849   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
3850
3851   if (prologue_need_br_taken_cost)
3852     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3853                           NULL, NULL_TREE, 0, vect_prologue);
3854
3855   if (prologue_need_br_not_taken_cost)
3856     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3857                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3858                           vect_prologue);
3859
3860   if (epilogue_need_br_taken_cost)
3861     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3862                           NULL, NULL_TREE, 0, vect_epilogue);
3863
3864   if (epilogue_need_br_not_taken_cost)
3865     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
3866                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3867                           vect_epilogue);
3868
3869   /* Take care of special costs for rgroup controls of partial vectors.  */
3870   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3871     {
3872       /* Calculate how many masks we need to generate.  */
3873       unsigned int num_masks = 0;
3874       rgroup_controls *rgm;
3875       unsigned int num_vectors_m1;
3876       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3877         if (rgm->type)
3878           num_masks += num_vectors_m1 + 1;
3879       gcc_assert (num_masks > 0);
3880
3881       /* In the worst case, we need to generate each mask in the prologue
3882          and in the loop body.  One of the loop body mask instructions
3883          replaces the comparison in the scalar loop, and since we don't
3884          count the scalar comparison against the scalar body, we shouldn't
3885          count that vector instruction against the vector body either.
3886
3887          Sometimes we can use unpacks instead of generating prologue
3888          masks and sometimes the prologue mask will fold to a constant,
3889          so the actual prologue cost might be smaller.  However, it's
3890          simpler and safer to use the worst-case cost; if this ends up
3891          being the tie-breaker between vectorizing or not, then it's
3892          probably better not to vectorize.  */
3893       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
3894                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
3895       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
3896                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
3897     }
3898   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
3899     {
3900       /* Referring to the functions vect_set_loop_condition_partial_vectors
3901          and vect_set_loop_controls_directly, we need to generate each
3902          length in the prologue and in the loop body if required. Although
3903          there are some possible optimizations, we consider the worst case
3904          here.  */
3905
3906       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
3907       bool need_iterate_p
3908         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3909            && !vect_known_niters_smaller_than_vf (loop_vinfo));
3910
3911       /* Calculate how many statements to be added.  */
3912       unsigned int prologue_stmts = 0;
3913       unsigned int body_stmts = 0;
3914
3915       rgroup_controls *rgc;
3916       unsigned int num_vectors_m1;
3917       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
3918         if (rgc->type)
3919           {
3920             /* May need one SHIFT for nitems_total computation.  */
3921             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
3922             if (nitems != 1 && !niters_known_p)
3923               prologue_stmts += 1;
3924
3925             /* May need one MAX and one MINUS for wrap around.  */
3926             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
3927               prologue_stmts += 2;
3928
3929             /* Need one MAX and one MINUS for each batch limit excepting for
3930                the 1st one.  */
3931             prologue_stmts += num_vectors_m1 * 2;
3932
3933             unsigned int num_vectors = num_vectors_m1 + 1;
3934
3935             /* Need to set up lengths in prologue, only one MIN required
3936                for each since start index is zero.  */
3937             prologue_stmts += num_vectors;
3938
3939             /* Each may need two MINs and one MINUS to update lengths in body
3940                for next iteration.  */
3941             if (need_iterate_p)
3942               body_stmts += 3 * num_vectors;
3943           }
3944
3945       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
3946                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3947       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
3948                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
3949     }
3950
3951   /* FORNOW: The scalar outside cost is incremented in one of the
3952      following ways:
3953
3954      1. The vectorizer checks for alignment and aliasing and generates
3955      a condition that allows dynamic vectorization.  A cost model
3956      check is ANDED with the versioning condition.  Hence scalar code
3957      path now has the added cost of the versioning check.
3958
3959        if (cost > th & versioning_check)
3960          jmp to vector code
3961
3962      Hence run-time scalar is incremented by not-taken branch cost.
3963
3964      2. The vectorizer then checks if a prologue is required.  If the
3965      cost model check was not done before during versioning, it has to
3966      be done before the prologue check.
3967
3968        if (cost <= th)
3969          prologue = scalar_iters
3970        if (prologue == 0)
3971          jmp to vector code
3972        else
3973          execute prologue
3974        if (prologue == num_iters)
3975          go to exit
3976
3977      Hence the run-time scalar cost is incremented by a taken branch,
3978      plus a not-taken branch, plus a taken branch cost.
3979
3980      3. The vectorizer then checks if an epilogue is required.  If the
3981      cost model check was not done before during prologue check, it
3982      has to be done with the epilogue check.
3983
3984        if (prologue == 0)
3985          jmp to vector code
3986        else
3987          execute prologue
3988        if (prologue == num_iters)
3989          go to exit
3990        vector code:
3991          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3992            jmp to epilogue
3993
3994      Hence the run-time scalar cost should be incremented by 2 taken
3995      branches.
3996
3997      TODO: The back end may reorder the BBS's differently and reverse
3998      conditions/branch directions.  Change the estimates below to
3999      something more reasonable.  */
4000
4001   /* If the number of iterations is known and we do not do versioning, we can
4002      decide whether to vectorize at compile time.  Hence the scalar version
4003      do not carry cost model guard costs.  */
4004   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4005       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4006     {
4007       /* Cost model check occurs at versioning.  */
4008       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4009         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4010       else
4011         {
4012           /* Cost model check occurs at prologue generation.  */
4013           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4014             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4015               + vect_get_stmt_cost (cond_branch_not_taken);
4016           /* Cost model check occurs at epilogue generation.  */
4017           else
4018             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4019         }
4020     }
4021
4022   /* Complete the target-specific cost calculations.  */
4023   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4024                &vec_inside_cost, &vec_epilogue_cost);
4025
4026   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4027
4028   /* Stash the costs so that we can compare two loop_vec_infos.  */
4029   loop_vinfo->vec_inside_cost = vec_inside_cost;
4030   loop_vinfo->vec_outside_cost = vec_outside_cost;
4031
4032   if (dump_enabled_p ())
4033     {
4034       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4035       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4036                    vec_inside_cost);
4037       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4038                    vec_prologue_cost);
4039       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4040                    vec_epilogue_cost);
4041       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4042                    scalar_single_iter_cost);
4043       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4044                    scalar_outside_cost);
4045       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4046                    vec_outside_cost);
4047       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4048                    peel_iters_prologue);
4049       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4050                    peel_iters_epilogue);
4051     }
4052
4053   /* Calculate number of iterations required to make the vector version
4054      profitable, relative to the loop bodies only.  The following condition
4055      must hold true:
4056      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4057      where
4058      SIC = scalar iteration cost, VIC = vector iteration cost,
4059      VOC = vector outside cost, VF = vectorization factor,
4060      NPEEL = prologue iterations + epilogue iterations,
4061      SOC = scalar outside cost for run time cost model check.  */
4062
4063   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4064                           - vec_inside_cost);
4065   if (saving_per_viter <= 0)
4066     {
4067       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4068         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4069                     "vectorization did not happen for a simd loop");
4070
4071       if (dump_enabled_p ())
4072         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4073                          "cost model: the vector iteration cost = %d "
4074                          "divided by the scalar iteration cost = %d "
4075                          "is greater or equal to the vectorization factor = %d"
4076                          ".\n",
4077                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4078       *ret_min_profitable_niters = -1;
4079       *ret_min_profitable_estimate = -1;
4080       return;
4081     }
4082
4083   /* ??? The "if" arm is written to handle all cases; see below for what
4084      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4085   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4086     {
4087       /* Rewriting the condition above in terms of the number of
4088          vector iterations (vniters) rather than the number of
4089          scalar iterations (niters) gives:
4090
4091          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4092
4093          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4094
4095          For integer N, X and Y when X > 0:
4096
4097          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4098       int outside_overhead = (vec_outside_cost
4099                               - scalar_single_iter_cost * peel_iters_prologue
4100                               - scalar_single_iter_cost * peel_iters_epilogue
4101                               - scalar_outside_cost);
4102       /* We're only interested in cases that require at least one
4103          vector iteration.  */
4104       int min_vec_niters = 1;
4105       if (outside_overhead > 0)
4106         min_vec_niters = outside_overhead / saving_per_viter + 1;
4107
4108       if (dump_enabled_p ())
4109         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4110                      min_vec_niters);
4111
4112       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4113         {
4114           /* Now that we know the minimum number of vector iterations,
4115              find the minimum niters for which the scalar cost is larger:
4116
4117              SIC * niters > VIC * vniters + VOC - SOC
4118
4119              We know that the minimum niters is no more than
4120              vniters * VF + NPEEL, but it might be (and often is) less
4121              than that if a partial vector iteration is cheaper than the
4122              equivalent scalar code.  */
4123           int threshold = (vec_inside_cost * min_vec_niters
4124                            + vec_outside_cost
4125                            - scalar_outside_cost);
4126           if (threshold <= 0)
4127             min_profitable_iters = 1;
4128           else
4129             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4130         }
4131       else
4132         /* Convert the number of vector iterations into a number of
4133            scalar iterations.  */
4134         min_profitable_iters = (min_vec_niters * assumed_vf
4135                                 + peel_iters_prologue
4136                                 + peel_iters_epilogue);
4137     }
4138   else
4139     {
4140       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4141                               * assumed_vf
4142                               - vec_inside_cost * peel_iters_prologue
4143                               - vec_inside_cost * peel_iters_epilogue);
4144       if (min_profitable_iters <= 0)
4145         min_profitable_iters = 0;
4146       else
4147         {
4148           min_profitable_iters /= saving_per_viter;
4149
4150           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4151               <= (((int) vec_inside_cost * min_profitable_iters)
4152                   + (((int) vec_outside_cost - scalar_outside_cost)
4153                      * assumed_vf)))
4154             min_profitable_iters++;
4155         }
4156     }
4157
4158   if (dump_enabled_p ())
4159     dump_printf (MSG_NOTE,
4160                  "  Calculated minimum iters for profitability: %d\n",
4161                  min_profitable_iters);
4162
4163   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4164       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4165     /* We want the vectorized loop to execute at least once.  */
4166     min_profitable_iters = assumed_vf + peel_iters_prologue;
4167   else if (min_profitable_iters < peel_iters_prologue)
4168     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4169        vectorized loop executes at least once.  */
4170     min_profitable_iters = peel_iters_prologue;
4171
4172   if (dump_enabled_p ())
4173     dump_printf_loc (MSG_NOTE, vect_location,
4174                      "  Runtime profitability threshold = %d\n",
4175                      min_profitable_iters);
4176
4177   *ret_min_profitable_niters = min_profitable_iters;
4178
4179   /* Calculate number of iterations required to make the vector version
4180      profitable, relative to the loop bodies only.
4181
4182      Non-vectorized variant is SIC * niters and it must win over vector
4183      variant on the expected loop trip count.  The following condition must hold true:
4184      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4185
4186   if (vec_outside_cost <= 0)
4187     min_profitable_estimate = 0;
4188   /* ??? This "else if" arm is written to handle all cases; see below for
4189      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4190   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4191     {
4192       /* This is a repeat of the code above, but with + SOC rather
4193          than - SOC.  */
4194       int outside_overhead = (vec_outside_cost
4195                               - scalar_single_iter_cost * peel_iters_prologue
4196                               - scalar_single_iter_cost * peel_iters_epilogue
4197                               + scalar_outside_cost);
4198       int min_vec_niters = 1;
4199       if (outside_overhead > 0)
4200         min_vec_niters = outside_overhead / saving_per_viter + 1;
4201
4202       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4203         {
4204           int threshold = (vec_inside_cost * min_vec_niters
4205                            + vec_outside_cost
4206                            + scalar_outside_cost);
4207           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4208         }
4209       else
4210         min_profitable_estimate = (min_vec_niters * assumed_vf
4211                                    + peel_iters_prologue
4212                                    + peel_iters_epilogue);
4213     }
4214   else
4215     {
4216       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4217                                  * assumed_vf
4218                                  - vec_inside_cost * peel_iters_prologue
4219                                  - vec_inside_cost * peel_iters_epilogue)
4220                                  / ((scalar_single_iter_cost * assumed_vf)
4221                                    - vec_inside_cost);
4222     }
4223   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4224   if (dump_enabled_p ())
4225     dump_printf_loc (MSG_NOTE, vect_location,
4226                      "  Static estimate profitability threshold = %d\n",
4227                      min_profitable_estimate);
4228
4229   *ret_min_profitable_estimate = min_profitable_estimate;
4230 }
4231
4232 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4233    vector elements (not bits) for a vector with NELT elements.  */
4234 static void
4235 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4236                               vec_perm_builder *sel)
4237 {
4238   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4239      by vec_perm_indices.  */
4240   sel->new_vector (nelt, 1, 3);
4241   for (unsigned int i = 0; i < 3; i++)
4242     sel->quick_push (i + offset);
4243 }
4244
4245 /* Checks whether the target supports whole-vector shifts for vectors of mode
4246    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4247    it supports vec_perm_const with masks for all necessary shift amounts.  */
4248 static bool
4249 have_whole_vector_shift (machine_mode mode)
4250 {
4251   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4252     return true;
4253
4254   /* Variable-length vectors should be handled via the optab.  */
4255   unsigned int nelt;
4256   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4257     return false;
4258
4259   vec_perm_builder sel;
4260   vec_perm_indices indices;
4261   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4262     {
4263       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4264       indices.new_vector (sel, 2, nelt);
4265       if (!can_vec_perm_const_p (mode, indices, false))
4266         return false;
4267     }
4268   return true;
4269 }
4270
4271 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4272    functions. Design better to avoid maintenance issues.  */
4273
4274 /* Function vect_model_reduction_cost.
4275
4276    Models cost for a reduction operation, including the vector ops
4277    generated within the strip-mine loop, the initial definition before
4278    the loop, and the epilogue code that must be generated.  */
4279
4280 static void
4281 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4282                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4283                            vect_reduction_type reduction_type,
4284                            int ncopies, stmt_vector_for_cost *cost_vec)
4285 {
4286   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4287   enum tree_code code;
4288   optab optab;
4289   tree vectype;
4290   machine_mode mode;
4291   class loop *loop = NULL;
4292
4293   if (loop_vinfo)
4294     loop = LOOP_VINFO_LOOP (loop_vinfo);
4295
4296   /* Condition reductions generate two reductions in the loop.  */
4297   if (reduction_type == COND_REDUCTION)
4298     ncopies *= 2;
4299
4300   vectype = STMT_VINFO_VECTYPE (stmt_info);
4301   mode = TYPE_MODE (vectype);
4302   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4303
4304   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4305
4306   if (reduction_type == EXTRACT_LAST_REDUCTION)
4307     /* No extra instructions are needed in the prologue.  The loop body
4308        operations are costed in vectorizable_condition.  */
4309     inside_cost = 0;
4310   else if (reduction_type == FOLD_LEFT_REDUCTION)
4311     {
4312       /* No extra instructions needed in the prologue.  */
4313       prologue_cost = 0;
4314
4315       if (reduc_fn != IFN_LAST)
4316         /* Count one reduction-like operation per vector.  */
4317         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4318                                         stmt_info, 0, vect_body);
4319       else
4320         {
4321           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4322           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4323           inside_cost = record_stmt_cost (cost_vec, nelements,
4324                                           vec_to_scalar, stmt_info, 0,
4325                                           vect_body);
4326           inside_cost += record_stmt_cost (cost_vec, nelements,
4327                                            scalar_stmt, stmt_info, 0,
4328                                            vect_body);
4329         }
4330     }
4331   else
4332     {
4333       /* Add in cost for initial definition.
4334          For cond reduction we have four vectors: initial index, step,
4335          initial result of the data reduction, initial value of the index
4336          reduction.  */
4337       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4338       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4339                                          scalar_to_vec, stmt_info, 0,
4340                                          vect_prologue);
4341
4342       /* Cost of reduction op inside loop.  */
4343       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4344                                       stmt_info, 0, vect_body);
4345     }
4346
4347   /* Determine cost of epilogue code.
4348
4349      We have a reduction operator that will reduce the vector in one statement.
4350      Also requires scalar extract.  */
4351
4352   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4353     {
4354       if (reduc_fn != IFN_LAST)
4355         {
4356           if (reduction_type == COND_REDUCTION)
4357             {
4358               /* An EQ stmt and an COND_EXPR stmt.  */
4359               epilogue_cost += record_stmt_cost (cost_vec, 2,
4360                                                  vector_stmt, stmt_info, 0,
4361                                                  vect_epilogue);
4362               /* Reduction of the max index and a reduction of the found
4363                  values.  */
4364               epilogue_cost += record_stmt_cost (cost_vec, 2,
4365                                                  vec_to_scalar, stmt_info, 0,
4366                                                  vect_epilogue);
4367               /* A broadcast of the max value.  */
4368               epilogue_cost += record_stmt_cost (cost_vec, 1,
4369                                                  scalar_to_vec, stmt_info, 0,
4370                                                  vect_epilogue);
4371             }
4372           else
4373             {
4374               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4375                                                  stmt_info, 0, vect_epilogue);
4376               epilogue_cost += record_stmt_cost (cost_vec, 1,
4377                                                  vec_to_scalar, stmt_info, 0,
4378                                                  vect_epilogue);
4379             }
4380         }
4381       else if (reduction_type == COND_REDUCTION)
4382         {
4383           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4384           /* Extraction of scalar elements.  */
4385           epilogue_cost += record_stmt_cost (cost_vec,
4386                                              2 * estimated_nunits,
4387                                              vec_to_scalar, stmt_info, 0,
4388                                              vect_epilogue);
4389           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4390           epilogue_cost += record_stmt_cost (cost_vec,
4391                                              2 * estimated_nunits - 3,
4392                                              scalar_stmt, stmt_info, 0,
4393                                              vect_epilogue);
4394         }
4395       else if (reduction_type == EXTRACT_LAST_REDUCTION
4396                || reduction_type == FOLD_LEFT_REDUCTION)
4397         /* No extra instructions need in the epilogue.  */
4398         ;
4399       else
4400         {
4401           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4402           tree bitsize =
4403             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4404           int element_bitsize = tree_to_uhwi (bitsize);
4405           int nelements = vec_size_in_bits / element_bitsize;
4406
4407           if (code == COND_EXPR)
4408             code = MAX_EXPR;
4409
4410           optab = optab_for_tree_code (code, vectype, optab_default);
4411
4412           /* We have a whole vector shift available.  */
4413           if (optab != unknown_optab
4414               && VECTOR_MODE_P (mode)
4415               && optab_handler (optab, mode) != CODE_FOR_nothing
4416               && have_whole_vector_shift (mode))
4417             {
4418               /* Final reduction via vector shifts and the reduction operator.
4419                  Also requires scalar extract.  */
4420               epilogue_cost += record_stmt_cost (cost_vec,
4421                                                  exact_log2 (nelements) * 2,
4422                                                  vector_stmt, stmt_info, 0,
4423                                                  vect_epilogue);
4424               epilogue_cost += record_stmt_cost (cost_vec, 1,
4425                                                  vec_to_scalar, stmt_info, 0,
4426                                                  vect_epilogue);
4427             }
4428           else
4429             /* Use extracts and reduction op for final reduction.  For N
4430                elements, we have N extracts and N-1 reduction ops.  */
4431             epilogue_cost += record_stmt_cost (cost_vec,
4432                                                nelements + nelements - 1,
4433                                                vector_stmt, stmt_info, 0,
4434                                                vect_epilogue);
4435         }
4436     }
4437
4438   if (dump_enabled_p ())
4439     dump_printf (MSG_NOTE,
4440                  "vect_model_reduction_cost: inside_cost = %d, "
4441                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4442                  prologue_cost, epilogue_cost);
4443 }
4444
4445
4446
4447 /* Function get_initial_def_for_reduction
4448
4449    Input:
4450    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4451    INIT_VAL - the initial value of the reduction variable
4452
4453    Output:
4454    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4455         of the reduction (used for adjusting the epilog - see below).
4456    Return a vector variable, initialized according to the operation that
4457         STMT_VINFO performs. This vector will be used as the initial value
4458         of the vector of partial results.
4459
4460    Option1 (adjust in epilog): Initialize the vector as follows:
4461      add/bit or/xor:    [0,0,...,0,0]
4462      mult/bit and:      [1,1,...,1,1]
4463      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4464    and when necessary (e.g. add/mult case) let the caller know
4465    that it needs to adjust the result by init_val.
4466
4467    Option2: Initialize the vector as follows:
4468      add/bit or/xor:    [init_val,0,0,...,0]
4469      mult/bit and:      [init_val,1,1,...,1]
4470      min/max/cond_expr: [init_val,init_val,...,init_val]
4471    and no adjustments are needed.
4472
4473    For example, for the following code:
4474
4475    s = init_val;
4476    for (i=0;i<n;i++)
4477      s = s + a[i];
4478
4479    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4480    For a vector of 4 units, we want to return either [0,0,0,init_val],
4481    or [0,0,0,0] and let the caller know that it needs to adjust
4482    the result at the end by 'init_val'.
4483
4484    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4485    initialization vector is simpler (same element in all entries), if
4486    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4487
4488    A cost model should help decide between these two schemes.  */
4489
4490 static tree
4491 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4492                                stmt_vec_info stmt_vinfo,
4493                                enum tree_code code, tree init_val,
4494                                tree *adjustment_def)
4495 {
4496   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4497   tree scalar_type = TREE_TYPE (init_val);
4498   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4499   tree def_for_init;
4500   tree init_def;
4501   REAL_VALUE_TYPE real_init_val = dconst0;
4502   int int_init_val = 0;
4503   gimple_seq stmts = NULL;
4504
4505   gcc_assert (vectype);
4506
4507   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4508               || SCALAR_FLOAT_TYPE_P (scalar_type));
4509
4510   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4511               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4512
4513   /* ADJUSTMENT_DEF is NULL when called from
4514      vect_create_epilog_for_reduction to vectorize double reduction.  */
4515   if (adjustment_def)
4516     *adjustment_def = NULL;
4517
4518   switch (code)
4519     {
4520     case WIDEN_SUM_EXPR:
4521     case DOT_PROD_EXPR:
4522     case SAD_EXPR:
4523     case PLUS_EXPR:
4524     case MINUS_EXPR:
4525     case BIT_IOR_EXPR:
4526     case BIT_XOR_EXPR:
4527     case MULT_EXPR:
4528     case BIT_AND_EXPR:
4529       {
4530         if (code == MULT_EXPR)
4531           {
4532             real_init_val = dconst1;
4533             int_init_val = 1;
4534           }
4535
4536         if (code == BIT_AND_EXPR)
4537           int_init_val = -1;
4538
4539         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4540           def_for_init = build_real (scalar_type, real_init_val);
4541         else
4542           def_for_init = build_int_cst (scalar_type, int_init_val);
4543
4544         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4545           {
4546             /* Option1: the first element is '0' or '1' as well.  */
4547             if (!operand_equal_p (def_for_init, init_val, 0))
4548               *adjustment_def = init_val;
4549             init_def = gimple_build_vector_from_val (&stmts, vectype,
4550                                                      def_for_init);
4551           }
4552         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4553           {
4554             /* Option2 (variable length): the first element is INIT_VAL.  */
4555             init_def = gimple_build_vector_from_val (&stmts, vectype,
4556                                                      def_for_init);
4557             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4558                                      vectype, init_def, init_val);
4559           }
4560         else
4561           {
4562             /* Option2: the first element is INIT_VAL.  */
4563             tree_vector_builder elts (vectype, 1, 2);
4564             elts.quick_push (init_val);
4565             elts.quick_push (def_for_init);
4566             init_def = gimple_build_vector (&stmts, &elts);
4567           }
4568       }
4569       break;
4570
4571     case MIN_EXPR:
4572     case MAX_EXPR:
4573     case COND_EXPR:
4574       {
4575         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4576         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4577       }
4578       break;
4579
4580     default:
4581       gcc_unreachable ();
4582     }
4583
4584   if (stmts)
4585     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4586   return init_def;
4587 }
4588
4589 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4590    NUMBER_OF_VECTORS is the number of vector defs to create.
4591    If NEUTRAL_OP is nonnull, introducing extra elements of that
4592    value will not change the result.  */
4593
4594 static void
4595 get_initial_defs_for_reduction (vec_info *vinfo,
4596                                 slp_tree slp_node,
4597                                 vec<tree> *vec_oprnds,
4598                                 unsigned int number_of_vectors,
4599                                 bool reduc_chain, tree neutral_op)
4600 {
4601   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4602   stmt_vec_info stmt_vinfo = stmts[0];
4603   unsigned HOST_WIDE_INT nunits;
4604   unsigned j, number_of_places_left_in_vector;
4605   tree vector_type;
4606   unsigned int group_size = stmts.length ();
4607   unsigned int i;
4608   class loop *loop;
4609
4610   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4611
4612   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4613
4614   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4615   gcc_assert (loop);
4616   edge pe = loop_preheader_edge (loop);
4617
4618   gcc_assert (!reduc_chain || neutral_op);
4619
4620   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4621      created vectors. It is greater than 1 if unrolling is performed.
4622
4623      For example, we have two scalar operands, s1 and s2 (e.g., group of
4624      strided accesses of size two), while NUNITS is four (i.e., four scalars
4625      of this type can be packed in a vector).  The output vector will contain
4626      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4627      will be 2).
4628
4629      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4630      vectors containing the operands.
4631
4632      For example, NUNITS is four as before, and the group size is 8
4633      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4634      {s5, s6, s7, s8}.  */
4635
4636   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4637     nunits = group_size;
4638
4639   number_of_places_left_in_vector = nunits;
4640   bool constant_p = true;
4641   tree_vector_builder elts (vector_type, nunits, 1);
4642   elts.quick_grow (nunits);
4643   gimple_seq ctor_seq = NULL;
4644   for (j = 0; j < nunits * number_of_vectors; ++j)
4645     {
4646       tree op;
4647       i = j % group_size;
4648       stmt_vinfo = stmts[i];
4649
4650       /* Get the def before the loop.  In reduction chain we have only
4651          one initial value.  Else we have as many as PHIs in the group.  */
4652       if (reduc_chain)
4653         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4654       else if (((vec_oprnds->length () + 1) * nunits
4655                 - number_of_places_left_in_vector >= group_size)
4656                && neutral_op)
4657         op = neutral_op;
4658       else
4659         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4660
4661       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4662       number_of_places_left_in_vector--;
4663       elts[nunits - number_of_places_left_in_vector - 1] = op;
4664       if (!CONSTANT_CLASS_P (op))
4665         constant_p = false;
4666
4667       if (number_of_places_left_in_vector == 0)
4668         {
4669           tree init;
4670           if (constant_p && !neutral_op
4671               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4672               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4673             /* Build the vector directly from ELTS.  */
4674             init = gimple_build_vector (&ctor_seq, &elts);
4675           else if (neutral_op)
4676             {
4677               /* Build a vector of the neutral value and shift the
4678                  other elements into place.  */
4679               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4680                                                    neutral_op);
4681               int k = nunits;
4682               while (k > 0 && elts[k - 1] == neutral_op)
4683                 k -= 1;
4684               while (k > 0)
4685                 {
4686                   k -= 1;
4687                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4688                                        vector_type, init, elts[k]);
4689                 }
4690             }
4691           else
4692             {
4693               /* First time round, duplicate ELTS to fill the
4694                  required number of vectors.  */
4695               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4696                                         number_of_vectors, *vec_oprnds);
4697               break;
4698             }
4699           vec_oprnds->quick_push (init);
4700
4701           number_of_places_left_in_vector = nunits;
4702           elts.new_vector (vector_type, nunits, 1);
4703           elts.quick_grow (nunits);
4704           constant_p = true;
4705         }
4706     }
4707   if (ctor_seq != NULL)
4708     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4709 }
4710
4711 /* For a statement STMT_INFO taking part in a reduction operation return
4712    the stmt_vec_info the meta information is stored on.  */
4713
4714 stmt_vec_info
4715 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4716 {
4717   stmt_info = vect_orig_stmt (stmt_info);
4718   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4719   if (!is_a <gphi *> (stmt_info->stmt)
4720       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4721     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4722   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4723   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4724     {
4725       if (gimple_phi_num_args (phi) == 1)
4726         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4727     }
4728   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4729     {
4730       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4731       stmt_vec_info info
4732           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4733       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4734         stmt_info = info;
4735     }
4736   return stmt_info;
4737 }
4738
4739 /* Function vect_create_epilog_for_reduction
4740
4741    Create code at the loop-epilog to finalize the result of a reduction
4742    computation.
4743
4744    STMT_INFO is the scalar reduction stmt that is being vectorized.
4745    SLP_NODE is an SLP node containing a group of reduction statements. The
4746      first one in this group is STMT_INFO.
4747    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4748    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4749      (counting from 0)
4750
4751    This function:
4752    1. Completes the reduction def-use cycles.
4753    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4754       by calling the function specified by REDUC_FN if available, or by
4755       other means (whole-vector shifts or a scalar loop).
4756       The function also creates a new phi node at the loop exit to preserve
4757       loop-closed form, as illustrated below.
4758
4759      The flow at the entry to this function:
4760
4761         loop:
4762           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4763           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4764           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4765         loop_exit:
4766           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4767           use <s_out0>
4768           use <s_out0>
4769
4770      The above is transformed by this function into:
4771
4772         loop:
4773           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4774           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4775           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4776         loop_exit:
4777           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4778           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4779           v_out2 = reduce <v_out1>
4780           s_out3 = extract_field <v_out2, 0>
4781           s_out4 = adjust_result <s_out3>
4782           use <s_out4>
4783           use <s_out4>
4784 */
4785
4786 static void
4787 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4788                                   stmt_vec_info stmt_info,
4789                                   slp_tree slp_node,
4790                                   slp_instance slp_node_instance)
4791 {
4792   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4793   gcc_assert (reduc_info->is_reduc_info);
4794   /* For double reductions we need to get at the inner loop reduction
4795      stmt which has the meta info attached.  Our stmt_info is that of the
4796      loop-closed PHI of the inner loop which we remember as
4797      def for the reduction PHI generation.  */
4798   bool double_reduc = false;
4799   stmt_vec_info rdef_info = stmt_info;
4800   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4801     {
4802       gcc_assert (!slp_node);
4803       double_reduc = true;
4804       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4805                                             (stmt_info->stmt, 0));
4806       stmt_info = vect_stmt_to_vectorize (stmt_info);
4807     }
4808   gphi *reduc_def_stmt
4809     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4810   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4811   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4812   tree vectype;
4813   machine_mode mode;
4814   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4815   basic_block exit_bb;
4816   tree scalar_dest;
4817   tree scalar_type;
4818   gimple *new_phi = NULL, *phi;
4819   gimple_stmt_iterator exit_gsi;
4820   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4821   gimple *epilog_stmt = NULL;
4822   gimple *exit_phi;
4823   tree bitsize;
4824   tree def;
4825   tree orig_name, scalar_result;
4826   imm_use_iterator imm_iter, phi_imm_iter;
4827   use_operand_p use_p, phi_use_p;
4828   gimple *use_stmt;
4829   bool nested_in_vect_loop = false;
4830   auto_vec<gimple *> new_phis;
4831   int j, i;
4832   auto_vec<tree> scalar_results;
4833   unsigned int group_size = 1, k;
4834   auto_vec<gimple *> phis;
4835   bool slp_reduc = false;
4836   bool direct_slp_reduc;
4837   tree new_phi_result;
4838   tree induction_index = NULL_TREE;
4839
4840   if (slp_node)
4841     group_size = SLP_TREE_LANES (slp_node);
4842
4843   if (nested_in_vect_loop_p (loop, stmt_info))
4844     {
4845       outer_loop = loop;
4846       loop = loop->inner;
4847       nested_in_vect_loop = true;
4848       gcc_assert (!slp_node);
4849     }
4850   gcc_assert (!nested_in_vect_loop || double_reduc);
4851
4852   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4853   gcc_assert (vectype);
4854   mode = TYPE_MODE (vectype);
4855
4856   tree initial_def = NULL;
4857   tree induc_val = NULL_TREE;
4858   tree adjustment_def = NULL;
4859   if (slp_node)
4860     ;
4861   else
4862     {
4863       /* Get at the scalar def before the loop, that defines the initial value
4864          of the reduction variable.  */
4865       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4866                                            loop_preheader_edge (loop));
4867       /* Optimize: for induction condition reduction, if we can't use zero
4868          for induc_val, use initial_def.  */
4869       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4870         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4871       else if (double_reduc)
4872         ;
4873       else if (nested_in_vect_loop)
4874         ;
4875       else
4876         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4877     }
4878
4879   unsigned vec_num;
4880   int ncopies;
4881   if (slp_node)
4882     {
4883       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4884       ncopies = 1;
4885     }
4886   else
4887     {
4888       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4889       vec_num = 1;
4890       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4891     }
4892
4893   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4894      which is updated with the current index of the loop for every match of
4895      the original loop's cond_expr (VEC_STMT).  This results in a vector
4896      containing the last time the condition passed for that vector lane.
4897      The first match will be a 1 to allow 0 to be used for non-matching
4898      indexes.  If there are no matches at all then the vector will be all
4899      zeroes.
4900
4901      PR92772: This algorithm is broken for architectures that support
4902      masked vectors, but do not provide fold_extract_last.  */
4903   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4904     {
4905       auto_vec<std::pair<tree, bool>, 2> ccompares;
4906       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4907       cond_info = vect_stmt_to_vectorize (cond_info);
4908       while (cond_info != reduc_info)
4909         {
4910           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4911             {
4912               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4913               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4914               ccompares.safe_push
4915                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4916                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4917             }
4918           cond_info
4919             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4920                                                  1 + STMT_VINFO_REDUC_IDX
4921                                                         (cond_info)));
4922           cond_info = vect_stmt_to_vectorize (cond_info);
4923         }
4924       gcc_assert (ccompares.length () != 0);
4925
4926       tree indx_before_incr, indx_after_incr;
4927       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4928       int scalar_precision
4929         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4930       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4931       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4932         (TYPE_MODE (vectype), cr_index_scalar_type,
4933          TYPE_VECTOR_SUBPARTS (vectype));
4934
4935       /* First we create a simple vector induction variable which starts
4936          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4937          vector size (STEP).  */
4938
4939       /* Create a {1,2,3,...} vector.  */
4940       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4941
4942       /* Create a vector of the step value.  */
4943       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4944       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4945
4946       /* Create an induction variable.  */
4947       gimple_stmt_iterator incr_gsi;
4948       bool insert_after;
4949       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4950       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4951                  insert_after, &indx_before_incr, &indx_after_incr);
4952
4953       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4954          filled with zeros (VEC_ZERO).  */
4955
4956       /* Create a vector of 0s.  */
4957       tree zero = build_zero_cst (cr_index_scalar_type);
4958       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4959
4960       /* Create a vector phi node.  */
4961       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4962       new_phi = create_phi_node (new_phi_tree, loop->header);
4963       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4964                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4965
4966       /* Now take the condition from the loops original cond_exprs
4967          and produce a new cond_exprs (INDEX_COND_EXPR) which for
4968          every match uses values from the induction variable
4969          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4970          (NEW_PHI_TREE).
4971          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4972          the new cond_expr (INDEX_COND_EXPR).  */
4973       gimple_seq stmts = NULL;
4974       for (int i = ccompares.length () - 1; i != -1; --i)
4975         {
4976           tree ccompare = ccompares[i].first;
4977           if (ccompares[i].second)
4978             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4979                                          cr_index_vector_type,
4980                                          ccompare,
4981                                          indx_before_incr, new_phi_tree);
4982           else
4983             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4984                                          cr_index_vector_type,
4985                                          ccompare,
4986                                          new_phi_tree, indx_before_incr);
4987         }
4988       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4989
4990       /* Update the phi with the vec cond.  */
4991       induction_index = new_phi_tree;
4992       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4993                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4994     }
4995
4996   /* 2. Create epilog code.
4997         The reduction epilog code operates across the elements of the vector
4998         of partial results computed by the vectorized loop.
4999         The reduction epilog code consists of:
5000
5001         step 1: compute the scalar result in a vector (v_out2)
5002         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5003         step 3: adjust the scalar result (s_out3) if needed.
5004
5005         Step 1 can be accomplished using one the following three schemes:
5006           (scheme 1) using reduc_fn, if available.
5007           (scheme 2) using whole-vector shifts, if available.
5008           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5009                      combined.
5010
5011           The overall epilog code looks like this:
5012
5013           s_out0 = phi <s_loop>         # original EXIT_PHI
5014           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5015           v_out2 = reduce <v_out1>              # step 1
5016           s_out3 = extract_field <v_out2, 0>    # step 2
5017           s_out4 = adjust_result <s_out3>       # step 3
5018
5019           (step 3 is optional, and steps 1 and 2 may be combined).
5020           Lastly, the uses of s_out0 are replaced by s_out4.  */
5021
5022
5023   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5024          v_out1 = phi <VECT_DEF>
5025          Store them in NEW_PHIS.  */
5026   if (double_reduc)
5027     loop = outer_loop;
5028   exit_bb = single_exit (loop)->dest;
5029   new_phis.create (slp_node ? vec_num : ncopies);
5030   for (unsigned i = 0; i < vec_num; i++)
5031     {
5032       if (slp_node)
5033         def = vect_get_slp_vect_def (slp_node, i);
5034       else
5035         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5036       for (j = 0; j < ncopies; j++)
5037         {
5038           tree new_def = copy_ssa_name (def);
5039           phi = create_phi_node (new_def, exit_bb);
5040           if (j == 0)
5041             new_phis.quick_push (phi);
5042           else
5043             {
5044               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5045               new_phis.quick_push (phi);
5046             }
5047
5048           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5049         }
5050     }
5051
5052   exit_gsi = gsi_after_labels (exit_bb);
5053
5054   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5055          (i.e. when reduc_fn is not available) and in the final adjustment
5056          code (if needed).  Also get the original scalar reduction variable as
5057          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5058          represents a reduction pattern), the tree-code and scalar-def are
5059          taken from the original stmt that the pattern-stmt (STMT) replaces.
5060          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5061          are taken from STMT.  */
5062
5063   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5064   if (orig_stmt_info != stmt_info)
5065     {
5066       /* Reduction pattern  */
5067       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5068       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5069     }
5070
5071   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5072   scalar_type = TREE_TYPE (scalar_dest);
5073   scalar_results.create (group_size);
5074   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5075   bitsize = TYPE_SIZE (scalar_type);
5076
5077   /* SLP reduction without reduction chain, e.g.,
5078      # a1 = phi <a2, a0>
5079      # b1 = phi <b2, b0>
5080      a2 = operation (a1)
5081      b2 = operation (b1)  */
5082   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5083
5084   /* True if we should implement SLP_REDUC using native reduction operations
5085      instead of scalar operations.  */
5086   direct_slp_reduc = (reduc_fn != IFN_LAST
5087                       && slp_reduc
5088                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5089
5090   /* In case of reduction chain, e.g.,
5091      # a1 = phi <a3, a0>
5092      a2 = operation (a1)
5093      a3 = operation (a2),
5094
5095      we may end up with more than one vector result.  Here we reduce them to
5096      one vector.  */
5097   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5098     {
5099       gimple_seq stmts = NULL;
5100       tree first_vect = PHI_RESULT (new_phis[0]);
5101       first_vect = gimple_convert (&stmts, vectype, first_vect);
5102       for (k = 1; k < new_phis.length (); k++)
5103         {
5104           gimple *next_phi = new_phis[k];
5105           tree second_vect = PHI_RESULT (next_phi);
5106           second_vect = gimple_convert (&stmts, vectype, second_vect);
5107           first_vect = gimple_build (&stmts, code, vectype,
5108                                      first_vect, second_vect);
5109         }
5110       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5111
5112       new_phi_result = first_vect;
5113       new_phis.truncate (0);
5114       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5115     }
5116   /* Likewise if we couldn't use a single defuse cycle.  */
5117   else if (ncopies > 1)
5118     {
5119       gimple_seq stmts = NULL;
5120       tree first_vect = PHI_RESULT (new_phis[0]);
5121       first_vect = gimple_convert (&stmts, vectype, first_vect);
5122       for (int k = 1; k < ncopies; ++k)
5123         {
5124           tree second_vect = PHI_RESULT (new_phis[k]);
5125           second_vect = gimple_convert (&stmts, vectype, second_vect);
5126           first_vect = gimple_build (&stmts, code, vectype,
5127                                      first_vect, second_vect);
5128         }
5129       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5130       new_phi_result = first_vect;
5131       new_phis.truncate (0);
5132       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5133     }
5134   else
5135     new_phi_result = PHI_RESULT (new_phis[0]);
5136
5137   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5138       && reduc_fn != IFN_LAST)
5139     {
5140       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5141          various data values where the condition matched and another vector
5142          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5143          need to extract the last matching index (which will be the index with
5144          highest value) and use this to index into the data vector.
5145          For the case where there were no matches, the data vector will contain
5146          all default values and the index vector will be all zeros.  */
5147
5148       /* Get various versions of the type of the vector of indexes.  */
5149       tree index_vec_type = TREE_TYPE (induction_index);
5150       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5151       tree index_scalar_type = TREE_TYPE (index_vec_type);
5152       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5153
5154       /* Get an unsigned integer version of the type of the data vector.  */
5155       int scalar_precision
5156         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5157       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5158       tree vectype_unsigned = build_vector_type
5159         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5160
5161       /* First we need to create a vector (ZERO_VEC) of zeros and another
5162          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5163          can create using a MAX reduction and then expanding.
5164          In the case where the loop never made any matches, the max index will
5165          be zero.  */
5166
5167       /* Vector of {0, 0, 0,...}.  */
5168       tree zero_vec = build_zero_cst (vectype);
5169
5170       gimple_seq stmts = NULL;
5171       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5172       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5173
5174       /* Find maximum value from the vector of found indexes.  */
5175       tree max_index = make_ssa_name (index_scalar_type);
5176       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5177                                                           1, induction_index);
5178       gimple_call_set_lhs (max_index_stmt, max_index);
5179       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5180
5181       /* Vector of {max_index, max_index, max_index,...}.  */
5182       tree max_index_vec = make_ssa_name (index_vec_type);
5183       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5184                                                       max_index);
5185       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5186                                                         max_index_vec_rhs);
5187       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5188
5189       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5190          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5191          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5192          otherwise.  Only one value should match, resulting in a vector
5193          (VEC_COND) with one data value and the rest zeros.
5194          In the case where the loop never made any matches, every index will
5195          match, resulting in a vector with all data values (which will all be
5196          the default value).  */
5197
5198       /* Compare the max index vector to the vector of found indexes to find
5199          the position of the max value.  */
5200       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5201       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5202                                                       induction_index,
5203                                                       max_index_vec);
5204       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5205
5206       /* Use the compare to choose either values from the data vector or
5207          zero.  */
5208       tree vec_cond = make_ssa_name (vectype);
5209       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5210                                                    vec_compare, new_phi_result,
5211                                                    zero_vec);
5212       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5213
5214       /* Finally we need to extract the data value from the vector (VEC_COND)
5215          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5216          reduction, but because this doesn't exist, we can use a MAX reduction
5217          instead.  The data value might be signed or a float so we need to cast
5218          it first.
5219          In the case where the loop never made any matches, the data values are
5220          all identical, and so will reduce down correctly.  */
5221
5222       /* Make the matched data values unsigned.  */
5223       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5224       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5225                                        vec_cond);
5226       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5227                                                         VIEW_CONVERT_EXPR,
5228                                                         vec_cond_cast_rhs);
5229       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5230
5231       /* Reduce down to a scalar value.  */
5232       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5233       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5234                                                            1, vec_cond_cast);
5235       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5236       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5237
5238       /* Convert the reduced value back to the result type and set as the
5239          result.  */
5240       stmts = NULL;
5241       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5242                                data_reduc);
5243       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5244       scalar_results.safe_push (new_temp);
5245     }
5246   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5247            && reduc_fn == IFN_LAST)
5248     {
5249       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5250          idx = 0;
5251          idx_val = induction_index[0];
5252          val = data_reduc[0];
5253          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5254            if (induction_index[i] > idx_val)
5255              val = data_reduc[i], idx_val = induction_index[i];
5256          return val;  */
5257
5258       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5259       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5260       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5261       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5262       /* Enforced by vectorizable_reduction, which ensures we have target
5263          support before allowing a conditional reduction on variable-length
5264          vectors.  */
5265       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5266       tree idx_val = NULL_TREE, val = NULL_TREE;
5267       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5268         {
5269           tree old_idx_val = idx_val;
5270           tree old_val = val;
5271           idx_val = make_ssa_name (idx_eltype);
5272           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5273                                              build3 (BIT_FIELD_REF, idx_eltype,
5274                                                      induction_index,
5275                                                      bitsize_int (el_size),
5276                                                      bitsize_int (off)));
5277           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5278           val = make_ssa_name (data_eltype);
5279           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5280                                              build3 (BIT_FIELD_REF,
5281                                                      data_eltype,
5282                                                      new_phi_result,
5283                                                      bitsize_int (el_size),
5284                                                      bitsize_int (off)));
5285           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5286           if (off != 0)
5287             {
5288               tree new_idx_val = idx_val;
5289               if (off != v_size - el_size)
5290                 {
5291                   new_idx_val = make_ssa_name (idx_eltype);
5292                   epilog_stmt = gimple_build_assign (new_idx_val,
5293                                                      MAX_EXPR, idx_val,
5294                                                      old_idx_val);
5295                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5296                 }
5297               tree new_val = make_ssa_name (data_eltype);
5298               epilog_stmt = gimple_build_assign (new_val,
5299                                                  COND_EXPR,
5300                                                  build2 (GT_EXPR,
5301                                                          boolean_type_node,
5302                                                          idx_val,
5303                                                          old_idx_val),
5304                                                  val, old_val);
5305               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5306               idx_val = new_idx_val;
5307               val = new_val;
5308             }
5309         }
5310       /* Convert the reduced value back to the result type and set as the
5311          result.  */
5312       gimple_seq stmts = NULL;
5313       val = gimple_convert (&stmts, scalar_type, val);
5314       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5315       scalar_results.safe_push (val);
5316     }
5317
5318   /* 2.3 Create the reduction code, using one of the three schemes described
5319          above. In SLP we simply need to extract all the elements from the
5320          vector (without reducing them), so we use scalar shifts.  */
5321   else if (reduc_fn != IFN_LAST && !slp_reduc)
5322     {
5323       tree tmp;
5324       tree vec_elem_type;
5325
5326       /* Case 1:  Create:
5327          v_out2 = reduc_expr <v_out1>  */
5328
5329       if (dump_enabled_p ())
5330         dump_printf_loc (MSG_NOTE, vect_location,
5331                          "Reduce using direct vector reduction.\n");
5332
5333       gimple_seq stmts = NULL;
5334       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5335       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5336       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5337                                vec_elem_type, new_phi_result);
5338       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5339       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5340
5341       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5342           && induc_val)
5343         {
5344           /* Earlier we set the initial value to be a vector if induc_val
5345              values.  Check the result and if it is induc_val then replace
5346              with the original initial value, unless induc_val is
5347              the same as initial_def already.  */
5348           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5349                                   induc_val);
5350
5351           tmp = make_ssa_name (new_scalar_dest);
5352           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5353                                              initial_def, new_temp);
5354           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5355           new_temp = tmp;
5356         }
5357
5358       scalar_results.safe_push (new_temp);
5359     }
5360   else if (direct_slp_reduc)
5361     {
5362       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5363          with the elements for other SLP statements replaced with the
5364          neutral value.  We can then do a normal reduction on each vector.  */
5365
5366       /* Enforced by vectorizable_reduction.  */
5367       gcc_assert (new_phis.length () == 1);
5368       gcc_assert (pow2p_hwi (group_size));
5369
5370       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5371       vec<stmt_vec_info> orig_phis
5372         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5373       gimple_seq seq = NULL;
5374
5375       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5376          and the same element size as VECTYPE.  */
5377       tree index = build_index_vector (vectype, 0, 1);
5378       tree index_type = TREE_TYPE (index);
5379       tree index_elt_type = TREE_TYPE (index_type);
5380       tree mask_type = truth_type_for (index_type);
5381
5382       /* Create a vector that, for each element, identifies which of
5383          the REDUC_GROUP_SIZE results should use it.  */
5384       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5385       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5386                             build_vector_from_val (index_type, index_mask));
5387
5388       /* Get a neutral vector value.  This is simply a splat of the neutral
5389          scalar value if we have one, otherwise the initial scalar value
5390          is itself a neutral value.  */
5391       tree vector_identity = NULL_TREE;
5392       tree neutral_op = NULL_TREE;
5393       if (slp_node)
5394         {
5395           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5396           neutral_op
5397             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5398                                             vectype, code, first != NULL);
5399         }
5400       if (neutral_op)
5401         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5402                                                         neutral_op);
5403       for (unsigned int i = 0; i < group_size; ++i)
5404         {
5405           /* If there's no univeral neutral value, we can use the
5406              initial scalar value from the original PHI.  This is used
5407              for MIN and MAX reduction, for example.  */
5408           if (!neutral_op)
5409             {
5410               tree scalar_value
5411                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5412                                          loop_preheader_edge (loop));
5413               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5414                                              scalar_value);
5415               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5416                                                               scalar_value);
5417             }
5418
5419           /* Calculate the equivalent of:
5420
5421              sel[j] = (index[j] == i);
5422
5423              which selects the elements of NEW_PHI_RESULT that should
5424              be included in the result.  */
5425           tree compare_val = build_int_cst (index_elt_type, i);
5426           compare_val = build_vector_from_val (index_type, compare_val);
5427           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5428                                    index, compare_val);
5429
5430           /* Calculate the equivalent of:
5431
5432              vec = seq ? new_phi_result : vector_identity;
5433
5434              VEC is now suitable for a full vector reduction.  */
5435           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5436                                    sel, new_phi_result, vector_identity);
5437
5438           /* Do the reduction and convert it to the appropriate type.  */
5439           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5440                                       TREE_TYPE (vectype), vec);
5441           scalar = gimple_convert (&seq, scalar_type, scalar);
5442           scalar_results.safe_push (scalar);
5443         }
5444       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5445     }
5446   else
5447     {
5448       bool reduce_with_shift;
5449       tree vec_temp;
5450
5451       gcc_assert (slp_reduc || new_phis.length () == 1);
5452
5453       /* See if the target wants to do the final (shift) reduction
5454          in a vector mode of smaller size and first reduce upper/lower
5455          halves against each other.  */
5456       enum machine_mode mode1 = mode;
5457       tree stype = TREE_TYPE (vectype);
5458       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5459       unsigned nunits1 = nunits;
5460       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5461           && new_phis.length () == 1)
5462         {
5463           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5464           /* For SLP reductions we have to make sure lanes match up, but
5465              since we're doing individual element final reduction reducing
5466              vector width here is even more important.
5467              ???  We can also separate lanes with permutes, for the common
5468              case of power-of-two group-size odd/even extracts would work.  */
5469           if (slp_reduc && nunits != nunits1)
5470             {
5471               nunits1 = least_common_multiple (nunits1, group_size);
5472               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5473             }
5474         }
5475       if (!slp_reduc
5476           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5477         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5478
5479       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5480                                                            stype, nunits1);
5481       reduce_with_shift = have_whole_vector_shift (mode1);
5482       if (!VECTOR_MODE_P (mode1))
5483         reduce_with_shift = false;
5484       else
5485         {
5486           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5487           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5488             reduce_with_shift = false;
5489         }
5490
5491       /* First reduce the vector to the desired vector size we should
5492          do shift reduction on by combining upper and lower halves.  */
5493       new_temp = new_phi_result;
5494       while (nunits > nunits1)
5495         {
5496           nunits /= 2;
5497           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5498                                                           stype, nunits);
5499           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5500
5501           /* The target has to make sure we support lowpart/highpart
5502              extraction, either via direct vector extract or through
5503              an integer mode punning.  */
5504           tree dst1, dst2;
5505           if (convert_optab_handler (vec_extract_optab,
5506                                      TYPE_MODE (TREE_TYPE (new_temp)),
5507                                      TYPE_MODE (vectype1))
5508               != CODE_FOR_nothing)
5509             {
5510               /* Extract sub-vectors directly once vec_extract becomes
5511                  a conversion optab.  */
5512               dst1 = make_ssa_name (vectype1);
5513               epilog_stmt
5514                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5515                                          build3 (BIT_FIELD_REF, vectype1,
5516                                                  new_temp, TYPE_SIZE (vectype1),
5517                                                  bitsize_int (0)));
5518               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5519               dst2 =  make_ssa_name (vectype1);
5520               epilog_stmt
5521                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5522                                          build3 (BIT_FIELD_REF, vectype1,
5523                                                  new_temp, TYPE_SIZE (vectype1),
5524                                                  bitsize_int (bitsize)));
5525               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5526             }
5527           else
5528             {
5529               /* Extract via punning to appropriately sized integer mode
5530                  vector.  */
5531               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5532               tree etype = build_vector_type (eltype, 2);
5533               gcc_assert (convert_optab_handler (vec_extract_optab,
5534                                                  TYPE_MODE (etype),
5535                                                  TYPE_MODE (eltype))
5536                           != CODE_FOR_nothing);
5537               tree tem = make_ssa_name (etype);
5538               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5539                                                  build1 (VIEW_CONVERT_EXPR,
5540                                                          etype, new_temp));
5541               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5542               new_temp = tem;
5543               tem = make_ssa_name (eltype);
5544               epilog_stmt
5545                   = gimple_build_assign (tem, BIT_FIELD_REF,
5546                                          build3 (BIT_FIELD_REF, eltype,
5547                                                  new_temp, TYPE_SIZE (eltype),
5548                                                  bitsize_int (0)));
5549               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5550               dst1 = make_ssa_name (vectype1);
5551               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5552                                                  build1 (VIEW_CONVERT_EXPR,
5553                                                          vectype1, tem));
5554               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5555               tem = make_ssa_name (eltype);
5556               epilog_stmt
5557                   = gimple_build_assign (tem, BIT_FIELD_REF,
5558                                          build3 (BIT_FIELD_REF, eltype,
5559                                                  new_temp, TYPE_SIZE (eltype),
5560                                                  bitsize_int (bitsize)));
5561               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5562               dst2 =  make_ssa_name (vectype1);
5563               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5564                                                  build1 (VIEW_CONVERT_EXPR,
5565                                                          vectype1, tem));
5566               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5567             }
5568
5569           new_temp = make_ssa_name (vectype1);
5570           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5571           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5572           new_phis[0] = epilog_stmt;
5573         }
5574
5575       if (reduce_with_shift && !slp_reduc)
5576         {
5577           int element_bitsize = tree_to_uhwi (bitsize);
5578           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5579              for variable-length vectors and also requires direct target support
5580              for loop reductions.  */
5581           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5582           int nelements = vec_size_in_bits / element_bitsize;
5583           vec_perm_builder sel;
5584           vec_perm_indices indices;
5585
5586           int elt_offset;
5587
5588           tree zero_vec = build_zero_cst (vectype1);
5589           /* Case 2: Create:
5590              for (offset = nelements/2; offset >= 1; offset/=2)
5591                 {
5592                   Create:  va' = vec_shift <va, offset>
5593                   Create:  va = vop <va, va'>
5594                 }  */
5595
5596           tree rhs;
5597
5598           if (dump_enabled_p ())
5599             dump_printf_loc (MSG_NOTE, vect_location,
5600                              "Reduce using vector shifts\n");
5601
5602           gimple_seq stmts = NULL;
5603           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5604           for (elt_offset = nelements / 2;
5605                elt_offset >= 1;
5606                elt_offset /= 2)
5607             {
5608               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5609               indices.new_vector (sel, 2, nelements);
5610               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5611               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5612                                        new_temp, zero_vec, mask);
5613               new_temp = gimple_build (&stmts, code,
5614                                        vectype1, new_name, new_temp);
5615             }
5616           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5617
5618           /* 2.4  Extract the final scalar result.  Create:
5619              s_out3 = extract_field <v_out2, bitpos>  */
5620
5621           if (dump_enabled_p ())
5622             dump_printf_loc (MSG_NOTE, vect_location,
5623                              "extract scalar result\n");
5624
5625           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5626                         bitsize, bitsize_zero_node);
5627           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5628           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5629           gimple_assign_set_lhs (epilog_stmt, new_temp);
5630           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5631           scalar_results.safe_push (new_temp);
5632         }
5633       else
5634         {
5635           /* Case 3: Create:
5636              s = extract_field <v_out2, 0>
5637              for (offset = element_size;
5638                   offset < vector_size;
5639                   offset += element_size;)
5640                {
5641                  Create:  s' = extract_field <v_out2, offset>
5642                  Create:  s = op <s, s'>  // For non SLP cases
5643                }  */
5644
5645           if (dump_enabled_p ())
5646             dump_printf_loc (MSG_NOTE, vect_location,
5647                              "Reduce using scalar code.\n");
5648
5649           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5650           int element_bitsize = tree_to_uhwi (bitsize);
5651           tree compute_type = TREE_TYPE (vectype);
5652           gimple_seq stmts = NULL;
5653           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5654             {
5655               int bit_offset;
5656               if (gimple_code (new_phi) == GIMPLE_PHI)
5657                 vec_temp = PHI_RESULT (new_phi);
5658               else
5659                 vec_temp = gimple_assign_lhs (new_phi);
5660               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5661                                        vec_temp, bitsize, bitsize_zero_node);
5662
5663               /* In SLP we don't need to apply reduction operation, so we just
5664                  collect s' values in SCALAR_RESULTS.  */
5665               if (slp_reduc)
5666                 scalar_results.safe_push (new_temp);
5667
5668               for (bit_offset = element_bitsize;
5669                    bit_offset < vec_size_in_bits;
5670                    bit_offset += element_bitsize)
5671                 {
5672                   tree bitpos = bitsize_int (bit_offset);
5673                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5674                                            compute_type, vec_temp,
5675                                            bitsize, bitpos);
5676                   if (slp_reduc)
5677                     {
5678                       /* In SLP we don't need to apply reduction operation, so
5679                          we just collect s' values in SCALAR_RESULTS.  */
5680                       new_temp = new_name;
5681                       scalar_results.safe_push (new_name);
5682                     }
5683                   else
5684                     new_temp = gimple_build (&stmts, code, compute_type,
5685                                              new_name, new_temp);
5686                 }
5687             }
5688
5689           /* The only case where we need to reduce scalar results in SLP, is
5690              unrolling.  If the size of SCALAR_RESULTS is greater than
5691              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5692              REDUC_GROUP_SIZE.  */
5693           if (slp_reduc)
5694             {
5695               tree res, first_res, new_res;
5696
5697               /* Reduce multiple scalar results in case of SLP unrolling.  */
5698               for (j = group_size; scalar_results.iterate (j, &res);
5699                    j++)
5700                 {
5701                   first_res = scalar_results[j % group_size];
5702                   new_res = gimple_build (&stmts, code, compute_type,
5703                                           first_res, res);
5704                   scalar_results[j % group_size] = new_res;
5705                 }
5706               for (k = 0; k < group_size; k++)
5707                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5708                                                     scalar_results[k]);
5709             }
5710           else
5711             {
5712               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5713               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5714               scalar_results.safe_push (new_temp);
5715             }
5716
5717           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5718         }
5719
5720       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5721           && induc_val)
5722         {
5723           /* Earlier we set the initial value to be a vector if induc_val
5724              values.  Check the result and if it is induc_val then replace
5725              with the original initial value, unless induc_val is
5726              the same as initial_def already.  */
5727           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5728                                   induc_val);
5729
5730           tree tmp = make_ssa_name (new_scalar_dest);
5731           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5732                                              initial_def, new_temp);
5733           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5734           scalar_results[0] = tmp;
5735         }
5736     }
5737
5738   /* 2.5 Adjust the final result by the initial value of the reduction
5739          variable. (When such adjustment is not needed, then
5740          'adjustment_def' is zero).  For example, if code is PLUS we create:
5741          new_temp = loop_exit_def + adjustment_def  */
5742
5743   if (adjustment_def)
5744     {
5745       gcc_assert (!slp_reduc);
5746       gimple_seq stmts = NULL;
5747       if (nested_in_vect_loop)
5748         {
5749           new_phi = new_phis[0];
5750           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5751           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5752           new_temp = gimple_build (&stmts, code, vectype,
5753                                    PHI_RESULT (new_phi), adjustment_def);
5754         }
5755       else
5756         {
5757           new_temp = scalar_results[0];
5758           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5759           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5760           new_temp = gimple_build (&stmts, code, scalar_type,
5761                                    new_temp, adjustment_def);
5762         }
5763
5764       epilog_stmt = gimple_seq_last_stmt (stmts);
5765       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5766       if (nested_in_vect_loop)
5767         {
5768           if (!double_reduc)
5769             scalar_results.quick_push (new_temp);
5770           else
5771             scalar_results[0] = new_temp;
5772         }
5773       else
5774         scalar_results[0] = new_temp;
5775
5776       new_phis[0] = epilog_stmt;
5777     }
5778
5779   if (double_reduc)
5780     loop = loop->inner;
5781
5782   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5783           phis with new adjusted scalar results, i.e., replace use <s_out0>
5784           with use <s_out4>.
5785
5786      Transform:
5787         loop_exit:
5788           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5789           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5790           v_out2 = reduce <v_out1>
5791           s_out3 = extract_field <v_out2, 0>
5792           s_out4 = adjust_result <s_out3>
5793           use <s_out0>
5794           use <s_out0>
5795
5796      into:
5797
5798         loop_exit:
5799           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5800           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5801           v_out2 = reduce <v_out1>
5802           s_out3 = extract_field <v_out2, 0>
5803           s_out4 = adjust_result <s_out3>
5804           use <s_out4>
5805           use <s_out4> */
5806
5807
5808   /* In SLP reduction chain we reduce vector results into one vector if
5809      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5810      LHS of the last stmt in the reduction chain, since we are looking for
5811      the loop exit phi node.  */
5812   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5813     {
5814       stmt_vec_info dest_stmt_info
5815         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5816       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5817       group_size = 1;
5818     }
5819
5820   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5821      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5822      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5823      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5824      correspond to the first vector stmt, etc.
5825      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5826   if (group_size > new_phis.length ())
5827     gcc_assert (!(group_size % new_phis.length ()));
5828
5829   for (k = 0; k < group_size; k++)
5830     {
5831       if (slp_reduc)
5832         {
5833           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5834
5835           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5836           /* SLP statements can't participate in patterns.  */
5837           gcc_assert (!orig_stmt_info);
5838           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5839         }
5840
5841       if (nested_in_vect_loop)
5842         {
5843           if (double_reduc)
5844             loop = outer_loop;
5845           else
5846             gcc_unreachable ();
5847         }
5848
5849       phis.create (3);
5850       /* Find the loop-closed-use at the loop exit of the original scalar
5851          result.  (The reduction result is expected to have two immediate uses,
5852          one at the latch block, and one at the loop exit).  For double
5853          reductions we are looking for exit phis of the outer loop.  */
5854       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5855         {
5856           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5857             {
5858               if (!is_gimple_debug (USE_STMT (use_p)))
5859                 phis.safe_push (USE_STMT (use_p));
5860             }
5861           else
5862             {
5863               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5864                 {
5865                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5866
5867                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5868                     {
5869                       if (!flow_bb_inside_loop_p (loop,
5870                                              gimple_bb (USE_STMT (phi_use_p)))
5871                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5872                         phis.safe_push (USE_STMT (phi_use_p));
5873                     }
5874                 }
5875             }
5876         }
5877
5878       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5879         {
5880           /* Replace the uses:  */
5881           orig_name = PHI_RESULT (exit_phi);
5882           scalar_result = scalar_results[k];
5883           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5884             {
5885               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5886                 SET_USE (use_p, scalar_result);
5887               update_stmt (use_stmt);
5888             }
5889         }
5890
5891       phis.release ();
5892     }
5893 }
5894
5895 /* Return a vector of type VECTYPE that is equal to the vector select
5896    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5897    before GSI.  */
5898
5899 static tree
5900 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5901                      tree vec, tree identity)
5902 {
5903   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5904   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5905                                           mask, vec, identity);
5906   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5907   return cond;
5908 }
5909
5910 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5911    order, starting with LHS.  Insert the extraction statements before GSI and
5912    associate the new scalar SSA names with variable SCALAR_DEST.
5913    Return the SSA name for the result.  */
5914
5915 static tree
5916 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5917                        tree_code code, tree lhs, tree vector_rhs)
5918 {
5919   tree vectype = TREE_TYPE (vector_rhs);
5920   tree scalar_type = TREE_TYPE (vectype);
5921   tree bitsize = TYPE_SIZE (scalar_type);
5922   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5923   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5924
5925   for (unsigned HOST_WIDE_INT bit_offset = 0;
5926        bit_offset < vec_size_in_bits;
5927        bit_offset += element_bitsize)
5928     {
5929       tree bitpos = bitsize_int (bit_offset);
5930       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5931                          bitsize, bitpos);
5932
5933       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5934       rhs = make_ssa_name (scalar_dest, stmt);
5935       gimple_assign_set_lhs (stmt, rhs);
5936       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5937
5938       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5939       tree new_name = make_ssa_name (scalar_dest, stmt);
5940       gimple_assign_set_lhs (stmt, new_name);
5941       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5942       lhs = new_name;
5943     }
5944   return lhs;
5945 }
5946
5947 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5948    type of the vector input.  */
5949
5950 static internal_fn
5951 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5952 {
5953   internal_fn mask_reduc_fn;
5954
5955   switch (reduc_fn)
5956     {
5957     case IFN_FOLD_LEFT_PLUS:
5958       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5959       break;
5960
5961     default:
5962       return IFN_LAST;
5963     }
5964
5965   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5966                                       OPTIMIZE_FOR_SPEED))
5967     return mask_reduc_fn;
5968   return IFN_LAST;
5969 }
5970
5971 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5972    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5973    statement.  CODE is the operation performed by STMT_INFO and OPS are
5974    its scalar operands.  REDUC_INDEX is the index of the operand in
5975    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5976    implements in-order reduction, or IFN_LAST if we should open-code it.
5977    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5978    that should be used to control the operation in a fully-masked loop.  */
5979
5980 static bool
5981 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5982                                stmt_vec_info stmt_info,
5983                                gimple_stmt_iterator *gsi,
5984                                gimple **vec_stmt, slp_tree slp_node,
5985                                gimple *reduc_def_stmt,
5986                                tree_code code, internal_fn reduc_fn,
5987                                tree ops[3], tree vectype_in,
5988                                int reduc_index, vec_loop_masks *masks)
5989 {
5990   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5991   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5992   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5993
5994   int ncopies;
5995   if (slp_node)
5996     ncopies = 1;
5997   else
5998     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5999
6000   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6001   gcc_assert (ncopies == 1);
6002   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6003
6004   if (slp_node)
6005     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6006                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6007
6008   tree op0 = ops[1 - reduc_index];
6009
6010   int group_size = 1;
6011   stmt_vec_info scalar_dest_def_info;
6012   auto_vec<tree> vec_oprnds0;
6013   if (slp_node)
6014     {
6015       auto_vec<vec<tree> > vec_defs (2);
6016       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6017       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6018       vec_defs[0].release ();
6019       vec_defs[1].release ();
6020       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6021       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6022     }
6023   else
6024     {
6025       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6026                                      op0, &vec_oprnds0);
6027       scalar_dest_def_info = stmt_info;
6028     }
6029
6030   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6031   tree scalar_type = TREE_TYPE (scalar_dest);
6032   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6033
6034   int vec_num = vec_oprnds0.length ();
6035   gcc_assert (vec_num == 1 || slp_node);
6036   tree vec_elem_type = TREE_TYPE (vectype_out);
6037   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6038
6039   tree vector_identity = NULL_TREE;
6040   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6041     vector_identity = build_zero_cst (vectype_out);
6042
6043   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6044   int i;
6045   tree def0;
6046   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6047     {
6048       gimple *new_stmt;
6049       tree mask = NULL_TREE;
6050       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6051         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6052
6053       /* Handle MINUS by adding the negative.  */
6054       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6055         {
6056           tree negated = make_ssa_name (vectype_out);
6057           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6058           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6059           def0 = negated;
6060         }
6061
6062       if (mask && mask_reduc_fn == IFN_LAST)
6063         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6064                                     vector_identity);
6065
6066       /* On the first iteration the input is simply the scalar phi
6067          result, and for subsequent iterations it is the output of
6068          the preceding operation.  */
6069       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6070         {
6071           if (mask && mask_reduc_fn != IFN_LAST)
6072             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6073                                                    def0, mask);
6074           else
6075             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6076                                                    def0);
6077           /* For chained SLP reductions the output of the previous reduction
6078              operation serves as the input of the next. For the final statement
6079              the output cannot be a temporary - we reuse the original
6080              scalar destination of the last statement.  */
6081           if (i != vec_num - 1)
6082             {
6083               gimple_set_lhs (new_stmt, scalar_dest_var);
6084               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6085               gimple_set_lhs (new_stmt, reduc_var);
6086             }
6087         }
6088       else
6089         {
6090           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6091                                              reduc_var, def0);
6092           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6093           /* Remove the statement, so that we can use the same code paths
6094              as for statements that we've just created.  */
6095           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6096           gsi_remove (&tmp_gsi, true);
6097         }
6098
6099       if (i == vec_num - 1)
6100         {
6101           gimple_set_lhs (new_stmt, scalar_dest);
6102           vect_finish_replace_stmt (loop_vinfo,
6103                                     scalar_dest_def_info,
6104                                     new_stmt);
6105         }
6106       else
6107         vect_finish_stmt_generation (loop_vinfo,
6108                                      scalar_dest_def_info,
6109                                      new_stmt, gsi);
6110
6111       if (slp_node)
6112         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6113       else
6114         {
6115           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6116           *vec_stmt = new_stmt;
6117         }
6118     }
6119
6120   return true;
6121 }
6122
6123 /* Function is_nonwrapping_integer_induction.
6124
6125    Check if STMT_VINO (which is part of loop LOOP) both increments and
6126    does not cause overflow.  */
6127
6128 static bool
6129 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6130 {
6131   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6132   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6133   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6134   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6135   widest_int ni, max_loop_value, lhs_max;
6136   wi::overflow_type overflow = wi::OVF_NONE;
6137
6138   /* Make sure the loop is integer based.  */
6139   if (TREE_CODE (base) != INTEGER_CST
6140       || TREE_CODE (step) != INTEGER_CST)
6141     return false;
6142
6143   /* Check that the max size of the loop will not wrap.  */
6144
6145   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6146     return true;
6147
6148   if (! max_stmt_executions (loop, &ni))
6149     return false;
6150
6151   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6152                             &overflow);
6153   if (overflow)
6154     return false;
6155
6156   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6157                             TYPE_SIGN (lhs_type), &overflow);
6158   if (overflow)
6159     return false;
6160
6161   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6162           <= TYPE_PRECISION (lhs_type));
6163 }
6164
6165 /* Check if masking can be supported by inserting a conditional expression.
6166    CODE is the code for the operation.  COND_FN is the conditional internal
6167    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6168 static bool
6169 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6170                          tree vectype_in)
6171 {
6172   if (cond_fn != IFN_LAST
6173       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6174                                          OPTIMIZE_FOR_SPEED))
6175     return false;
6176
6177   switch (code)
6178     {
6179     case DOT_PROD_EXPR:
6180     case SAD_EXPR:
6181       return true;
6182
6183     default:
6184       return false;
6185     }
6186 }
6187
6188 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6189    code for the operation.  VOP is the array of operands.  MASK is the loop
6190    mask.  GSI is a statement iterator used to place the new conditional
6191    expression.  */
6192 static void
6193 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6194                       gimple_stmt_iterator *gsi)
6195 {
6196   switch (code)
6197     {
6198     case DOT_PROD_EXPR:
6199       {
6200         tree vectype = TREE_TYPE (vop[1]);
6201         tree zero = build_zero_cst (vectype);
6202         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6203         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6204                                                mask, vop[1], zero);
6205         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6206         vop[1] = masked_op1;
6207         break;
6208       }
6209
6210     case SAD_EXPR:
6211       {
6212         tree vectype = TREE_TYPE (vop[1]);
6213         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6214         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6215                                                mask, vop[1], vop[0]);
6216         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6217         vop[1] = masked_op1;
6218         break;
6219       }
6220
6221     default:
6222       gcc_unreachable ();
6223     }
6224 }
6225
6226 /* Function vectorizable_reduction.
6227
6228    Check if STMT_INFO performs a reduction operation that can be vectorized.
6229    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6230    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6231    Return true if STMT_INFO is vectorizable in this way.
6232
6233    This function also handles reduction idioms (patterns) that have been
6234    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6235    may be of this form:
6236      X = pattern_expr (arg0, arg1, ..., X)
6237    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6238    sequence that had been detected and replaced by the pattern-stmt
6239    (STMT_INFO).
6240
6241    This function also handles reduction of condition expressions, for example:
6242      for (int i = 0; i < N; i++)
6243        if (a[i] < value)
6244          last = a[i];
6245    This is handled by vectorising the loop and creating an additional vector
6246    containing the loop indexes for which "a[i] < value" was true.  In the
6247    function epilogue this is reduced to a single max value and then used to
6248    index into the vector of results.
6249
6250    In some cases of reduction patterns, the type of the reduction variable X is
6251    different than the type of the other arguments of STMT_INFO.
6252    In such cases, the vectype that is used when transforming STMT_INFO into
6253    a vector stmt is different than the vectype that is used to determine the
6254    vectorization factor, because it consists of a different number of elements
6255    than the actual number of elements that are being operated upon in parallel.
6256
6257    For example, consider an accumulation of shorts into an int accumulator.
6258    On some targets it's possible to vectorize this pattern operating on 8
6259    shorts at a time (hence, the vectype for purposes of determining the
6260    vectorization factor should be V8HI); on the other hand, the vectype that
6261    is used to create the vector form is actually V4SI (the type of the result).
6262
6263    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6264    indicates what is the actual level of parallelism (V8HI in the example), so
6265    that the right vectorization factor would be derived.  This vectype
6266    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6267    be used to create the vectorized stmt.  The right vectype for the vectorized
6268    stmt is obtained from the type of the result X:
6269       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6270
6271    This means that, contrary to "regular" reductions (or "regular" stmts in
6272    general), the following equation:
6273       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6274    does *NOT* necessarily hold for reduction patterns.  */
6275
6276 bool
6277 vectorizable_reduction (loop_vec_info loop_vinfo,
6278                         stmt_vec_info stmt_info, slp_tree slp_node,
6279                         slp_instance slp_node_instance,
6280                         stmt_vector_for_cost *cost_vec)
6281 {
6282   tree scalar_dest;
6283   tree vectype_in = NULL_TREE;
6284   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6285   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6286   stmt_vec_info cond_stmt_vinfo = NULL;
6287   tree scalar_type;
6288   int i;
6289   int ncopies;
6290   bool single_defuse_cycle = false;
6291   bool nested_cycle = false;
6292   bool double_reduc = false;
6293   int vec_num;
6294   tree tem;
6295   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6296   tree cond_reduc_val = NULL_TREE;
6297
6298   /* Make sure it was already recognized as a reduction computation.  */
6299   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6300       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6301       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6302     return false;
6303
6304   /* The stmt we store reduction analysis meta on.  */
6305   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6306   reduc_info->is_reduc_info = true;
6307
6308   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6309     {
6310       if (is_a <gphi *> (stmt_info->stmt))
6311         {
6312           if (slp_node)
6313             {
6314               /* We eventually need to set a vector type on invariant
6315                  arguments.  */
6316               unsigned j;
6317               slp_tree child;
6318               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6319                 if (!vect_maybe_update_slp_op_vectype
6320                        (child, SLP_TREE_VECTYPE (slp_node)))
6321                   {
6322                     if (dump_enabled_p ())
6323                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6324                                        "incompatible vector types for "
6325                                        "invariants\n");
6326                     return false;
6327                   }
6328             }
6329           /* Analysis for double-reduction is done on the outer
6330              loop PHI, nested cycles have no further restrictions.  */
6331           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6332         }
6333       else
6334         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6335       return true;
6336     }
6337
6338   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6339   stmt_vec_info phi_info = stmt_info;
6340   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6341       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6342     {
6343       if (!is_a <gphi *> (stmt_info->stmt))
6344         {
6345           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6346           return true;
6347         }
6348       if (slp_node)
6349         {
6350           slp_node_instance->reduc_phis = slp_node;
6351           /* ???  We're leaving slp_node to point to the PHIs, we only
6352              need it to get at the number of vector stmts which wasn't
6353              yet initialized for the instance root.  */
6354         }
6355       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6356         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6357       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6358         {
6359           use_operand_p use_p;
6360           gimple *use_stmt;
6361           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6362                                      &use_p, &use_stmt);
6363           gcc_assert (res);
6364           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6365           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6366         }
6367     }
6368
6369   /* PHIs should not participate in patterns.  */
6370   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6371   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6372
6373   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6374      and compute the reduction chain length.  Discover the real
6375      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6376   tree reduc_def
6377     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6378                              loop_latch_edge
6379                                (gimple_bb (reduc_def_phi)->loop_father));
6380   unsigned reduc_chain_length = 0;
6381   bool only_slp_reduc_chain = true;
6382   stmt_info = NULL;
6383   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6384   while (reduc_def != PHI_RESULT (reduc_def_phi))
6385     {
6386       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6387       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6388       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6389         {
6390           if (dump_enabled_p ())
6391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6392                              "reduction chain broken by patterns.\n");
6393           return false;
6394         }
6395       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6396         only_slp_reduc_chain = false;
6397       /* ???  For epilogue generation live members of the chain need
6398          to point back to the PHI via their original stmt for
6399          info_for_reduction to work.  */
6400       if (STMT_VINFO_LIVE_P (vdef))
6401         STMT_VINFO_REDUC_DEF (def) = phi_info;
6402       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6403       if (!assign)
6404         {
6405           if (dump_enabled_p ())
6406             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6407                              "reduction chain includes calls.\n");
6408           return false;
6409         }
6410       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6411         {
6412           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6413                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6414             {
6415               if (dump_enabled_p ())
6416                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6417                                  "conversion in the reduction chain.\n");
6418               return false;
6419             }
6420         }
6421       else if (!stmt_info)
6422         /* First non-conversion stmt.  */
6423         stmt_info = vdef;
6424       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6425       reduc_chain_length++;
6426       if (!stmt_info && slp_node)
6427         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6428     }
6429   /* PHIs should not participate in patterns.  */
6430   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6431
6432   if (nested_in_vect_loop_p (loop, stmt_info))
6433     {
6434       loop = loop->inner;
6435       nested_cycle = true;
6436     }
6437
6438   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6439      element.  */
6440   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6441     {
6442       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6443       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6444     }
6445   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6446     gcc_assert (slp_node
6447                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6448
6449   /* 1. Is vectorizable reduction?  */
6450   /* Not supportable if the reduction variable is used in the loop, unless
6451      it's a reduction chain.  */
6452   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6453       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6454     return false;
6455
6456   /* Reductions that are not used even in an enclosing outer-loop,
6457      are expected to be "live" (used out of the loop).  */
6458   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6459       && !STMT_VINFO_LIVE_P (stmt_info))
6460     return false;
6461
6462   /* 2. Has this been recognized as a reduction pattern?
6463
6464      Check if STMT represents a pattern that has been recognized
6465      in earlier analysis stages.  For stmts that represent a pattern,
6466      the STMT_VINFO_RELATED_STMT field records the last stmt in
6467      the original sequence that constitutes the pattern.  */
6468
6469   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6470   if (orig_stmt_info)
6471     {
6472       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6473       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6474     }
6475
6476   /* 3. Check the operands of the operation.  The first operands are defined
6477         inside the loop body. The last operand is the reduction variable,
6478         which is defined by the loop-header-phi.  */
6479
6480   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6481   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6482   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6483   enum tree_code code = gimple_assign_rhs_code (stmt);
6484   bool lane_reduc_code_p
6485     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6486   int op_type = TREE_CODE_LENGTH (code);
6487
6488   scalar_dest = gimple_assign_lhs (stmt);
6489   scalar_type = TREE_TYPE (scalar_dest);
6490   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6491       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6492     return false;
6493
6494   /* Do not try to vectorize bit-precision reductions.  */
6495   if (!type_has_mode_precision_p (scalar_type))
6496     return false;
6497
6498   /* For lane-reducing ops we're reducing the number of reduction PHIs
6499      which means the only use of that may be in the lane-reducing operation.  */
6500   if (lane_reduc_code_p
6501       && reduc_chain_length != 1
6502       && !only_slp_reduc_chain)
6503     {
6504       if (dump_enabled_p ())
6505         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6506                          "lane-reducing reduction with extra stmts.\n");
6507       return false;
6508     }
6509
6510   /* All uses but the last are expected to be defined in the loop.
6511      The last use is the reduction variable.  In case of nested cycle this
6512      assumption is not true: we use reduc_index to record the index of the
6513      reduction variable.  */
6514   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6515   /* We need to skip an extra operand for COND_EXPRs with embedded
6516      comparison.  */
6517   unsigned opno_adjust = 0;
6518   if (code == COND_EXPR
6519       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6520     opno_adjust = 1;
6521   for (i = 0; i < op_type; i++)
6522     {
6523       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6524       if (i == 0 && code == COND_EXPR)
6525         continue;
6526
6527       stmt_vec_info def_stmt_info;
6528       enum vect_def_type dt;
6529       tree op;
6530       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6531                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6532                                &def_stmt_info))
6533         {
6534           if (dump_enabled_p ())
6535             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6536                              "use not simple.\n");
6537           return false;
6538         }
6539       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6540         continue;
6541
6542       /* There should be only one cycle def in the stmt, the one
6543          leading to reduc_def.  */
6544       if (VECTORIZABLE_CYCLE_DEF (dt))
6545         return false;
6546
6547       /* To properly compute ncopies we are interested in the widest
6548          non-reduction input type in case we're looking at a widening
6549          accumulation that we later handle in vect_transform_reduction.  */
6550       if (lane_reduc_code_p
6551           && tem
6552           && (!vectype_in
6553               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6554                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6555         vectype_in = tem;
6556
6557       if (code == COND_EXPR)
6558         {
6559           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6560           if (dt == vect_constant_def)
6561             {
6562               cond_reduc_dt = dt;
6563               cond_reduc_val = op;
6564             }
6565           if (dt == vect_induction_def
6566               && def_stmt_info
6567               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6568             {
6569               cond_reduc_dt = dt;
6570               cond_stmt_vinfo = def_stmt_info;
6571             }
6572         }
6573     }
6574   if (!vectype_in)
6575     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6576   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6577
6578   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6579   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6580   /* If we have a condition reduction, see if we can simplify it further.  */
6581   if (v_reduc_type == COND_REDUCTION)
6582     {
6583       if (slp_node)
6584         return false;
6585
6586       /* When the condition uses the reduction value in the condition, fail.  */
6587       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6588         {
6589           if (dump_enabled_p ())
6590             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6591                              "condition depends on previous iteration\n");
6592           return false;
6593         }
6594
6595       if (reduc_chain_length == 1
6596           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6597                                              vectype_in, OPTIMIZE_FOR_SPEED))
6598         {
6599           if (dump_enabled_p ())
6600             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6601                              "optimizing condition reduction with"
6602                              " FOLD_EXTRACT_LAST.\n");
6603           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6604         }
6605       else if (cond_reduc_dt == vect_induction_def)
6606         {
6607           tree base
6608             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6609           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6610
6611           gcc_assert (TREE_CODE (base) == INTEGER_CST
6612                       && TREE_CODE (step) == INTEGER_CST);
6613           cond_reduc_val = NULL_TREE;
6614           enum tree_code cond_reduc_op_code = ERROR_MARK;
6615           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6616           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6617             ;
6618           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6619              above base; punt if base is the minimum value of the type for
6620              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6621           else if (tree_int_cst_sgn (step) == -1)
6622             {
6623               cond_reduc_op_code = MIN_EXPR;
6624               if (tree_int_cst_sgn (base) == -1)
6625                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6626               else if (tree_int_cst_lt (base,
6627                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6628                 cond_reduc_val
6629                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6630             }
6631           else
6632             {
6633               cond_reduc_op_code = MAX_EXPR;
6634               if (tree_int_cst_sgn (base) == 1)
6635                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6636               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6637                                         base))
6638                 cond_reduc_val
6639                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6640             }
6641           if (cond_reduc_val)
6642             {
6643               if (dump_enabled_p ())
6644                 dump_printf_loc (MSG_NOTE, vect_location,
6645                                  "condition expression based on "
6646                                  "integer induction.\n");
6647               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6648               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6649                 = cond_reduc_val;
6650               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6651             }
6652         }
6653       else if (cond_reduc_dt == vect_constant_def)
6654         {
6655           enum vect_def_type cond_initial_dt;
6656           tree cond_initial_val
6657             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6658
6659           gcc_assert (cond_reduc_val != NULL_TREE);
6660           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6661           if (cond_initial_dt == vect_constant_def
6662               && types_compatible_p (TREE_TYPE (cond_initial_val),
6663                                      TREE_TYPE (cond_reduc_val)))
6664             {
6665               tree e = fold_binary (LE_EXPR, boolean_type_node,
6666                                     cond_initial_val, cond_reduc_val);
6667               if (e && (integer_onep (e) || integer_zerop (e)))
6668                 {
6669                   if (dump_enabled_p ())
6670                     dump_printf_loc (MSG_NOTE, vect_location,
6671                                      "condition expression based on "
6672                                      "compile time constant.\n");
6673                   /* Record reduction code at analysis stage.  */
6674                   STMT_VINFO_REDUC_CODE (reduc_info)
6675                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6676                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6677                 }
6678             }
6679         }
6680     }
6681
6682   if (STMT_VINFO_LIVE_P (phi_info))
6683     return false;
6684
6685   if (slp_node)
6686     ncopies = 1;
6687   else
6688     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6689
6690   gcc_assert (ncopies >= 1);
6691
6692   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6693
6694   if (nested_cycle)
6695     {
6696       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6697                   == vect_double_reduction_def);
6698       double_reduc = true;
6699     }
6700
6701   /* 4.2. Check support for the epilog operation.
6702
6703           If STMT represents a reduction pattern, then the type of the
6704           reduction variable may be different than the type of the rest
6705           of the arguments.  For example, consider the case of accumulation
6706           of shorts into an int accumulator; The original code:
6707                         S1: int_a = (int) short_a;
6708           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6709
6710           was replaced with:
6711                         STMT: int_acc = widen_sum <short_a, int_acc>
6712
6713           This means that:
6714           1. The tree-code that is used to create the vector operation in the
6715              epilog code (that reduces the partial results) is not the
6716              tree-code of STMT, but is rather the tree-code of the original
6717              stmt from the pattern that STMT is replacing.  I.e, in the example
6718              above we want to use 'widen_sum' in the loop, but 'plus' in the
6719              epilog.
6720           2. The type (mode) we use to check available target support
6721              for the vector operation to be created in the *epilog*, is
6722              determined by the type of the reduction variable (in the example
6723              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6724              However the type (mode) we use to check available target support
6725              for the vector operation to be created *inside the loop*, is
6726              determined by the type of the other arguments to STMT (in the
6727              example we'd check this: optab_handler (widen_sum_optab,
6728              vect_short_mode)).
6729
6730           This is contrary to "regular" reductions, in which the types of all
6731           the arguments are the same as the type of the reduction variable.
6732           For "regular" reductions we can therefore use the same vector type
6733           (and also the same tree-code) when generating the epilog code and
6734           when generating the code inside the loop.  */
6735
6736   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6737   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6738
6739   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6740   if (reduction_type == TREE_CODE_REDUCTION)
6741     {
6742       /* Check whether it's ok to change the order of the computation.
6743          Generally, when vectorizing a reduction we change the order of the
6744          computation.  This may change the behavior of the program in some
6745          cases, so we need to check that this is ok.  One exception is when
6746          vectorizing an outer-loop: the inner-loop is executed sequentially,
6747          and therefore vectorizing reductions in the inner-loop during
6748          outer-loop vectorization is safe.  */
6749       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6750         {
6751           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6752              is not directy used in stmt.  */
6753           if (!only_slp_reduc_chain
6754               && reduc_chain_length != 1)
6755             {
6756               if (dump_enabled_p ())
6757                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6758                                  "in-order reduction chain without SLP.\n");
6759               return false;
6760             }
6761           STMT_VINFO_REDUC_TYPE (reduc_info)
6762             = reduction_type = FOLD_LEFT_REDUCTION;
6763         }
6764       else if (!commutative_tree_code (orig_code)
6765                || !associative_tree_code (orig_code))
6766         {
6767           if (dump_enabled_p ())
6768             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769                             "reduction: not commutative/associative");
6770           return false;
6771         }
6772     }
6773
6774   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6775       && ncopies > 1)
6776     {
6777       if (dump_enabled_p ())
6778         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779                          "multiple types in double reduction or condition "
6780                          "reduction or fold-left reduction.\n");
6781       return false;
6782     }
6783
6784   internal_fn reduc_fn = IFN_LAST;
6785   if (reduction_type == TREE_CODE_REDUCTION
6786       || reduction_type == FOLD_LEFT_REDUCTION
6787       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6788       || reduction_type == CONST_COND_REDUCTION)
6789     {
6790       if (reduction_type == FOLD_LEFT_REDUCTION
6791           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6792           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6793         {
6794           if (reduc_fn != IFN_LAST
6795               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6796                                                   OPTIMIZE_FOR_SPEED))
6797             {
6798               if (dump_enabled_p ())
6799                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6800                                  "reduc op not supported by target.\n");
6801
6802               reduc_fn = IFN_LAST;
6803             }
6804         }
6805       else
6806         {
6807           if (!nested_cycle || double_reduc)
6808             {
6809               if (dump_enabled_p ())
6810                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6811                                  "no reduc code for scalar code.\n");
6812
6813               return false;
6814             }
6815         }
6816     }
6817   else if (reduction_type == COND_REDUCTION)
6818     {
6819       int scalar_precision
6820         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6821       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6822       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6823                                                 nunits_out);
6824
6825       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6826                                           OPTIMIZE_FOR_SPEED))
6827         reduc_fn = IFN_REDUC_MAX;
6828     }
6829   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6830
6831   if (reduction_type != EXTRACT_LAST_REDUCTION
6832       && (!nested_cycle || double_reduc)
6833       && reduc_fn == IFN_LAST
6834       && !nunits_out.is_constant ())
6835     {
6836       if (dump_enabled_p ())
6837         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6838                          "missing target support for reduction on"
6839                          " variable-length vectors.\n");
6840       return false;
6841     }
6842
6843   /* For SLP reductions, see if there is a neutral value we can use.  */
6844   tree neutral_op = NULL_TREE;
6845   if (slp_node)
6846     neutral_op = neutral_op_for_slp_reduction
6847       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6848        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6849
6850   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6851     {
6852       /* We can't support in-order reductions of code such as this:
6853
6854            for (int i = 0; i < n1; ++i)
6855              for (int j = 0; j < n2; ++j)
6856                l += a[j];
6857
6858          since GCC effectively transforms the loop when vectorizing:
6859
6860            for (int i = 0; i < n1 / VF; ++i)
6861              for (int j = 0; j < n2; ++j)
6862                for (int k = 0; k < VF; ++k)
6863                  l += a[j];
6864
6865          which is a reassociation of the original operation.  */
6866       if (dump_enabled_p ())
6867         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6868                          "in-order double reduction not supported.\n");
6869
6870       return false;
6871     }
6872
6873   if (reduction_type == FOLD_LEFT_REDUCTION
6874       && slp_node
6875       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6876     {
6877       /* We cannot use in-order reductions in this case because there is
6878          an implicit reassociation of the operations involved.  */
6879       if (dump_enabled_p ())
6880         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6881                          "in-order unchained SLP reductions not supported.\n");
6882       return false;
6883     }
6884
6885   /* For double reductions, and for SLP reductions with a neutral value,
6886      we construct a variable-length initial vector by loading a vector
6887      full of the neutral value and then shift-and-inserting the start
6888      values into the low-numbered elements.  */
6889   if ((double_reduc || neutral_op)
6890       && !nunits_out.is_constant ()
6891       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6892                                           vectype_out, OPTIMIZE_FOR_SPEED))
6893     {
6894       if (dump_enabled_p ())
6895         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6896                          "reduction on variable-length vectors requires"
6897                          " target support for a vector-shift-and-insert"
6898                          " operation.\n");
6899       return false;
6900     }
6901
6902   /* Check extra constraints for variable-length unchained SLP reductions.  */
6903   if (STMT_SLP_TYPE (stmt_info)
6904       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6905       && !nunits_out.is_constant ())
6906     {
6907       /* We checked above that we could build the initial vector when
6908          there's a neutral element value.  Check here for the case in
6909          which each SLP statement has its own initial value and in which
6910          that value needs to be repeated for every instance of the
6911          statement within the initial vector.  */
6912       unsigned int group_size = SLP_TREE_LANES (slp_node);
6913       if (!neutral_op
6914           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6915                                               TREE_TYPE (vectype_out)))
6916         {
6917           if (dump_enabled_p ())
6918             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6919                              "unsupported form of SLP reduction for"
6920                              " variable-length vectors: cannot build"
6921                              " initial vector.\n");
6922           return false;
6923         }
6924       /* The epilogue code relies on the number of elements being a multiple
6925          of the group size.  The duplicate-and-interleave approach to setting
6926          up the initial vector does too.  */
6927       if (!multiple_p (nunits_out, group_size))
6928         {
6929           if (dump_enabled_p ())
6930             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6931                              "unsupported form of SLP reduction for"
6932                              " variable-length vectors: the vector size"
6933                              " is not a multiple of the number of results.\n");
6934           return false;
6935         }
6936     }
6937
6938   if (reduction_type == COND_REDUCTION)
6939     {
6940       widest_int ni;
6941
6942       if (! max_loop_iterations (loop, &ni))
6943         {
6944           if (dump_enabled_p ())
6945             dump_printf_loc (MSG_NOTE, vect_location,
6946                              "loop count not known, cannot create cond "
6947                              "reduction.\n");
6948           return false;
6949         }
6950       /* Convert backedges to iterations.  */
6951       ni += 1;
6952
6953       /* The additional index will be the same type as the condition.  Check
6954          that the loop can fit into this less one (because we'll use up the
6955          zero slot for when there are no matches).  */
6956       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6957       if (wi::geu_p (ni, wi::to_widest (max_index)))
6958         {
6959           if (dump_enabled_p ())
6960             dump_printf_loc (MSG_NOTE, vect_location,
6961                              "loop size is greater than data size.\n");
6962           return false;
6963         }
6964     }
6965
6966   /* In case the vectorization factor (VF) is bigger than the number
6967      of elements that we can fit in a vectype (nunits), we have to generate
6968      more than one vector stmt - i.e - we need to "unroll" the
6969      vector stmt by a factor VF/nunits.  For more details see documentation
6970      in vectorizable_operation.  */
6971
6972   /* If the reduction is used in an outer loop we need to generate
6973      VF intermediate results, like so (e.g. for ncopies=2):
6974         r0 = phi (init, r0)
6975         r1 = phi (init, r1)
6976         r0 = x0 + r0;
6977         r1 = x1 + r1;
6978     (i.e. we generate VF results in 2 registers).
6979     In this case we have a separate def-use cycle for each copy, and therefore
6980     for each copy we get the vector def for the reduction variable from the
6981     respective phi node created for this copy.
6982
6983     Otherwise (the reduction is unused in the loop nest), we can combine
6984     together intermediate results, like so (e.g. for ncopies=2):
6985         r = phi (init, r)
6986         r = x0 + r;
6987         r = x1 + r;
6988    (i.e. we generate VF/2 results in a single register).
6989    In this case for each copy we get the vector def for the reduction variable
6990    from the vectorized reduction operation generated in the previous iteration.
6991
6992    This only works when we see both the reduction PHI and its only consumer
6993    in vectorizable_reduction and there are no intermediate stmts
6994    participating.  */
6995   if (ncopies > 1
6996       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6997       && reduc_chain_length == 1)
6998     single_defuse_cycle = true;
6999
7000   if (single_defuse_cycle || lane_reduc_code_p)
7001     {
7002       gcc_assert (code != COND_EXPR);
7003
7004       /* 4. Supportable by target?  */
7005       bool ok = true;
7006
7007       /* 4.1. check support for the operation in the loop  */
7008       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7009       if (!optab)
7010         {
7011           if (dump_enabled_p ())
7012             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7013                              "no optab.\n");
7014           ok = false;
7015         }
7016
7017       machine_mode vec_mode = TYPE_MODE (vectype_in);
7018       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7019         {
7020           if (dump_enabled_p ())
7021             dump_printf (MSG_NOTE, "op not supported by target.\n");
7022           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7023               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7024             ok = false;
7025           else
7026             if (dump_enabled_p ())
7027               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7028         }
7029
7030       /* Worthwhile without SIMD support?  */
7031       if (ok
7032           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7033           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7034         {
7035           if (dump_enabled_p ())
7036             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7037                              "not worthwhile without SIMD support.\n");
7038           ok = false;
7039         }
7040
7041       /* lane-reducing operations have to go through vect_transform_reduction.
7042          For the other cases try without the single cycle optimization.  */
7043       if (!ok)
7044         {
7045           if (lane_reduc_code_p)
7046             return false;
7047           else
7048             single_defuse_cycle = false;
7049         }
7050     }
7051   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7052
7053   /* If the reduction stmt is one of the patterns that have lane
7054      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7055   if ((ncopies > 1 && ! single_defuse_cycle)
7056       && lane_reduc_code_p)
7057     {
7058       if (dump_enabled_p ())
7059         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7060                          "multi def-use cycle not possible for lane-reducing "
7061                          "reduction operation\n");
7062       return false;
7063     }
7064
7065   if (slp_node
7066       && !(!single_defuse_cycle
7067            && code != DOT_PROD_EXPR
7068            && code != WIDEN_SUM_EXPR
7069            && code != SAD_EXPR
7070            && reduction_type != FOLD_LEFT_REDUCTION))
7071     for (i = 0; i < op_type; i++)
7072       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7073         {
7074           if (dump_enabled_p ())
7075             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7076                              "incompatible vector types for invariants\n");
7077           return false;
7078         }
7079
7080   if (slp_node)
7081     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7082   else
7083     vec_num = 1;
7084
7085   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7086                              reduction_type, ncopies, cost_vec);
7087   if (dump_enabled_p ()
7088       && reduction_type == FOLD_LEFT_REDUCTION)
7089     dump_printf_loc (MSG_NOTE, vect_location,
7090                      "using an in-order (fold-left) reduction.\n");
7091   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7092   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7093      reductions go through their own vectorizable_* routines.  */
7094   if (!single_defuse_cycle
7095       && code != DOT_PROD_EXPR
7096       && code != WIDEN_SUM_EXPR
7097       && code != SAD_EXPR
7098       && reduction_type != FOLD_LEFT_REDUCTION)
7099     {
7100       stmt_vec_info tem
7101         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7102       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7103         {
7104           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7105           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7106         }
7107       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7108       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7109     }
7110   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7111     {
7112       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7113       internal_fn cond_fn = get_conditional_internal_fn (code);
7114
7115       if (reduction_type != FOLD_LEFT_REDUCTION
7116           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7117           && (cond_fn == IFN_LAST
7118               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7119                                                   OPTIMIZE_FOR_SPEED)))
7120         {
7121           if (dump_enabled_p ())
7122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7123                              "can't operate on partial vectors because"
7124                              " no conditional operation is available.\n");
7125           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7126         }
7127       else if (reduction_type == FOLD_LEFT_REDUCTION
7128                && reduc_fn == IFN_LAST
7129                && !expand_vec_cond_expr_p (vectype_in,
7130                                            truth_type_for (vectype_in),
7131                                            SSA_NAME))
7132         {
7133           if (dump_enabled_p ())
7134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7135                              "can't operate on partial vectors because"
7136                              " no conditional operation is available.\n");
7137           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7138         }
7139       else
7140         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7141                                vectype_in, NULL);
7142     }
7143   return true;
7144 }
7145
7146 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7147    value.  */
7148
7149 bool
7150 vect_transform_reduction (loop_vec_info loop_vinfo,
7151                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7152                           gimple **vec_stmt, slp_tree slp_node)
7153 {
7154   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7155   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7156   int i;
7157   int ncopies;
7158   int vec_num;
7159
7160   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7161   gcc_assert (reduc_info->is_reduc_info);
7162
7163   if (nested_in_vect_loop_p (loop, stmt_info))
7164     {
7165       loop = loop->inner;
7166       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7167     }
7168
7169   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7170   enum tree_code code = gimple_assign_rhs_code (stmt);
7171   int op_type = TREE_CODE_LENGTH (code);
7172
7173   /* Flatten RHS.  */
7174   tree ops[3];
7175   switch (get_gimple_rhs_class (code))
7176     {
7177     case GIMPLE_TERNARY_RHS:
7178       ops[2] = gimple_assign_rhs3 (stmt);
7179       /* Fall thru.  */
7180     case GIMPLE_BINARY_RHS:
7181       ops[0] = gimple_assign_rhs1 (stmt);
7182       ops[1] = gimple_assign_rhs2 (stmt);
7183       break;
7184     default:
7185       gcc_unreachable ();
7186     }
7187
7188   /* All uses but the last are expected to be defined in the loop.
7189      The last use is the reduction variable.  In case of nested cycle this
7190      assumption is not true: we use reduc_index to record the index of the
7191      reduction variable.  */
7192   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7193   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7194   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7195   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7196
7197   if (slp_node)
7198     {
7199       ncopies = 1;
7200       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7201     }
7202   else
7203     {
7204       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7205       vec_num = 1;
7206     }
7207
7208   internal_fn cond_fn = get_conditional_internal_fn (code);
7209   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7210   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7211
7212   /* Transform.  */
7213   tree new_temp = NULL_TREE;
7214   auto_vec<tree> vec_oprnds0;
7215   auto_vec<tree> vec_oprnds1;
7216   auto_vec<tree> vec_oprnds2;
7217   tree def0;
7218
7219   if (dump_enabled_p ())
7220     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7221
7222   /* FORNOW: Multiple types are not supported for condition.  */
7223   if (code == COND_EXPR)
7224     gcc_assert (ncopies == 1);
7225
7226   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7227
7228   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7229   if (reduction_type == FOLD_LEFT_REDUCTION)
7230     {
7231       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7232       return vectorize_fold_left_reduction
7233           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7234            reduc_fn, ops, vectype_in, reduc_index, masks);
7235     }
7236
7237   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7238   gcc_assert (single_defuse_cycle
7239               || code == DOT_PROD_EXPR
7240               || code == WIDEN_SUM_EXPR
7241               || code == SAD_EXPR);
7242
7243   /* Create the destination vector  */
7244   tree scalar_dest = gimple_assign_lhs (stmt);
7245   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7246
7247   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7248                      single_defuse_cycle && reduc_index == 0
7249                      ? NULL_TREE : ops[0], &vec_oprnds0,
7250                      single_defuse_cycle && reduc_index == 1
7251                      ? NULL_TREE : ops[1], &vec_oprnds1,
7252                      op_type == ternary_op
7253                      && !(single_defuse_cycle && reduc_index == 2)
7254                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7255   if (single_defuse_cycle)
7256     {
7257       gcc_assert (!slp_node);
7258       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7259                                      ops[reduc_index],
7260                                      reduc_index == 0 ? &vec_oprnds0
7261                                      : (reduc_index == 1 ? &vec_oprnds1
7262                                         : &vec_oprnds2));
7263     }
7264
7265   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7266     {
7267       gimple *new_stmt;
7268       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7269       if (masked_loop_p && !mask_by_cond_expr)
7270         {
7271           /* Make sure that the reduction accumulator is vop[0].  */
7272           if (reduc_index == 1)
7273             {
7274               gcc_assert (commutative_tree_code (code));
7275               std::swap (vop[0], vop[1]);
7276             }
7277           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7278                                           vectype_in, i);
7279           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7280                                                     vop[0], vop[1], vop[0]);
7281           new_temp = make_ssa_name (vec_dest, call);
7282           gimple_call_set_lhs (call, new_temp);
7283           gimple_call_set_nothrow (call, true);
7284           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7285           new_stmt = call;
7286         }
7287       else
7288         {
7289           if (op_type == ternary_op)
7290             vop[2] = vec_oprnds2[i];
7291
7292           if (masked_loop_p && mask_by_cond_expr)
7293             {
7294               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7295                                               vectype_in, i);
7296               build_vect_cond_expr (code, vop, mask, gsi);
7297             }
7298
7299           new_stmt = gimple_build_assign (vec_dest, code,
7300                                           vop[0], vop[1], vop[2]);
7301           new_temp = make_ssa_name (vec_dest, new_stmt);
7302           gimple_assign_set_lhs (new_stmt, new_temp);
7303           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7304         }
7305
7306       if (slp_node)
7307         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7308       else if (single_defuse_cycle
7309                && i < ncopies - 1)
7310         {
7311           if (reduc_index == 0)
7312             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7313           else if (reduc_index == 1)
7314             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7315           else if (reduc_index == 2)
7316             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7317         }
7318       else
7319         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7320     }
7321
7322   if (!slp_node)
7323     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7324
7325   return true;
7326 }
7327
7328 /* Transform phase of a cycle PHI.  */
7329
7330 bool
7331 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7332                           stmt_vec_info stmt_info, gimple **vec_stmt,
7333                           slp_tree slp_node, slp_instance slp_node_instance)
7334 {
7335   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7336   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7337   int i;
7338   int ncopies;
7339   int j;
7340   bool nested_cycle = false;
7341   int vec_num;
7342
7343   if (nested_in_vect_loop_p (loop, stmt_info))
7344     {
7345       loop = loop->inner;
7346       nested_cycle = true;
7347     }
7348
7349   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7350   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7351   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7352   gcc_assert (reduc_info->is_reduc_info);
7353
7354   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7355       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7356     /* Leave the scalar phi in place.  */
7357     return true;
7358
7359   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7360   /* For a nested cycle we do not fill the above.  */
7361   if (!vectype_in)
7362     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7363   gcc_assert (vectype_in);
7364
7365   if (slp_node)
7366     {
7367       /* The size vect_schedule_slp_instance computes is off for us.  */
7368       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7369                                       * SLP_TREE_LANES (slp_node), vectype_in);
7370       ncopies = 1;
7371     }
7372   else
7373     {
7374       vec_num = 1;
7375       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7376     }
7377
7378   /* Check whether we should use a single PHI node and accumulate
7379      vectors to one before the backedge.  */
7380   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7381     ncopies = 1;
7382
7383   /* Create the destination vector  */
7384   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7385   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7386                                                vectype_out);
7387
7388   /* Get the loop-entry arguments.  */
7389   tree vec_initial_def;
7390   auto_vec<tree> vec_initial_defs;
7391   if (slp_node)
7392     {
7393       vec_initial_defs.reserve (vec_num);
7394       if (nested_cycle)
7395         {
7396           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7397           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7398                              &vec_initial_defs);
7399         }
7400       else
7401         {
7402           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7403           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7404           tree neutral_op
7405               = neutral_op_for_slp_reduction (slp_node, vectype_out,
7406                                               STMT_VINFO_REDUC_CODE (reduc_info),
7407                                               first != NULL);
7408           get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7409                                           &vec_initial_defs, vec_num,
7410                                           first != NULL, neutral_op);
7411         }
7412     }
7413   else
7414     {
7415       /* Get at the scalar def before the loop, that defines the initial
7416          value of the reduction variable.  */
7417       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7418                                                 loop_preheader_edge (loop));
7419       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7420          and we can't use zero for induc_val, use initial_def.  Similarly
7421          for REDUC_MIN and initial_def larger than the base.  */
7422       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7423         {
7424           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7425           if (TREE_CODE (initial_def) == INTEGER_CST
7426               && !integer_zerop (induc_val)
7427               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7428                    && tree_int_cst_lt (initial_def, induc_val))
7429                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7430                       && tree_int_cst_lt (induc_val, initial_def))))
7431             {
7432               induc_val = initial_def;
7433               /* Communicate we used the initial_def to epilouge
7434                  generation.  */
7435               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7436             }
7437           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7438           vec_initial_defs.create (ncopies);
7439           for (i = 0; i < ncopies; ++i)
7440             vec_initial_defs.quick_push (vec_initial_def);
7441         }
7442       else if (nested_cycle)
7443         {
7444           /* Do not use an adjustment def as that case is not supported
7445              correctly if ncopies is not one.  */
7446           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7447                                          ncopies, initial_def,
7448                                          &vec_initial_defs);
7449         }
7450       else
7451         {
7452           tree adjustment_def = NULL_TREE;
7453           tree *adjustment_defp = &adjustment_def;
7454           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7455           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7456             adjustment_defp = NULL;
7457           vec_initial_def
7458             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7459                                              initial_def, adjustment_defp);
7460           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7461           vec_initial_defs.create (ncopies);
7462           for (i = 0; i < ncopies; ++i)
7463             vec_initial_defs.quick_push (vec_initial_def);
7464         }
7465     }
7466
7467   /* Generate the reduction PHIs upfront.  */
7468   for (i = 0; i < vec_num; i++)
7469     {
7470       tree vec_init_def = vec_initial_defs[i];
7471       for (j = 0; j < ncopies; j++)
7472         {
7473           /* Create the reduction-phi that defines the reduction
7474              operand.  */
7475           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7476
7477           /* Set the loop-entry arg of the reduction-phi.  */
7478           if (j != 0 && nested_cycle)
7479             vec_init_def = vec_initial_defs[j];
7480           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7481                        UNKNOWN_LOCATION);
7482
7483           /* The loop-latch arg is set in epilogue processing.  */
7484
7485           if (slp_node)
7486             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7487           else
7488             {
7489               if (j == 0)
7490                 *vec_stmt = new_phi;
7491               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7492             }
7493         }
7494     }
7495
7496   return true;
7497 }
7498
7499 /* Vectorizes LC PHIs.  */
7500
7501 bool
7502 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7503                      stmt_vec_info stmt_info, gimple **vec_stmt,
7504                      slp_tree slp_node)
7505 {
7506   if (!loop_vinfo
7507       || !is_a <gphi *> (stmt_info->stmt)
7508       || gimple_phi_num_args (stmt_info->stmt) != 1)
7509     return false;
7510
7511   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7512       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7513     return false;
7514
7515   if (!vec_stmt) /* transformation not required.  */
7516     {
7517       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7518       return true;
7519     }
7520
7521   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7522   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7523   basic_block bb = gimple_bb (stmt_info->stmt);
7524   edge e = single_pred_edge (bb);
7525   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7526   auto_vec<tree> vec_oprnds;
7527   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7528                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7529                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7530   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7531     {
7532       /* Create the vectorized LC PHI node.  */
7533       gphi *new_phi = create_phi_node (vec_dest, bb);
7534       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7535       if (slp_node)
7536         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7537       else
7538         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7539     }
7540   if (!slp_node)
7541     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7542
7543   return true;
7544 }
7545
7546 /* Vectorizes PHIs.  */
7547
7548 bool
7549 vectorizable_phi (vec_info *,
7550                   stmt_vec_info stmt_info, gimple **vec_stmt,
7551                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7552 {
7553   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7554     return false;
7555
7556   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7557     return false;
7558
7559   tree vectype = SLP_TREE_VECTYPE (slp_node);
7560
7561   if (!vec_stmt) /* transformation not required.  */
7562     {
7563       slp_tree child;
7564       unsigned i;
7565       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7566         if (!child)
7567           {
7568             if (dump_enabled_p ())
7569               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7570                                "PHI node with unvectorized backedge def\n");
7571             return false;
7572           }
7573         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7574           {
7575             if (dump_enabled_p ())
7576               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7577                                "incompatible vector types for invariants\n");
7578             return false;
7579           }
7580       record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7581                         vector_stmt, stmt_info, vectype, 0, vect_body);
7582       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7583       return true;
7584     }
7585
7586   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7587   basic_block bb = gimple_bb (stmt_info->stmt);
7588   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7589   auto_vec<gphi *> new_phis;
7590   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7591     {
7592       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7593
7594       /* Skip not yet vectorized defs.  */
7595       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7596           && SLP_TREE_VEC_STMTS (child).is_empty ())
7597         continue;
7598
7599       auto_vec<tree> vec_oprnds;
7600       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7601       if (!new_phis.exists ())
7602         {
7603           new_phis.create (vec_oprnds.length ());
7604           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7605             {
7606               /* Create the vectorized LC PHI node.  */
7607               new_phis.quick_push (create_phi_node (vec_dest, bb));
7608               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7609             }
7610         }
7611       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7612       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7613         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7614     }
7615   /* We should have at least one already vectorized child.  */
7616   gcc_assert (new_phis.exists ());
7617
7618   return true;
7619 }
7620
7621
7622 /* Function vect_min_worthwhile_factor.
7623
7624    For a loop where we could vectorize the operation indicated by CODE,
7625    return the minimum vectorization factor that makes it worthwhile
7626    to use generic vectors.  */
7627 static unsigned int
7628 vect_min_worthwhile_factor (enum tree_code code)
7629 {
7630   switch (code)
7631     {
7632     case PLUS_EXPR:
7633     case MINUS_EXPR:
7634     case NEGATE_EXPR:
7635       return 4;
7636
7637     case BIT_AND_EXPR:
7638     case BIT_IOR_EXPR:
7639     case BIT_XOR_EXPR:
7640     case BIT_NOT_EXPR:
7641       return 2;
7642
7643     default:
7644       return INT_MAX;
7645     }
7646 }
7647
7648 /* Return true if VINFO indicates we are doing loop vectorization and if
7649    it is worth decomposing CODE operations into scalar operations for
7650    that loop's vectorization factor.  */
7651
7652 bool
7653 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7654 {
7655   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7656   unsigned HOST_WIDE_INT value;
7657   return (loop_vinfo
7658           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7659           && value >= vect_min_worthwhile_factor (code));
7660 }
7661
7662 /* Function vectorizable_induction
7663
7664    Check if STMT_INFO performs an induction computation that can be vectorized.
7665    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7666    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7667    Return true if STMT_INFO is vectorizable in this way.  */
7668
7669 bool
7670 vectorizable_induction (loop_vec_info loop_vinfo,
7671                         stmt_vec_info stmt_info,
7672                         gimple **vec_stmt, slp_tree slp_node,
7673                         stmt_vector_for_cost *cost_vec)
7674 {
7675   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7676   unsigned ncopies;
7677   bool nested_in_vect_loop = false;
7678   class loop *iv_loop;
7679   tree vec_def;
7680   edge pe = loop_preheader_edge (loop);
7681   basic_block new_bb;
7682   tree new_vec, vec_init, vec_step, t;
7683   tree new_name;
7684   gimple *new_stmt;
7685   gphi *induction_phi;
7686   tree induc_def, vec_dest;
7687   tree init_expr, step_expr;
7688   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7689   unsigned i;
7690   tree expr;
7691   gimple_stmt_iterator si;
7692
7693   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7694   if (!phi)
7695     return false;
7696
7697   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7698     return false;
7699
7700   /* Make sure it was recognized as induction computation.  */
7701   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7702     return false;
7703
7704   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7705   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7706
7707   if (slp_node)
7708     ncopies = 1;
7709   else
7710     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7711   gcc_assert (ncopies >= 1);
7712
7713   /* FORNOW. These restrictions should be relaxed.  */
7714   if (nested_in_vect_loop_p (loop, stmt_info))
7715     {
7716       imm_use_iterator imm_iter;
7717       use_operand_p use_p;
7718       gimple *exit_phi;
7719       edge latch_e;
7720       tree loop_arg;
7721
7722       if (ncopies > 1)
7723         {
7724           if (dump_enabled_p ())
7725             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7726                              "multiple types in nested loop.\n");
7727           return false;
7728         }
7729
7730       exit_phi = NULL;
7731       latch_e = loop_latch_edge (loop->inner);
7732       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7733       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7734         {
7735           gimple *use_stmt = USE_STMT (use_p);
7736           if (is_gimple_debug (use_stmt))
7737             continue;
7738
7739           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7740             {
7741               exit_phi = use_stmt;
7742               break;
7743             }
7744         }
7745       if (exit_phi)
7746         {
7747           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7748           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7749                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7750             {
7751               if (dump_enabled_p ())
7752                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7753                                  "inner-loop induction only used outside "
7754                                  "of the outer vectorized loop.\n");
7755               return false;
7756             }
7757         }
7758
7759       nested_in_vect_loop = true;
7760       iv_loop = loop->inner;
7761     }
7762   else
7763     iv_loop = loop;
7764   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7765
7766   if (slp_node && !nunits.is_constant ())
7767     {
7768       /* The current SLP code creates the step value element-by-element.  */
7769       if (dump_enabled_p ())
7770         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7771                          "SLP induction not supported for variable-length"
7772                          " vectors.\n");
7773       return false;
7774     }
7775
7776   if (!vec_stmt) /* transformation not required.  */
7777     {
7778       unsigned inside_cost = 0, prologue_cost = 0;
7779       if (slp_node)
7780         {
7781           /* We eventually need to set a vector type on invariant
7782              arguments.  */
7783           unsigned j;
7784           slp_tree child;
7785           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7786             if (!vect_maybe_update_slp_op_vectype
7787                 (child, SLP_TREE_VECTYPE (slp_node)))
7788               {
7789                 if (dump_enabled_p ())
7790                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7791                                    "incompatible vector types for "
7792                                    "invariants\n");
7793                 return false;
7794               }
7795           /* loop cost for vec_loop.  */
7796           inside_cost
7797             = record_stmt_cost (cost_vec,
7798                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7799                                 vector_stmt, stmt_info, 0, vect_body);
7800           /* prologue cost for vec_init (if not nested) and step.  */
7801           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
7802                                             scalar_to_vec,
7803                                             stmt_info, 0, vect_prologue);
7804         }
7805       else /* if (!slp_node) */
7806         {
7807           /* loop cost for vec_loop.  */
7808           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
7809                                           stmt_info, 0, vect_body);
7810           /* prologue cost for vec_init and vec_step.  */
7811           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
7812                                             stmt_info, 0, vect_prologue);
7813         }
7814       if (dump_enabled_p ())
7815         dump_printf_loc (MSG_NOTE, vect_location,
7816                          "vect_model_induction_cost: inside_cost = %d, "
7817                          "prologue_cost = %d .\n", inside_cost,
7818                          prologue_cost);
7819
7820       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7821       DUMP_VECT_SCOPE ("vectorizable_induction");
7822       return true;
7823     }
7824
7825   /* Transform.  */
7826
7827   /* Compute a vector variable, initialized with the first VF values of
7828      the induction variable.  E.g., for an iv with IV_PHI='X' and
7829      evolution S, for a vector of 4 units, we want to compute:
7830      [X, X + S, X + 2*S, X + 3*S].  */
7831
7832   if (dump_enabled_p ())
7833     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7834
7835   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7836   gcc_assert (step_expr != NULL_TREE);
7837   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7838
7839   pe = loop_preheader_edge (iv_loop);
7840   /* Find the first insertion point in the BB.  */
7841   basic_block bb = gimple_bb (phi);
7842   si = gsi_after_labels (bb);
7843
7844   /* For SLP induction we have to generate several IVs as for example
7845      with group size 3 we need
7846        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
7847        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
7848   if (slp_node)
7849     {
7850       /* Enforced above.  */
7851       unsigned int const_nunits = nunits.to_constant ();
7852
7853       /* The initial values are vectorized, but any lanes > group_size
7854          need adjustment.  */
7855       slp_tree init_node
7856         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
7857
7858       /* Gather steps.  Since we do not vectorize inductions as
7859          cycles we have to reconstruct the step from SCEV data.  */
7860       unsigned group_size = SLP_TREE_LANES (slp_node);
7861       tree *steps = XALLOCAVEC (tree, group_size);
7862       tree *inits = XALLOCAVEC (tree, group_size);
7863       stmt_vec_info phi_info;
7864       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
7865         {
7866           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
7867           if (!init_node)
7868             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
7869                                            pe->dest_idx);
7870         }
7871
7872       /* Now generate the IVs.  */
7873       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7874       gcc_assert ((const_nunits * nvects) % group_size == 0);
7875       unsigned nivs;
7876       if (nested_in_vect_loop)
7877         nivs = nvects;
7878       else
7879         {
7880           /* Compute the number of distinct IVs we need.  First reduce
7881              group_size if it is a multiple of const_nunits so we get
7882              one IV for a group_size of 4 but const_nunits 2.  */
7883           unsigned group_sizep = group_size;
7884           if (group_sizep % const_nunits == 0)
7885             group_sizep = group_sizep / const_nunits;
7886           nivs = least_common_multiple (group_sizep,
7887                                         const_nunits) / const_nunits;
7888         }
7889       tree stept = TREE_TYPE (step_vectype);
7890       tree lupdate_mul = NULL_TREE;
7891       if (!nested_in_vect_loop)
7892         {
7893           /* The number of iterations covered in one vector iteration.  */
7894           unsigned lup_mul = (nvects * const_nunits) / group_size;
7895           lupdate_mul
7896             = build_vector_from_val (step_vectype,
7897                                      SCALAR_FLOAT_TYPE_P (stept)
7898                                      ? build_real_from_wide (stept, lup_mul,
7899                                                              UNSIGNED)
7900                                      : build_int_cstu (stept, lup_mul));
7901         }
7902       tree peel_mul = NULL_TREE;
7903       gimple_seq init_stmts = NULL;
7904       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
7905         {
7906           if (SCALAR_FLOAT_TYPE_P (stept))
7907             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
7908                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7909           else
7910             peel_mul = gimple_convert (&init_stmts, stept,
7911                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
7912           peel_mul = gimple_build_vector_from_val (&init_stmts,
7913                                                    step_vectype, peel_mul);
7914         }
7915       unsigned ivn;
7916       auto_vec<tree> vec_steps;
7917       for (ivn = 0; ivn < nivs; ++ivn)
7918         {
7919           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
7920           tree_vector_builder init_elts (vectype, const_nunits, 1);
7921           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
7922           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7923             {
7924               /* The scalar steps of the IVs.  */
7925               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
7926               step_elts.quick_push (elt);
7927               if (!init_node)
7928                 {
7929                   /* The scalar inits of the IVs if not vectorized.  */
7930                   elt = inits[(ivn*const_nunits + eltn) % group_size];
7931                   init_elts.quick_push (elt);
7932                 }
7933               /* The number of steps to add to the initial values.  */
7934               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
7935               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
7936                                    ? build_real_from_wide (stept,
7937                                                            mul_elt, UNSIGNED)
7938                                    : build_int_cstu (stept, mul_elt));
7939             }
7940           vec_step = gimple_build_vector (&init_stmts, &step_elts);
7941           vec_step = gimple_convert (&init_stmts, step_vectype, vec_step);
7942           vec_steps.safe_push (vec_step);
7943           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
7944           if (peel_mul)
7945             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
7946                                      step_mul, peel_mul);
7947           if (!init_node)
7948             vec_init = gimple_build_vector (&init_stmts, &init_elts);
7949
7950           /* Create the induction-phi that defines the induction-operand.  */
7951           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
7952                                             "vec_iv_");
7953           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7954           induc_def = PHI_RESULT (induction_phi);
7955
7956           /* Create the iv update inside the loop  */
7957           tree up = vec_step;
7958           if (lupdate_mul)
7959             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
7960                                vec_step, lupdate_mul);
7961           gimple_seq stmts = NULL;
7962           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7963           vec_def = gimple_build (&stmts,
7964                                   PLUS_EXPR, step_vectype, vec_def, up);
7965           vec_def = gimple_convert (&stmts, vectype, vec_def);
7966           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7967           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7968                        UNKNOWN_LOCATION);
7969
7970           if (init_node)
7971             vec_init = vect_get_slp_vect_def (init_node, ivn);
7972           if (!nested_in_vect_loop
7973               && !integer_zerop (step_mul))
7974             {
7975               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
7976               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
7977                                  vec_step, step_mul);
7978               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
7979                                       vec_def, up);
7980               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
7981             }
7982
7983           /* Set the arguments of the phi node:  */
7984           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7985
7986           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7987         }
7988       if (!nested_in_vect_loop)
7989         {
7990           /* Fill up to the number of vectors we need for the whole group.  */
7991           nivs = least_common_multiple (group_size,
7992                                         const_nunits) / const_nunits;
7993           for (; ivn < nivs; ++ivn)
7994             SLP_TREE_VEC_STMTS (slp_node)
7995               .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7996         }
7997
7998       /* Re-use IVs when we can.  We are generating further vector
7999          stmts by adding VF' * stride to the IVs generated above.  */
8000       if (ivn < nvects)
8001         {
8002           unsigned vfp
8003             = least_common_multiple (group_size, const_nunits) / group_size;
8004           tree lupdate_mul
8005             = build_vector_from_val (step_vectype,
8006                                      SCALAR_FLOAT_TYPE_P (stept)
8007                                      ? build_real_from_wide (stept,
8008                                                              vfp, UNSIGNED)
8009                                      : build_int_cstu (stept, vfp));
8010           for (; ivn < nvects; ++ivn)
8011             {
8012               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8013               tree def = gimple_get_lhs (iv);
8014               if (ivn < 2*nivs)
8015                 vec_steps[ivn - nivs]
8016                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8017                                   vec_steps[ivn - nivs], lupdate_mul);
8018               gimple_seq stmts = NULL;
8019               def = gimple_convert (&stmts, step_vectype, def);
8020               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8021                                   def, vec_steps[ivn % nivs]);
8022               def = gimple_convert (&stmts, vectype, def);
8023               if (gimple_code (iv) == GIMPLE_PHI)
8024                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8025               else
8026                 {
8027                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8028                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8029                 }
8030               SLP_TREE_VEC_STMTS (slp_node)
8031                 .quick_push (SSA_NAME_DEF_STMT (def));
8032             }
8033         }
8034
8035       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8036       gcc_assert (!new_bb);
8037
8038       return true;
8039     }
8040
8041   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8042                                      loop_preheader_edge (iv_loop));
8043
8044   gimple_seq stmts = NULL;
8045   if (!nested_in_vect_loop)
8046     {
8047       /* Convert the initial value to the IV update type.  */
8048       tree new_type = TREE_TYPE (step_expr);
8049       init_expr = gimple_convert (&stmts, new_type, init_expr);
8050
8051       /* If we are using the loop mask to "peel" for alignment then we need
8052          to adjust the start value here.  */
8053       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8054       if (skip_niters != NULL_TREE)
8055         {
8056           if (FLOAT_TYPE_P (vectype))
8057             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8058                                         skip_niters);
8059           else
8060             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8061           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8062                                          skip_niters, step_expr);
8063           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8064                                     init_expr, skip_step);
8065         }
8066     }
8067
8068   if (stmts)
8069     {
8070       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8071       gcc_assert (!new_bb);
8072     }
8073
8074   /* Create the vector that holds the initial_value of the induction.  */
8075   if (nested_in_vect_loop)
8076     {
8077       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8078          been created during vectorization of previous stmts.  We obtain it
8079          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8080       auto_vec<tree> vec_inits;
8081       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8082                                      init_expr, &vec_inits);
8083       vec_init = vec_inits[0];
8084       /* If the initial value is not of proper type, convert it.  */
8085       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8086         {
8087           new_stmt
8088             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8089                                                           vect_simple_var,
8090                                                           "vec_iv_"),
8091                                    VIEW_CONVERT_EXPR,
8092                                    build1 (VIEW_CONVERT_EXPR, vectype,
8093                                            vec_init));
8094           vec_init = gimple_assign_lhs (new_stmt);
8095           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8096                                                  new_stmt);
8097           gcc_assert (!new_bb);
8098         }
8099     }
8100   else
8101     {
8102       /* iv_loop is the loop to be vectorized. Create:
8103          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8104       stmts = NULL;
8105       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8106
8107       unsigned HOST_WIDE_INT const_nunits;
8108       if (nunits.is_constant (&const_nunits))
8109         {
8110           tree_vector_builder elts (step_vectype, const_nunits, 1);
8111           elts.quick_push (new_name);
8112           for (i = 1; i < const_nunits; i++)
8113             {
8114               /* Create: new_name_i = new_name + step_expr  */
8115               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8116                                        new_name, step_expr);
8117               elts.quick_push (new_name);
8118             }
8119           /* Create a vector from [new_name_0, new_name_1, ...,
8120              new_name_nunits-1]  */
8121           vec_init = gimple_build_vector (&stmts, &elts);
8122         }
8123       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8124         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8125         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8126                                  new_name, step_expr);
8127       else
8128         {
8129           /* Build:
8130                 [base, base, base, ...]
8131                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8132           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8133           gcc_assert (flag_associative_math);
8134           tree index = build_index_vector (step_vectype, 0, 1);
8135           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8136                                                         new_name);
8137           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8138                                                         step_expr);
8139           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8140           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8141                                    vec_init, step_vec);
8142           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8143                                    vec_init, base_vec);
8144         }
8145       vec_init = gimple_convert (&stmts, vectype, vec_init);
8146
8147       if (stmts)
8148         {
8149           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8150           gcc_assert (!new_bb);
8151         }
8152     }
8153
8154
8155   /* Create the vector that holds the step of the induction.  */
8156   if (nested_in_vect_loop)
8157     /* iv_loop is nested in the loop to be vectorized. Generate:
8158        vec_step = [S, S, S, S]  */
8159     new_name = step_expr;
8160   else
8161     {
8162       /* iv_loop is the loop to be vectorized. Generate:
8163           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8164       gimple_seq seq = NULL;
8165       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8166         {
8167           expr = build_int_cst (integer_type_node, vf);
8168           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8169         }
8170       else
8171         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8172       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8173                                expr, step_expr);
8174       if (seq)
8175         {
8176           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8177           gcc_assert (!new_bb);
8178         }
8179     }
8180
8181   t = unshare_expr (new_name);
8182   gcc_assert (CONSTANT_CLASS_P (new_name)
8183               || TREE_CODE (new_name) == SSA_NAME);
8184   new_vec = build_vector_from_val (step_vectype, t);
8185   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8186                                new_vec, step_vectype, NULL);
8187
8188
8189   /* Create the following def-use cycle:
8190      loop prolog:
8191          vec_init = ...
8192          vec_step = ...
8193      loop:
8194          vec_iv = PHI <vec_init, vec_loop>
8195          ...
8196          STMT
8197          ...
8198          vec_loop = vec_iv + vec_step;  */
8199
8200   /* Create the induction-phi that defines the induction-operand.  */
8201   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8202   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8203   induc_def = PHI_RESULT (induction_phi);
8204
8205   /* Create the iv update inside the loop  */
8206   stmts = NULL;
8207   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8208   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8209   vec_def = gimple_convert (&stmts, vectype, vec_def);
8210   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8211   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8212
8213   /* Set the arguments of the phi node:  */
8214   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8215   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8216                UNKNOWN_LOCATION);
8217
8218   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8219   *vec_stmt = induction_phi;
8220
8221   /* In case that vectorization factor (VF) is bigger than the number
8222      of elements that we can fit in a vectype (nunits), we have to generate
8223      more than one vector stmt - i.e - we need to "unroll" the
8224      vector stmt by a factor VF/nunits.  For more details see documentation
8225      in vectorizable_operation.  */
8226
8227   if (ncopies > 1)
8228     {
8229       gimple_seq seq = NULL;
8230       /* FORNOW. This restriction should be relaxed.  */
8231       gcc_assert (!nested_in_vect_loop);
8232
8233       /* Create the vector that holds the step of the induction.  */
8234       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8235         {
8236           expr = build_int_cst (integer_type_node, nunits);
8237           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8238         }
8239       else
8240         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8241       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8242                                expr, step_expr);
8243       if (seq)
8244         {
8245           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8246           gcc_assert (!new_bb);
8247         }
8248
8249       t = unshare_expr (new_name);
8250       gcc_assert (CONSTANT_CLASS_P (new_name)
8251                   || TREE_CODE (new_name) == SSA_NAME);
8252       new_vec = build_vector_from_val (step_vectype, t);
8253       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8254                                    new_vec, step_vectype, NULL);
8255
8256       vec_def = induc_def;
8257       for (i = 1; i < ncopies; i++)
8258         {
8259           /* vec_i = vec_prev + vec_step  */
8260           gimple_seq stmts = NULL;
8261           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8262           vec_def = gimple_build (&stmts,
8263                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8264           vec_def = gimple_convert (&stmts, vectype, vec_def);
8265
8266           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8267           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8268           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8269         }
8270     }
8271
8272   if (dump_enabled_p ())
8273     dump_printf_loc (MSG_NOTE, vect_location,
8274                      "transform induction: created def-use cycle: %G%G",
8275                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8276
8277   return true;
8278 }
8279
8280 /* Function vectorizable_live_operation.
8281
8282    STMT_INFO computes a value that is used outside the loop.  Check if
8283    it can be supported.  */
8284
8285 bool
8286 vectorizable_live_operation (vec_info *vinfo,
8287                              stmt_vec_info stmt_info,
8288                              gimple_stmt_iterator *gsi,
8289                              slp_tree slp_node, slp_instance slp_node_instance,
8290                              int slp_index, bool vec_stmt_p,
8291                              stmt_vector_for_cost *cost_vec)
8292 {
8293   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8294   imm_use_iterator imm_iter;
8295   tree lhs, lhs_type, bitsize, vec_bitsize;
8296   tree vectype = (slp_node
8297                   ? SLP_TREE_VECTYPE (slp_node)
8298                   : STMT_VINFO_VECTYPE (stmt_info));
8299   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8300   int ncopies;
8301   gimple *use_stmt;
8302   auto_vec<tree> vec_oprnds;
8303   int vec_entry = 0;
8304   poly_uint64 vec_index = 0;
8305
8306   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8307
8308   /* If a stmt of a reduction is live, vectorize it via
8309      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8310      validity so just trigger the transform here.  */
8311   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8312     {
8313       if (!vec_stmt_p)
8314         return true;
8315       if (slp_node)
8316         {
8317           /* For reduction chains the meta-info is attached to
8318              the group leader.  */
8319           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8320             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8321           /* For SLP reductions we vectorize the epilogue for
8322              all involved stmts together.  */
8323           else if (slp_index != 0)
8324             return true;
8325           else
8326             /* For SLP reductions the meta-info is attached to
8327                the representative.  */
8328             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8329         }
8330       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8331       gcc_assert (reduc_info->is_reduc_info);
8332       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8333           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8334         return true;
8335       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8336                                         slp_node_instance);
8337       return true;
8338     }
8339
8340   /* If STMT is not relevant and it is a simple assignment and its inputs are
8341      invariant then it can remain in place, unvectorized.  The original last
8342      scalar value that it computes will be used.  */
8343   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8344     {
8345       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8346       if (dump_enabled_p ())
8347         dump_printf_loc (MSG_NOTE, vect_location,
8348                          "statement is simple and uses invariant.  Leaving in "
8349                          "place.\n");
8350       return true;
8351     }
8352
8353   if (slp_node)
8354     ncopies = 1;
8355   else
8356     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8357
8358   if (slp_node)
8359     {
8360       gcc_assert (slp_index >= 0);
8361
8362       /* Get the last occurrence of the scalar index from the concatenation of
8363          all the slp vectors. Calculate which slp vector it is and the index
8364          within.  */
8365       int num_scalar = SLP_TREE_LANES (slp_node);
8366       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8367       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8368
8369       /* Calculate which vector contains the result, and which lane of
8370          that vector we need.  */
8371       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8372         {
8373           if (dump_enabled_p ())
8374             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8375                              "Cannot determine which vector holds the"
8376                              " final result.\n");
8377           return false;
8378         }
8379     }
8380
8381   if (!vec_stmt_p)
8382     {
8383       /* No transformation required.  */
8384       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8385         {
8386           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8387                                                OPTIMIZE_FOR_SPEED))
8388             {
8389               if (dump_enabled_p ())
8390                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8391                                  "can't operate on partial vectors "
8392                                  "because the target doesn't support extract "
8393                                  "last reduction.\n");
8394               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8395             }
8396           else if (slp_node)
8397             {
8398               if (dump_enabled_p ())
8399                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8400                                  "can't operate on partial vectors "
8401                                  "because an SLP statement is live after "
8402                                  "the loop.\n");
8403               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8404             }
8405           else if (ncopies > 1)
8406             {
8407               if (dump_enabled_p ())
8408                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8409                                  "can't operate on partial vectors "
8410                                  "because ncopies is greater than 1.\n");
8411               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8412             }
8413           else
8414             {
8415               gcc_assert (ncopies == 1 && !slp_node);
8416               vect_record_loop_mask (loop_vinfo,
8417                                      &LOOP_VINFO_MASKS (loop_vinfo),
8418                                      1, vectype, NULL);
8419             }
8420         }
8421       /* ???  Enable for loop costing as well.  */
8422       if (!loop_vinfo)
8423         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8424                           0, vect_epilogue);
8425       return true;
8426     }
8427
8428   /* Use the lhs of the original scalar statement.  */
8429   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8430   if (dump_enabled_p ())
8431     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8432                      "stmt %G", stmt);
8433
8434   lhs = gimple_get_lhs (stmt);
8435   lhs_type = TREE_TYPE (lhs);
8436
8437   bitsize = vector_element_bits_tree (vectype);
8438   vec_bitsize = TYPE_SIZE (vectype);
8439
8440   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8441   tree vec_lhs, bitstart;
8442   gimple *vec_stmt;
8443   if (slp_node)
8444     {
8445       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8446
8447       /* Get the correct slp vectorized stmt.  */
8448       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8449       vec_lhs = gimple_get_lhs (vec_stmt);
8450
8451       /* Get entry to use.  */
8452       bitstart = bitsize_int (vec_index);
8453       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8454     }
8455   else
8456     {
8457       /* For multiple copies, get the last copy.  */
8458       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8459       vec_lhs = gimple_get_lhs (vec_stmt);
8460
8461       /* Get the last lane in the vector.  */
8462       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8463     }
8464
8465   if (loop_vinfo)
8466     {
8467       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8468          requirement, insert one phi node for it.  It looks like:
8469            loop;
8470          BB:
8471            # lhs' = PHI <lhs>
8472          ==>
8473            loop;
8474          BB:
8475            # vec_lhs' = PHI <vec_lhs>
8476            new_tree = lane_extract <vec_lhs', ...>;
8477            lhs' = new_tree;  */
8478
8479       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8480       basic_block exit_bb = single_exit (loop)->dest;
8481       gcc_assert (single_pred_p (exit_bb));
8482
8483       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8484       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8485       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8486
8487       gimple_seq stmts = NULL;
8488       tree new_tree;
8489       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8490         {
8491           /* Emit:
8492
8493                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8494
8495              where VEC_LHS is the vectorized live-out result and MASK is
8496              the loop mask for the final iteration.  */
8497           gcc_assert (ncopies == 1 && !slp_node);
8498           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8499           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8500                                           1, vectype, 0);
8501           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8502                                           mask, vec_lhs_phi);
8503
8504           /* Convert the extracted vector element to the scalar type.  */
8505           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8506         }
8507       else
8508         {
8509           tree bftype = TREE_TYPE (vectype);
8510           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8511             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8512           new_tree = build3 (BIT_FIELD_REF, bftype,
8513                              vec_lhs_phi, bitsize, bitstart);
8514           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8515                                            &stmts, true, NULL_TREE);
8516         }
8517
8518       if (stmts)
8519         {
8520           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8521           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8522
8523           /* Remove existing phi from lhs and create one copy from new_tree.  */
8524           tree lhs_phi = NULL_TREE;
8525           gimple_stmt_iterator gsi;
8526           for (gsi = gsi_start_phis (exit_bb);
8527                !gsi_end_p (gsi); gsi_next (&gsi))
8528             {
8529               gimple *phi = gsi_stmt (gsi);
8530               if ((gimple_phi_arg_def (phi, 0) == lhs))
8531                 {
8532                   remove_phi_node (&gsi, false);
8533                   lhs_phi = gimple_phi_result (phi);
8534                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8535                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8536                   break;
8537                 }
8538             }
8539         }
8540
8541       /* Replace use of lhs with newly computed result.  If the use stmt is a
8542          single arg PHI, just replace all uses of PHI result.  It's necessary
8543          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8544       use_operand_p use_p;
8545       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8546         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8547             && !is_gimple_debug (use_stmt))
8548           {
8549             if (gimple_code (use_stmt) == GIMPLE_PHI
8550                 && gimple_phi_num_args (use_stmt) == 1)
8551               {
8552                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8553               }
8554             else
8555               {
8556                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8557                     SET_USE (use_p, new_tree);
8558               }
8559             update_stmt (use_stmt);
8560           }
8561     }
8562   else
8563     {
8564       /* For basic-block vectorization simply insert the lane-extraction.  */
8565       tree bftype = TREE_TYPE (vectype);
8566       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8567         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8568       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8569                               vec_lhs, bitsize, bitstart);
8570       gimple_seq stmts = NULL;
8571       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8572                                        &stmts, true, NULL_TREE);
8573       if (TREE_CODE (new_tree) == SSA_NAME
8574           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8575         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8576       if (is_a <gphi *> (vec_stmt))
8577         {
8578           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8579           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8580         }
8581       else
8582         {
8583           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8584           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8585         }
8586
8587       /* Replace use of lhs with newly computed result.  If the use stmt is a
8588          single arg PHI, just replace all uses of PHI result.  It's necessary
8589          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8590       use_operand_p use_p;
8591       stmt_vec_info use_stmt_info;
8592       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8593         if (!is_gimple_debug (use_stmt)
8594             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8595                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8596           {
8597             /* ???  This can happen when the live lane ends up being
8598                used in a vector construction code-generated by an
8599                external SLP node (and code-generation for that already
8600                happened).  See gcc.dg/vect/bb-slp-47.c.
8601                Doing this is what would happen if that vector CTOR
8602                were not code-generated yet so it is not too bad.
8603                ???  In fact we'd likely want to avoid this situation
8604                in the first place.  */
8605             if (TREE_CODE (new_tree) == SSA_NAME
8606                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8607                 && gimple_code (use_stmt) != GIMPLE_PHI
8608                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8609                                                 use_stmt))
8610               {
8611                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8612                 gcc_assert (code == CONSTRUCTOR
8613                             || code == VIEW_CONVERT_EXPR
8614                             || CONVERT_EXPR_CODE_P (code));
8615                 if (dump_enabled_p ())
8616                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8617                                    "Using original scalar computation for "
8618                                    "live lane because use preceeds vector "
8619                                    "def\n");
8620                 continue;
8621               }
8622             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8623               SET_USE (use_p, new_tree);
8624             update_stmt (use_stmt);
8625           }
8626     }
8627
8628   return true;
8629 }
8630
8631 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8632
8633 static void
8634 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8635 {
8636   ssa_op_iter op_iter;
8637   imm_use_iterator imm_iter;
8638   def_operand_p def_p;
8639   gimple *ustmt;
8640
8641   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8642     {
8643       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8644         {
8645           basic_block bb;
8646
8647           if (!is_gimple_debug (ustmt))
8648             continue;
8649
8650           bb = gimple_bb (ustmt);
8651
8652           if (!flow_bb_inside_loop_p (loop, bb))
8653             {
8654               if (gimple_debug_bind_p (ustmt))
8655                 {
8656                   if (dump_enabled_p ())
8657                     dump_printf_loc (MSG_NOTE, vect_location,
8658                                      "killing debug use\n");
8659
8660                   gimple_debug_bind_reset_value (ustmt);
8661                   update_stmt (ustmt);
8662                 }
8663               else
8664                 gcc_unreachable ();
8665             }
8666         }
8667     }
8668 }
8669
8670 /* Given loop represented by LOOP_VINFO, return true if computation of
8671    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8672    otherwise.  */
8673
8674 static bool
8675 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8676 {
8677   /* Constant case.  */
8678   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8679     {
8680       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8681       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8682
8683       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8684       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8685       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8686         return true;
8687     }
8688
8689   widest_int max;
8690   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8691   /* Check the upper bound of loop niters.  */
8692   if (get_max_loop_iterations (loop, &max))
8693     {
8694       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8695       signop sgn = TYPE_SIGN (type);
8696       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8697       if (max < type_max)
8698         return true;
8699     }
8700   return false;
8701 }
8702
8703 /* Return a mask type with half the number of elements as OLD_TYPE,
8704    given that it should have mode NEW_MODE.  */
8705
8706 tree
8707 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8708 {
8709   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8710   return build_truth_vector_type_for_mode (nunits, new_mode);
8711 }
8712
8713 /* Return a mask type with twice as many elements as OLD_TYPE,
8714    given that it should have mode NEW_MODE.  */
8715
8716 tree
8717 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8718 {
8719   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8720   return build_truth_vector_type_for_mode (nunits, new_mode);
8721 }
8722
8723 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8724    contain a sequence of NVECTORS masks that each control a vector of type
8725    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8726    these vector masks with the vector version of SCALAR_MASK.  */
8727
8728 void
8729 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8730                        unsigned int nvectors, tree vectype, tree scalar_mask)
8731 {
8732   gcc_assert (nvectors != 0);
8733   if (masks->length () < nvectors)
8734     masks->safe_grow_cleared (nvectors, true);
8735   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8736   /* The number of scalars per iteration and the number of vectors are
8737      both compile-time constants.  */
8738   unsigned int nscalars_per_iter
8739     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8740                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8741
8742   if (scalar_mask)
8743     {
8744       scalar_cond_masked_key cond (scalar_mask, nvectors);
8745       loop_vinfo->scalar_cond_masked_set.add (cond);
8746     }
8747
8748   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8749     {
8750       rgm->max_nscalars_per_iter = nscalars_per_iter;
8751       rgm->type = truth_type_for (vectype);
8752       rgm->factor = 1;
8753     }
8754 }
8755
8756 /* Given a complete set of masks MASKS, extract mask number INDEX
8757    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8758    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8759
8760    See the comment above vec_loop_masks for more details about the mask
8761    arrangement.  */
8762
8763 tree
8764 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8765                     unsigned int nvectors, tree vectype, unsigned int index)
8766 {
8767   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8768   tree mask_type = rgm->type;
8769
8770   /* Populate the rgroup's mask array, if this is the first time we've
8771      used it.  */
8772   if (rgm->controls.is_empty ())
8773     {
8774       rgm->controls.safe_grow_cleared (nvectors, true);
8775       for (unsigned int i = 0; i < nvectors; ++i)
8776         {
8777           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8778           /* Provide a dummy definition until the real one is available.  */
8779           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8780           rgm->controls[i] = mask;
8781         }
8782     }
8783
8784   tree mask = rgm->controls[index];
8785   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8786                 TYPE_VECTOR_SUBPARTS (vectype)))
8787     {
8788       /* A loop mask for data type X can be reused for data type Y
8789          if X has N times more elements than Y and if Y's elements
8790          are N times bigger than X's.  In this case each sequence
8791          of N elements in the loop mask will be all-zero or all-one.
8792          We can then view-convert the mask so that each sequence of
8793          N elements is replaced by a single element.  */
8794       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8795                               TYPE_VECTOR_SUBPARTS (vectype)));
8796       gimple_seq seq = NULL;
8797       mask_type = truth_type_for (vectype);
8798       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8799       if (seq)
8800         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8801     }
8802   return mask;
8803 }
8804
8805 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
8806    lengths for controlling an operation on VECTYPE.  The operation splits
8807    each element of VECTYPE into FACTOR separate subelements, measuring the
8808    length as a number of these subelements.  */
8809
8810 void
8811 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8812                       unsigned int nvectors, tree vectype, unsigned int factor)
8813 {
8814   gcc_assert (nvectors != 0);
8815   if (lens->length () < nvectors)
8816     lens->safe_grow_cleared (nvectors, true);
8817   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8818
8819   /* The number of scalars per iteration, scalar occupied bytes and
8820      the number of vectors are both compile-time constants.  */
8821   unsigned int nscalars_per_iter
8822     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8823                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8824
8825   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
8826     {
8827       /* For now, we only support cases in which all loads and stores fall back
8828          to VnQI or none do.  */
8829       gcc_assert (!rgl->max_nscalars_per_iter
8830                   || (rgl->factor == 1 && factor == 1)
8831                   || (rgl->max_nscalars_per_iter * rgl->factor
8832                       == nscalars_per_iter * factor));
8833       rgl->max_nscalars_per_iter = nscalars_per_iter;
8834       rgl->type = vectype;
8835       rgl->factor = factor;
8836     }
8837 }
8838
8839 /* Given a complete set of length LENS, extract length number INDEX for an
8840    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
8841
8842 tree
8843 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
8844                    unsigned int nvectors, unsigned int index)
8845 {
8846   rgroup_controls *rgl = &(*lens)[nvectors - 1];
8847
8848   /* Populate the rgroup's len array, if this is the first time we've
8849      used it.  */
8850   if (rgl->controls.is_empty ())
8851     {
8852       rgl->controls.safe_grow_cleared (nvectors, true);
8853       for (unsigned int i = 0; i < nvectors; ++i)
8854         {
8855           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
8856           gcc_assert (len_type != NULL_TREE);
8857           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
8858
8859           /* Provide a dummy definition until the real one is available.  */
8860           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
8861           rgl->controls[i] = len;
8862         }
8863     }
8864
8865   return rgl->controls[index];
8866 }
8867
8868 /* Scale profiling counters by estimation for LOOP which is vectorized
8869    by factor VF.  */
8870
8871 static void
8872 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8873 {
8874   edge preheader = loop_preheader_edge (loop);
8875   /* Reduce loop iterations by the vectorization factor.  */
8876   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8877   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8878
8879   if (freq_h.nonzero_p ())
8880     {
8881       profile_probability p;
8882
8883       /* Avoid dropping loop body profile counter to 0 because of zero count
8884          in loop's preheader.  */
8885       if (!(freq_e == profile_count::zero ()))
8886         freq_e = freq_e.force_nonzero ();
8887       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8888       scale_loop_frequencies (loop, p);
8889     }
8890
8891   edge exit_e = single_exit (loop);
8892   exit_e->probability = profile_probability::always ()
8893                                  .apply_scale (1, new_est_niter + 1);
8894
8895   edge exit_l = single_pred_edge (loop->latch);
8896   profile_probability prob = exit_l->probability;
8897   exit_l->probability = exit_e->probability.invert ();
8898   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8899     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8900 }
8901
8902 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
8903    latch edge values originally defined by it.  */
8904
8905 static void
8906 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
8907                                      stmt_vec_info def_stmt_info)
8908 {
8909   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
8910   if (!def || TREE_CODE (def) != SSA_NAME)
8911     return;
8912   stmt_vec_info phi_info;
8913   imm_use_iterator iter;
8914   use_operand_p use_p;
8915   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
8916     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
8917       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
8918           && (phi_info = loop_vinfo->lookup_stmt (phi))
8919           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
8920           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
8921           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
8922         {
8923           loop_p loop = gimple_bb (phi)->loop_father;
8924           edge e = loop_latch_edge (loop);
8925           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
8926             {
8927               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
8928               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
8929               gcc_assert (phi_defs.length () == latch_defs.length ());
8930               for (unsigned i = 0; i < phi_defs.length (); ++i)
8931                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
8932                              gimple_get_lhs (latch_defs[i]), e,
8933                              gimple_phi_arg_location (phi, e->dest_idx));
8934             }
8935         }
8936 }
8937
8938 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8939    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8940    stmt_vec_info.  */
8941
8942 static void
8943 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8944                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8945 {
8946   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8947   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8948
8949   if (dump_enabled_p ())
8950     dump_printf_loc (MSG_NOTE, vect_location,
8951                      "------>vectorizing statement: %G", stmt_info->stmt);
8952
8953   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8954     vect_loop_kill_debug_uses (loop, stmt_info);
8955
8956   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8957       && !STMT_VINFO_LIVE_P (stmt_info))
8958     return;
8959
8960   if (STMT_VINFO_VECTYPE (stmt_info))
8961     {
8962       poly_uint64 nunits
8963         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8964       if (!STMT_SLP_TYPE (stmt_info)
8965           && maybe_ne (nunits, vf)
8966           && dump_enabled_p ())
8967         /* For SLP VF is set according to unrolling factor, and not
8968            to vector size, hence for SLP this print is not valid.  */
8969         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8970     }
8971
8972   /* Pure SLP statements have already been vectorized.  We still need
8973      to apply loop vectorization to hybrid SLP statements.  */
8974   if (PURE_SLP_STMT (stmt_info))
8975     return;
8976
8977   if (dump_enabled_p ())
8978     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8979
8980   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8981     *seen_store = stmt_info;
8982 }
8983
8984 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8985    in the hash_map with its corresponding values.  */
8986
8987 static tree
8988 find_in_mapping (tree t, void *context)
8989 {
8990   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8991
8992   tree *value = mapping->get (t);
8993   return value ? *value : t;
8994 }
8995
8996 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8997    original loop that has now been vectorized.
8998
8999    The inits of the data_references need to be advanced with the number of
9000    iterations of the main loop.  This has been computed in vect_do_peeling and
9001    is stored in parameter ADVANCE.  We first restore the data_references
9002    initial offset with the values recored in ORIG_DRS_INIT.
9003
9004    Since the loop_vec_info of this EPILOGUE was constructed for the original
9005    loop, its stmt_vec_infos all point to the original statements.  These need
9006    to be updated to point to their corresponding copies as well as the SSA_NAMES
9007    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9008
9009    The data_reference's connections also need to be updated.  Their
9010    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9011    stmt_vec_infos, their statements need to point to their corresponding copy,
9012    if they are gather loads or scatter stores then their reference needs to be
9013    updated to point to its corresponding copy and finally we set
9014    'base_misaligned' to false as we have already peeled for alignment in the
9015    prologue of the main loop.  */
9016
9017 static void
9018 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9019 {
9020   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9021   auto_vec<gimple *> stmt_worklist;
9022   hash_map<tree,tree> mapping;
9023   gimple *orig_stmt, *new_stmt;
9024   gimple_stmt_iterator epilogue_gsi;
9025   gphi_iterator epilogue_phi_gsi;
9026   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9027   basic_block *epilogue_bbs = get_loop_body (epilogue);
9028   unsigned i;
9029
9030   free (LOOP_VINFO_BBS (epilogue_vinfo));
9031   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9032
9033   /* Advance data_reference's with the number of iterations of the previous
9034      loop and its prologue.  */
9035   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9036
9037
9038   /* The EPILOGUE loop is a copy of the original loop so they share the same
9039      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9040      point to the copied statements.  We also create a mapping of all LHS' in
9041      the original loop and all the LHS' in the EPILOGUE and create worklists to
9042      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9043   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9044     {
9045       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9046            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9047         {
9048           new_stmt = epilogue_phi_gsi.phi ();
9049
9050           gcc_assert (gimple_uid (new_stmt) > 0);
9051           stmt_vinfo
9052             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9053
9054           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9055           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9056
9057           mapping.put (gimple_phi_result (orig_stmt),
9058                        gimple_phi_result (new_stmt));
9059           /* PHI nodes can not have patterns or related statements.  */
9060           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9061                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9062         }
9063
9064       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9065            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9066         {
9067           new_stmt = gsi_stmt (epilogue_gsi);
9068           if (is_gimple_debug (new_stmt))
9069             continue;
9070
9071           gcc_assert (gimple_uid (new_stmt) > 0);
9072           stmt_vinfo
9073             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9074
9075           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9076           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9077
9078           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9079             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9080
9081           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9082             {
9083               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9084               for (gimple_stmt_iterator gsi = gsi_start (seq);
9085                    !gsi_end_p (gsi); gsi_next (&gsi))
9086                 stmt_worklist.safe_push (gsi_stmt (gsi));
9087             }
9088
9089           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9090           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9091             {
9092               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9093               stmt_worklist.safe_push (stmt);
9094               /* Set BB such that the assert in
9095                 'get_initial_def_for_reduction' is able to determine that
9096                 the BB of the related stmt is inside this loop.  */
9097               gimple_set_bb (stmt,
9098                              gimple_bb (new_stmt));
9099               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9100               gcc_assert (related_vinfo == NULL
9101                           || related_vinfo == stmt_vinfo);
9102             }
9103         }
9104     }
9105
9106   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9107      using the original main loop and thus need to be updated to refer to the
9108      cloned variables used in the epilogue.  */
9109   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9110     {
9111       gimple *stmt = stmt_worklist[i];
9112       tree *new_op;
9113
9114       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9115         {
9116           tree op = gimple_op (stmt, j);
9117           if ((new_op = mapping.get(op)))
9118             gimple_set_op (stmt, j, *new_op);
9119           else
9120             {
9121               /* PR92429: The last argument of simplify_replace_tree disables
9122                  folding when replacing arguments.  This is required as
9123                  otherwise you might end up with different statements than the
9124                  ones analyzed in vect_loop_analyze, leading to different
9125                  vectorization.  */
9126               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9127                                           &find_in_mapping, &mapping, false);
9128               gimple_set_op (stmt, j, op);
9129             }
9130         }
9131     }
9132
9133   struct data_reference *dr;
9134   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9135   FOR_EACH_VEC_ELT (datarefs, i, dr)
9136     {
9137       orig_stmt = DR_STMT (dr);
9138       gcc_assert (gimple_uid (orig_stmt) > 0);
9139       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9140       /* Data references for gather loads and scatter stores do not use the
9141          updated offset we set using ADVANCE.  Instead we have to make sure the
9142          reference in the data references point to the corresponding copy of
9143          the original in the epilogue.  */
9144       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9145           == VMAT_GATHER_SCATTER)
9146         {
9147           DR_REF (dr)
9148             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9149                                      &find_in_mapping, &mapping);
9150           DR_BASE_ADDRESS (dr)
9151             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9152                                      &find_in_mapping, &mapping);
9153         }
9154       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9155       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9156       /* The vector size of the epilogue is smaller than that of the main loop
9157          so the alignment is either the same or lower. This means the dr will
9158          thus by definition be aligned.  */
9159       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9160     }
9161
9162   epilogue_vinfo->shared->datarefs_copy.release ();
9163   epilogue_vinfo->shared->save_datarefs ();
9164 }
9165
9166 /* Function vect_transform_loop.
9167
9168    The analysis phase has determined that the loop is vectorizable.
9169    Vectorize the loop - created vectorized stmts to replace the scalar
9170    stmts in the loop, and update the loop exit condition.
9171    Returns scalar epilogue loop if any.  */
9172
9173 class loop *
9174 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9175 {
9176   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9177   class loop *epilogue = NULL;
9178   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9179   int nbbs = loop->num_nodes;
9180   int i;
9181   tree niters_vector = NULL_TREE;
9182   tree step_vector = NULL_TREE;
9183   tree niters_vector_mult_vf = NULL_TREE;
9184   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9185   unsigned int lowest_vf = constant_lower_bound (vf);
9186   gimple *stmt;
9187   bool check_profitability = false;
9188   unsigned int th;
9189
9190   DUMP_VECT_SCOPE ("vec_transform_loop");
9191
9192   loop_vinfo->shared->check_datarefs ();
9193
9194   /* Use the more conservative vectorization threshold.  If the number
9195      of iterations is constant assume the cost check has been performed
9196      by our caller.  If the threshold makes all loops profitable that
9197      run at least the (estimated) vectorization factor number of times
9198      checking is pointless, too.  */
9199   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9200   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9201     {
9202       if (dump_enabled_p ())
9203         dump_printf_loc (MSG_NOTE, vect_location,
9204                          "Profitability threshold is %d loop iterations.\n",
9205                          th);
9206       check_profitability = true;
9207     }
9208
9209   /* Make sure there exists a single-predecessor exit bb.  Do this before
9210      versioning.   */
9211   edge e = single_exit (loop);
9212   if (! single_pred_p (e->dest))
9213     {
9214       split_loop_exit_edge (e, true);
9215       if (dump_enabled_p ())
9216         dump_printf (MSG_NOTE, "split exit edge\n");
9217     }
9218
9219   /* Version the loop first, if required, so the profitability check
9220      comes first.  */
9221
9222   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9223     {
9224       class loop *sloop
9225         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9226       sloop->force_vectorize = false;
9227       check_profitability = false;
9228     }
9229
9230   /* Make sure there exists a single-predecessor exit bb also on the
9231      scalar loop copy.  Do this after versioning but before peeling
9232      so CFG structure is fine for both scalar and if-converted loop
9233      to make slpeel_duplicate_current_defs_from_edges face matched
9234      loop closed PHI nodes on the exit.  */
9235   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9236     {
9237       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9238       if (! single_pred_p (e->dest))
9239         {
9240           split_loop_exit_edge (e, true);
9241           if (dump_enabled_p ())
9242             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9243         }
9244     }
9245
9246   tree niters = vect_build_loop_niters (loop_vinfo);
9247   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9248   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9249   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9250   tree advance;
9251   drs_init_vec orig_drs_init;
9252
9253   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9254                               &step_vector, &niters_vector_mult_vf, th,
9255                               check_profitability, niters_no_overflow,
9256                               &advance);
9257
9258   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9259       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9260     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9261                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9262
9263   if (niters_vector == NULL_TREE)
9264     {
9265       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9266           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9267           && known_eq (lowest_vf, vf))
9268         {
9269           niters_vector
9270             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9271                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9272           step_vector = build_one_cst (TREE_TYPE (niters));
9273         }
9274       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9275         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9276                                      &step_vector, niters_no_overflow);
9277       else
9278         /* vect_do_peeling subtracted the number of peeled prologue
9279            iterations from LOOP_VINFO_NITERS.  */
9280         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9281                                      &niters_vector, &step_vector,
9282                                      niters_no_overflow);
9283     }
9284
9285   /* 1) Make sure the loop header has exactly two entries
9286      2) Make sure we have a preheader basic block.  */
9287
9288   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9289
9290   split_edge (loop_preheader_edge (loop));
9291
9292   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9293     /* This will deal with any possible peeling.  */
9294     vect_prepare_for_masked_peels (loop_vinfo);
9295
9296   /* Schedule the SLP instances first, then handle loop vectorization
9297      below.  */
9298   if (!loop_vinfo->slp_instances.is_empty ())
9299     {
9300       DUMP_VECT_SCOPE ("scheduling SLP instances");
9301       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9302     }
9303
9304   /* FORNOW: the vectorizer supports only loops which body consist
9305      of one basic block (header + empty latch). When the vectorizer will
9306      support more involved loop forms, the order by which the BBs are
9307      traversed need to be reconsidered.  */
9308
9309   for (i = 0; i < nbbs; i++)
9310     {
9311       basic_block bb = bbs[i];
9312       stmt_vec_info stmt_info;
9313
9314       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9315            gsi_next (&si))
9316         {
9317           gphi *phi = si.phi ();
9318           if (dump_enabled_p ())
9319             dump_printf_loc (MSG_NOTE, vect_location,
9320                              "------>vectorizing phi: %G", phi);
9321           stmt_info = loop_vinfo->lookup_stmt (phi);
9322           if (!stmt_info)
9323             continue;
9324
9325           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9326             vect_loop_kill_debug_uses (loop, stmt_info);
9327
9328           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9329               && !STMT_VINFO_LIVE_P (stmt_info))
9330             continue;
9331
9332           if (STMT_VINFO_VECTYPE (stmt_info)
9333               && (maybe_ne
9334                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9335               && dump_enabled_p ())
9336             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9337
9338           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9339                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9340                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9341                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9342                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9343               && ! PURE_SLP_STMT (stmt_info))
9344             {
9345               if (dump_enabled_p ())
9346                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9347               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9348             }
9349         }
9350
9351       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9352            gsi_next (&si))
9353         {
9354           gphi *phi = si.phi ();
9355           stmt_info = loop_vinfo->lookup_stmt (phi);
9356           if (!stmt_info)
9357             continue;
9358
9359           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9360               && !STMT_VINFO_LIVE_P (stmt_info))
9361             continue;
9362
9363           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9364                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9365                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9366                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9367                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9368               && ! PURE_SLP_STMT (stmt_info))
9369             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9370         }
9371
9372       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9373            !gsi_end_p (si);)
9374         {
9375           stmt = gsi_stmt (si);
9376           /* During vectorization remove existing clobber stmts.  */
9377           if (gimple_clobber_p (stmt))
9378             {
9379               unlink_stmt_vdef (stmt);
9380               gsi_remove (&si, true);
9381               release_defs (stmt);
9382             }
9383           else
9384             {
9385               /* Ignore vector stmts created in the outer loop.  */
9386               stmt_info = loop_vinfo->lookup_stmt (stmt);
9387
9388               /* vector stmts created in the outer-loop during vectorization of
9389                  stmts in an inner-loop may not have a stmt_info, and do not
9390                  need to be vectorized.  */
9391               stmt_vec_info seen_store = NULL;
9392               if (stmt_info)
9393                 {
9394                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9395                     {
9396                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9397                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9398                            !gsi_end_p (subsi); gsi_next (&subsi))
9399                         {
9400                           stmt_vec_info pat_stmt_info
9401                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9402                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9403                                                     &si, &seen_store);
9404                         }
9405                       stmt_vec_info pat_stmt_info
9406                         = STMT_VINFO_RELATED_STMT (stmt_info);
9407                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
9408                                                 &seen_store);
9409                       maybe_set_vectorized_backedge_value (loop_vinfo,
9410                                                            pat_stmt_info);
9411                     }
9412                   else
9413                     {
9414                       vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9415                                                 &seen_store);
9416                       maybe_set_vectorized_backedge_value (loop_vinfo,
9417                                                            stmt_info);
9418                     }
9419                 }
9420               gsi_next (&si);
9421               if (seen_store)
9422                 {
9423                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9424                     /* Interleaving.  If IS_STORE is TRUE, the
9425                        vectorization of the interleaving chain was
9426                        completed - free all the stores in the chain.  */
9427                     vect_remove_stores (loop_vinfo,
9428                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9429                   else
9430                     /* Free the attached stmt_vec_info and remove the stmt.  */
9431                     loop_vinfo->remove_stmt (stmt_info);
9432                 }
9433             }
9434         }
9435
9436       /* Stub out scalar statements that must not survive vectorization.
9437          Doing this here helps with grouped statements, or statements that
9438          are involved in patterns.  */
9439       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9440            !gsi_end_p (gsi); gsi_next (&gsi))
9441         {
9442           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9443           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9444             {
9445               tree lhs = gimple_get_lhs (call);
9446               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9447                 {
9448                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9449                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9450                   gsi_replace (&gsi, new_stmt, true);
9451                 }
9452             }
9453         }
9454     }                           /* BBs in loop */
9455
9456   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9457      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9458   if (integer_onep (step_vector))
9459     niters_no_overflow = true;
9460   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9461                            niters_vector_mult_vf, !niters_no_overflow);
9462
9463   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9464   scale_profile_for_vect_loop (loop, assumed_vf);
9465
9466   /* True if the final iteration might not handle a full vector's
9467      worth of scalar iterations.  */
9468   bool final_iter_may_be_partial
9469     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9470   /* The minimum number of iterations performed by the epilogue.  This
9471      is 1 when peeling for gaps because we always need a final scalar
9472      iteration.  */
9473   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9474   /* +1 to convert latch counts to loop iteration counts,
9475      -min_epilogue_iters to remove iterations that cannot be performed
9476        by the vector code.  */
9477   int bias_for_lowest = 1 - min_epilogue_iters;
9478   int bias_for_assumed = bias_for_lowest;
9479   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9480   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9481     {
9482       /* When the amount of peeling is known at compile time, the first
9483          iteration will have exactly alignment_npeels active elements.
9484          In the worst case it will have at least one.  */
9485       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9486       bias_for_lowest += lowest_vf - min_first_active;
9487       bias_for_assumed += assumed_vf - min_first_active;
9488     }
9489   /* In these calculations the "- 1" converts loop iteration counts
9490      back to latch counts.  */
9491   if (loop->any_upper_bound)
9492     loop->nb_iterations_upper_bound
9493       = (final_iter_may_be_partial
9494          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9495                           lowest_vf) - 1
9496          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9497                            lowest_vf) - 1);
9498   if (loop->any_likely_upper_bound)
9499     loop->nb_iterations_likely_upper_bound
9500       = (final_iter_may_be_partial
9501          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9502                           + bias_for_lowest, lowest_vf) - 1
9503          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9504                            + bias_for_lowest, lowest_vf) - 1);
9505   if (loop->any_estimate)
9506     loop->nb_iterations_estimate
9507       = (final_iter_may_be_partial
9508          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9509                           assumed_vf) - 1
9510          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9511                            assumed_vf) - 1);
9512
9513   if (dump_enabled_p ())
9514     {
9515       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9516         {
9517           dump_printf_loc (MSG_NOTE, vect_location,
9518                            "LOOP VECTORIZED\n");
9519           if (loop->inner)
9520             dump_printf_loc (MSG_NOTE, vect_location,
9521                              "OUTER LOOP VECTORIZED\n");
9522           dump_printf (MSG_NOTE, "\n");
9523         }
9524       else
9525         dump_printf_loc (MSG_NOTE, vect_location,
9526                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9527                          GET_MODE_NAME (loop_vinfo->vector_mode));
9528     }
9529
9530   /* Loops vectorized with a variable factor won't benefit from
9531      unrolling/peeling.  */
9532   if (!vf.is_constant ())
9533     {
9534       loop->unroll = 1;
9535       if (dump_enabled_p ())
9536         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9537                          " variable-length vectorization factor\n");
9538     }
9539   /* Free SLP instances here because otherwise stmt reference counting
9540      won't work.  */
9541   slp_instance instance;
9542   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9543     vect_free_slp_instance (instance);
9544   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9545   /* Clear-up safelen field since its value is invalid after vectorization
9546      since vectorized loop can have loop-carried dependencies.  */
9547   loop->safelen = 0;
9548
9549   if (epilogue)
9550     {
9551       update_epilogue_loop_vinfo (epilogue, advance);
9552
9553       epilogue->simduid = loop->simduid;
9554       epilogue->force_vectorize = loop->force_vectorize;
9555       epilogue->dont_vectorize = false;
9556     }
9557
9558   return epilogue;
9559 }
9560
9561 /* The code below is trying to perform simple optimization - revert
9562    if-conversion for masked stores, i.e. if the mask of a store is zero
9563    do not perform it and all stored value producers also if possible.
9564    For example,
9565      for (i=0; i<n; i++)
9566        if (c[i])
9567         {
9568           p1[i] += 1;
9569           p2[i] = p3[i] +2;
9570         }
9571    this transformation will produce the following semi-hammock:
9572
9573    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9574      {
9575        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9576        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9577        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9578        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9579        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9580        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9581      }
9582 */
9583
9584 void
9585 optimize_mask_stores (class loop *loop)
9586 {
9587   basic_block *bbs = get_loop_body (loop);
9588   unsigned nbbs = loop->num_nodes;
9589   unsigned i;
9590   basic_block bb;
9591   class loop *bb_loop;
9592   gimple_stmt_iterator gsi;
9593   gimple *stmt;
9594   auto_vec<gimple *> worklist;
9595   auto_purge_vect_location sentinel;
9596
9597   vect_location = find_loop_location (loop);
9598   /* Pick up all masked stores in loop if any.  */
9599   for (i = 0; i < nbbs; i++)
9600     {
9601       bb = bbs[i];
9602       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9603            gsi_next (&gsi))
9604         {
9605           stmt = gsi_stmt (gsi);
9606           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9607             worklist.safe_push (stmt);
9608         }
9609     }
9610
9611   free (bbs);
9612   if (worklist.is_empty ())
9613     return;
9614
9615   /* Loop has masked stores.  */
9616   while (!worklist.is_empty ())
9617     {
9618       gimple *last, *last_store;
9619       edge e, efalse;
9620       tree mask;
9621       basic_block store_bb, join_bb;
9622       gimple_stmt_iterator gsi_to;
9623       tree vdef, new_vdef;
9624       gphi *phi;
9625       tree vectype;
9626       tree zero;
9627
9628       last = worklist.pop ();
9629       mask = gimple_call_arg (last, 2);
9630       bb = gimple_bb (last);
9631       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9632          the same loop as if_bb.  It could be different to LOOP when two
9633          level loop-nest is vectorized and mask_store belongs to the inner
9634          one.  */
9635       e = split_block (bb, last);
9636       bb_loop = bb->loop_father;
9637       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9638       join_bb = e->dest;
9639       store_bb = create_empty_bb (bb);
9640       add_bb_to_loop (store_bb, bb_loop);
9641       e->flags = EDGE_TRUE_VALUE;
9642       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9643       /* Put STORE_BB to likely part.  */
9644       efalse->probability = profile_probability::unlikely ();
9645       store_bb->count = efalse->count ();
9646       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9647       if (dom_info_available_p (CDI_DOMINATORS))
9648         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9649       if (dump_enabled_p ())
9650         dump_printf_loc (MSG_NOTE, vect_location,
9651                          "Create new block %d to sink mask stores.",
9652                          store_bb->index);
9653       /* Create vector comparison with boolean result.  */
9654       vectype = TREE_TYPE (mask);
9655       zero = build_zero_cst (vectype);
9656       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9657       gsi = gsi_last_bb (bb);
9658       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9659       /* Create new PHI node for vdef of the last masked store:
9660          .MEM_2 = VDEF <.MEM_1>
9661          will be converted to
9662          .MEM.3 = VDEF <.MEM_1>
9663          and new PHI node will be created in join bb
9664          .MEM_2 = PHI <.MEM_1, .MEM_3>
9665       */
9666       vdef = gimple_vdef (last);
9667       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9668       gimple_set_vdef (last, new_vdef);
9669       phi = create_phi_node (vdef, join_bb);
9670       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9671
9672       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9673       while (true)
9674         {
9675           gimple_stmt_iterator gsi_from;
9676           gimple *stmt1 = NULL;
9677
9678           /* Move masked store to STORE_BB.  */
9679           last_store = last;
9680           gsi = gsi_for_stmt (last);
9681           gsi_from = gsi;
9682           /* Shift GSI to the previous stmt for further traversal.  */
9683           gsi_prev (&gsi);
9684           gsi_to = gsi_start_bb (store_bb);
9685           gsi_move_before (&gsi_from, &gsi_to);
9686           /* Setup GSI_TO to the non-empty block start.  */
9687           gsi_to = gsi_start_bb (store_bb);
9688           if (dump_enabled_p ())
9689             dump_printf_loc (MSG_NOTE, vect_location,
9690                              "Move stmt to created bb\n%G", last);
9691           /* Move all stored value producers if possible.  */
9692           while (!gsi_end_p (gsi))
9693             {
9694               tree lhs;
9695               imm_use_iterator imm_iter;
9696               use_operand_p use_p;
9697               bool res;
9698
9699               /* Skip debug statements.  */
9700               if (is_gimple_debug (gsi_stmt (gsi)))
9701                 {
9702                   gsi_prev (&gsi);
9703                   continue;
9704                 }
9705               stmt1 = gsi_stmt (gsi);
9706               /* Do not consider statements writing to memory or having
9707                  volatile operand.  */
9708               if (gimple_vdef (stmt1)
9709                   || gimple_has_volatile_ops (stmt1))
9710                 break;
9711               gsi_from = gsi;
9712               gsi_prev (&gsi);
9713               lhs = gimple_get_lhs (stmt1);
9714               if (!lhs)
9715                 break;
9716
9717               /* LHS of vectorized stmt must be SSA_NAME.  */
9718               if (TREE_CODE (lhs) != SSA_NAME)
9719                 break;
9720
9721               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9722                 {
9723                   /* Remove dead scalar statement.  */
9724                   if (has_zero_uses (lhs))
9725                     {
9726                       gsi_remove (&gsi_from, true);
9727                       continue;
9728                     }
9729                 }
9730
9731               /* Check that LHS does not have uses outside of STORE_BB.  */
9732               res = true;
9733               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9734                 {
9735                   gimple *use_stmt;
9736                   use_stmt = USE_STMT (use_p);
9737                   if (is_gimple_debug (use_stmt))
9738                     continue;
9739                   if (gimple_bb (use_stmt) != store_bb)
9740                     {
9741                       res = false;
9742                       break;
9743                     }
9744                 }
9745               if (!res)
9746                 break;
9747
9748               if (gimple_vuse (stmt1)
9749                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9750                 break;
9751
9752               /* Can move STMT1 to STORE_BB.  */
9753               if (dump_enabled_p ())
9754                 dump_printf_loc (MSG_NOTE, vect_location,
9755                                  "Move stmt to created bb\n%G", stmt1);
9756               gsi_move_before (&gsi_from, &gsi_to);
9757               /* Shift GSI_TO for further insertion.  */
9758               gsi_prev (&gsi_to);
9759             }
9760           /* Put other masked stores with the same mask to STORE_BB.  */
9761           if (worklist.is_empty ()
9762               || gimple_call_arg (worklist.last (), 2) != mask
9763               || worklist.last () != stmt1)
9764             break;
9765           last = worklist.pop ();
9766         }
9767       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9768     }
9769 }
9770
9771 /* Decide whether it is possible to use a zero-based induction variable
9772    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
9773    the value that the induction variable must be able to hold in order
9774    to ensure that the rgroups eventually have no active vector elements.
9775    Return -1 otherwise.  */
9776
9777 widest_int
9778 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
9779 {
9780   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9781   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9782   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9783
9784   /* Calculate the value that the induction variable must be able
9785      to hit in order to ensure that we end the loop with an all-false mask.
9786      This involves adding the maximum number of inactive trailing scalar
9787      iterations.  */
9788   widest_int iv_limit = -1;
9789   if (max_loop_iterations (loop, &iv_limit))
9790     {
9791       if (niters_skip)
9792         {
9793           /* Add the maximum number of skipped iterations to the
9794              maximum iteration count.  */
9795           if (TREE_CODE (niters_skip) == INTEGER_CST)
9796             iv_limit += wi::to_widest (niters_skip);
9797           else
9798             iv_limit += max_vf - 1;
9799         }
9800       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9801         /* Make a conservatively-correct assumption.  */
9802         iv_limit += max_vf - 1;
9803
9804       /* IV_LIMIT is the maximum number of latch iterations, which is also
9805          the maximum in-range IV value.  Round this value down to the previous
9806          vector alignment boundary and then add an extra full iteration.  */
9807       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9808       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9809     }
9810   return iv_limit;
9811 }
9812
9813 /* For the given rgroup_controls RGC, check whether an induction variable
9814    would ever hit a value that produces a set of all-false masks or zero
9815    lengths before wrapping around.  Return true if it's possible to wrap
9816    around before hitting the desirable value, otherwise return false.  */
9817
9818 bool
9819 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
9820 {
9821   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
9822
9823   if (iv_limit == -1)
9824     return true;
9825
9826   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9827   unsigned int compare_precision = TYPE_PRECISION (compare_type);
9828   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
9829
9830   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
9831     return true;
9832
9833   return false;
9834 }