gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           if (is_gimple_debug (gsi_stmt (si)))
 346             continue;
 347           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 348           opt_result res
 349             = vect_determine_vf_for_stmt (loop_vinfo,
 350                                           stmt_info, &vectorization_factor);
 351           if (!res)
 352             return res;
 353         }
 354     }
 355
 356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 357   if (dump_enabled_p ())
 358     {
 359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 360       dump_dec (MSG_NOTE, vectorization_factor);
 361       dump_printf (MSG_NOTE, "\n");
 362     }
 363
 364   if (known_le (vectorization_factor, 1U))
 365     return opt_result::failure_at (vect_location,
 366                                    "not vectorized: unsupported data-type\n");
 367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 368   return opt_result::success ();
 369 }
 370
 371
 372 /* Function vect_is_simple_iv_evolution.
 373
 374    FORNOW: A simple evolution of an induction variables in the loop is
 375    considered a polynomial evolution.  */
 376
 377 static bool
 378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 379                              tree * step)
 380 {
 381   tree init_expr;
 382   tree step_expr;
 383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 384   basic_block bb;
 385
 386   /* When there is no evolution in this loop, the evolution function
 387      is not "simple".  */
 388   if (evolution_part == NULL_TREE)
 389     return false;
 390
 391   /* When the evolution is a polynomial of degree >= 2
 392      the evolution function is not "simple".  */
 393   if (tree_is_chrec (evolution_part))
 394     return false;
 395
 396   step_expr = evolution_part;
 397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 398
 399   if (dump_enabled_p ())
 400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 401                      step_expr, init_expr);
 402
 403   *init = init_expr;
 404   *step = step_expr;
 405
 406   if (TREE_CODE (step_expr) != INTEGER_CST
 407       && (TREE_CODE (step_expr) != SSA_NAME
 408           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 409               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 410           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 411               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 412                   || !flag_associative_math)))
 413       && (TREE_CODE (step_expr) != REAL_CST
 414           || !flag_associative_math))
 415     {
 416       if (dump_enabled_p ())
 417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                          "step unknown.\n");
 419       return false;
 420     }
 421
 422   return true;
 423 }
 424
 425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 426    what we are assuming is a double reduction.  For example, given
 427    a structure like this:
 428
 429       outer1:
 430         x_1 = PHI <x_4(outer2), ...>;
 431         ...
 432
 433       inner:
 434         x_2 = PHI <x_1(outer1), ...>;
 435         ...
 436         x_3 = ...;
 437         ...
 438
 439       outer2:
 440         x_4 = PHI <x_3(inner)>;
 441         ...
 442
 443    outer loop analysis would treat x_1 as a double reduction phi and
 444    this function would then return true for x_2.  */
 445
 446 static bool
 447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 448 {
 449   use_operand_p use_p;
 450   ssa_op_iter op_iter;
 451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 454         return true;
 455   return false;
 456 }
 457
 458 /* Function vect_analyze_scalar_cycles_1.
 459
 460    Examine the cross iteration def-use cycles of scalar variables
 461    in LOOP.  LOOP_VINFO represents the loop that is now being
 462    considered for vectorization (can be LOOP, or an outer-loop
 463    enclosing LOOP).  */
 464
 465 static void
 466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 467 {
 468   basic_block bb = loop->header;
 469   tree init, step;
 470   auto_vec<stmt_vec_info, 64> worklist;
 471   gphi_iterator gsi;
 472   bool double_reduc, reduc_chain;
 473
 474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 475
 476   /* First - identify all inductions.  Reduction detection assumes that all the
 477      inductions have been identified, therefore, this order must not be
 478      changed.  */
 479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 480     {
 481       gphi *phi = gsi.phi ();
 482       tree access_fn = NULL;
 483       tree def = PHI_RESULT (phi);
 484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 485
 486       if (dump_enabled_p ())
 487         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 488
 489       /* Skip virtual phi's.  The data dependences that are associated with
 490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 491       if (virtual_operand_p (def))
 492         continue;
 493
 494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 495
 496       /* Analyze the evolution function.  */
 497       access_fn = analyze_scalar_evolution (loop, def);
 498       if (access_fn)
 499         {
 500           STRIP_NOPS (access_fn);
 501           if (dump_enabled_p ())
 502             dump_printf_loc (MSG_NOTE, vect_location,
 503                              "Access function of PHI: %T\n", access_fn);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 505             = initial_condition_in_loop_num (access_fn, loop->num);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 507             = evolution_part_in_loop_num (access_fn, loop->num);
 508         }
 509
 510       if (!access_fn
 511           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 512           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 513           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 514               && TREE_CODE (step) != INTEGER_CST))
 515         {
 516           worklist.safe_push (stmt_vinfo);
 517           continue;
 518         }
 519
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 521                   != NULL_TREE);
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 523
 524       if (dump_enabled_p ())
 525         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 527     }
 528
 529
 530   /* Second - identify all reductions and nested cycles.  */
 531   while (worklist.length () > 0)
 532     {
 533       stmt_vec_info stmt_vinfo = worklist.pop ();
 534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 535       tree def = PHI_RESULT (phi);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 539
 540       gcc_assert (!virtual_operand_p (def)
 541                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 542
 543       stmt_vec_info reduc_stmt_info
 544         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 545                                     &reduc_chain);
 546       if (reduc_stmt_info)
 547         {
 548           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 549           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 550           if (double_reduc)
 551             {
 552               if (dump_enabled_p ())
 553                 dump_printf_loc (MSG_NOTE, vect_location,
 554                                  "Detected double reduction.\n");
 555
 556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 557               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 558             }
 559           else
 560             {
 561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 562                 {
 563                   if (dump_enabled_p ())
 564                     dump_printf_loc (MSG_NOTE, vect_location,
 565                                      "Detected vectorizable nested cycle.\n");
 566
 567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 568                 }
 569               else
 570                 {
 571                   if (dump_enabled_p ())
 572                     dump_printf_loc (MSG_NOTE, vect_location,
 573                                      "Detected reduction.\n");
 574
 575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 576                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 577                   /* Store the reduction cycles for possible vectorization in
 578                      loop-aware SLP if it was not detected as reduction
 579                      chain.  */
 580                   if (! reduc_chain)
 581                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 582                       (reduc_stmt_info);
 583                 }
 584             }
 585         }
 586       else
 587         if (dump_enabled_p ())
 588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 589                            "Unknown def-use cycle pattern.\n");
 590     }
 591 }
 592
 593
 594 /* Function vect_analyze_scalar_cycles.
 595
 596    Examine the cross iteration def-use cycles of scalar variables, by
 597    analyzing the loop-header PHIs of scalar variables.  Classify each
 598    cycle as one of the following: invariant, induction, reduction, unknown.
 599    We do that for the loop represented by LOOP_VINFO, and also to its
 600    inner-loop, if exists.
 601    Examples for scalar cycles:
 602
 603    Example1: reduction:
 604
 605               loop1:
 606               for (i=0; i<N; i++)
 607                  sum += a[i];
 608
 609    Example2: induction:
 610
 611               loop2:
 612               for (i=0; i<N; i++)
 613                  a[i] = i;  */
 614
 615 static void
 616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 617 {
 618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 619
 620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 621
 622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 623      Reductions in such inner-loop therefore have different properties than
 624      the reductions in the nest that gets vectorized:
 625      1. When vectorized, they are executed in the same order as in the original
 626         scalar loop, so we can't change the order of computation when
 627         vectorizing them.
 628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 629         current checks are too strict.  */
 630
 631   if (loop->inner)
 632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 633 }
 634
 635 /* Transfer group and reduction information from STMT_INFO to its
 636    pattern stmt.  */
 637
 638 static void
 639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 640 {
 641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 642   stmt_vec_info stmtp;
 643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 644               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 646   do
 647     {
 648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 650                            == STMT_VINFO_DEF_TYPE (stmt_info));
 651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 653       if (stmt_info)
 654         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 655           = STMT_VINFO_RELATED_STMT (stmt_info);
 656     }
 657   while (stmt_info);
 658 }
 659
 660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 661
 662 static void
 663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 664 {
 665   stmt_vec_info first;
 666   unsigned i;
 667
 668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 669     {
 670       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 671       while (next)
 672         {
 673           if ((STMT_VINFO_IN_PATTERN_P (next)
 674                != STMT_VINFO_IN_PATTERN_P (first))
 675               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 676             break;
 677           next = REDUC_GROUP_NEXT_ELEMENT (next);
 678         }
 679       /* If all reduction chain members are well-formed patterns adjust
 680          the group to group the pattern stmts instead.  */
 681       if (! next
 682           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 683         {
 684           if (STMT_VINFO_IN_PATTERN_P (first))
 685             {
 686               vect_fixup_reduc_chain (first);
 687               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 688                 = STMT_VINFO_RELATED_STMT (first);
 689             }
 690         }
 691       /* If not all stmt in the chain are patterns or if we failed
 692          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 693          it as regular reduction instead.  */
 694       else
 695         {
 696           stmt_vec_info vinfo = first;
 697           stmt_vec_info last = NULL;
 698           while (vinfo)
 699             {
 700               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 701               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 702               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 703               last = vinfo;
 704               vinfo = next;
 705             }
 706           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 707             = vect_internal_def;
 708           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 709           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 710           --i;
 711         }
 712     }
 713 }
 714
 715 /* Function vect_get_loop_niters.
 716
 717    Determine how many iterations the loop is executed and place it
 718    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 719    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 720    niter information holds in ASSUMPTIONS.
 721
 722    Return the loop exit condition.  */
 723
 724
 725 static gcond *
 726 vect_get_loop_niters (class loop *loop, tree *assumptions,
 727                       tree *number_of_iterations, tree *number_of_iterationsm1)
 728 {
 729   edge exit = single_exit (loop);
 730   class tree_niter_desc niter_desc;
 731   tree niter_assumptions, niter, may_be_zero;
 732   gcond *cond = get_loop_exit_condition (loop);
 733
 734   *assumptions = boolean_true_node;
 735   *number_of_iterationsm1 = chrec_dont_know;
 736   *number_of_iterations = chrec_dont_know;
 737   DUMP_VECT_SCOPE ("get_loop_niters");
 738
 739   if (!exit)
 740     return cond;
 741
 742   may_be_zero = NULL_TREE;
 743   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 744       || chrec_contains_undetermined (niter_desc.niter))
 745     return cond;
 746
 747   niter_assumptions = niter_desc.assumptions;
 748   may_be_zero = niter_desc.may_be_zero;
 749   niter = niter_desc.niter;
 750
 751   if (may_be_zero && integer_zerop (may_be_zero))
 752     may_be_zero = NULL_TREE;
 753
 754   if (may_be_zero)
 755     {
 756       if (COMPARISON_CLASS_P (may_be_zero))
 757         {
 758           /* Try to combine may_be_zero with assumptions, this can simplify
 759              computation of niter expression.  */
 760           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 761             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 762                                              niter_assumptions,
 763                                              fold_build1 (TRUTH_NOT_EXPR,
 764                                                           boolean_type_node,
 765                                                           may_be_zero));
 766           else
 767             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 768                                  build_int_cst (TREE_TYPE (niter), 0),
 769                                  rewrite_to_non_trapping_overflow (niter));
 770
 771           may_be_zero = NULL_TREE;
 772         }
 773       else if (integer_nonzerop (may_be_zero))
 774         {
 775           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 776           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 777           return cond;
 778         }
 779       else
 780         return cond;
 781     }
 782
 783   *assumptions = niter_assumptions;
 784   *number_of_iterationsm1 = niter;
 785
 786   /* We want the number of loop header executions which is the number
 787      of latch executions plus one.
 788      ???  For UINT_MAX latch executions this number overflows to zero
 789      for loops like do { n++; } while (n != 0);  */
 790   if (niter && !chrec_contains_undetermined (niter))
 791     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 792                           build_int_cst (TREE_TYPE (niter), 1));
 793   *number_of_iterations = niter;
 794
 795   return cond;
 796 }
 797
 798 /* Function bb_in_loop_p
 799
 800    Used as predicate for dfs order traversal of the loop bbs.  */
 801
 802 static bool
 803 bb_in_loop_p (const_basic_block bb, const void *data)
 804 {
 805   const class loop *const loop = (const class loop *)data;
 806   if (flow_bb_inside_loop_p (loop, bb))
 807     return true;
 808   return false;
 809 }
 810
 811
 812 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 813    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 814
 815 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 816   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 817     loop (loop_in),
 818     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 819     num_itersm1 (NULL_TREE),
 820     num_iters (NULL_TREE),
 821     num_iters_unchanged (NULL_TREE),
 822     num_iters_assumptions (NULL_TREE),
 823     th (0),
 824     versioning_threshold (0),
 825     vectorization_factor (0),
 826     max_vectorization_factor (0),
 827     mask_skip_niters (NULL_TREE),
 828     rgroup_compare_type (NULL_TREE),
 829     simd_if_cond (NULL_TREE),
 830     unaligned_dr (NULL),
 831     peeling_for_alignment (0),
 832     ptr_mask (0),
 833     ivexpr_map (NULL),
 834     scan_map (NULL),
 835     slp_unrolling_factor (1),
 836     single_scalar_iteration_cost (0),
 837     vec_outside_cost (0),
 838     vec_inside_cost (0),
 839     vectorizable (false),
 840     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 841     using_partial_vectors_p (false),
 842     epil_using_partial_vectors_p (false),
 843     peeling_for_gaps (false),
 844     peeling_for_niter (false),
 845     no_data_dependencies (false),
 846     has_mask_store (false),
 847     scalar_loop_scaling (profile_probability::uninitialized ()),
 848     scalar_loop (NULL),
 849     orig_loop_info (NULL)
 850 {
 851   /* CHECKME: We want to visit all BBs before their successors (except for
 852      latch blocks, for which this assertion wouldn't hold).  In the simple
 853      case of the loop forms we allow, a dfs order of the BBs would the same
 854      as reversed postorder traversal, so we are safe.  */
 855
 856   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 857                                           bbs, loop->num_nodes, loop);
 858   gcc_assert (nbbs == loop->num_nodes);
 859
 860   for (unsigned int i = 0; i < nbbs; i++)
 861     {
 862       basic_block bb = bbs[i];
 863       gimple_stmt_iterator si;
 864
 865       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 866         {
 867           gimple *phi = gsi_stmt (si);
 868           gimple_set_uid (phi, 0);
 869           add_stmt (phi);
 870         }
 871
 872       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 873         {
 874           gimple *stmt = gsi_stmt (si);
 875           gimple_set_uid (stmt, 0);
 876           if (is_gimple_debug (stmt))
 877             continue;
 878           add_stmt (stmt);
 879           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 880              third argument is the #pragma omp simd if (x) condition, when 0,
 881              loop shouldn't be vectorized, when non-zero constant, it should
 882              be vectorized normally, otherwise versioned with vectorized loop
 883              done if the condition is non-zero at runtime.  */
 884           if (loop_in->simduid
 885               && is_gimple_call (stmt)
 886               && gimple_call_internal_p (stmt)
 887               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 888               && gimple_call_num_args (stmt) >= 3
 889               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 890               && (loop_in->simduid
 891                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 892             {
 893               tree arg = gimple_call_arg (stmt, 2);
 894               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 895                 simd_if_cond = arg;
 896               else
 897                 gcc_assert (integer_nonzerop (arg));
 898             }
 899         }
 900     }
 901
 902   epilogue_vinfos.create (6);
 903 }
 904
 905 /* Free all levels of rgroup CONTROLS.  */
 906
 907 void
 908 release_vec_loop_controls (vec<rgroup_controls> *controls)
 909 {
 910   rgroup_controls *rgc;
 911   unsigned int i;
 912   FOR_EACH_VEC_ELT (*controls, i, rgc)
 913     rgc->controls.release ();
 914   controls->release ();
 915 }
 916
 917 /* Free all memory used by the _loop_vec_info, as well as all the
 918    stmt_vec_info structs of all the stmts in the loop.  */
 919
 920 _loop_vec_info::~_loop_vec_info ()
 921 {
 922   free (bbs);
 923
 924   release_vec_loop_controls (&masks);
 925   release_vec_loop_controls (&lens);
 926   delete ivexpr_map;
 927   delete scan_map;
 928   epilogue_vinfos.release ();
 929
 930   loop->aux = NULL;
 931 }
 932
 933 /* Return an invariant or register for EXPR and emit necessary
 934    computations in the LOOP_VINFO loop preheader.  */
 935
 936 tree
 937 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 938 {
 939   if (is_gimple_reg (expr)
 940       || is_gimple_min_invariant (expr))
 941     return expr;
 942
 943   if (! loop_vinfo->ivexpr_map)
 944     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 945   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 946   if (! cached)
 947     {
 948       gimple_seq stmts = NULL;
 949       cached = force_gimple_operand (unshare_expr (expr),
 950                                      &stmts, true, NULL_TREE);
 951       if (stmts)
 952         {
 953           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 954           gsi_insert_seq_on_edge_immediate (e, stmts);
 955         }
 956     }
 957   return cached;
 958 }
 959
 960 /* Return true if we can use CMP_TYPE as the comparison type to produce
 961    all masks required to mask LOOP_VINFO.  */
 962
 963 static bool
 964 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 965 {
 966   rgroup_controls *rgm;
 967   unsigned int i;
 968   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 969     if (rgm->type != NULL_TREE
 970         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 971                                             cmp_type, rgm->type,
 972                                             OPTIMIZE_FOR_SPEED))
 973       return false;
 974   return true;
 975 }
 976
 977 /* Calculate the maximum number of scalars per iteration for every
 978    rgroup in LOOP_VINFO.  */
 979
 980 static unsigned int
 981 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 982 {
 983   unsigned int res = 1;
 984   unsigned int i;
 985   rgroup_controls *rgm;
 986   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 987     res = MAX (res, rgm->max_nscalars_per_iter);
 988   return res;
 989 }
 990
 991 /* Calculate the minimum precision necessary to represent:
 992
 993       MAX_NITERS * FACTOR
 994
 995    as an unsigned integer, where MAX_NITERS is the maximum number of
 996    loop header iterations for the original scalar form of LOOP_VINFO.  */
 997
 998 static unsigned
 999 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1000 {
1001   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1002
1003   /* Get the maximum number of iterations that is representable
1004      in the counter type.  */
1005   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1006   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1007
1008   /* Get a more refined estimate for the number of iterations.  */
1009   widest_int max_back_edges;
1010   if (max_loop_iterations (loop, &max_back_edges))
1011     max_ni = wi::smin (max_ni, max_back_edges + 1);
1012
1013   /* Work out how many bits we need to represent the limit.  */
1014   return wi::min_precision (max_ni * factor, UNSIGNED);
1015 }
1016
1017 /* True if the loop needs peeling or partial vectors when vectorized.  */
1018
1019 static bool
1020 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1021 {
1022   unsigned HOST_WIDE_INT const_vf;
1023   HOST_WIDE_INT max_niter
1024     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1025
1026   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1027   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1028     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1029                                           (loop_vinfo));
1030
1031   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1032       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1033     {
1034       /* Work out the (constant) number of iterations that need to be
1035          peeled for reasons other than niters.  */
1036       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1037       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1038         peel_niter += 1;
1039       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1040                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1041         return true;
1042     }
1043   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1044       /* ??? When peeling for gaps but not alignment, we could
1045          try to check whether the (variable) niters is known to be
1046          VF * N + 1.  That's something of a niche case though.  */
1047       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1048       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1049       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1050            < (unsigned) exact_log2 (const_vf))
1051           /* In case of versioning, check if the maximum number of
1052              iterations is greater than th.  If they are identical,
1053              the epilogue is unnecessary.  */
1054           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1055               || ((unsigned HOST_WIDE_INT) max_niter
1056                   > (th / const_vf) * const_vf))))
1057     return true;
1058
1059   return false;
1060 }
1061
1062 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1063    whether we can actually generate the masks required.  Return true if so,
1064    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1065
1066 static bool
1067 vect_verify_full_masking (loop_vec_info loop_vinfo)
1068 {
1069   unsigned int min_ni_width;
1070   unsigned int max_nscalars_per_iter
1071     = vect_get_max_nscalars_per_iter (loop_vinfo);
1072
1073   /* Use a normal loop if there are no statements that need masking.
1074      This only happens in rare degenerate cases: it means that the loop
1075      has no loads, no stores, and no live-out values.  */
1076   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1077     return false;
1078
1079   /* Work out how many bits we need to represent the limit.  */
1080   min_ni_width
1081     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1082
1083   /* Find a scalar mode for which WHILE_ULT is supported.  */
1084   opt_scalar_int_mode cmp_mode_iter;
1085   tree cmp_type = NULL_TREE;
1086   tree iv_type = NULL_TREE;
1087   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1088   unsigned int iv_precision = UINT_MAX;
1089
1090   if (iv_limit != -1)
1091     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1092                                       UNSIGNED);
1093
1094   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1095     {
1096       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1097       if (cmp_bits >= min_ni_width
1098           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1099         {
1100           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1101           if (this_type
1102               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1103             {
1104               /* Although we could stop as soon as we find a valid mode,
1105                  there are at least two reasons why that's not always the
1106                  best choice:
1107
1108                  - An IV that's Pmode or wider is more likely to be reusable
1109                    in address calculations than an IV that's narrower than
1110                    Pmode.
1111
1112                  - Doing the comparison in IV_PRECISION or wider allows
1113                    a natural 0-based IV, whereas using a narrower comparison
1114                    type requires mitigations against wrap-around.
1115
1116                  Conversely, if the IV limit is variable, doing the comparison
1117                  in a wider type than the original type can introduce
1118                  unnecessary extensions, so picking the widest valid mode
1119                  is not always a good choice either.
1120
1121                  Here we prefer the first IV type that's Pmode or wider,
1122                  and the first comparison type that's IV_PRECISION or wider.
1123                  (The comparison type must be no wider than the IV type,
1124                  to avoid extensions in the vector loop.)
1125
1126                  ??? We might want to try continuing beyond Pmode for ILP32
1127                  targets if CMP_BITS < IV_PRECISION.  */
1128               iv_type = this_type;
1129               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1130                 cmp_type = this_type;
1131               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1132                 break;
1133             }
1134         }
1135     }
1136
1137   if (!cmp_type)
1138     return false;
1139
1140   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1141   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1142   return true;
1143 }
1144
1145 /* Check whether we can use vector access with length based on precison
1146    comparison.  So far, to keep it simple, we only allow the case that the
1147    precision of the target supported length is larger than the precision
1148    required by loop niters.  */
1149
1150 static bool
1151 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1152 {
1153   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1154     return false;
1155
1156   unsigned int max_nitems_per_iter = 1;
1157   unsigned int i;
1158   rgroup_controls *rgl;
1159   /* Find the maximum number of items per iteration for every rgroup.  */
1160   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1161     {
1162       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1163       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1164     }
1165
1166   /* Work out how many bits we need to represent the length limit.  */
1167   unsigned int min_ni_prec
1168     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1169
1170   /* Now use the maximum of below precisions for one suitable IV type:
1171      - the IV's natural precision
1172      - the precision needed to hold: the maximum number of scalar
1173        iterations multiplied by the scale factor (min_ni_prec above)
1174      - the Pmode precision
1175
1176      If min_ni_prec is less than the precision of the current niters,
1177      we perfer to still use the niters type.  Prefer to use Pmode and
1178      wider IV to avoid narrow conversions.  */
1179
1180   unsigned int ni_prec
1181     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1182   min_ni_prec = MAX (min_ni_prec, ni_prec);
1183   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1184
1185   tree iv_type = NULL_TREE;
1186   opt_scalar_int_mode tmode_iter;
1187   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1188     {
1189       scalar_mode tmode = tmode_iter.require ();
1190       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1191
1192       /* ??? Do we really want to construct one IV whose precision exceeds
1193          BITS_PER_WORD?  */
1194       if (tbits > BITS_PER_WORD)
1195         break;
1196
1197       /* Find the first available standard integral type.  */
1198       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1199         {
1200           iv_type = build_nonstandard_integer_type (tbits, true);
1201           break;
1202         }
1203     }
1204
1205   if (!iv_type)
1206     {
1207       if (dump_enabled_p ())
1208         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1209                          "can't vectorize with length-based partial vectors"
1210                          " because there is no suitable iv type.\n");
1211       return false;
1212     }
1213
1214   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1215   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1216
1217   return true;
1218 }
1219
1220 /* Calculate the cost of one scalar iteration of the loop.  */
1221 static void
1222 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1223 {
1224   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1225   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1226   int nbbs = loop->num_nodes, factor;
1227   int innerloop_iters, i;
1228
1229   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1230
1231   /* Gather costs for statements in the scalar loop.  */
1232
1233   /* FORNOW.  */
1234   innerloop_iters = 1;
1235   if (loop->inner)
1236     innerloop_iters = 50; /* FIXME */
1237
1238   for (i = 0; i < nbbs; i++)
1239     {
1240       gimple_stmt_iterator si;
1241       basic_block bb = bbs[i];
1242
1243       if (bb->loop_father == loop->inner)
1244         factor = innerloop_iters;
1245       else
1246         factor = 1;
1247
1248       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1249         {
1250           gimple *stmt = gsi_stmt (si);
1251           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1252
1253           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1254             continue;
1255
1256           /* Skip stmts that are not vectorized inside the loop.  */
1257           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1258           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1259               && (!STMT_VINFO_LIVE_P (vstmt_info)
1260                   || !VECTORIZABLE_CYCLE_DEF
1261                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1262             continue;
1263
1264           vect_cost_for_stmt kind;
1265           if (STMT_VINFO_DATA_REF (stmt_info))
1266             {
1267               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1268                kind = scalar_load;
1269              else
1270                kind = scalar_store;
1271             }
1272           else if (vect_nop_conversion_p (stmt_info))
1273             continue;
1274           else
1275             kind = scalar_stmt;
1276
1277           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1278                             factor, kind, stmt_info, 0, vect_prologue);
1279         }
1280     }
1281
1282   /* Now accumulate cost.  */
1283   void *target_cost_data = init_cost (loop);
1284   stmt_info_for_cost *si;
1285   int j;
1286   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1287                     j, si)
1288     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1289                           si->kind, si->stmt_info, si->vectype,
1290                           si->misalign, vect_body);
1291   unsigned dummy, body_cost = 0;
1292   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1293   destroy_cost_data (target_cost_data);
1294   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1295 }
1296
1297
1298 /* Function vect_analyze_loop_form_1.
1299
1300    Verify that certain CFG restrictions hold, including:
1301    - the loop has a pre-header
1302    - the loop has a single entry and exit
1303    - the loop exit condition is simple enough
1304    - the number of iterations can be analyzed, i.e, a countable loop.  The
1305      niter could be analyzed under some assumptions.  */
1306
1307 opt_result
1308 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1309                           tree *assumptions, tree *number_of_iterationsm1,
1310                           tree *number_of_iterations, gcond **inner_loop_cond)
1311 {
1312   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1313
1314   /* Different restrictions apply when we are considering an inner-most loop,
1315      vs. an outer (nested) loop.
1316      (FORNOW. May want to relax some of these restrictions in the future).  */
1317
1318   if (!loop->inner)
1319     {
1320       /* Inner-most loop.  We currently require that the number of BBs is
1321          exactly 2 (the header and latch).  Vectorizable inner-most loops
1322          look like this:
1323
1324                         (pre-header)
1325                            |
1326                           header <--------+
1327                            | |            |
1328                            | +--> latch --+
1329                            |
1330                         (exit-bb)  */
1331
1332       if (loop->num_nodes != 2)
1333         return opt_result::failure_at (vect_location,
1334                                        "not vectorized:"
1335                                        " control flow in loop.\n");
1336
1337       if (empty_block_p (loop->header))
1338         return opt_result::failure_at (vect_location,
1339                                        "not vectorized: empty loop.\n");
1340     }
1341   else
1342     {
1343       class loop *innerloop = loop->inner;
1344       edge entryedge;
1345
1346       /* Nested loop. We currently require that the loop is doubly-nested,
1347          contains a single inner loop, and the number of BBs is exactly 5.
1348          Vectorizable outer-loops look like this:
1349
1350                         (pre-header)
1351                            |
1352                           header <---+
1353                            |         |
1354                           inner-loop |
1355                            |         |
1356                           tail ------+
1357                            |
1358                         (exit-bb)
1359
1360          The inner-loop has the properties expected of inner-most loops
1361          as described above.  */
1362
1363       if ((loop->inner)->inner || (loop->inner)->next)
1364         return opt_result::failure_at (vect_location,
1365                                        "not vectorized:"
1366                                        " multiple nested loops.\n");
1367
1368       if (loop->num_nodes != 5)
1369         return opt_result::failure_at (vect_location,
1370                                        "not vectorized:"
1371                                        " control flow in loop.\n");
1372
1373       entryedge = loop_preheader_edge (innerloop);
1374       if (entryedge->src != loop->header
1375           || !single_exit (innerloop)
1376           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1377         return opt_result::failure_at (vect_location,
1378                                        "not vectorized:"
1379                                        " unsupported outerloop form.\n");
1380
1381       /* Analyze the inner-loop.  */
1382       tree inner_niterm1, inner_niter, inner_assumptions;
1383       opt_result res
1384         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1385                                     &inner_assumptions, &inner_niterm1,
1386                                     &inner_niter, NULL);
1387       if (!res)
1388         {
1389           if (dump_enabled_p ())
1390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391                              "not vectorized: Bad inner loop.\n");
1392           return res;
1393         }
1394
1395       /* Don't support analyzing niter under assumptions for inner
1396          loop.  */
1397       if (!integer_onep (inner_assumptions))
1398         return opt_result::failure_at (vect_location,
1399                                        "not vectorized: Bad inner loop.\n");
1400
1401       if (!expr_invariant_in_loop_p (loop, inner_niter))
1402         return opt_result::failure_at (vect_location,
1403                                        "not vectorized: inner-loop count not"
1404                                        " invariant.\n");
1405
1406       if (dump_enabled_p ())
1407         dump_printf_loc (MSG_NOTE, vect_location,
1408                          "Considering outer-loop vectorization.\n");
1409     }
1410
1411   if (!single_exit (loop))
1412     return opt_result::failure_at (vect_location,
1413                                    "not vectorized: multiple exits.\n");
1414   if (EDGE_COUNT (loop->header->preds) != 2)
1415     return opt_result::failure_at (vect_location,
1416                                    "not vectorized:"
1417                                    " too many incoming edges.\n");
1418
1419   /* We assume that the loop exit condition is at the end of the loop. i.e,
1420      that the loop is represented as a do-while (with a proper if-guard
1421      before the loop if needed), where the loop header contains all the
1422      executable statements, and the latch is empty.  */
1423   if (!empty_block_p (loop->latch)
1424       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1425     return opt_result::failure_at (vect_location,
1426                                    "not vectorized: latch block not empty.\n");
1427
1428   /* Make sure the exit is not abnormal.  */
1429   edge e = single_exit (loop);
1430   if (e->flags & EDGE_ABNORMAL)
1431     return opt_result::failure_at (vect_location,
1432                                    "not vectorized:"
1433                                    " abnormal loop exit edge.\n");
1434
1435   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1436                                      number_of_iterationsm1);
1437   if (!*loop_cond)
1438     return opt_result::failure_at
1439       (vect_location,
1440        "not vectorized: complicated exit condition.\n");
1441
1442   if (integer_zerop (*assumptions)
1443       || !*number_of_iterations
1444       || chrec_contains_undetermined (*number_of_iterations))
1445     return opt_result::failure_at
1446       (*loop_cond,
1447        "not vectorized: number of iterations cannot be computed.\n");
1448
1449   if (integer_zerop (*number_of_iterations))
1450     return opt_result::failure_at
1451       (*loop_cond,
1452        "not vectorized: number of iterations = 0.\n");
1453
1454   return opt_result::success ();
1455 }
1456
1457 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1458
1459 opt_loop_vec_info
1460 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1461 {
1462   tree assumptions, number_of_iterations, number_of_iterationsm1;
1463   gcond *loop_cond, *inner_loop_cond = NULL;
1464
1465   opt_result res
1466     = vect_analyze_loop_form_1 (loop, &loop_cond,
1467                                 &assumptions, &number_of_iterationsm1,
1468                                 &number_of_iterations, &inner_loop_cond);
1469   if (!res)
1470     return opt_loop_vec_info::propagate_failure (res);
1471
1472   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1473   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1474   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1475   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1476   if (!integer_onep (assumptions))
1477     {
1478       /* We consider to vectorize this loop by versioning it under
1479          some assumptions.  In order to do this, we need to clear
1480          existing information computed by scev and niter analyzer.  */
1481       scev_reset_htab ();
1482       free_numbers_of_iterations_estimates (loop);
1483       /* Also set flag for this loop so that following scev and niter
1484          analysis are done under the assumptions.  */
1485       loop_constraint_set (loop, LOOP_C_FINITE);
1486       /* Also record the assumptions for versioning.  */
1487       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1488     }
1489
1490   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1491     {
1492       if (dump_enabled_p ())
1493         {
1494           dump_printf_loc (MSG_NOTE, vect_location,
1495                            "Symbolic number of iterations is ");
1496           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1497           dump_printf (MSG_NOTE, "\n");
1498         }
1499     }
1500
1501   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1502   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1503   if (inner_loop_cond)
1504     {
1505       stmt_vec_info inner_loop_cond_info
1506         = loop_vinfo->lookup_stmt (inner_loop_cond);
1507       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1508     }
1509
1510   gcc_assert (!loop->aux);
1511   loop->aux = loop_vinfo;
1512   return opt_loop_vec_info::success (loop_vinfo);
1513 }
1514
1515
1516
1517 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1518    statements update the vectorization factor.  */
1519
1520 static void
1521 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1522 {
1523   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1524   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1525   int nbbs = loop->num_nodes;
1526   poly_uint64 vectorization_factor;
1527   int i;
1528
1529   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1530
1531   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1532   gcc_assert (known_ne (vectorization_factor, 0U));
1533
1534   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1535      vectorization factor of the loop is the unrolling factor required by
1536      the SLP instances.  If that unrolling factor is 1, we say, that we
1537      perform pure SLP on loop - cross iteration parallelism is not
1538      exploited.  */
1539   bool only_slp_in_loop = true;
1540   for (i = 0; i < nbbs; i++)
1541     {
1542       basic_block bb = bbs[i];
1543       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1544            gsi_next (&si))
1545         {
1546           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1547           if (!stmt_info)
1548             continue;
1549           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1550                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1551               && !PURE_SLP_STMT (stmt_info))
1552             /* STMT needs both SLP and loop-based vectorization.  */
1553             only_slp_in_loop = false;
1554         }
1555       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1556            gsi_next (&si))
1557         {
1558           if (is_gimple_debug (gsi_stmt (si)))
1559             continue;
1560           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1561           stmt_info = vect_stmt_to_vectorize (stmt_info);
1562           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1563                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1564               && !PURE_SLP_STMT (stmt_info))
1565             /* STMT needs both SLP and loop-based vectorization.  */
1566             only_slp_in_loop = false;
1567         }
1568     }
1569
1570   if (only_slp_in_loop)
1571     {
1572       if (dump_enabled_p ())
1573         dump_printf_loc (MSG_NOTE, vect_location,
1574                          "Loop contains only SLP stmts\n");
1575       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1576     }
1577   else
1578     {
1579       if (dump_enabled_p ())
1580         dump_printf_loc (MSG_NOTE, vect_location,
1581                          "Loop contains SLP and non-SLP stmts\n");
1582       /* Both the vectorization factor and unroll factor have the form
1583          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1584          so they must have a common multiple.  */
1585       vectorization_factor
1586         = force_common_multiple (vectorization_factor,
1587                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1588     }
1589
1590   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1591   if (dump_enabled_p ())
1592     {
1593       dump_printf_loc (MSG_NOTE, vect_location,
1594                        "Updating vectorization factor to ");
1595       dump_dec (MSG_NOTE, vectorization_factor);
1596       dump_printf (MSG_NOTE, ".\n");
1597     }
1598 }
1599
1600 /* Return true if STMT_INFO describes a double reduction phi and if
1601    the other phi in the reduction is also relevant for vectorization.
1602    This rejects cases such as:
1603
1604       outer1:
1605         x_1 = PHI <x_3(outer2), ...>;
1606         ...
1607
1608       inner:
1609         x_2 = ...;
1610         ...
1611
1612       outer2:
1613         x_3 = PHI <x_2(inner)>;
1614
1615    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1616
1617 static bool
1618 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1619 {
1620   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1621     return false;
1622
1623   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1624 }
1625
1626 /* Function vect_analyze_loop_operations.
1627
1628    Scan the loop stmts and make sure they are all vectorizable.  */
1629
1630 static opt_result
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1632 {
1633   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635   int nbbs = loop->num_nodes;
1636   int i;
1637   stmt_vec_info stmt_info;
1638   bool need_to_vectorize = false;
1639   bool ok;
1640
1641   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1642
1643   auto_vec<stmt_info_for_cost> cost_vec;
1644
1645   for (i = 0; i < nbbs; i++)
1646     {
1647       basic_block bb = bbs[i];
1648
1649       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650            gsi_next (&si))
1651         {
1652           gphi *phi = si.phi ();
1653           ok = true;
1654
1655           stmt_info = loop_vinfo->lookup_stmt (phi);
1656           if (dump_enabled_p ())
1657             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1658           if (virtual_operand_p (gimple_phi_result (phi)))
1659             continue;
1660
1661           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662              (i.e., a phi in the tail of the outer-loop).  */
1663           if (! is_loop_header_bb_p (bb))
1664             {
1665               /* FORNOW: we currently don't support the case that these phis
1666                  are not used in the outerloop (unless it is double reduction,
1667                  i.e., this phi is vect_reduction_def), cause this case
1668                  requires to actually do something here.  */
1669               if (STMT_VINFO_LIVE_P (stmt_info)
1670                   && !vect_active_double_reduction_p (stmt_info))
1671                 return opt_result::failure_at (phi,
1672                                                "Unsupported loop-closed phi"
1673                                                " in outer-loop.\n");
1674
1675               /* If PHI is used in the outer loop, we check that its operand
1676                  is defined in the inner loop.  */
1677               if (STMT_VINFO_RELEVANT_P (stmt_info))
1678                 {
1679                   tree phi_op;
1680
1681                   if (gimple_phi_num_args (phi) != 1)
1682                     return opt_result::failure_at (phi, "unsupported phi");
1683
1684                   phi_op = PHI_ARG_DEF (phi, 0);
1685                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1686                   if (!op_def_info)
1687                     return opt_result::failure_at (phi, "unsupported phi\n");
1688
1689                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1690                       && (STMT_VINFO_RELEVANT (op_def_info)
1691                           != vect_used_in_outer_by_reduction))
1692                     return opt_result::failure_at (phi, "unsupported phi\n");
1693
1694                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1695                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1696                            == vect_double_reduction_def))
1697                       && !vectorizable_lc_phi (loop_vinfo,
1698                                                stmt_info, NULL, NULL))
1699                     return opt_result::failure_at (phi, "unsupported phi\n");
1700                 }
1701
1702               continue;
1703             }
1704
1705           gcc_assert (stmt_info);
1706
1707           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1708                || STMT_VINFO_LIVE_P (stmt_info))
1709               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1710             /* A scalar-dependence cycle that we don't support.  */
1711             return opt_result::failure_at (phi,
1712                                            "not vectorized:"
1713                                            " scalar dependence cycle.\n");
1714
1715           if (STMT_VINFO_RELEVANT_P (stmt_info))
1716             {
1717               need_to_vectorize = true;
1718               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1719                   && ! PURE_SLP_STMT (stmt_info))
1720                 ok = vectorizable_induction (loop_vinfo,
1721                                              stmt_info, NULL, NULL,
1722                                              &cost_vec);
1723               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1724                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1725                             == vect_double_reduction_def)
1726                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1727                        && ! PURE_SLP_STMT (stmt_info))
1728                 ok = vectorizable_reduction (loop_vinfo,
1729                                              stmt_info, NULL, NULL, &cost_vec);
1730             }
1731
1732           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1733           if (ok
1734               && STMT_VINFO_LIVE_P (stmt_info)
1735               && !PURE_SLP_STMT (stmt_info))
1736             ok = vectorizable_live_operation (loop_vinfo,
1737                                               stmt_info, NULL, NULL, NULL,
1738                                               -1, false, &cost_vec);
1739
1740           if (!ok)
1741             return opt_result::failure_at (phi,
1742                                            "not vectorized: relevant phi not "
1743                                            "supported: %G",
1744                                            static_cast <gimple *> (phi));
1745         }
1746
1747       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1748            gsi_next (&si))
1749         {
1750           gimple *stmt = gsi_stmt (si);
1751           if (!gimple_clobber_p (stmt)
1752               && !is_gimple_debug (stmt))
1753             {
1754               opt_result res
1755                 = vect_analyze_stmt (loop_vinfo,
1756                                      loop_vinfo->lookup_stmt (stmt),
1757                                      &need_to_vectorize,
1758                                      NULL, NULL, &cost_vec);
1759               if (!res)
1760                 return res;
1761             }
1762         }
1763     } /* bbs */
1764
1765   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1766
1767   /* All operations in the loop are either irrelevant (deal with loop
1768      control, or dead), or only used outside the loop and can be moved
1769      out of the loop (e.g. invariants, inductions).  The loop can be
1770      optimized away by scalar optimizations.  We're better off not
1771      touching this loop.  */
1772   if (!need_to_vectorize)
1773     {
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_NOTE, vect_location,
1776                          "All the computation can be taken out of the loop.\n");
1777       return opt_result::failure_at
1778         (vect_location,
1779          "not vectorized: redundant loop. no profit to vectorize.\n");
1780     }
1781
1782   return opt_result::success ();
1783 }
1784
1785 /* Return true if we know that the iteration count is smaller than the
1786    vectorization factor.  Return false if it isn't, or if we can't be sure
1787    either way.  */
1788
1789 static bool
1790 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1791 {
1792   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1793
1794   HOST_WIDE_INT max_niter;
1795   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1796     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1797   else
1798     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1799
1800   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1801     return true;
1802
1803   return false;
1804 }
1805
1806 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1807    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1808    definitely no, or -1 if it's worth retrying.  */
1809
1810 static int
1811 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1812 {
1813   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1815
1816   /* Only loops that can handle partially-populated vectors can have iteration
1817      counts less than the vectorization factor.  */
1818   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1819     {
1820       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1821         {
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                              "not vectorized: iteration count smaller than "
1825                              "vectorization factor.\n");
1826           return 0;
1827         }
1828     }
1829
1830   /* If using the "very cheap" model. reject cases in which we'd keep
1831      a copy of the scalar code (even if we might be able to vectorize it).  */
1832   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1833       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1834           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1835           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1836     {
1837       if (dump_enabled_p ())
1838         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839                          "some scalar iterations would need to be peeled\n");
1840       return 0;
1841     }
1842
1843   int min_profitable_iters, min_profitable_estimate;
1844   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1845                                       &min_profitable_estimate);
1846
1847   if (min_profitable_iters < 0)
1848     {
1849       if (dump_enabled_p ())
1850         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1851                          "not vectorized: vectorization not profitable.\n");
1852       if (dump_enabled_p ())
1853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                          "not vectorized: vector version will never be "
1855                          "profitable.\n");
1856       return -1;
1857     }
1858
1859   int min_scalar_loop_bound = (param_min_vect_loop_bound
1860                                * assumed_vf);
1861
1862   /* Use the cost model only if it is more conservative than user specified
1863      threshold.  */
1864   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1865                                     min_profitable_iters);
1866
1867   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1868
1869   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1870       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1871     {
1872       if (dump_enabled_p ())
1873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874                          "not vectorized: vectorization not profitable.\n");
1875       if (dump_enabled_p ())
1876         dump_printf_loc (MSG_NOTE, vect_location,
1877                          "not vectorized: iteration count smaller than user "
1878                          "specified loop bound parameter or minimum profitable "
1879                          "iterations (whichever is more conservative).\n");
1880       return 0;
1881     }
1882
1883   /* The static profitablity threshold min_profitable_estimate includes
1884      the cost of having to check at runtime whether the scalar loop
1885      should be used instead.  If it turns out that we don't need or want
1886      such a check, the threshold we should use for the static estimate
1887      is simply the point at which the vector loop becomes more profitable
1888      than the scalar loop.  */
1889   if (min_profitable_estimate > min_profitable_iters
1890       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1891       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1892       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1893       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1894     {
1895       if (dump_enabled_p ())
1896         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1897                          " choice between the scalar and vector loops\n");
1898       min_profitable_estimate = min_profitable_iters;
1899     }
1900
1901   /* If the vector loop needs multiple iterations to be beneficial then
1902      things are probably too close to call, and the conservative thing
1903      would be to stick with the scalar code.  */
1904   if (flag_vect_cost_model == VECT_COST_MODEL_VERY_CHEAP
1905       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1906     {
1907       if (dump_enabled_p ())
1908         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1909                          "one iteration of the vector loop would be"
1910                          " more expensive than the equivalent number of"
1911                          " iterations of the scalar loop\n");
1912       return 0;
1913     }
1914
1915   HOST_WIDE_INT estimated_niter;
1916
1917   /* If we are vectorizing an epilogue then we know the maximum number of
1918      scalar iterations it will cover is at least one lower than the
1919      vectorization factor of the main loop.  */
1920   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1921     estimated_niter
1922       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1923   else
1924     {
1925       estimated_niter = estimated_stmt_executions_int (loop);
1926       if (estimated_niter == -1)
1927         estimated_niter = likely_max_stmt_executions_int (loop);
1928     }
1929   if (estimated_niter != -1
1930       && ((unsigned HOST_WIDE_INT) estimated_niter
1931           < MAX (th, (unsigned) min_profitable_estimate)))
1932     {
1933       if (dump_enabled_p ())
1934         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1935                          "not vectorized: estimated iteration count too "
1936                          "small.\n");
1937       if (dump_enabled_p ())
1938         dump_printf_loc (MSG_NOTE, vect_location,
1939                          "not vectorized: estimated iteration count smaller "
1940                          "than specified loop bound parameter or minimum "
1941                          "profitable iterations (whichever is more "
1942                          "conservative).\n");
1943       return -1;
1944     }
1945
1946   return 1;
1947 }
1948
1949 static opt_result
1950 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1951                            vec<data_reference_p> *datarefs,
1952                            unsigned int *n_stmts)
1953 {
1954   *n_stmts = 0;
1955   for (unsigned i = 0; i < loop->num_nodes; i++)
1956     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1957          !gsi_end_p (gsi); gsi_next (&gsi))
1958       {
1959         gimple *stmt = gsi_stmt (gsi);
1960         if (is_gimple_debug (stmt))
1961           continue;
1962         ++(*n_stmts);
1963         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1964                                                         NULL, 0);
1965         if (!res)
1966           {
1967             if (is_gimple_call (stmt) && loop->safelen)
1968               {
1969                 tree fndecl = gimple_call_fndecl (stmt), op;
1970                 if (fndecl != NULL_TREE)
1971                   {
1972                     cgraph_node *node = cgraph_node::get (fndecl);
1973                     if (node != NULL && node->simd_clones != NULL)
1974                       {
1975                         unsigned int j, n = gimple_call_num_args (stmt);
1976                         for (j = 0; j < n; j++)
1977                           {
1978                             op = gimple_call_arg (stmt, j);
1979                             if (DECL_P (op)
1980                                 || (REFERENCE_CLASS_P (op)
1981                                     && get_base_address (op)))
1982                               break;
1983                           }
1984                         op = gimple_call_lhs (stmt);
1985                         /* Ignore #pragma omp declare simd functions
1986                            if they don't have data references in the
1987                            call stmt itself.  */
1988                         if (j == n
1989                             && !(op
1990                                  && (DECL_P (op)
1991                                      || (REFERENCE_CLASS_P (op)
1992                                          && get_base_address (op)))))
1993                           continue;
1994                       }
1995                   }
1996               }
1997             return res;
1998           }
1999         /* If dependence analysis will give up due to the limit on the
2000            number of datarefs stop here and fail fatally.  */
2001         if (datarefs->length ()
2002             > (unsigned)param_loop_max_datarefs_for_datadeps)
2003           return opt_result::failure_at (stmt, "exceeded param "
2004                                          "loop-max-datarefs-for-datadeps\n");
2005       }
2006   return opt_result::success ();
2007 }
2008
2009 /* Look for SLP-only access groups and turn each individual access into its own
2010    group.  */
2011 static void
2012 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2013 {
2014   unsigned int i;
2015   struct data_reference *dr;
2016
2017   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2018
2019   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2020   FOR_EACH_VEC_ELT (datarefs, i, dr)
2021     {
2022       gcc_assert (DR_REF (dr));
2023       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2024
2025       /* Check if the load is a part of an interleaving chain.  */
2026       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2027         {
2028           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2029           unsigned int group_size = DR_GROUP_SIZE (first_element);
2030
2031           /* Check if SLP-only groups.  */
2032           if (!STMT_SLP_TYPE (stmt_info)
2033               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2034             {
2035               /* Dissolve the group.  */
2036               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2037
2038               stmt_vec_info vinfo = first_element;
2039               while (vinfo)
2040                 {
2041                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2042                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2043                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2044                   DR_GROUP_SIZE (vinfo) = 1;
2045                   if (STMT_VINFO_STRIDED_P (first_element))
2046                     DR_GROUP_GAP (vinfo) = 0;
2047                   else
2048                     DR_GROUP_GAP (vinfo) = group_size - 1;
2049                   vinfo = next;
2050                 }
2051             }
2052         }
2053     }
2054 }
2055
2056 /* Determine if operating on full vectors for LOOP_VINFO might leave
2057    some scalar iterations still to do.  If so, decide how we should
2058    handle those scalar iterations.  The possibilities are:
2059
2060    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2061        In this case:
2062
2063          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2064          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2065          LOOP_VINFO_PEELING_FOR_NITER == false
2066
2067    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2068        to handle the remaining scalar iterations.  In this case:
2069
2070          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2071          LOOP_VINFO_PEELING_FOR_NITER == true
2072
2073        There are two choices:
2074
2075        (2a) Consider vectorizing the epilogue loop at the same VF as the
2076             main loop, but using partial vectors instead of full vectors.
2077             In this case:
2078
2079               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2080
2081        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2082             In this case:
2083
2084               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2085
2086    When FOR_EPILOGUE_P is true, make this determination based on the
2087    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2088    based on the assumption that LOOP_VINFO is the main loop.  The caller
2089    has made sure that the number of iterations is set appropriately for
2090    this value of FOR_EPILOGUE_P.  */
2091
2092 opt_result
2093 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2094                                             bool for_epilogue_p)
2095 {
2096   /* Determine whether there would be any scalar iterations left over.  */
2097   bool need_peeling_or_partial_vectors_p
2098     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2099
2100   /* Decide whether to vectorize the loop with partial vectors.  */
2101   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2102   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2103   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2104       && need_peeling_or_partial_vectors_p)
2105     {
2106       /* For partial-vector-usage=1, try to push the handling of partial
2107          vectors to the epilogue, with the main loop continuing to operate
2108          on full vectors.
2109
2110          ??? We could then end up failing to use partial vectors if we
2111          decide to peel iterations into a prologue, and if the main loop
2112          then ends up processing fewer than VF iterations.  */
2113       if (param_vect_partial_vector_usage == 1
2114           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2115           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2116         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2117       else
2118         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2119     }
2120
2121   if (dump_enabled_p ())
2122     {
2123       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2124         dump_printf_loc (MSG_NOTE, vect_location,
2125                          "operating on partial vectors%s.\n",
2126                          for_epilogue_p ? " for epilogue loop" : "");
2127       else
2128         dump_printf_loc (MSG_NOTE, vect_location,
2129                          "operating only on full vectors%s.\n",
2130                          for_epilogue_p ? " for epilogue loop" : "");
2131     }
2132
2133   if (for_epilogue_p)
2134     {
2135       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2136       gcc_assert (orig_loop_vinfo);
2137       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2138         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2139                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2140     }
2141
2142   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2143       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2144     {
2145       /* Check that the loop processes at least one full vector.  */
2146       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2147       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2148       if (known_lt (wi::to_widest (scalar_niters), vf))
2149         return opt_result::failure_at (vect_location,
2150                                        "loop does not have enough iterations"
2151                                        " to support vectorization.\n");
2152
2153       /* If we need to peel an extra epilogue iteration to handle data
2154          accesses with gaps, check that there are enough scalar iterations
2155          available.
2156
2157          The check above is redundant with this one when peeling for gaps,
2158          but the distinction is useful for diagnostics.  */
2159       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2160       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2161           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2162         return opt_result::failure_at (vect_location,
2163                                        "loop does not have enough iterations"
2164                                        " to support peeling for gaps.\n");
2165     }
2166
2167   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2168     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2169        && need_peeling_or_partial_vectors_p);
2170
2171   return opt_result::success ();
2172 }
2173
2174 /* Function vect_analyze_loop_2.
2175
2176    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2177    for it.  The different analyses will record information in the
2178    loop_vec_info struct.  */
2179 static opt_result
2180 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
2181 {
2182   opt_result ok = opt_result::success ();
2183   int res;
2184   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2185   poly_uint64 min_vf = 2;
2186   loop_vec_info orig_loop_vinfo = NULL;
2187
2188   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2189      loop_vec_info of the first vectorized loop.  */
2190   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2191     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2192   else
2193     orig_loop_vinfo = loop_vinfo;
2194   gcc_assert (orig_loop_vinfo);
2195
2196   /* The first group of checks is independent of the vector size.  */
2197   fatal = true;
2198
2199   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2200       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2201     return opt_result::failure_at (vect_location,
2202                                    "not vectorized: simd if(0)\n");
2203
2204   /* Find all data references in the loop (which correspond to vdefs/vuses)
2205      and analyze their evolution in the loop.  */
2206
2207   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2208
2209   /* Gather the data references and count stmts in the loop.  */
2210   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2211     {
2212       opt_result res
2213         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2214                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2215                                      n_stmts);
2216       if (!res)
2217         {
2218           if (dump_enabled_p ())
2219             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220                              "not vectorized: loop contains function "
2221                              "calls or data references that cannot "
2222                              "be analyzed\n");
2223           return res;
2224         }
2225       loop_vinfo->shared->save_datarefs ();
2226     }
2227   else
2228     loop_vinfo->shared->check_datarefs ();
2229
2230   /* Analyze the data references and also adjust the minimal
2231      vectorization factor according to the loads and stores.  */
2232
2233   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2234   if (!ok)
2235     {
2236       if (dump_enabled_p ())
2237         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2238                          "bad data references.\n");
2239       return ok;
2240     }
2241
2242   /* Classify all cross-iteration scalar data-flow cycles.
2243      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2244   vect_analyze_scalar_cycles (loop_vinfo);
2245
2246   vect_pattern_recog (loop_vinfo);
2247
2248   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2249
2250   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2251      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2252
2253   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2254   if (!ok)
2255     {
2256       if (dump_enabled_p ())
2257         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2258                          "bad data access.\n");
2259       return ok;
2260     }
2261
2262   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2263
2264   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2265   if (!ok)
2266     {
2267       if (dump_enabled_p ())
2268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2269                          "unexpected pattern.\n");
2270       return ok;
2271     }
2272
2273   /* While the rest of the analysis below depends on it in some way.  */
2274   fatal = false;
2275
2276   /* Analyze data dependences between the data-refs in the loop
2277      and adjust the maximum vectorization factor according to
2278      the dependences.
2279      FORNOW: fail at the first data dependence that we encounter.  */
2280
2281   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2282   if (!ok)
2283     {
2284       if (dump_enabled_p ())
2285         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2286                          "bad data dependence.\n");
2287       return ok;
2288     }
2289   if (max_vf != MAX_VECTORIZATION_FACTOR
2290       && maybe_lt (max_vf, min_vf))
2291     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2292   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2293
2294   ok = vect_determine_vectorization_factor (loop_vinfo);
2295   if (!ok)
2296     {
2297       if (dump_enabled_p ())
2298         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299                          "can't determine vectorization factor.\n");
2300       return ok;
2301     }
2302   if (max_vf != MAX_VECTORIZATION_FACTOR
2303       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2304     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2305
2306   /* Compute the scalar iteration cost.  */
2307   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2308
2309   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2310
2311   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2312   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2313   if (!ok)
2314     return ok;
2315
2316   /* If there are any SLP instances mark them as pure_slp.  */
2317   bool slp = vect_make_slp_decision (loop_vinfo);
2318   if (slp)
2319     {
2320       /* Find stmts that need to be both vectorized and SLPed.  */
2321       vect_detect_hybrid_slp (loop_vinfo);
2322
2323       /* Update the vectorization factor based on the SLP decision.  */
2324       vect_update_vf_for_slp (loop_vinfo);
2325
2326       /* Optimize the SLP graph with the vectorization factor fixed.  */
2327       vect_optimize_slp (loop_vinfo);
2328
2329       /* Gather the loads reachable from the SLP graph entries.  */
2330       vect_gather_slp_loads (loop_vinfo);
2331     }
2332
2333   bool saved_can_use_partial_vectors_p
2334     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2335
2336   /* We don't expect to have to roll back to anything other than an empty
2337      set of rgroups.  */
2338   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2339
2340   /* This is the point where we can re-start analysis with SLP forced off.  */
2341 start_over:
2342
2343   /* Now the vectorization factor is final.  */
2344   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2345   gcc_assert (known_ne (vectorization_factor, 0U));
2346
2347   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2348     {
2349       dump_printf_loc (MSG_NOTE, vect_location,
2350                        "vectorization_factor = ");
2351       dump_dec (MSG_NOTE, vectorization_factor);
2352       dump_printf (MSG_NOTE, ", niters = %wd\n",
2353                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2354     }
2355
2356   /* Analyze the alignment of the data-refs in the loop.
2357      Fail if a data reference is found that cannot be vectorized.  */
2358
2359   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2360   if (!ok)
2361     {
2362       if (dump_enabled_p ())
2363         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2364                          "bad data alignment.\n");
2365       return ok;
2366     }
2367
2368   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2369      It is important to call pruning after vect_analyze_data_ref_accesses,
2370      since we use grouping information gathered by interleaving analysis.  */
2371   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2372   if (!ok)
2373     return ok;
2374
2375   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2376      vectorization, since we do not want to add extra peeling or
2377      add versioning for alignment.  */
2378   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2379     /* This pass will decide on using loop versioning and/or loop peeling in
2380        order to enhance the alignment of data references in the loop.  */
2381     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2382   if (!ok)
2383     return ok;
2384
2385   if (slp)
2386     {
2387       /* Analyze operations in the SLP instances.  Note this may
2388          remove unsupported SLP instances which makes the above
2389          SLP kind detection invalid.  */
2390       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2391       vect_slp_analyze_operations (loop_vinfo);
2392       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2393         {
2394           ok = opt_result::failure_at (vect_location,
2395                                        "unsupported SLP instances\n");
2396           goto again;
2397         }
2398
2399       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2400       slp_tree load_node, slp_root;
2401       unsigned i, x;
2402       slp_instance instance;
2403       bool can_use_lanes = true;
2404       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2405         {
2406           slp_root = SLP_INSTANCE_TREE (instance);
2407           int group_size = SLP_TREE_LANES (slp_root);
2408           tree vectype = SLP_TREE_VECTYPE (slp_root);
2409           bool loads_permuted = false;
2410           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2411             {
2412               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2413                 continue;
2414               unsigned j;
2415               stmt_vec_info load_info;
2416               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2417                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2418                   {
2419                     loads_permuted = true;
2420                     break;
2421                   }
2422             }
2423
2424           /* If the loads and stores can be handled with load/store-lane
2425              instructions record it and move on to the next instance.  */
2426           if (loads_permuted
2427               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2428               && vect_store_lanes_supported (vectype, group_size, false))
2429             {
2430               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2431                 {
2432                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2433                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2434                   /* Use SLP for strided accesses (or if we can't
2435                      load-lanes).  */
2436                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2437                       || ! vect_load_lanes_supported
2438                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2439                              DR_GROUP_SIZE (stmt_vinfo), false))
2440                     break;
2441                 }
2442
2443               can_use_lanes
2444                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2445
2446               if (can_use_lanes && dump_enabled_p ())
2447                 dump_printf_loc (MSG_NOTE, vect_location,
2448                                  "SLP instance %p can use load/store-lanes\n",
2449                                  instance);
2450             }
2451           else
2452             {
2453               can_use_lanes = false;
2454               break;
2455             }
2456         }
2457
2458       /* If all SLP instances can use load/store-lanes abort SLP and try again
2459          with SLP disabled.  */
2460       if (can_use_lanes)
2461         {
2462           ok = opt_result::failure_at (vect_location,
2463                                        "Built SLP cancelled: can use "
2464                                        "load/store-lanes\n");
2465           if (dump_enabled_p ())
2466             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2467                              "Built SLP cancelled: all SLP instances support "
2468                              "load/store-lanes\n");
2469           goto again;
2470         }
2471     }
2472
2473   /* Dissolve SLP-only groups.  */
2474   vect_dissolve_slp_only_groups (loop_vinfo);
2475
2476   /* Scan all the remaining operations in the loop that are not subject
2477      to SLP and make sure they are vectorizable.  */
2478   ok = vect_analyze_loop_operations (loop_vinfo);
2479   if (!ok)
2480     {
2481       if (dump_enabled_p ())
2482         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2483                          "bad operation or unsupported loop bound.\n");
2484       return ok;
2485     }
2486
2487   /* For now, we don't expect to mix both masking and length approaches for one
2488      loop, disable it if both are recorded.  */
2489   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2490       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2491       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2492     {
2493       if (dump_enabled_p ())
2494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2495                          "can't vectorize a loop with partial vectors"
2496                          " because we don't expect to mix different"
2497                          " approaches with partial vectors for the"
2498                          " same loop.\n");
2499       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2500     }
2501
2502   /* If we still have the option of using partial vectors,
2503      check whether we can generate the necessary loop controls.  */
2504   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2505       && !vect_verify_full_masking (loop_vinfo)
2506       && !vect_verify_loop_lens (loop_vinfo))
2507     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2508
2509   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2510      to be able to handle fewer than VF scalars, or needs to have a lower VF
2511      than the main loop.  */
2512   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2513       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2514       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2515                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2516     return opt_result::failure_at (vect_location,
2517                                    "Vectorization factor too high for"
2518                                    " epilogue loop.\n");
2519
2520   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2521      assuming that the loop will be used as a main loop.  We will redo
2522      this analysis later if we instead decide to use the loop as an
2523      epilogue loop.  */
2524   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2525   if (!ok)
2526     return ok;
2527
2528   /* Check the costings of the loop make vectorizing worthwhile.  */
2529   res = vect_analyze_loop_costing (loop_vinfo);
2530   if (res < 0)
2531     {
2532       ok = opt_result::failure_at (vect_location,
2533                                    "Loop costings may not be worthwhile.\n");
2534       goto again;
2535     }
2536   if (!res)
2537     return opt_result::failure_at (vect_location,
2538                                    "Loop costings not worthwhile.\n");
2539
2540   /* If an epilogue loop is required make sure we can create one.  */
2541   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2542       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2543     {
2544       if (dump_enabled_p ())
2545         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2546       if (!vect_can_advance_ivs_p (loop_vinfo)
2547           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2548                                            single_exit (LOOP_VINFO_LOOP
2549                                                          (loop_vinfo))))
2550         {
2551           ok = opt_result::failure_at (vect_location,
2552                                        "not vectorized: can't create required "
2553                                        "epilog loop\n");
2554           goto again;
2555         }
2556     }
2557
2558   /* During peeling, we need to check if number of loop iterations is
2559      enough for both peeled prolog loop and vector loop.  This check
2560      can be merged along with threshold check of loop versioning, so
2561      increase threshold for this case if necessary.
2562
2563      If we are analyzing an epilogue we still want to check what its
2564      versioning threshold would be.  If we decide to vectorize the epilogues we
2565      will want to use the lowest versioning threshold of all epilogues and main
2566      loop.  This will enable us to enter a vectorized epilogue even when
2567      versioning the loop.  We can't simply check whether the epilogue requires
2568      versioning though since we may have skipped some versioning checks when
2569      analyzing the epilogue.  For instance, checks for alias versioning will be
2570      skipped when dealing with epilogues as we assume we already checked them
2571      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2572   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2573     {
2574       poly_uint64 niters_th = 0;
2575       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2576
2577       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2578         {
2579           /* Niters for peeled prolog loop.  */
2580           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2581             {
2582               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2583               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2584               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2585             }
2586           else
2587             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2588         }
2589
2590       /* Niters for at least one iteration of vectorized loop.  */
2591       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2592         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2593       /* One additional iteration because of peeling for gap.  */
2594       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2595         niters_th += 1;
2596
2597       /*  Use the same condition as vect_transform_loop to decide when to use
2598           the cost to determine a versioning threshold.  */
2599       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2600           && ordered_p (th, niters_th))
2601         niters_th = ordered_max (poly_uint64 (th), niters_th);
2602
2603       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2604     }
2605
2606   gcc_assert (known_eq (vectorization_factor,
2607                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2608
2609   /* Ok to vectorize!  */
2610   return opt_result::success ();
2611
2612 again:
2613   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2614   gcc_assert (!ok);
2615
2616   /* Try again with SLP forced off but if we didn't do any SLP there is
2617      no point in re-trying.  */
2618   if (!slp)
2619     return ok;
2620
2621   /* If there are reduction chains re-trying will fail anyway.  */
2622   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2623     return ok;
2624
2625   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2626      via interleaving or lane instructions.  */
2627   slp_instance instance;
2628   slp_tree node;
2629   unsigned i, j;
2630   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2631     {
2632       stmt_vec_info vinfo;
2633       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2634       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2635         continue;
2636       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2637       unsigned int size = DR_GROUP_SIZE (vinfo);
2638       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2639       if (! vect_store_lanes_supported (vectype, size, false)
2640          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2641          && ! vect_grouped_store_supported (vectype, size))
2642         return opt_result::failure_at (vinfo->stmt,
2643                                        "unsupported grouped store\n");
2644       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2645         {
2646           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2647           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2648           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2649           size = DR_GROUP_SIZE (vinfo);
2650           vectype = STMT_VINFO_VECTYPE (vinfo);
2651           if (! vect_load_lanes_supported (vectype, size, false)
2652               && ! vect_grouped_load_supported (vectype, single_element_p,
2653                                                 size))
2654             return opt_result::failure_at (vinfo->stmt,
2655                                            "unsupported grouped load\n");
2656         }
2657     }
2658
2659   if (dump_enabled_p ())
2660     dump_printf_loc (MSG_NOTE, vect_location,
2661                      "re-trying with SLP disabled\n");
2662
2663   /* Roll back state appropriately.  No SLP this time.  */
2664   slp = false;
2665   /* Restore vectorization factor as it were without SLP.  */
2666   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2667   /* Free the SLP instances.  */
2668   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2669     vect_free_slp_instance (instance);
2670   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2671   /* Reset SLP type to loop_vect on all stmts.  */
2672   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2673     {
2674       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2675       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2676            !gsi_end_p (si); gsi_next (&si))
2677         {
2678           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2679           STMT_SLP_TYPE (stmt_info) = loop_vect;
2680           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2681               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2682             {
2683               /* vectorizable_reduction adjusts reduction stmt def-types,
2684                  restore them to that of the PHI.  */
2685               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2686                 = STMT_VINFO_DEF_TYPE (stmt_info);
2687               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2688                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2689                 = STMT_VINFO_DEF_TYPE (stmt_info);
2690             }
2691         }
2692       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2693            !gsi_end_p (si); gsi_next (&si))
2694         {
2695           if (is_gimple_debug (gsi_stmt (si)))
2696             continue;
2697           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2698           STMT_SLP_TYPE (stmt_info) = loop_vect;
2699           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2700             {
2701               stmt_vec_info pattern_stmt_info
2702                 = STMT_VINFO_RELATED_STMT (stmt_info);
2703               if (STMT_VINFO_SLP_VECT_ONLY (pattern_stmt_info))
2704                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2705
2706               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2707               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2708               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2709                    !gsi_end_p (pi); gsi_next (&pi))
2710                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2711                   = loop_vect;
2712             }
2713         }
2714     }
2715   /* Free optimized alias test DDRS.  */
2716   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2717   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2718   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2719   /* Reset target cost data.  */
2720   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2721   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2722     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2723   /* Reset accumulated rgroup information.  */
2724   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2725   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2726   /* Reset assorted flags.  */
2727   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2728   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2729   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2730   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2731   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2732     = saved_can_use_partial_vectors_p;
2733
2734   goto start_over;
2735 }
2736
2737 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2738    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2739    OLD_LOOP_VINFO is better unless something specifically indicates
2740    otherwise.
2741
2742    Note that this deliberately isn't a partial order.  */
2743
2744 static bool
2745 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2746                           loop_vec_info old_loop_vinfo)
2747 {
2748   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2749   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2750
2751   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2752   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2753
2754   /* Always prefer a VF of loop->simdlen over any other VF.  */
2755   if (loop->simdlen)
2756     {
2757       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2758       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2759       if (new_simdlen_p != old_simdlen_p)
2760         return new_simdlen_p;
2761     }
2762
2763   /* Limit the VFs to what is likely to be the maximum number of iterations,
2764      to handle cases in which at least one loop_vinfo is fully-masked.  */
2765   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2766   if (estimated_max_niter != -1)
2767     {
2768       if (known_le (estimated_max_niter, new_vf))
2769         new_vf = estimated_max_niter;
2770       if (known_le (estimated_max_niter, old_vf))
2771         old_vf = estimated_max_niter;
2772     }
2773
2774   /* Check whether the (fractional) cost per scalar iteration is lower
2775      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2776   poly_int64 rel_new = new_loop_vinfo->vec_inside_cost * old_vf;
2777   poly_int64 rel_old = old_loop_vinfo->vec_inside_cost * new_vf;
2778
2779   HOST_WIDE_INT est_rel_new_min
2780     = estimated_poly_value (rel_new, POLY_VALUE_MIN);
2781   HOST_WIDE_INT est_rel_new_max
2782     = estimated_poly_value (rel_new, POLY_VALUE_MAX);
2783
2784   HOST_WIDE_INT est_rel_old_min
2785     = estimated_poly_value (rel_old, POLY_VALUE_MIN);
2786   HOST_WIDE_INT est_rel_old_max
2787     = estimated_poly_value (rel_old, POLY_VALUE_MAX);
2788
2789   /* Check first if we can make out an unambigous total order from the minimum
2790      and maximum estimates.  */
2791   if (est_rel_new_min < est_rel_old_min
2792       && est_rel_new_max < est_rel_old_max)
2793     return true;
2794   else if (est_rel_old_min < est_rel_new_min
2795            && est_rel_old_max < est_rel_new_max)
2796     return false;
2797   /* When old_loop_vinfo uses a variable vectorization factor,
2798      we know that it has a lower cost for at least one runtime VF.
2799      However, we don't know how likely that VF is.
2800
2801      One option would be to compare the costs for the estimated VFs.
2802      The problem is that that can put too much pressure on the cost
2803      model.  E.g. if the estimated VF is also the lowest possible VF,
2804      and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2805      for the estimated VF, we'd then choose new_loop_vinfo even
2806      though (a) new_loop_vinfo might not actually be better than
2807      old_loop_vinfo for that VF and (b) it would be significantly
2808      worse at larger VFs.
2809
2810      Here we go for a hacky compromise: pick new_loop_vinfo if it is
2811      no more expensive than old_loop_vinfo even after doubling the
2812      estimated old_loop_vinfo VF.  For all but trivial loops, this
2813      ensures that we only pick new_loop_vinfo if it is significantly
2814      better than old_loop_vinfo at the estimated VF.  */
2815
2816   if (est_rel_old_min != est_rel_new_min
2817       || est_rel_old_max != est_rel_new_max)
2818     {
2819       HOST_WIDE_INT est_rel_new_likely
2820         = estimated_poly_value (rel_new, POLY_VALUE_LIKELY);
2821       HOST_WIDE_INT est_rel_old_likely
2822         = estimated_poly_value (rel_old, POLY_VALUE_LIKELY);
2823
2824       return est_rel_new_likely * 2 <= est_rel_old_likely;
2825     }
2826
2827   /* If there's nothing to choose between the loop bodies, see whether
2828      there's a difference in the prologue and epilogue costs.  */
2829   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2830     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2831
2832   return false;
2833 }
2834
2835 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2836    true if we should.  */
2837
2838 static bool
2839 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2840                         loop_vec_info old_loop_vinfo)
2841 {
2842   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2843     return false;
2844
2845   if (dump_enabled_p ())
2846     dump_printf_loc (MSG_NOTE, vect_location,
2847                      "***** Preferring vector mode %s to vector mode %s\n",
2848                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2849                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2850   return true;
2851 }
2852
2853 /* If LOOP_VINFO is already a main loop, return it unmodified.  Otherwise
2854    try to reanalyze it as a main loop.  Return the loop_vinfo on success
2855    and null on failure.  */
2856
2857 static loop_vec_info
2858 vect_reanalyze_as_main_loop (loop_vec_info loop_vinfo, unsigned int *n_stmts)
2859 {
2860   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2861     return loop_vinfo;
2862
2863   if (dump_enabled_p ())
2864     dump_printf_loc (MSG_NOTE, vect_location,
2865                      "***** Reanalyzing as a main loop with vector mode %s\n",
2866                      GET_MODE_NAME (loop_vinfo->vector_mode));
2867
2868   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2869   vec_info_shared *shared = loop_vinfo->shared;
2870   opt_loop_vec_info main_loop_vinfo = vect_analyze_loop_form (loop, shared);
2871   gcc_assert (main_loop_vinfo);
2872
2873   main_loop_vinfo->vector_mode = loop_vinfo->vector_mode;
2874
2875   bool fatal = false;
2876   bool res = vect_analyze_loop_2 (main_loop_vinfo, fatal, n_stmts);
2877   loop->aux = NULL;
2878   if (!res)
2879     {
2880       if (dump_enabled_p ())
2881         dump_printf_loc (MSG_NOTE, vect_location,
2882                          "***** Failed to analyze main loop with vector"
2883                          " mode %s\n",
2884                          GET_MODE_NAME (loop_vinfo->vector_mode));
2885       delete main_loop_vinfo;
2886       return NULL;
2887     }
2888   LOOP_VINFO_VECTORIZABLE_P (main_loop_vinfo) = 1;
2889   return main_loop_vinfo;
2890 }
2891
2892 /* Function vect_analyze_loop.
2893
2894    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2895    for it.  The different analyses will record information in the
2896    loop_vec_info struct.  */
2897 opt_loop_vec_info
2898 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2899 {
2900   auto_vector_modes vector_modes;
2901
2902   /* Autodetect first vector size we try.  */
2903   unsigned int autovec_flags
2904     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2905                                                     loop->simdlen != 0);
2906   unsigned int mode_i = 0;
2907
2908   DUMP_VECT_SCOPE ("analyze_loop_nest");
2909
2910   if (loop_outer (loop)
2911       && loop_vec_info_for_loop (loop_outer (loop))
2912       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2913     return opt_loop_vec_info::failure_at (vect_location,
2914                                           "outer-loop already vectorized.\n");
2915
2916   if (!find_loop_nest (loop, &shared->loop_nest))
2917     return opt_loop_vec_info::failure_at
2918       (vect_location,
2919        "not vectorized: loop nest containing two or more consecutive inner"
2920        " loops cannot be vectorized\n");
2921
2922   unsigned n_stmts = 0;
2923   machine_mode autodetected_vector_mode = VOIDmode;
2924   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2925   machine_mode next_vector_mode = VOIDmode;
2926   poly_uint64 lowest_th = 0;
2927   unsigned vectorized_loops = 0;
2928   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2929                              && !unlimited_cost_model (loop));
2930
2931   bool vect_epilogues = false;
2932   opt_result res = opt_result::success ();
2933   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2934   while (1)
2935     {
2936       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2937       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2938       if (!loop_vinfo)
2939         {
2940           if (dump_enabled_p ())
2941             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2942                              "bad loop form.\n");
2943           gcc_checking_assert (first_loop_vinfo == NULL);
2944           return loop_vinfo;
2945         }
2946       loop_vinfo->vector_mode = next_vector_mode;
2947
2948       bool fatal = false;
2949
2950       /* When pick_lowest_cost_p is true, we should in principle iterate
2951          over all the loop_vec_infos that LOOP_VINFO could replace and
2952          try to vectorize LOOP_VINFO under the same conditions.
2953          E.g. when trying to replace an epilogue loop, we should vectorize
2954          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2955          to replace the main loop, we should vectorize LOOP_VINFO as a main
2956          loop too.
2957
2958          However, autovectorize_vector_modes is usually sorted as follows:
2959
2960          - Modes that naturally produce lower VFs usually follow modes that
2961            naturally produce higher VFs.
2962
2963          - When modes naturally produce the same VF, maskable modes
2964            usually follow unmaskable ones, so that the maskable mode
2965            can be used to vectorize the epilogue of the unmaskable mode.
2966
2967          This order is preferred because it leads to the maximum
2968          epilogue vectorization opportunities.  Targets should only use
2969          a different order if they want to make wide modes available while
2970          disparaging them relative to earlier, smaller modes.  The assumption
2971          in that case is that the wider modes are more expensive in some
2972          way that isn't reflected directly in the costs.
2973
2974          There should therefore be few interesting cases in which
2975          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2976          treated as a standalone loop, and ends up being genuinely cheaper
2977          than FIRST_LOOP_VINFO.  */
2978       if (vect_epilogues)
2979         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2980
2981       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2982       if (mode_i == 0)
2983         autodetected_vector_mode = loop_vinfo->vector_mode;
2984       if (dump_enabled_p ())
2985         {
2986           if (res)
2987             dump_printf_loc (MSG_NOTE, vect_location,
2988                              "***** Analysis succeeded with vector mode %s\n",
2989                              GET_MODE_NAME (loop_vinfo->vector_mode));
2990           else
2991             dump_printf_loc (MSG_NOTE, vect_location,
2992                              "***** Analysis failed with vector mode %s\n",
2993                              GET_MODE_NAME (loop_vinfo->vector_mode));
2994         }
2995
2996       loop->aux = NULL;
2997
2998       if (!fatal)
2999         while (mode_i < vector_modes.length ()
3000                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
3001           {
3002             if (dump_enabled_p ())
3003               dump_printf_loc (MSG_NOTE, vect_location,
3004                                "***** The result for vector mode %s would"
3005                                " be the same\n",
3006                                GET_MODE_NAME (vector_modes[mode_i]));
3007             mode_i += 1;
3008           }
3009
3010       if (res)
3011         {
3012           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3013           vectorized_loops++;
3014
3015           /* Once we hit the desired simdlen for the first time,
3016              discard any previous attempts.  */
3017           if (simdlen
3018               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3019             {
3020               delete first_loop_vinfo;
3021               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3022               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
3023               simdlen = 0;
3024             }
3025           else if (pick_lowest_cost_p && first_loop_vinfo)
3026             {
3027               /* Keep trying to roll back vectorization attempts while the
3028                  loop_vec_infos they produced were worse than this one.  */
3029               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3030               while (!vinfos.is_empty ()
3031                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3032                 {
3033                   gcc_assert (vect_epilogues);
3034                   delete vinfos.pop ();
3035                 }
3036               if (vinfos.is_empty ()
3037                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3038                 {
3039                   loop_vec_info main_loop_vinfo
3040                     = vect_reanalyze_as_main_loop (loop_vinfo, &n_stmts);
3041                   if (main_loop_vinfo == loop_vinfo)
3042                     {
3043                       delete first_loop_vinfo;
3044                       first_loop_vinfo = opt_loop_vec_info::success (NULL);
3045                     }
3046                   else if (main_loop_vinfo
3047                            && vect_joust_loop_vinfos (main_loop_vinfo,
3048                                                       first_loop_vinfo))
3049                     {
3050                       delete first_loop_vinfo;
3051                       first_loop_vinfo = opt_loop_vec_info::success (NULL);
3052                       delete loop_vinfo;
3053                       loop_vinfo
3054                         = opt_loop_vec_info::success (main_loop_vinfo);
3055                     }
3056                   else
3057                     delete main_loop_vinfo;
3058                 }
3059             }
3060
3061           if (first_loop_vinfo == NULL)
3062             {
3063               first_loop_vinfo = loop_vinfo;
3064               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3065             }
3066           else if (vect_epilogues
3067                    /* For now only allow one epilogue loop.  */
3068                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
3069             {
3070               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3071               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3072               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3073                           || maybe_ne (lowest_th, 0U));
3074               /* Keep track of the known smallest versioning
3075                  threshold.  */
3076               if (ordered_p (lowest_th, th))
3077                 lowest_th = ordered_min (lowest_th, th);
3078             }
3079           else
3080             {
3081               delete loop_vinfo;
3082               loop_vinfo = opt_loop_vec_info::success (NULL);
3083             }
3084
3085           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3086              enabled, SIMDUID is not set, it is the innermost loop and we have
3087              either already found the loop's SIMDLEN or there was no SIMDLEN to
3088              begin with.
3089              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3090           vect_epilogues = (!simdlen
3091                             && loop->inner == NULL
3092                             && param_vect_epilogues_nomask
3093                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3094                             && !loop->simduid
3095                             /* For now only allow one epilogue loop, but allow
3096                                pick_lowest_cost_p to replace it.  */
3097                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
3098                                 || pick_lowest_cost_p));
3099
3100           /* Commit to first_loop_vinfo if we have no reason to try
3101              alternatives.  */
3102           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
3103             break;
3104         }
3105       else
3106         {
3107           delete loop_vinfo;
3108           loop_vinfo = opt_loop_vec_info::success (NULL);
3109           if (fatal)
3110             {
3111               gcc_checking_assert (first_loop_vinfo == NULL);
3112               break;
3113             }
3114         }
3115
3116       /* Handle the case that the original loop can use partial
3117          vectorization, but want to only adopt it for the epilogue.
3118          The retry should be in the same mode as original.  */
3119       if (vect_epilogues
3120           && loop_vinfo
3121           && LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo))
3122         {
3123           gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3124                       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo));
3125           if (dump_enabled_p ())
3126             dump_printf_loc (MSG_NOTE, vect_location,
3127                              "***** Re-trying analysis with same vector mode"
3128                              " %s for epilogue with partial vectors.\n",
3129                              GET_MODE_NAME (loop_vinfo->vector_mode));
3130           continue;
3131         }
3132
3133       if (mode_i < vector_modes.length ()
3134           && VECTOR_MODE_P (autodetected_vector_mode)
3135           && (related_vector_mode (vector_modes[mode_i],
3136                                    GET_MODE_INNER (autodetected_vector_mode))
3137               == autodetected_vector_mode)
3138           && (related_vector_mode (autodetected_vector_mode,
3139                                    GET_MODE_INNER (vector_modes[mode_i]))
3140               == vector_modes[mode_i]))
3141         {
3142           if (dump_enabled_p ())
3143             dump_printf_loc (MSG_NOTE, vect_location,
3144                              "***** Skipping vector mode %s, which would"
3145                              " repeat the analysis for %s\n",
3146                              GET_MODE_NAME (vector_modes[mode_i]),
3147                              GET_MODE_NAME (autodetected_vector_mode));
3148           mode_i += 1;
3149         }
3150
3151       if (mode_i == vector_modes.length ()
3152           || autodetected_vector_mode == VOIDmode)
3153         break;
3154
3155       /* Try the next biggest vector size.  */
3156       next_vector_mode = vector_modes[mode_i++];
3157       if (dump_enabled_p ())
3158         dump_printf_loc (MSG_NOTE, vect_location,
3159                          "***** Re-trying analysis with vector mode %s\n",
3160                          GET_MODE_NAME (next_vector_mode));
3161     }
3162
3163   if (first_loop_vinfo)
3164     {
3165       loop->aux = (loop_vec_info) first_loop_vinfo;
3166       if (dump_enabled_p ())
3167         dump_printf_loc (MSG_NOTE, vect_location,
3168                          "***** Choosing vector mode %s\n",
3169                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3170       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3171       return first_loop_vinfo;
3172     }
3173
3174   return opt_loop_vec_info::propagate_failure (res);
3175 }
3176
3177 /* Return true if there is an in-order reduction function for CODE, storing
3178    it in *REDUC_FN if so.  */
3179
3180 static bool
3181 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3182 {
3183   switch (code)
3184     {
3185     case PLUS_EXPR:
3186       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3187       return true;
3188
3189     default:
3190       return false;
3191     }
3192 }
3193
3194 /* Function reduction_fn_for_scalar_code
3195
3196    Input:
3197    CODE - tree_code of a reduction operations.
3198
3199    Output:
3200    REDUC_FN - the corresponding internal function to be used to reduce the
3201       vector of partial results into a single scalar result, or IFN_LAST
3202       if the operation is a supported reduction operation, but does not have
3203       such an internal function.
3204
3205    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3206
3207 static bool
3208 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3209 {
3210   switch (code)
3211     {
3212       case MAX_EXPR:
3213         *reduc_fn = IFN_REDUC_MAX;
3214         return true;
3215
3216       case MIN_EXPR:
3217         *reduc_fn = IFN_REDUC_MIN;
3218         return true;
3219
3220       case PLUS_EXPR:
3221         *reduc_fn = IFN_REDUC_PLUS;
3222         return true;
3223
3224       case BIT_AND_EXPR:
3225         *reduc_fn = IFN_REDUC_AND;
3226         return true;
3227
3228       case BIT_IOR_EXPR:
3229         *reduc_fn = IFN_REDUC_IOR;
3230         return true;
3231
3232       case BIT_XOR_EXPR:
3233         *reduc_fn = IFN_REDUC_XOR;
3234         return true;
3235
3236       case MULT_EXPR:
3237       case MINUS_EXPR:
3238         *reduc_fn = IFN_LAST;
3239         return true;
3240
3241       default:
3242        return false;
3243     }
3244 }
3245
3246 /* If there is a neutral value X such that SLP reduction NODE would not
3247    be affected by the introduction of additional X elements, return that X,
3248    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
3249    is the vector type that would hold element X.  REDUC_CHAIN is true if
3250    the SLP statements perform a single reduction, false if each statement
3251    performs an independent reduction.  */
3252
3253 static tree
3254 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
3255                               tree_code code, bool reduc_chain)
3256 {
3257   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3258   stmt_vec_info stmt_vinfo = stmts[0];
3259   tree scalar_type = TREE_TYPE (vector_type);
3260   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
3261   gcc_assert (loop);
3262
3263   switch (code)
3264     {
3265     case WIDEN_SUM_EXPR:
3266     case DOT_PROD_EXPR:
3267     case SAD_EXPR:
3268     case PLUS_EXPR:
3269     case MINUS_EXPR:
3270     case BIT_IOR_EXPR:
3271     case BIT_XOR_EXPR:
3272       return build_zero_cst (scalar_type);
3273
3274     case MULT_EXPR:
3275       return build_one_cst (scalar_type);
3276
3277     case BIT_AND_EXPR:
3278       return build_all_ones_cst (scalar_type);
3279
3280     case MAX_EXPR:
3281     case MIN_EXPR:
3282       /* For MIN/MAX the initial values are neutral.  A reduction chain
3283          has only a single initial value, so that value is neutral for
3284          all statements.  */
3285       if (reduc_chain)
3286         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
3287                                       loop_preheader_edge (loop));
3288       return NULL_TREE;
3289
3290     default:
3291       return NULL_TREE;
3292     }
3293 }
3294
3295 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3296    STMT is printed with a message MSG. */
3297
3298 static void
3299 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3300 {
3301   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3302 }
3303
3304 /* Return true if we need an in-order reduction for operation CODE
3305    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3306    overflow must wrap.  */
3307
3308 bool
3309 needs_fold_left_reduction_p (tree type, tree_code code)
3310 {
3311   /* CHECKME: check for !flag_finite_math_only too?  */
3312   if (SCALAR_FLOAT_TYPE_P (type))
3313     switch (code)
3314       {
3315       case MIN_EXPR:
3316       case MAX_EXPR:
3317         return false;
3318
3319       default:
3320         return !flag_associative_math;
3321       }
3322
3323   if (INTEGRAL_TYPE_P (type))
3324     {
3325       if (!operation_no_trapping_overflow (type, code))
3326         return true;
3327       return false;
3328     }
3329
3330   if (SAT_FIXED_POINT_TYPE_P (type))
3331     return true;
3332
3333   return false;
3334 }
3335
3336 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3337    has a handled computation expression.  Store the main reduction
3338    operation in *CODE.  */
3339
3340 static bool
3341 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3342                       tree loop_arg, enum tree_code *code,
3343                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3344 {
3345   auto_bitmap visited;
3346   tree lookfor = PHI_RESULT (phi);
3347   ssa_op_iter curri;
3348   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3349   while (USE_FROM_PTR (curr) != loop_arg)
3350     curr = op_iter_next_use (&curri);
3351   curri.i = curri.numops;
3352   do
3353     {
3354       path.safe_push (std::make_pair (curri, curr));
3355       tree use = USE_FROM_PTR (curr);
3356       if (use == lookfor)
3357         break;
3358       gimple *def = SSA_NAME_DEF_STMT (use);
3359       if (gimple_nop_p (def)
3360           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3361         {
3362 pop:
3363           do
3364             {
3365               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3366               curri = x.first;
3367               curr = x.second;
3368               do
3369                 curr = op_iter_next_use (&curri);
3370               /* Skip already visited or non-SSA operands (from iterating
3371                  over PHI args).  */
3372               while (curr != NULL_USE_OPERAND_P
3373                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3374                          || ! bitmap_set_bit (visited,
3375                                               SSA_NAME_VERSION
3376                                                 (USE_FROM_PTR (curr)))));
3377             }
3378           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3379           if (curr == NULL_USE_OPERAND_P)
3380             break;
3381         }
3382       else
3383         {
3384           if (gimple_code (def) == GIMPLE_PHI)
3385             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3386           else
3387             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3388           while (curr != NULL_USE_OPERAND_P
3389                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3390                      || ! bitmap_set_bit (visited,
3391                                           SSA_NAME_VERSION
3392                                             (USE_FROM_PTR (curr)))))
3393             curr = op_iter_next_use (&curri);
3394           if (curr == NULL_USE_OPERAND_P)
3395             goto pop;
3396         }
3397     }
3398   while (1);
3399   if (dump_file && (dump_flags & TDF_DETAILS))
3400     {
3401       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3402       unsigned i;
3403       std::pair<ssa_op_iter, use_operand_p> *x;
3404       FOR_EACH_VEC_ELT (path, i, x)
3405         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3406       dump_printf (MSG_NOTE, "\n");
3407     }
3408
3409   /* Check whether the reduction path detected is valid.  */
3410   bool fail = path.length () == 0;
3411   bool neg = false;
3412   int sign = -1;
3413   *code = ERROR_MARK;
3414   for (unsigned i = 1; i < path.length (); ++i)
3415     {
3416       gimple *use_stmt = USE_STMT (path[i].second);
3417       tree op = USE_FROM_PTR (path[i].second);
3418       if (! is_gimple_assign (use_stmt)
3419           /* The following make sure we can compute the operand index
3420              easily plus it mostly disallows chaining via COND_EXPR condition
3421              operands.  */
3422           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3423               && (gimple_num_ops (use_stmt) <= 2
3424                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3425               && (gimple_num_ops (use_stmt) <= 3
3426                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3427         {
3428           fail = true;
3429           break;
3430         }
3431       /* Check there's only a single stmt the op is used on.  For the
3432          not value-changing tail and the last stmt allow out-of-loop uses.
3433          ???  We could relax this and handle arbitrary live stmts by
3434          forcing a scalar epilogue for example.  */
3435       imm_use_iterator imm_iter;
3436       gimple *op_use_stmt;
3437       unsigned cnt = 0;
3438       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3439         if (!is_gimple_debug (op_use_stmt)
3440             && (*code != ERROR_MARK
3441                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3442           {
3443             /* We want to allow x + x but not x < 1 ? x : 2.  */
3444             if (is_gimple_assign (op_use_stmt)
3445                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3446               {
3447                 use_operand_p use_p;
3448                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3449                   cnt++;
3450               }
3451             else
3452               cnt++;
3453           }
3454       if (cnt != 1)
3455         {
3456           fail = true;
3457           break;
3458         }
3459       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3460       if (use_code == MINUS_EXPR)
3461         {
3462           use_code = PLUS_EXPR;
3463           /* Track whether we negate the reduction value each iteration.  */
3464           if (gimple_assign_rhs2 (use_stmt) == op)
3465             neg = ! neg;
3466         }
3467       if (CONVERT_EXPR_CODE_P (use_code)
3468           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3469                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3470         ;
3471       else if (*code == ERROR_MARK)
3472         {
3473           *code = use_code;
3474           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3475         }
3476       else if (use_code != *code)
3477         {
3478           fail = true;
3479           break;
3480         }
3481       else if ((use_code == MIN_EXPR
3482                 || use_code == MAX_EXPR)
3483                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3484         {
3485           fail = true;
3486           break;
3487         }
3488     }
3489   return ! fail && ! neg && *code != ERROR_MARK;
3490 }
3491
3492 bool
3493 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3494                       tree loop_arg, enum tree_code code)
3495 {
3496   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3497   enum tree_code code_;
3498   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3499           && code_ == code);
3500 }
3501
3502
3503
3504 /* Function vect_is_simple_reduction
3505
3506    (1) Detect a cross-iteration def-use cycle that represents a simple
3507    reduction computation.  We look for the following pattern:
3508
3509    loop_header:
3510      a1 = phi < a0, a2 >
3511      a3 = ...
3512      a2 = operation (a3, a1)
3513
3514    or
3515
3516    a3 = ...
3517    loop_header:
3518      a1 = phi < a0, a2 >
3519      a2 = operation (a3, a1)
3520
3521    such that:
3522    1. operation is commutative and associative and it is safe to
3523       change the order of the computation
3524    2. no uses for a2 in the loop (a2 is used out of the loop)
3525    3. no uses of a1 in the loop besides the reduction operation
3526    4. no uses of a1 outside the loop.
3527
3528    Conditions 1,4 are tested here.
3529    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3530
3531    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3532    nested cycles.
3533
3534    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3535    reductions:
3536
3537      a1 = phi < a0, a2 >
3538      inner loop (def of a3)
3539      a2 = phi < a3 >
3540
3541    (4) Detect condition expressions, ie:
3542      for (int i = 0; i < N; i++)
3543        if (a[i] < val)
3544         ret_val = a[i];
3545
3546 */
3547
3548 static stmt_vec_info
3549 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3550                           bool *double_reduc, bool *reduc_chain_p)
3551 {
3552   gphi *phi = as_a <gphi *> (phi_info->stmt);
3553   gimple *phi_use_stmt = NULL;
3554   imm_use_iterator imm_iter;
3555   use_operand_p use_p;
3556
3557   *double_reduc = false;
3558   *reduc_chain_p = false;
3559   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3560
3561   tree phi_name = PHI_RESULT (phi);
3562   /* ???  If there are no uses of the PHI result the inner loop reduction
3563      won't be detected as possibly double-reduction by vectorizable_reduction
3564      because that tries to walk the PHI arg from the preheader edge which
3565      can be constant.  See PR60382.  */
3566   if (has_zero_uses (phi_name))
3567     return NULL;
3568   class loop *loop = (gimple_bb (phi))->loop_father;
3569   unsigned nphi_def_loop_uses = 0;
3570   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3571     {
3572       gimple *use_stmt = USE_STMT (use_p);
3573       if (is_gimple_debug (use_stmt))
3574         continue;
3575
3576       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3577         {
3578           if (dump_enabled_p ())
3579             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3580                              "intermediate value used outside loop.\n");
3581
3582           return NULL;
3583         }
3584
3585       nphi_def_loop_uses++;
3586       phi_use_stmt = use_stmt;
3587     }
3588
3589   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3590   if (TREE_CODE (latch_def) != SSA_NAME)
3591     {
3592       if (dump_enabled_p ())
3593         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3594                          "reduction: not ssa_name: %T\n", latch_def);
3595       return NULL;
3596     }
3597
3598   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3599   if (!def_stmt_info
3600       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3601     return NULL;
3602
3603   bool nested_in_vect_loop
3604     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3605   unsigned nlatch_def_loop_uses = 0;
3606   auto_vec<gphi *, 3> lcphis;
3607   bool inner_loop_of_double_reduc = false;
3608   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3609     {
3610       gimple *use_stmt = USE_STMT (use_p);
3611       if (is_gimple_debug (use_stmt))
3612         continue;
3613       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3614         nlatch_def_loop_uses++;
3615       else
3616         {
3617           /* We can have more than one loop-closed PHI.  */
3618           lcphis.safe_push (as_a <gphi *> (use_stmt));
3619           if (nested_in_vect_loop
3620               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3621                   == vect_double_reduction_def))
3622             inner_loop_of_double_reduc = true;
3623         }
3624     }
3625
3626   /* If we are vectorizing an inner reduction we are executing that
3627      in the original order only in case we are not dealing with a
3628      double reduction.  */
3629   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3630     {
3631       if (dump_enabled_p ())
3632         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3633                         "detected nested cycle: ");
3634       return def_stmt_info;
3635     }
3636
3637   /* If this isn't a nested cycle or if the nested cycle reduction value
3638      is used ouside of the inner loop we cannot handle uses of the reduction
3639      value.  */
3640   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3641     {
3642       if (dump_enabled_p ())
3643         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3644                          "reduction used in loop.\n");
3645       return NULL;
3646     }
3647
3648   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3649      defined in the inner loop.  */
3650   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3651     {
3652       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3653       if (gimple_phi_num_args (def_stmt) != 1
3654           || TREE_CODE (op1) != SSA_NAME)
3655         {
3656           if (dump_enabled_p ())
3657             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3658                              "unsupported phi node definition.\n");
3659
3660           return NULL;
3661         }
3662
3663       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3664       if (gimple_bb (def1)
3665           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3666           && loop->inner
3667           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3668           && is_gimple_assign (def1)
3669           && is_a <gphi *> (phi_use_stmt)
3670           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3671         {
3672           if (dump_enabled_p ())
3673             report_vect_op (MSG_NOTE, def_stmt,
3674                             "detected double reduction: ");
3675
3676           *double_reduc = true;
3677           return def_stmt_info;
3678         }
3679
3680       return NULL;
3681     }
3682
3683   /* Look for the expression computing latch_def from then loop PHI result.  */
3684   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3685   enum tree_code code;
3686   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3687                             path))
3688     {
3689       STMT_VINFO_REDUC_CODE (phi_info) = code;
3690       if (code == COND_EXPR && !nested_in_vect_loop)
3691         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3692
3693       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3694          reduction chain for which the additional restriction is that
3695          all operations in the chain are the same.  */
3696       auto_vec<stmt_vec_info, 8> reduc_chain;
3697       unsigned i;
3698       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3699       for (i = path.length () - 1; i >= 1; --i)
3700         {
3701           gimple *stmt = USE_STMT (path[i].second);
3702           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3703           STMT_VINFO_REDUC_IDX (stmt_info)
3704             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3705           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3706           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3707                                      && (i == 1 || i == path.length () - 1));
3708           if ((stmt_code != code && !leading_conversion)
3709               /* We can only handle the final value in epilogue
3710                  generation for reduction chains.  */
3711               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3712             is_slp_reduc = false;
3713           /* For reduction chains we support a trailing/leading
3714              conversions.  We do not store those in the actual chain.  */
3715           if (leading_conversion)
3716             continue;
3717           reduc_chain.safe_push (stmt_info);
3718         }
3719       if (is_slp_reduc && reduc_chain.length () > 1)
3720         {
3721           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3722             {
3723               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3724               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3725             }
3726           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3727           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3728
3729           /* Save the chain for further analysis in SLP detection.  */
3730           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3731           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3732
3733           *reduc_chain_p = true;
3734           if (dump_enabled_p ())
3735             dump_printf_loc (MSG_NOTE, vect_location,
3736                             "reduction: detected reduction chain\n");
3737         }
3738       else if (dump_enabled_p ())
3739         dump_printf_loc (MSG_NOTE, vect_location,
3740                          "reduction: detected reduction\n");
3741
3742       return def_stmt_info;
3743     }
3744
3745   if (dump_enabled_p ())
3746     dump_printf_loc (MSG_NOTE, vect_location,
3747                      "reduction: unknown pattern\n");
3748
3749   return NULL;
3750 }
3751
3752 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3753    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3754    or -1 if not known.  */
3755
3756 static int
3757 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3758 {
3759   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3760   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3761     {
3762       if (dump_enabled_p ())
3763         dump_printf_loc (MSG_NOTE, vect_location,
3764                          "cost model: epilogue peel iters set to vf/2 "
3765                          "because loop iterations are unknown .\n");
3766       return assumed_vf / 2;
3767     }
3768   else
3769     {
3770       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3771       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3772       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3773       /* If we need to peel for gaps, but no peeling is required, we have to
3774          peel VF iterations.  */
3775       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3776         peel_iters_epilogue = assumed_vf;
3777       return peel_iters_epilogue;
3778     }
3779 }
3780
3781 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3782 int
3783 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3784                              int *peel_iters_epilogue,
3785                              stmt_vector_for_cost *scalar_cost_vec,
3786                              stmt_vector_for_cost *prologue_cost_vec,
3787                              stmt_vector_for_cost *epilogue_cost_vec)
3788 {
3789   int retval = 0;
3790
3791   *peel_iters_epilogue
3792     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3793
3794   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3795     {
3796       /* If peeled iterations are known but number of scalar loop
3797          iterations are unknown, count a taken branch per peeled loop.  */
3798       if (peel_iters_prologue > 0)
3799         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3800                                    NULL, NULL_TREE, 0, vect_prologue);
3801       if (*peel_iters_epilogue > 0)
3802         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3803                                     NULL, NULL_TREE, 0, vect_epilogue);
3804     }
3805
3806   stmt_info_for_cost *si;
3807   int j;
3808   if (peel_iters_prologue)
3809     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3810       retval += record_stmt_cost (prologue_cost_vec,
3811                                   si->count * peel_iters_prologue,
3812                                   si->kind, si->stmt_info, si->misalign,
3813                                   vect_prologue);
3814   if (*peel_iters_epilogue)
3815     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3816       retval += record_stmt_cost (epilogue_cost_vec,
3817                                   si->count * *peel_iters_epilogue,
3818                                   si->kind, si->stmt_info, si->misalign,
3819                                   vect_epilogue);
3820
3821   return retval;
3822 }
3823
3824 /* Function vect_estimate_min_profitable_iters
3825
3826    Return the number of iterations required for the vector version of the
3827    loop to be profitable relative to the cost of the scalar version of the
3828    loop.
3829
3830    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3831    of iterations for vectorization.  -1 value means loop vectorization
3832    is not profitable.  This returned value may be used for dynamic
3833    profitability check.
3834
3835    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3836    for static check against estimated number of iterations.  */
3837
3838 static void
3839 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3840                                     int *ret_min_profitable_niters,
3841                                     int *ret_min_profitable_estimate)
3842 {
3843   int min_profitable_iters;
3844   int min_profitable_estimate;
3845   int peel_iters_prologue;
3846   int peel_iters_epilogue;
3847   unsigned vec_inside_cost = 0;
3848   int vec_outside_cost = 0;
3849   unsigned vec_prologue_cost = 0;
3850   unsigned vec_epilogue_cost = 0;
3851   int scalar_single_iter_cost = 0;
3852   int scalar_outside_cost = 0;
3853   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3854   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3855   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3856
3857   /* Cost model disabled.  */
3858   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3859     {
3860       if (dump_enabled_p ())
3861         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3862       *ret_min_profitable_niters = 0;
3863       *ret_min_profitable_estimate = 0;
3864       return;
3865     }
3866
3867   /* Requires loop versioning tests to handle misalignment.  */
3868   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3869     {
3870       /*  FIXME: Make cost depend on complexity of individual check.  */
3871       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3872       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3873                             NULL, NULL_TREE, 0, vect_prologue);
3874       if (dump_enabled_p ())
3875         dump_printf (MSG_NOTE,
3876                      "cost model: Adding cost of checks for loop "
3877                      "versioning to treat misalignment.\n");
3878     }
3879
3880   /* Requires loop versioning with alias checks.  */
3881   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3882     {
3883       /*  FIXME: Make cost depend on complexity of individual check.  */
3884       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3885       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3886                             NULL, NULL_TREE, 0, vect_prologue);
3887       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3888       if (len)
3889         /* Count LEN - 1 ANDs and LEN comparisons.  */
3890         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3891                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3892       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3893       if (len)
3894         {
3895           /* Count LEN - 1 ANDs and LEN comparisons.  */
3896           unsigned int nstmts = len * 2 - 1;
3897           /* +1 for each bias that needs adding.  */
3898           for (unsigned int i = 0; i < len; ++i)
3899             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3900               nstmts += 1;
3901           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3902                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3903         }
3904       if (dump_enabled_p ())
3905         dump_printf (MSG_NOTE,
3906                      "cost model: Adding cost of checks for loop "
3907                      "versioning aliasing.\n");
3908     }
3909
3910   /* Requires loop versioning with niter checks.  */
3911   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3912     {
3913       /*  FIXME: Make cost depend on complexity of individual check.  */
3914       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3915                             NULL, NULL_TREE, 0, vect_prologue);
3916       if (dump_enabled_p ())
3917         dump_printf (MSG_NOTE,
3918                      "cost model: Adding cost of checks for loop "
3919                      "versioning niters.\n");
3920     }
3921
3922   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3923     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3924                           NULL, NULL_TREE, 0, vect_prologue);
3925
3926   /* Count statements in scalar loop.  Using this as scalar cost for a single
3927      iteration for now.
3928
3929      TODO: Add outer loop support.
3930
3931      TODO: Consider assigning different costs to different scalar
3932      statements.  */
3933
3934   scalar_single_iter_cost
3935     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3936
3937   /* Add additional cost for the peeled instructions in prologue and epilogue
3938      loop.  (For fully-masked loops there will be no peeling.)
3939
3940      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3941      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3942
3943      TODO: Build an expression that represents peel_iters for prologue and
3944      epilogue to be used in a run-time test.  */
3945
3946   bool prologue_need_br_taken_cost = false;
3947   bool prologue_need_br_not_taken_cost = false;
3948
3949   /* Calculate peel_iters_prologue.  */
3950   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3951     peel_iters_prologue = 0;
3952   else if (npeel < 0)
3953     {
3954       peel_iters_prologue = assumed_vf / 2;
3955       if (dump_enabled_p ())
3956         dump_printf (MSG_NOTE, "cost model: "
3957                      "prologue peel iters set to vf/2.\n");
3958
3959       /* If peeled iterations are unknown, count a taken branch and a not taken
3960          branch per peeled loop.  Even if scalar loop iterations are known,
3961          vector iterations are not known since peeled prologue iterations are
3962          not known.  Hence guards remain the same.  */
3963       prologue_need_br_taken_cost = true;
3964       prologue_need_br_not_taken_cost = true;
3965     }
3966   else
3967     {
3968       peel_iters_prologue = npeel;
3969       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3970         /* If peeled iterations are known but number of scalar loop
3971            iterations are unknown, count a taken branch per peeled loop.  */
3972         prologue_need_br_taken_cost = true;
3973     }
3974
3975   bool epilogue_need_br_taken_cost = false;
3976   bool epilogue_need_br_not_taken_cost = false;
3977
3978   /* Calculate peel_iters_epilogue.  */
3979   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3980     /* We need to peel exactly one iteration for gaps.  */
3981     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3982   else if (npeel < 0)
3983     {
3984       /* If peeling for alignment is unknown, loop bound of main loop
3985          becomes unknown.  */
3986       peel_iters_epilogue = assumed_vf / 2;
3987       if (dump_enabled_p ())
3988         dump_printf (MSG_NOTE, "cost model: "
3989                      "epilogue peel iters set to vf/2 because "
3990                      "peeling for alignment is unknown.\n");
3991
3992       /* See the same reason above in peel_iters_prologue calculation.  */
3993       epilogue_need_br_taken_cost = true;
3994       epilogue_need_br_not_taken_cost = true;
3995     }
3996   else
3997     {
3998       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3999       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4000         /* If peeled iterations are known but number of scalar loop
4001            iterations are unknown, count a taken branch per peeled loop.  */
4002         epilogue_need_br_taken_cost = true;
4003     }
4004
4005   stmt_info_for_cost *si;
4006   int j;
4007   /* Add costs associated with peel_iters_prologue.  */
4008   if (peel_iters_prologue)
4009     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4010       {
4011         (void) add_stmt_cost (loop_vinfo, target_cost_data,
4012                               si->count * peel_iters_prologue, si->kind,
4013                               si->stmt_info, si->vectype, si->misalign,
4014                               vect_prologue);
4015       }
4016
4017   /* Add costs associated with peel_iters_epilogue.  */
4018   if (peel_iters_epilogue)
4019     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4020       {
4021         (void) add_stmt_cost (loop_vinfo, target_cost_data,
4022                               si->count * peel_iters_epilogue, si->kind,
4023                               si->stmt_info, si->vectype, si->misalign,
4024                               vect_epilogue);
4025       }
4026
4027   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4028
4029   if (prologue_need_br_taken_cost)
4030     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4031                           NULL, NULL_TREE, 0, vect_prologue);
4032
4033   if (prologue_need_br_not_taken_cost)
4034     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4035                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4036                           vect_prologue);
4037
4038   if (epilogue_need_br_taken_cost)
4039     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
4040                           NULL, NULL_TREE, 0, vect_epilogue);
4041
4042   if (epilogue_need_br_not_taken_cost)
4043     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1,
4044                           cond_branch_not_taken, NULL, NULL_TREE, 0,
4045                           vect_epilogue);
4046
4047   /* Take care of special costs for rgroup controls of partial vectors.  */
4048   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
4049     {
4050       /* Calculate how many masks we need to generate.  */
4051       unsigned int num_masks = 0;
4052       rgroup_controls *rgm;
4053       unsigned int num_vectors_m1;
4054       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
4055         if (rgm->type)
4056           num_masks += num_vectors_m1 + 1;
4057       gcc_assert (num_masks > 0);
4058
4059       /* In the worst case, we need to generate each mask in the prologue
4060          and in the loop body.  One of the loop body mask instructions
4061          replaces the comparison in the scalar loop, and since we don't
4062          count the scalar comparison against the scalar body, we shouldn't
4063          count that vector instruction against the vector body either.
4064
4065          Sometimes we can use unpacks instead of generating prologue
4066          masks and sometimes the prologue mask will fold to a constant,
4067          so the actual prologue cost might be smaller.  However, it's
4068          simpler and safer to use the worst-case cost; if this ends up
4069          being the tie-breaker between vectorizing or not, then it's
4070          probably better not to vectorize.  */
4071       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks,
4072                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4073       (void) add_stmt_cost (loop_vinfo, target_cost_data, num_masks - 1,
4074                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
4075     }
4076   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4077     {
4078       /* Referring to the functions vect_set_loop_condition_partial_vectors
4079          and vect_set_loop_controls_directly, we need to generate each
4080          length in the prologue and in the loop body if required. Although
4081          there are some possible optimizations, we consider the worst case
4082          here.  */
4083
4084       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4085       bool need_iterate_p
4086         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4087            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4088
4089       /* Calculate how many statements to be added.  */
4090       unsigned int prologue_stmts = 0;
4091       unsigned int body_stmts = 0;
4092
4093       rgroup_controls *rgc;
4094       unsigned int num_vectors_m1;
4095       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4096         if (rgc->type)
4097           {
4098             /* May need one SHIFT for nitems_total computation.  */
4099             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4100             if (nitems != 1 && !niters_known_p)
4101               prologue_stmts += 1;
4102
4103             /* May need one MAX and one MINUS for wrap around.  */
4104             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4105               prologue_stmts += 2;
4106
4107             /* Need one MAX and one MINUS for each batch limit excepting for
4108                the 1st one.  */
4109             prologue_stmts += num_vectors_m1 * 2;
4110
4111             unsigned int num_vectors = num_vectors_m1 + 1;
4112
4113             /* Need to set up lengths in prologue, only one MIN required
4114                for each since start index is zero.  */
4115             prologue_stmts += num_vectors;
4116
4117             /* Each may need two MINs and one MINUS to update lengths in body
4118                for next iteration.  */
4119             if (need_iterate_p)
4120               body_stmts += 3 * num_vectors;
4121           }
4122
4123       (void) add_stmt_cost (loop_vinfo, target_cost_data, prologue_stmts,
4124                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4125       (void) add_stmt_cost (loop_vinfo, target_cost_data, body_stmts,
4126                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4127     }
4128
4129   /* FORNOW: The scalar outside cost is incremented in one of the
4130      following ways:
4131
4132      1. The vectorizer checks for alignment and aliasing and generates
4133      a condition that allows dynamic vectorization.  A cost model
4134      check is ANDED with the versioning condition.  Hence scalar code
4135      path now has the added cost of the versioning check.
4136
4137        if (cost > th & versioning_check)
4138          jmp to vector code
4139
4140      Hence run-time scalar is incremented by not-taken branch cost.
4141
4142      2. The vectorizer then checks if a prologue is required.  If the
4143      cost model check was not done before during versioning, it has to
4144      be done before the prologue check.
4145
4146        if (cost <= th)
4147          prologue = scalar_iters
4148        if (prologue == 0)
4149          jmp to vector code
4150        else
4151          execute prologue
4152        if (prologue == num_iters)
4153          go to exit
4154
4155      Hence the run-time scalar cost is incremented by a taken branch,
4156      plus a not-taken branch, plus a taken branch cost.
4157
4158      3. The vectorizer then checks if an epilogue is required.  If the
4159      cost model check was not done before during prologue check, it
4160      has to be done with the epilogue check.
4161
4162        if (prologue == 0)
4163          jmp to vector code
4164        else
4165          execute prologue
4166        if (prologue == num_iters)
4167          go to exit
4168        vector code:
4169          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4170            jmp to epilogue
4171
4172      Hence the run-time scalar cost should be incremented by 2 taken
4173      branches.
4174
4175      TODO: The back end may reorder the BBS's differently and reverse
4176      conditions/branch directions.  Change the estimates below to
4177      something more reasonable.  */
4178
4179   /* If the number of iterations is known and we do not do versioning, we can
4180      decide whether to vectorize at compile time.  Hence the scalar version
4181      do not carry cost model guard costs.  */
4182   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4183       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4184     {
4185       /* Cost model check occurs at versioning.  */
4186       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4187         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4188       else
4189         {
4190           /* Cost model check occurs at prologue generation.  */
4191           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4192             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4193               + vect_get_stmt_cost (cond_branch_not_taken);
4194           /* Cost model check occurs at epilogue generation.  */
4195           else
4196             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4197         }
4198     }
4199
4200   /* Complete the target-specific cost calculations.  */
4201   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
4202                &vec_inside_cost, &vec_epilogue_cost);
4203
4204   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4205
4206   /* Stash the costs so that we can compare two loop_vec_infos.  */
4207   loop_vinfo->vec_inside_cost = vec_inside_cost;
4208   loop_vinfo->vec_outside_cost = vec_outside_cost;
4209
4210   if (dump_enabled_p ())
4211     {
4212       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4213       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4214                    vec_inside_cost);
4215       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4216                    vec_prologue_cost);
4217       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4218                    vec_epilogue_cost);
4219       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4220                    scalar_single_iter_cost);
4221       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4222                    scalar_outside_cost);
4223       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4224                    vec_outside_cost);
4225       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4226                    peel_iters_prologue);
4227       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4228                    peel_iters_epilogue);
4229     }
4230
4231   /* Calculate number of iterations required to make the vector version
4232      profitable, relative to the loop bodies only.  The following condition
4233      must hold true:
4234      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4235      where
4236      SIC = scalar iteration cost, VIC = vector iteration cost,
4237      VOC = vector outside cost, VF = vectorization factor,
4238      NPEEL = prologue iterations + epilogue iterations,
4239      SOC = scalar outside cost for run time cost model check.  */
4240
4241   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4242                           - vec_inside_cost);
4243   if (saving_per_viter <= 0)
4244     {
4245       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4246         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4247                     "vectorization did not happen for a simd loop");
4248
4249       if (dump_enabled_p ())
4250         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4251                          "cost model: the vector iteration cost = %d "
4252                          "divided by the scalar iteration cost = %d "
4253                          "is greater or equal to the vectorization factor = %d"
4254                          ".\n",
4255                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4256       *ret_min_profitable_niters = -1;
4257       *ret_min_profitable_estimate = -1;
4258       return;
4259     }
4260
4261   /* ??? The "if" arm is written to handle all cases; see below for what
4262      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4263   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4264     {
4265       /* Rewriting the condition above in terms of the number of
4266          vector iterations (vniters) rather than the number of
4267          scalar iterations (niters) gives:
4268
4269          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4270
4271          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4272
4273          For integer N, X and Y when X > 0:
4274
4275          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4276       int outside_overhead = (vec_outside_cost
4277                               - scalar_single_iter_cost * peel_iters_prologue
4278                               - scalar_single_iter_cost * peel_iters_epilogue
4279                               - scalar_outside_cost);
4280       /* We're only interested in cases that require at least one
4281          vector iteration.  */
4282       int min_vec_niters = 1;
4283       if (outside_overhead > 0)
4284         min_vec_niters = outside_overhead / saving_per_viter + 1;
4285
4286       if (dump_enabled_p ())
4287         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4288                      min_vec_niters);
4289
4290       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4291         {
4292           /* Now that we know the minimum number of vector iterations,
4293              find the minimum niters for which the scalar cost is larger:
4294
4295              SIC * niters > VIC * vniters + VOC - SOC
4296
4297              We know that the minimum niters is no more than
4298              vniters * VF + NPEEL, but it might be (and often is) less
4299              than that if a partial vector iteration is cheaper than the
4300              equivalent scalar code.  */
4301           int threshold = (vec_inside_cost * min_vec_niters
4302                            + vec_outside_cost
4303                            - scalar_outside_cost);
4304           if (threshold <= 0)
4305             min_profitable_iters = 1;
4306           else
4307             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4308         }
4309       else
4310         /* Convert the number of vector iterations into a number of
4311            scalar iterations.  */
4312         min_profitable_iters = (min_vec_niters * assumed_vf
4313                                 + peel_iters_prologue
4314                                 + peel_iters_epilogue);
4315     }
4316   else
4317     {
4318       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4319                               * assumed_vf
4320                               - vec_inside_cost * peel_iters_prologue
4321                               - vec_inside_cost * peel_iters_epilogue);
4322       if (min_profitable_iters <= 0)
4323         min_profitable_iters = 0;
4324       else
4325         {
4326           min_profitable_iters /= saving_per_viter;
4327
4328           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4329               <= (((int) vec_inside_cost * min_profitable_iters)
4330                   + (((int) vec_outside_cost - scalar_outside_cost)
4331                      * assumed_vf)))
4332             min_profitable_iters++;
4333         }
4334     }
4335
4336   if (dump_enabled_p ())
4337     dump_printf (MSG_NOTE,
4338                  "  Calculated minimum iters for profitability: %d\n",
4339                  min_profitable_iters);
4340
4341   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4342       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4343     /* We want the vectorized loop to execute at least once.  */
4344     min_profitable_iters = assumed_vf + peel_iters_prologue;
4345   else if (min_profitable_iters < peel_iters_prologue)
4346     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4347        vectorized loop executes at least once.  */
4348     min_profitable_iters = peel_iters_prologue;
4349
4350   if (dump_enabled_p ())
4351     dump_printf_loc (MSG_NOTE, vect_location,
4352                      "  Runtime profitability threshold = %d\n",
4353                      min_profitable_iters);
4354
4355   *ret_min_profitable_niters = min_profitable_iters;
4356
4357   /* Calculate number of iterations required to make the vector version
4358      profitable, relative to the loop bodies only.
4359
4360      Non-vectorized variant is SIC * niters and it must win over vector
4361      variant on the expected loop trip count.  The following condition must hold true:
4362      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4363
4364   if (vec_outside_cost <= 0)
4365     min_profitable_estimate = 0;
4366   /* ??? This "else if" arm is written to handle all cases; see below for
4367      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4368   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4369     {
4370       /* This is a repeat of the code above, but with + SOC rather
4371          than - SOC.  */
4372       int outside_overhead = (vec_outside_cost
4373                               - scalar_single_iter_cost * peel_iters_prologue
4374                               - scalar_single_iter_cost * peel_iters_epilogue
4375                               + scalar_outside_cost);
4376       int min_vec_niters = 1;
4377       if (outside_overhead > 0)
4378         min_vec_niters = outside_overhead / saving_per_viter + 1;
4379
4380       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4381         {
4382           int threshold = (vec_inside_cost * min_vec_niters
4383                            + vec_outside_cost
4384                            + scalar_outside_cost);
4385           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4386         }
4387       else
4388         min_profitable_estimate = (min_vec_niters * assumed_vf
4389                                    + peel_iters_prologue
4390                                    + peel_iters_epilogue);
4391     }
4392   else
4393     {
4394       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4395                                  * assumed_vf
4396                                  - vec_inside_cost * peel_iters_prologue
4397                                  - vec_inside_cost * peel_iters_epilogue)
4398                                  / ((scalar_single_iter_cost * assumed_vf)
4399                                    - vec_inside_cost);
4400     }
4401   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4402   if (dump_enabled_p ())
4403     dump_printf_loc (MSG_NOTE, vect_location,
4404                      "  Static estimate profitability threshold = %d\n",
4405                      min_profitable_estimate);
4406
4407   *ret_min_profitable_estimate = min_profitable_estimate;
4408 }
4409
4410 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4411    vector elements (not bits) for a vector with NELT elements.  */
4412 static void
4413 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4414                               vec_perm_builder *sel)
4415 {
4416   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4417      by vec_perm_indices.  */
4418   sel->new_vector (nelt, 1, 3);
4419   for (unsigned int i = 0; i < 3; i++)
4420     sel->quick_push (i + offset);
4421 }
4422
4423 /* Checks whether the target supports whole-vector shifts for vectors of mode
4424    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4425    it supports vec_perm_const with masks for all necessary shift amounts.  */
4426 static bool
4427 have_whole_vector_shift (machine_mode mode)
4428 {
4429   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4430     return true;
4431
4432   /* Variable-length vectors should be handled via the optab.  */
4433   unsigned int nelt;
4434   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4435     return false;
4436
4437   vec_perm_builder sel;
4438   vec_perm_indices indices;
4439   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4440     {
4441       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4442       indices.new_vector (sel, 2, nelt);
4443       if (!can_vec_perm_const_p (mode, indices, false))
4444         return false;
4445     }
4446   return true;
4447 }
4448
4449 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4450    functions. Design better to avoid maintenance issues.  */
4451
4452 /* Function vect_model_reduction_cost.
4453
4454    Models cost for a reduction operation, including the vector ops
4455    generated within the strip-mine loop in some cases, the initial
4456    definition before the loop, and the epilogue code that must be generated.  */
4457
4458 static void
4459 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4460                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4461                            vect_reduction_type reduction_type,
4462                            int ncopies, stmt_vector_for_cost *cost_vec)
4463 {
4464   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4465   enum tree_code code;
4466   optab optab;
4467   tree vectype;
4468   machine_mode mode;
4469   class loop *loop = NULL;
4470
4471   if (loop_vinfo)
4472     loop = LOOP_VINFO_LOOP (loop_vinfo);
4473
4474   /* Condition reductions generate two reductions in the loop.  */
4475   if (reduction_type == COND_REDUCTION)
4476     ncopies *= 2;
4477
4478   vectype = STMT_VINFO_VECTYPE (stmt_info);
4479   mode = TYPE_MODE (vectype);
4480   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4481
4482   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4483
4484   if (reduction_type == EXTRACT_LAST_REDUCTION)
4485     /* No extra instructions are needed in the prologue.  The loop body
4486        operations are costed in vectorizable_condition.  */
4487     inside_cost = 0;
4488   else if (reduction_type == FOLD_LEFT_REDUCTION)
4489     {
4490       /* No extra instructions needed in the prologue.  */
4491       prologue_cost = 0;
4492
4493       if (reduc_fn != IFN_LAST)
4494         /* Count one reduction-like operation per vector.  */
4495         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4496                                         stmt_info, 0, vect_body);
4497       else
4498         {
4499           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4500           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4501           inside_cost = record_stmt_cost (cost_vec, nelements,
4502                                           vec_to_scalar, stmt_info, 0,
4503                                           vect_body);
4504           inside_cost += record_stmt_cost (cost_vec, nelements,
4505                                            scalar_stmt, stmt_info, 0,
4506                                            vect_body);
4507         }
4508     }
4509   else
4510     {
4511       /* Add in cost for initial definition.
4512          For cond reduction we have four vectors: initial index, step,
4513          initial result of the data reduction, initial value of the index
4514          reduction.  */
4515       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4516       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4517                                          scalar_to_vec, stmt_info, 0,
4518                                          vect_prologue);
4519     }
4520
4521   /* Determine cost of epilogue code.
4522
4523      We have a reduction operator that will reduce the vector in one statement.
4524      Also requires scalar extract.  */
4525
4526   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4527     {
4528       if (reduc_fn != IFN_LAST)
4529         {
4530           if (reduction_type == COND_REDUCTION)
4531             {
4532               /* An EQ stmt and an COND_EXPR stmt.  */
4533               epilogue_cost += record_stmt_cost (cost_vec, 2,
4534                                                  vector_stmt, stmt_info, 0,
4535                                                  vect_epilogue);
4536               /* Reduction of the max index and a reduction of the found
4537                  values.  */
4538               epilogue_cost += record_stmt_cost (cost_vec, 2,
4539                                                  vec_to_scalar, stmt_info, 0,
4540                                                  vect_epilogue);
4541               /* A broadcast of the max value.  */
4542               epilogue_cost += record_stmt_cost (cost_vec, 1,
4543                                                  scalar_to_vec, stmt_info, 0,
4544                                                  vect_epilogue);
4545             }
4546           else
4547             {
4548               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4549                                                  stmt_info, 0, vect_epilogue);
4550               epilogue_cost += record_stmt_cost (cost_vec, 1,
4551                                                  vec_to_scalar, stmt_info, 0,
4552                                                  vect_epilogue);
4553             }
4554         }
4555       else if (reduction_type == COND_REDUCTION)
4556         {
4557           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4558           /* Extraction of scalar elements.  */
4559           epilogue_cost += record_stmt_cost (cost_vec,
4560                                              2 * estimated_nunits,
4561                                              vec_to_scalar, stmt_info, 0,
4562                                              vect_epilogue);
4563           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4564           epilogue_cost += record_stmt_cost (cost_vec,
4565                                              2 * estimated_nunits - 3,
4566                                              scalar_stmt, stmt_info, 0,
4567                                              vect_epilogue);
4568         }
4569       else if (reduction_type == EXTRACT_LAST_REDUCTION
4570                || reduction_type == FOLD_LEFT_REDUCTION)
4571         /* No extra instructions need in the epilogue.  */
4572         ;
4573       else
4574         {
4575           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4576           tree bitsize =
4577             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4578           int element_bitsize = tree_to_uhwi (bitsize);
4579           int nelements = vec_size_in_bits / element_bitsize;
4580
4581           if (code == COND_EXPR)
4582             code = MAX_EXPR;
4583
4584           optab = optab_for_tree_code (code, vectype, optab_default);
4585
4586           /* We have a whole vector shift available.  */
4587           if (optab != unknown_optab
4588               && VECTOR_MODE_P (mode)
4589               && optab_handler (optab, mode) != CODE_FOR_nothing
4590               && have_whole_vector_shift (mode))
4591             {
4592               /* Final reduction via vector shifts and the reduction operator.
4593                  Also requires scalar extract.  */
4594               epilogue_cost += record_stmt_cost (cost_vec,
4595                                                  exact_log2 (nelements) * 2,
4596                                                  vector_stmt, stmt_info, 0,
4597                                                  vect_epilogue);
4598               epilogue_cost += record_stmt_cost (cost_vec, 1,
4599                                                  vec_to_scalar, stmt_info, 0,
4600                                                  vect_epilogue);
4601             }
4602           else
4603             /* Use extracts and reduction op for final reduction.  For N
4604                elements, we have N extracts and N-1 reduction ops.  */
4605             epilogue_cost += record_stmt_cost (cost_vec,
4606                                                nelements + nelements - 1,
4607                                                vector_stmt, stmt_info, 0,
4608                                                vect_epilogue);
4609         }
4610     }
4611
4612   if (dump_enabled_p ())
4613     dump_printf (MSG_NOTE,
4614                  "vect_model_reduction_cost: inside_cost = %d, "
4615                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4616                  prologue_cost, epilogue_cost);
4617 }
4618
4619
4620
4621 /* Function get_initial_def_for_reduction
4622
4623    Input:
4624    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4625    INIT_VAL - the initial value of the reduction variable
4626
4627    Output:
4628    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4629         of the reduction (used for adjusting the epilog - see below).
4630    Return a vector variable, initialized according to the operation that
4631         STMT_VINFO performs. This vector will be used as the initial value
4632         of the vector of partial results.
4633
4634    Option1 (adjust in epilog): Initialize the vector as follows:
4635      add/bit or/xor:    [0,0,...,0,0]
4636      mult/bit and:      [1,1,...,1,1]
4637      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4638    and when necessary (e.g. add/mult case) let the caller know
4639    that it needs to adjust the result by init_val.
4640
4641    Option2: Initialize the vector as follows:
4642      add/bit or/xor:    [init_val,0,0,...,0]
4643      mult/bit and:      [init_val,1,1,...,1]
4644      min/max/cond_expr: [init_val,init_val,...,init_val]
4645    and no adjustments are needed.
4646
4647    For example, for the following code:
4648
4649    s = init_val;
4650    for (i=0;i<n;i++)
4651      s = s + a[i];
4652
4653    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4654    For a vector of 4 units, we want to return either [0,0,0,init_val],
4655    or [0,0,0,0] and let the caller know that it needs to adjust
4656    the result at the end by 'init_val'.
4657
4658    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4659    initialization vector is simpler (same element in all entries), if
4660    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4661
4662    A cost model should help decide between these two schemes.  */
4663
4664 static tree
4665 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4666                                stmt_vec_info stmt_vinfo,
4667                                enum tree_code code, tree init_val,
4668                                tree *adjustment_def)
4669 {
4670   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4671   tree scalar_type = TREE_TYPE (init_val);
4672   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4673   tree def_for_init;
4674   tree init_def;
4675   REAL_VALUE_TYPE real_init_val = dconst0;
4676   int int_init_val = 0;
4677   gimple_seq stmts = NULL;
4678
4679   gcc_assert (vectype);
4680
4681   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4682               || SCALAR_FLOAT_TYPE_P (scalar_type));
4683
4684   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4685               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4686
4687   /* ADJUSTMENT_DEF is NULL when called from
4688      vect_create_epilog_for_reduction to vectorize double reduction.  */
4689   if (adjustment_def)
4690     *adjustment_def = NULL;
4691
4692   switch (code)
4693     {
4694     case WIDEN_SUM_EXPR:
4695     case DOT_PROD_EXPR:
4696     case SAD_EXPR:
4697     case PLUS_EXPR:
4698     case MINUS_EXPR:
4699     case BIT_IOR_EXPR:
4700     case BIT_XOR_EXPR:
4701     case MULT_EXPR:
4702     case BIT_AND_EXPR:
4703       {
4704         if (code == MULT_EXPR)
4705           {
4706             real_init_val = dconst1;
4707             int_init_val = 1;
4708           }
4709
4710         if (code == BIT_AND_EXPR)
4711           int_init_val = -1;
4712
4713         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4714           def_for_init = build_real (scalar_type, real_init_val);
4715         else
4716           def_for_init = build_int_cst (scalar_type, int_init_val);
4717
4718         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4719           {
4720             /* Option1: the first element is '0' or '1' as well.  */
4721             if (!operand_equal_p (def_for_init, init_val, 0))
4722               *adjustment_def = init_val;
4723             init_def = gimple_build_vector_from_val (&stmts, vectype,
4724                                                      def_for_init);
4725           }
4726         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4727           {
4728             /* Option2 (variable length): the first element is INIT_VAL.  */
4729             init_def = gimple_build_vector_from_val (&stmts, vectype,
4730                                                      def_for_init);
4731             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4732                                      vectype, init_def, init_val);
4733           }
4734         else
4735           {
4736             /* Option2: the first element is INIT_VAL.  */
4737             tree_vector_builder elts (vectype, 1, 2);
4738             elts.quick_push (init_val);
4739             elts.quick_push (def_for_init);
4740             init_def = gimple_build_vector (&stmts, &elts);
4741           }
4742       }
4743       break;
4744
4745     case MIN_EXPR:
4746     case MAX_EXPR:
4747     case COND_EXPR:
4748       {
4749         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4750         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4751       }
4752       break;
4753
4754     default:
4755       gcc_unreachable ();
4756     }
4757
4758   if (stmts)
4759     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4760   return init_def;
4761 }
4762
4763 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4764    NUMBER_OF_VECTORS is the number of vector defs to create.
4765    If NEUTRAL_OP is nonnull, introducing extra elements of that
4766    value will not change the result.  */
4767
4768 static void
4769 get_initial_defs_for_reduction (vec_info *vinfo,
4770                                 slp_tree slp_node,
4771                                 vec<tree> *vec_oprnds,
4772                                 unsigned int number_of_vectors,
4773                                 bool reduc_chain, tree neutral_op)
4774 {
4775   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4776   stmt_vec_info stmt_vinfo = stmts[0];
4777   unsigned HOST_WIDE_INT nunits;
4778   unsigned j, number_of_places_left_in_vector;
4779   tree vector_type;
4780   unsigned int group_size = stmts.length ();
4781   unsigned int i;
4782   class loop *loop;
4783
4784   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4785
4786   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4787
4788   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4789   gcc_assert (loop);
4790   edge pe = loop_preheader_edge (loop);
4791
4792   gcc_assert (!reduc_chain || neutral_op);
4793
4794   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4795      created vectors. It is greater than 1 if unrolling is performed.
4796
4797      For example, we have two scalar operands, s1 and s2 (e.g., group of
4798      strided accesses of size two), while NUNITS is four (i.e., four scalars
4799      of this type can be packed in a vector).  The output vector will contain
4800      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4801      will be 2).
4802
4803      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4804      vectors containing the operands.
4805
4806      For example, NUNITS is four as before, and the group size is 8
4807      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4808      {s5, s6, s7, s8}.  */
4809
4810   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4811     nunits = group_size;
4812
4813   number_of_places_left_in_vector = nunits;
4814   bool constant_p = true;
4815   tree_vector_builder elts (vector_type, nunits, 1);
4816   elts.quick_grow (nunits);
4817   gimple_seq ctor_seq = NULL;
4818   for (j = 0; j < nunits * number_of_vectors; ++j)
4819     {
4820       tree op;
4821       i = j % group_size;
4822       stmt_vinfo = stmts[i];
4823
4824       /* Get the def before the loop.  In reduction chain we have only
4825          one initial value.  Else we have as many as PHIs in the group.  */
4826       if (reduc_chain)
4827         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4828       else if (((vec_oprnds->length () + 1) * nunits
4829                 - number_of_places_left_in_vector >= group_size)
4830                && neutral_op)
4831         op = neutral_op;
4832       else
4833         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4834
4835       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4836       number_of_places_left_in_vector--;
4837       elts[nunits - number_of_places_left_in_vector - 1] = op;
4838       if (!CONSTANT_CLASS_P (op))
4839         constant_p = false;
4840
4841       if (number_of_places_left_in_vector == 0)
4842         {
4843           tree init;
4844           if (constant_p && !neutral_op
4845               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4846               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4847             /* Build the vector directly from ELTS.  */
4848             init = gimple_build_vector (&ctor_seq, &elts);
4849           else if (neutral_op)
4850             {
4851               /* Build a vector of the neutral value and shift the
4852                  other elements into place.  */
4853               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4854                                                    neutral_op);
4855               int k = nunits;
4856               while (k > 0 && elts[k - 1] == neutral_op)
4857                 k -= 1;
4858               while (k > 0)
4859                 {
4860                   k -= 1;
4861                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4862                                        vector_type, init, elts[k]);
4863                 }
4864             }
4865           else
4866             {
4867               /* First time round, duplicate ELTS to fill the
4868                  required number of vectors.  */
4869               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4870                                         number_of_vectors, *vec_oprnds);
4871               break;
4872             }
4873           vec_oprnds->quick_push (init);
4874
4875           number_of_places_left_in_vector = nunits;
4876           elts.new_vector (vector_type, nunits, 1);
4877           elts.quick_grow (nunits);
4878           constant_p = true;
4879         }
4880     }
4881   if (ctor_seq != NULL)
4882     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4883 }
4884
4885 /* For a statement STMT_INFO taking part in a reduction operation return
4886    the stmt_vec_info the meta information is stored on.  */
4887
4888 stmt_vec_info
4889 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4890 {
4891   stmt_info = vect_orig_stmt (stmt_info);
4892   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4893   if (!is_a <gphi *> (stmt_info->stmt)
4894       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4895     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4896   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4897   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4898     {
4899       if (gimple_phi_num_args (phi) == 1)
4900         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4901     }
4902   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4903     {
4904       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4905       stmt_vec_info info
4906           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4907       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4908         stmt_info = info;
4909     }
4910   return stmt_info;
4911 }
4912
4913 /* Function vect_create_epilog_for_reduction
4914
4915    Create code at the loop-epilog to finalize the result of a reduction
4916    computation.
4917
4918    STMT_INFO is the scalar reduction stmt that is being vectorized.
4919    SLP_NODE is an SLP node containing a group of reduction statements. The
4920      first one in this group is STMT_INFO.
4921    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4922    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4923      (counting from 0)
4924
4925    This function:
4926    1. Completes the reduction def-use cycles.
4927    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4928       by calling the function specified by REDUC_FN if available, or by
4929       other means (whole-vector shifts or a scalar loop).
4930       The function also creates a new phi node at the loop exit to preserve
4931       loop-closed form, as illustrated below.
4932
4933      The flow at the entry to this function:
4934
4935         loop:
4936           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4937           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4938           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4939         loop_exit:
4940           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4941           use <s_out0>
4942           use <s_out0>
4943
4944      The above is transformed by this function into:
4945
4946         loop:
4947           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4948           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4949           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4950         loop_exit:
4951           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4952           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4953           v_out2 = reduce <v_out1>
4954           s_out3 = extract_field <v_out2, 0>
4955           s_out4 = adjust_result <s_out3>
4956           use <s_out4>
4957           use <s_out4>
4958 */
4959
4960 static void
4961 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4962                                   stmt_vec_info stmt_info,
4963                                   slp_tree slp_node,
4964                                   slp_instance slp_node_instance)
4965 {
4966   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4967   gcc_assert (reduc_info->is_reduc_info);
4968   /* For double reductions we need to get at the inner loop reduction
4969      stmt which has the meta info attached.  Our stmt_info is that of the
4970      loop-closed PHI of the inner loop which we remember as
4971      def for the reduction PHI generation.  */
4972   bool double_reduc = false;
4973   stmt_vec_info rdef_info = stmt_info;
4974   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4975     {
4976       gcc_assert (!slp_node);
4977       double_reduc = true;
4978       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4979                                             (stmt_info->stmt, 0));
4980       stmt_info = vect_stmt_to_vectorize (stmt_info);
4981     }
4982   gphi *reduc_def_stmt
4983     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4984   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4985   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4986   tree vectype;
4987   machine_mode mode;
4988   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4989   basic_block exit_bb;
4990   tree scalar_dest;
4991   tree scalar_type;
4992   gimple *new_phi = NULL, *phi;
4993   gimple_stmt_iterator exit_gsi;
4994   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4995   gimple *epilog_stmt = NULL;
4996   gimple *exit_phi;
4997   tree bitsize;
4998   tree def;
4999   tree orig_name, scalar_result;
5000   imm_use_iterator imm_iter, phi_imm_iter;
5001   use_operand_p use_p, phi_use_p;
5002   gimple *use_stmt;
5003   bool nested_in_vect_loop = false;
5004   auto_vec<gimple *> new_phis;
5005   int j, i;
5006   auto_vec<tree> scalar_results;
5007   unsigned int group_size = 1, k;
5008   auto_vec<gimple *> phis;
5009   bool slp_reduc = false;
5010   bool direct_slp_reduc;
5011   tree new_phi_result;
5012   tree induction_index = NULL_TREE;
5013
5014   if (slp_node)
5015     group_size = SLP_TREE_LANES (slp_node);
5016
5017   if (nested_in_vect_loop_p (loop, stmt_info))
5018     {
5019       outer_loop = loop;
5020       loop = loop->inner;
5021       nested_in_vect_loop = true;
5022       gcc_assert (!slp_node);
5023     }
5024   gcc_assert (!nested_in_vect_loop || double_reduc);
5025
5026   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5027   gcc_assert (vectype);
5028   mode = TYPE_MODE (vectype);
5029
5030   tree initial_def = NULL;
5031   tree induc_val = NULL_TREE;
5032   tree adjustment_def = NULL;
5033   if (slp_node)
5034     ;
5035   else
5036     {
5037       /* Get at the scalar def before the loop, that defines the initial value
5038          of the reduction variable.  */
5039       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5040                                            loop_preheader_edge (loop));
5041       /* Optimize: for induction condition reduction, if we can't use zero
5042          for induc_val, use initial_def.  */
5043       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5044         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5045       else if (double_reduc)
5046         ;
5047       else if (nested_in_vect_loop)
5048         ;
5049       else
5050         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5051     }
5052
5053   unsigned vec_num;
5054   int ncopies;
5055   if (slp_node)
5056     {
5057       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5058       ncopies = 1;
5059     }
5060   else
5061     {
5062       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5063       vec_num = 1;
5064       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5065     }
5066
5067   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5068      which is updated with the current index of the loop for every match of
5069      the original loop's cond_expr (VEC_STMT).  This results in a vector
5070      containing the last time the condition passed for that vector lane.
5071      The first match will be a 1 to allow 0 to be used for non-matching
5072      indexes.  If there are no matches at all then the vector will be all
5073      zeroes.
5074
5075      PR92772: This algorithm is broken for architectures that support
5076      masked vectors, but do not provide fold_extract_last.  */
5077   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5078     {
5079       auto_vec<std::pair<tree, bool>, 2> ccompares;
5080       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5081       cond_info = vect_stmt_to_vectorize (cond_info);
5082       while (cond_info != reduc_info)
5083         {
5084           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5085             {
5086               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5087               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5088               ccompares.safe_push
5089                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5090                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5091             }
5092           cond_info
5093             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5094                                                  1 + STMT_VINFO_REDUC_IDX
5095                                                         (cond_info)));
5096           cond_info = vect_stmt_to_vectorize (cond_info);
5097         }
5098       gcc_assert (ccompares.length () != 0);
5099
5100       tree indx_before_incr, indx_after_incr;
5101       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5102       int scalar_precision
5103         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5104       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5105       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5106         (TYPE_MODE (vectype), cr_index_scalar_type,
5107          TYPE_VECTOR_SUBPARTS (vectype));
5108
5109       /* First we create a simple vector induction variable which starts
5110          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5111          vector size (STEP).  */
5112
5113       /* Create a {1,2,3,...} vector.  */
5114       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5115
5116       /* Create a vector of the step value.  */
5117       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5118       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5119
5120       /* Create an induction variable.  */
5121       gimple_stmt_iterator incr_gsi;
5122       bool insert_after;
5123       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5124       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5125                  insert_after, &indx_before_incr, &indx_after_incr);
5126
5127       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5128          filled with zeros (VEC_ZERO).  */
5129
5130       /* Create a vector of 0s.  */
5131       tree zero = build_zero_cst (cr_index_scalar_type);
5132       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5133
5134       /* Create a vector phi node.  */
5135       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5136       new_phi = create_phi_node (new_phi_tree, loop->header);
5137       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5138                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5139
5140       /* Now take the condition from the loops original cond_exprs
5141          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5142          every match uses values from the induction variable
5143          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5144          (NEW_PHI_TREE).
5145          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5146          the new cond_expr (INDEX_COND_EXPR).  */
5147       gimple_seq stmts = NULL;
5148       for (int i = ccompares.length () - 1; i != -1; --i)
5149         {
5150           tree ccompare = ccompares[i].first;
5151           if (ccompares[i].second)
5152             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5153                                          cr_index_vector_type,
5154                                          ccompare,
5155                                          indx_before_incr, new_phi_tree);
5156           else
5157             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5158                                          cr_index_vector_type,
5159                                          ccompare,
5160                                          new_phi_tree, indx_before_incr);
5161         }
5162       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5163
5164       /* Update the phi with the vec cond.  */
5165       induction_index = new_phi_tree;
5166       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5167                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5168     }
5169
5170   /* 2. Create epilog code.
5171         The reduction epilog code operates across the elements of the vector
5172         of partial results computed by the vectorized loop.
5173         The reduction epilog code consists of:
5174
5175         step 1: compute the scalar result in a vector (v_out2)
5176         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5177         step 3: adjust the scalar result (s_out3) if needed.
5178
5179         Step 1 can be accomplished using one the following three schemes:
5180           (scheme 1) using reduc_fn, if available.
5181           (scheme 2) using whole-vector shifts, if available.
5182           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5183                      combined.
5184
5185           The overall epilog code looks like this:
5186
5187           s_out0 = phi <s_loop>         # original EXIT_PHI
5188           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5189           v_out2 = reduce <v_out1>              # step 1
5190           s_out3 = extract_field <v_out2, 0>    # step 2
5191           s_out4 = adjust_result <s_out3>       # step 3
5192
5193           (step 3 is optional, and steps 1 and 2 may be combined).
5194           Lastly, the uses of s_out0 are replaced by s_out4.  */
5195
5196
5197   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5198          v_out1 = phi <VECT_DEF>
5199          Store them in NEW_PHIS.  */
5200   if (double_reduc)
5201     loop = outer_loop;
5202   exit_bb = single_exit (loop)->dest;
5203   new_phis.create (slp_node ? vec_num : ncopies);
5204   for (unsigned i = 0; i < vec_num; i++)
5205     {
5206       if (slp_node)
5207         def = vect_get_slp_vect_def (slp_node, i);
5208       else
5209         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5210       for (j = 0; j < ncopies; j++)
5211         {
5212           tree new_def = copy_ssa_name (def);
5213           phi = create_phi_node (new_def, exit_bb);
5214           if (j == 0)
5215             new_phis.quick_push (phi);
5216           else
5217             {
5218               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5219               new_phis.quick_push (phi);
5220             }
5221
5222           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5223         }
5224     }
5225
5226   exit_gsi = gsi_after_labels (exit_bb);
5227
5228   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5229          (i.e. when reduc_fn is not available) and in the final adjustment
5230          code (if needed).  Also get the original scalar reduction variable as
5231          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5232          represents a reduction pattern), the tree-code and scalar-def are
5233          taken from the original stmt that the pattern-stmt (STMT) replaces.
5234          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5235          are taken from STMT.  */
5236
5237   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5238   if (orig_stmt_info != stmt_info)
5239     {
5240       /* Reduction pattern  */
5241       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5242       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5243     }
5244
5245   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
5246   scalar_type = TREE_TYPE (scalar_dest);
5247   scalar_results.create (group_size);
5248   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5249   bitsize = TYPE_SIZE (scalar_type);
5250
5251   /* SLP reduction without reduction chain, e.g.,
5252      # a1 = phi <a2, a0>
5253      # b1 = phi <b2, b0>
5254      a2 = operation (a1)
5255      b2 = operation (b1)  */
5256   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5257
5258   /* True if we should implement SLP_REDUC using native reduction operations
5259      instead of scalar operations.  */
5260   direct_slp_reduc = (reduc_fn != IFN_LAST
5261                       && slp_reduc
5262                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5263
5264   /* In case of reduction chain, e.g.,
5265      # a1 = phi <a3, a0>
5266      a2 = operation (a1)
5267      a3 = operation (a2),
5268
5269      we may end up with more than one vector result.  Here we reduce them to
5270      one vector.  */
5271   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
5272     {
5273       gimple_seq stmts = NULL;
5274       tree first_vect = PHI_RESULT (new_phis[0]);
5275       first_vect = gimple_convert (&stmts, vectype, first_vect);
5276       for (k = 1; k < new_phis.length (); k++)
5277         {
5278           gimple *next_phi = new_phis[k];
5279           tree second_vect = PHI_RESULT (next_phi);
5280           second_vect = gimple_convert (&stmts, vectype, second_vect);
5281           first_vect = gimple_build (&stmts, code, vectype,
5282                                      first_vect, second_vect);
5283         }
5284       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5285
5286       new_phi_result = first_vect;
5287       new_phis.truncate (0);
5288       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5289     }
5290   /* Likewise if we couldn't use a single defuse cycle.  */
5291   else if (ncopies > 1)
5292     {
5293       gimple_seq stmts = NULL;
5294       tree first_vect = PHI_RESULT (new_phis[0]);
5295       first_vect = gimple_convert (&stmts, vectype, first_vect);
5296       for (int k = 1; k < ncopies; ++k)
5297         {
5298           tree second_vect = PHI_RESULT (new_phis[k]);
5299           second_vect = gimple_convert (&stmts, vectype, second_vect);
5300           first_vect = gimple_build (&stmts, code, vectype,
5301                                      first_vect, second_vect);
5302         }
5303       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5304       new_phi_result = first_vect;
5305       new_phis.truncate (0);
5306       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
5307     }
5308   else
5309     new_phi_result = PHI_RESULT (new_phis[0]);
5310
5311   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5312       && reduc_fn != IFN_LAST)
5313     {
5314       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5315          various data values where the condition matched and another vector
5316          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5317          need to extract the last matching index (which will be the index with
5318          highest value) and use this to index into the data vector.
5319          For the case where there were no matches, the data vector will contain
5320          all default values and the index vector will be all zeros.  */
5321
5322       /* Get various versions of the type of the vector of indexes.  */
5323       tree index_vec_type = TREE_TYPE (induction_index);
5324       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5325       tree index_scalar_type = TREE_TYPE (index_vec_type);
5326       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5327
5328       /* Get an unsigned integer version of the type of the data vector.  */
5329       int scalar_precision
5330         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5331       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5332       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5333                                                 vectype);
5334
5335       /* First we need to create a vector (ZERO_VEC) of zeros and another
5336          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5337          can create using a MAX reduction and then expanding.
5338          In the case where the loop never made any matches, the max index will
5339          be zero.  */
5340
5341       /* Vector of {0, 0, 0,...}.  */
5342       tree zero_vec = build_zero_cst (vectype);
5343
5344       gimple_seq stmts = NULL;
5345       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5346       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5347
5348       /* Find maximum value from the vector of found indexes.  */
5349       tree max_index = make_ssa_name (index_scalar_type);
5350       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5351                                                           1, induction_index);
5352       gimple_call_set_lhs (max_index_stmt, max_index);
5353       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5354
5355       /* Vector of {max_index, max_index, max_index,...}.  */
5356       tree max_index_vec = make_ssa_name (index_vec_type);
5357       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5358                                                       max_index);
5359       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5360                                                         max_index_vec_rhs);
5361       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5362
5363       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5364          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5365          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5366          otherwise.  Only one value should match, resulting in a vector
5367          (VEC_COND) with one data value and the rest zeros.
5368          In the case where the loop never made any matches, every index will
5369          match, resulting in a vector with all data values (which will all be
5370          the default value).  */
5371
5372       /* Compare the max index vector to the vector of found indexes to find
5373          the position of the max value.  */
5374       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5375       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5376                                                       induction_index,
5377                                                       max_index_vec);
5378       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5379
5380       /* Use the compare to choose either values from the data vector or
5381          zero.  */
5382       tree vec_cond = make_ssa_name (vectype);
5383       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5384                                                    vec_compare, new_phi_result,
5385                                                    zero_vec);
5386       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5387
5388       /* Finally we need to extract the data value from the vector (VEC_COND)
5389          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5390          reduction, but because this doesn't exist, we can use a MAX reduction
5391          instead.  The data value might be signed or a float so we need to cast
5392          it first.
5393          In the case where the loop never made any matches, the data values are
5394          all identical, and so will reduce down correctly.  */
5395
5396       /* Make the matched data values unsigned.  */
5397       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5398       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5399                                        vec_cond);
5400       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5401                                                         VIEW_CONVERT_EXPR,
5402                                                         vec_cond_cast_rhs);
5403       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5404
5405       /* Reduce down to a scalar value.  */
5406       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5407       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5408                                                            1, vec_cond_cast);
5409       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5410       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5411
5412       /* Convert the reduced value back to the result type and set as the
5413          result.  */
5414       stmts = NULL;
5415       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5416                                data_reduc);
5417       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5418       scalar_results.safe_push (new_temp);
5419     }
5420   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5421            && reduc_fn == IFN_LAST)
5422     {
5423       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5424          idx = 0;
5425          idx_val = induction_index[0];
5426          val = data_reduc[0];
5427          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5428            if (induction_index[i] > idx_val)
5429              val = data_reduc[i], idx_val = induction_index[i];
5430          return val;  */
5431
5432       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5433       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5434       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5435       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5436       /* Enforced by vectorizable_reduction, which ensures we have target
5437          support before allowing a conditional reduction on variable-length
5438          vectors.  */
5439       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5440       tree idx_val = NULL_TREE, val = NULL_TREE;
5441       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5442         {
5443           tree old_idx_val = idx_val;
5444           tree old_val = val;
5445           idx_val = make_ssa_name (idx_eltype);
5446           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5447                                              build3 (BIT_FIELD_REF, idx_eltype,
5448                                                      induction_index,
5449                                                      bitsize_int (el_size),
5450                                                      bitsize_int (off)));
5451           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5452           val = make_ssa_name (data_eltype);
5453           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5454                                              build3 (BIT_FIELD_REF,
5455                                                      data_eltype,
5456                                                      new_phi_result,
5457                                                      bitsize_int (el_size),
5458                                                      bitsize_int (off)));
5459           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5460           if (off != 0)
5461             {
5462               tree new_idx_val = idx_val;
5463               if (off != v_size - el_size)
5464                 {
5465                   new_idx_val = make_ssa_name (idx_eltype);
5466                   epilog_stmt = gimple_build_assign (new_idx_val,
5467                                                      MAX_EXPR, idx_val,
5468                                                      old_idx_val);
5469                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5470                 }
5471               tree new_val = make_ssa_name (data_eltype);
5472               epilog_stmt = gimple_build_assign (new_val,
5473                                                  COND_EXPR,
5474                                                  build2 (GT_EXPR,
5475                                                          boolean_type_node,
5476                                                          idx_val,
5477                                                          old_idx_val),
5478                                                  val, old_val);
5479               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5480               idx_val = new_idx_val;
5481               val = new_val;
5482             }
5483         }
5484       /* Convert the reduced value back to the result type and set as the
5485          result.  */
5486       gimple_seq stmts = NULL;
5487       val = gimple_convert (&stmts, scalar_type, val);
5488       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5489       scalar_results.safe_push (val);
5490     }
5491
5492   /* 2.3 Create the reduction code, using one of the three schemes described
5493          above. In SLP we simply need to extract all the elements from the
5494          vector (without reducing them), so we use scalar shifts.  */
5495   else if (reduc_fn != IFN_LAST && !slp_reduc)
5496     {
5497       tree tmp;
5498       tree vec_elem_type;
5499
5500       /* Case 1:  Create:
5501          v_out2 = reduc_expr <v_out1>  */
5502
5503       if (dump_enabled_p ())
5504         dump_printf_loc (MSG_NOTE, vect_location,
5505                          "Reduce using direct vector reduction.\n");
5506
5507       gimple_seq stmts = NULL;
5508       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5509       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5510       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5511                                vec_elem_type, new_phi_result);
5512       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5513       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5514
5515       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5516           && induc_val)
5517         {
5518           /* Earlier we set the initial value to be a vector if induc_val
5519              values.  Check the result and if it is induc_val then replace
5520              with the original initial value, unless induc_val is
5521              the same as initial_def already.  */
5522           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5523                                   induc_val);
5524
5525           tmp = make_ssa_name (new_scalar_dest);
5526           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5527                                              initial_def, new_temp);
5528           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5529           new_temp = tmp;
5530         }
5531
5532       scalar_results.safe_push (new_temp);
5533     }
5534   else if (direct_slp_reduc)
5535     {
5536       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5537          with the elements for other SLP statements replaced with the
5538          neutral value.  We can then do a normal reduction on each vector.  */
5539
5540       /* Enforced by vectorizable_reduction.  */
5541       gcc_assert (new_phis.length () == 1);
5542       gcc_assert (pow2p_hwi (group_size));
5543
5544       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5545       vec<stmt_vec_info> orig_phis
5546         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5547       gimple_seq seq = NULL;
5548
5549       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5550          and the same element size as VECTYPE.  */
5551       tree index = build_index_vector (vectype, 0, 1);
5552       tree index_type = TREE_TYPE (index);
5553       tree index_elt_type = TREE_TYPE (index_type);
5554       tree mask_type = truth_type_for (index_type);
5555
5556       /* Create a vector that, for each element, identifies which of
5557          the REDUC_GROUP_SIZE results should use it.  */
5558       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5559       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5560                             build_vector_from_val (index_type, index_mask));
5561
5562       /* Get a neutral vector value.  This is simply a splat of the neutral
5563          scalar value if we have one, otherwise the initial scalar value
5564          is itself a neutral value.  */
5565       tree vector_identity = NULL_TREE;
5566       tree neutral_op = NULL_TREE;
5567       if (slp_node)
5568         {
5569           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5570           neutral_op
5571             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5572                                             vectype, code, first != NULL);
5573         }
5574       if (neutral_op)
5575         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5576                                                         neutral_op);
5577       for (unsigned int i = 0; i < group_size; ++i)
5578         {
5579           /* If there's no univeral neutral value, we can use the
5580              initial scalar value from the original PHI.  This is used
5581              for MIN and MAX reduction, for example.  */
5582           if (!neutral_op)
5583             {
5584               tree scalar_value
5585                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5586                                          loop_preheader_edge (loop));
5587               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5588                                              scalar_value);
5589               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5590                                                               scalar_value);
5591             }
5592
5593           /* Calculate the equivalent of:
5594
5595              sel[j] = (index[j] == i);
5596
5597              which selects the elements of NEW_PHI_RESULT that should
5598              be included in the result.  */
5599           tree compare_val = build_int_cst (index_elt_type, i);
5600           compare_val = build_vector_from_val (index_type, compare_val);
5601           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5602                                    index, compare_val);
5603
5604           /* Calculate the equivalent of:
5605
5606              vec = seq ? new_phi_result : vector_identity;
5607
5608              VEC is now suitable for a full vector reduction.  */
5609           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5610                                    sel, new_phi_result, vector_identity);
5611
5612           /* Do the reduction and convert it to the appropriate type.  */
5613           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5614                                       TREE_TYPE (vectype), vec);
5615           scalar = gimple_convert (&seq, scalar_type, scalar);
5616           scalar_results.safe_push (scalar);
5617         }
5618       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5619     }
5620   else
5621     {
5622       bool reduce_with_shift;
5623       tree vec_temp;
5624
5625       gcc_assert (slp_reduc || new_phis.length () == 1);
5626
5627       /* See if the target wants to do the final (shift) reduction
5628          in a vector mode of smaller size and first reduce upper/lower
5629          halves against each other.  */
5630       enum machine_mode mode1 = mode;
5631       tree stype = TREE_TYPE (vectype);
5632       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5633       unsigned nunits1 = nunits;
5634       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5635           && new_phis.length () == 1)
5636         {
5637           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5638           /* For SLP reductions we have to make sure lanes match up, but
5639              since we're doing individual element final reduction reducing
5640              vector width here is even more important.
5641              ???  We can also separate lanes with permutes, for the common
5642              case of power-of-two group-size odd/even extracts would work.  */
5643           if (slp_reduc && nunits != nunits1)
5644             {
5645               nunits1 = least_common_multiple (nunits1, group_size);
5646               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5647             }
5648         }
5649       if (!slp_reduc
5650           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5651         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5652
5653       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5654                                                            stype, nunits1);
5655       reduce_with_shift = have_whole_vector_shift (mode1);
5656       if (!VECTOR_MODE_P (mode1))
5657         reduce_with_shift = false;
5658       else
5659         {
5660           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5661           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5662             reduce_with_shift = false;
5663         }
5664
5665       /* First reduce the vector to the desired vector size we should
5666          do shift reduction on by combining upper and lower halves.  */
5667       new_temp = new_phi_result;
5668       while (nunits > nunits1)
5669         {
5670           nunits /= 2;
5671           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5672                                                           stype, nunits);
5673           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5674
5675           /* The target has to make sure we support lowpart/highpart
5676              extraction, either via direct vector extract or through
5677              an integer mode punning.  */
5678           tree dst1, dst2;
5679           if (convert_optab_handler (vec_extract_optab,
5680                                      TYPE_MODE (TREE_TYPE (new_temp)),
5681                                      TYPE_MODE (vectype1))
5682               != CODE_FOR_nothing)
5683             {
5684               /* Extract sub-vectors directly once vec_extract becomes
5685                  a conversion optab.  */
5686               dst1 = make_ssa_name (vectype1);
5687               epilog_stmt
5688                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5689                                          build3 (BIT_FIELD_REF, vectype1,
5690                                                  new_temp, TYPE_SIZE (vectype1),
5691                                                  bitsize_int (0)));
5692               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5693               dst2 =  make_ssa_name (vectype1);
5694               epilog_stmt
5695                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5696                                          build3 (BIT_FIELD_REF, vectype1,
5697                                                  new_temp, TYPE_SIZE (vectype1),
5698                                                  bitsize_int (bitsize)));
5699               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5700             }
5701           else
5702             {
5703               /* Extract via punning to appropriately sized integer mode
5704                  vector.  */
5705               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5706               tree etype = build_vector_type (eltype, 2);
5707               gcc_assert (convert_optab_handler (vec_extract_optab,
5708                                                  TYPE_MODE (etype),
5709                                                  TYPE_MODE (eltype))
5710                           != CODE_FOR_nothing);
5711               tree tem = make_ssa_name (etype);
5712               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5713                                                  build1 (VIEW_CONVERT_EXPR,
5714                                                          etype, new_temp));
5715               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5716               new_temp = tem;
5717               tem = make_ssa_name (eltype);
5718               epilog_stmt
5719                   = gimple_build_assign (tem, BIT_FIELD_REF,
5720                                          build3 (BIT_FIELD_REF, eltype,
5721                                                  new_temp, TYPE_SIZE (eltype),
5722                                                  bitsize_int (0)));
5723               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5724               dst1 = make_ssa_name (vectype1);
5725               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5726                                                  build1 (VIEW_CONVERT_EXPR,
5727                                                          vectype1, tem));
5728               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5729               tem = make_ssa_name (eltype);
5730               epilog_stmt
5731                   = gimple_build_assign (tem, BIT_FIELD_REF,
5732                                          build3 (BIT_FIELD_REF, eltype,
5733                                                  new_temp, TYPE_SIZE (eltype),
5734                                                  bitsize_int (bitsize)));
5735               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5736               dst2 =  make_ssa_name (vectype1);
5737               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5738                                                  build1 (VIEW_CONVERT_EXPR,
5739                                                          vectype1, tem));
5740               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5741             }
5742
5743           new_temp = make_ssa_name (vectype1);
5744           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5745           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5746           new_phis[0] = epilog_stmt;
5747         }
5748
5749       if (reduce_with_shift && !slp_reduc)
5750         {
5751           int element_bitsize = tree_to_uhwi (bitsize);
5752           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5753              for variable-length vectors and also requires direct target support
5754              for loop reductions.  */
5755           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5756           int nelements = vec_size_in_bits / element_bitsize;
5757           vec_perm_builder sel;
5758           vec_perm_indices indices;
5759
5760           int elt_offset;
5761
5762           tree zero_vec = build_zero_cst (vectype1);
5763           /* Case 2: Create:
5764              for (offset = nelements/2; offset >= 1; offset/=2)
5765                 {
5766                   Create:  va' = vec_shift <va, offset>
5767                   Create:  va = vop <va, va'>
5768                 }  */
5769
5770           tree rhs;
5771
5772           if (dump_enabled_p ())
5773             dump_printf_loc (MSG_NOTE, vect_location,
5774                              "Reduce using vector shifts\n");
5775
5776           gimple_seq stmts = NULL;
5777           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5778           for (elt_offset = nelements / 2;
5779                elt_offset >= 1;
5780                elt_offset /= 2)
5781             {
5782               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5783               indices.new_vector (sel, 2, nelements);
5784               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5785               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5786                                        new_temp, zero_vec, mask);
5787               new_temp = gimple_build (&stmts, code,
5788                                        vectype1, new_name, new_temp);
5789             }
5790           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5791
5792           /* 2.4  Extract the final scalar result.  Create:
5793              s_out3 = extract_field <v_out2, bitpos>  */
5794
5795           if (dump_enabled_p ())
5796             dump_printf_loc (MSG_NOTE, vect_location,
5797                              "extract scalar result\n");
5798
5799           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5800                         bitsize, bitsize_zero_node);
5801           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5802           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5803           gimple_assign_set_lhs (epilog_stmt, new_temp);
5804           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5805           scalar_results.safe_push (new_temp);
5806         }
5807       else
5808         {
5809           /* Case 3: Create:
5810              s = extract_field <v_out2, 0>
5811              for (offset = element_size;
5812                   offset < vector_size;
5813                   offset += element_size;)
5814                {
5815                  Create:  s' = extract_field <v_out2, offset>
5816                  Create:  s = op <s, s'>  // For non SLP cases
5817                }  */
5818
5819           if (dump_enabled_p ())
5820             dump_printf_loc (MSG_NOTE, vect_location,
5821                              "Reduce using scalar code.\n");
5822
5823           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5824           int element_bitsize = tree_to_uhwi (bitsize);
5825           tree compute_type = TREE_TYPE (vectype);
5826           gimple_seq stmts = NULL;
5827           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5828             {
5829               int bit_offset;
5830               if (gimple_code (new_phi) == GIMPLE_PHI)
5831                 vec_temp = PHI_RESULT (new_phi);
5832               else
5833                 vec_temp = gimple_assign_lhs (new_phi);
5834               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5835                                        vec_temp, bitsize, bitsize_zero_node);
5836
5837               /* In SLP we don't need to apply reduction operation, so we just
5838                  collect s' values in SCALAR_RESULTS.  */
5839               if (slp_reduc)
5840                 scalar_results.safe_push (new_temp);
5841
5842               for (bit_offset = element_bitsize;
5843                    bit_offset < vec_size_in_bits;
5844                    bit_offset += element_bitsize)
5845                 {
5846                   tree bitpos = bitsize_int (bit_offset);
5847                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5848                                            compute_type, vec_temp,
5849                                            bitsize, bitpos);
5850                   if (slp_reduc)
5851                     {
5852                       /* In SLP we don't need to apply reduction operation, so
5853                          we just collect s' values in SCALAR_RESULTS.  */
5854                       new_temp = new_name;
5855                       scalar_results.safe_push (new_name);
5856                     }
5857                   else
5858                     new_temp = gimple_build (&stmts, code, compute_type,
5859                                              new_name, new_temp);
5860                 }
5861             }
5862
5863           /* The only case where we need to reduce scalar results in SLP, is
5864              unrolling.  If the size of SCALAR_RESULTS is greater than
5865              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5866              REDUC_GROUP_SIZE.  */
5867           if (slp_reduc)
5868             {
5869               tree res, first_res, new_res;
5870
5871               /* Reduce multiple scalar results in case of SLP unrolling.  */
5872               for (j = group_size; scalar_results.iterate (j, &res);
5873                    j++)
5874                 {
5875                   first_res = scalar_results[j % group_size];
5876                   new_res = gimple_build (&stmts, code, compute_type,
5877                                           first_res, res);
5878                   scalar_results[j % group_size] = new_res;
5879                 }
5880               for (k = 0; k < group_size; k++)
5881                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5882                                                     scalar_results[k]);
5883             }
5884           else
5885             {
5886               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5887               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5888               scalar_results.safe_push (new_temp);
5889             }
5890
5891           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5892         }
5893
5894       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5895           && induc_val)
5896         {
5897           /* Earlier we set the initial value to be a vector if induc_val
5898              values.  Check the result and if it is induc_val then replace
5899              with the original initial value, unless induc_val is
5900              the same as initial_def already.  */
5901           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5902                                   induc_val);
5903
5904           tree tmp = make_ssa_name (new_scalar_dest);
5905           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5906                                              initial_def, new_temp);
5907           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5908           scalar_results[0] = tmp;
5909         }
5910     }
5911
5912   /* 2.5 Adjust the final result by the initial value of the reduction
5913          variable. (When such adjustment is not needed, then
5914          'adjustment_def' is zero).  For example, if code is PLUS we create:
5915          new_temp = loop_exit_def + adjustment_def  */
5916
5917   if (adjustment_def)
5918     {
5919       gcc_assert (!slp_reduc);
5920       gimple_seq stmts = NULL;
5921       if (nested_in_vect_loop)
5922         {
5923           new_phi = new_phis[0];
5924           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5925           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5926           new_temp = gimple_build (&stmts, code, vectype,
5927                                    PHI_RESULT (new_phi), adjustment_def);
5928         }
5929       else
5930         {
5931           new_temp = scalar_results[0];
5932           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5933           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5934           new_temp = gimple_build (&stmts, code, scalar_type,
5935                                    new_temp, adjustment_def);
5936         }
5937
5938       epilog_stmt = gimple_seq_last_stmt (stmts);
5939       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5940       if (nested_in_vect_loop)
5941         {
5942           if (!double_reduc)
5943             scalar_results.quick_push (new_temp);
5944           else
5945             scalar_results[0] = new_temp;
5946         }
5947       else
5948         scalar_results[0] = new_temp;
5949
5950       new_phis[0] = epilog_stmt;
5951     }
5952
5953   if (double_reduc)
5954     loop = loop->inner;
5955
5956   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5957           phis with new adjusted scalar results, i.e., replace use <s_out0>
5958           with use <s_out4>.
5959
5960      Transform:
5961         loop_exit:
5962           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5963           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5964           v_out2 = reduce <v_out1>
5965           s_out3 = extract_field <v_out2, 0>
5966           s_out4 = adjust_result <s_out3>
5967           use <s_out0>
5968           use <s_out0>
5969
5970      into:
5971
5972         loop_exit:
5973           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5974           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5975           v_out2 = reduce <v_out1>
5976           s_out3 = extract_field <v_out2, 0>
5977           s_out4 = adjust_result <s_out3>
5978           use <s_out4>
5979           use <s_out4> */
5980
5981
5982   /* In SLP reduction chain we reduce vector results into one vector if
5983      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5984      LHS of the last stmt in the reduction chain, since we are looking for
5985      the loop exit phi node.  */
5986   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5987     {
5988       stmt_vec_info dest_stmt_info
5989         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5990       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5991       group_size = 1;
5992     }
5993
5994   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5995      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5996      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5997      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5998      correspond to the first vector stmt, etc.
5999      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
6000   if (group_size > new_phis.length ())
6001     gcc_assert (!(group_size % new_phis.length ()));
6002
6003   for (k = 0; k < group_size; k++)
6004     {
6005       if (slp_reduc)
6006         {
6007           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
6008
6009           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
6010           /* SLP statements can't participate in patterns.  */
6011           gcc_assert (!orig_stmt_info);
6012           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
6013         }
6014
6015       if (nested_in_vect_loop)
6016         {
6017           if (double_reduc)
6018             loop = outer_loop;
6019           else
6020             gcc_unreachable ();
6021         }
6022
6023       phis.create (3);
6024       /* Find the loop-closed-use at the loop exit of the original scalar
6025          result.  (The reduction result is expected to have two immediate uses,
6026          one at the latch block, and one at the loop exit).  For double
6027          reductions we are looking for exit phis of the outer loop.  */
6028       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6029         {
6030           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6031             {
6032               if (!is_gimple_debug (USE_STMT (use_p)))
6033                 phis.safe_push (USE_STMT (use_p));
6034             }
6035           else
6036             {
6037               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6038                 {
6039                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6040
6041                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6042                     {
6043                       if (!flow_bb_inside_loop_p (loop,
6044                                              gimple_bb (USE_STMT (phi_use_p)))
6045                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6046                         phis.safe_push (USE_STMT (phi_use_p));
6047                     }
6048                 }
6049             }
6050         }
6051
6052       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6053         {
6054           /* Replace the uses:  */
6055           orig_name = PHI_RESULT (exit_phi);
6056           scalar_result = scalar_results[k];
6057           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6058             {
6059               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6060                 SET_USE (use_p, scalar_result);
6061               update_stmt (use_stmt);
6062             }
6063         }
6064
6065       phis.release ();
6066     }
6067 }
6068
6069 /* Return a vector of type VECTYPE that is equal to the vector select
6070    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6071    before GSI.  */
6072
6073 static tree
6074 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6075                      tree vec, tree identity)
6076 {
6077   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6078   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6079                                           mask, vec, identity);
6080   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6081   return cond;
6082 }
6083
6084 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6085    order, starting with LHS.  Insert the extraction statements before GSI and
6086    associate the new scalar SSA names with variable SCALAR_DEST.
6087    Return the SSA name for the result.  */
6088
6089 static tree
6090 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6091                        tree_code code, tree lhs, tree vector_rhs)
6092 {
6093   tree vectype = TREE_TYPE (vector_rhs);
6094   tree scalar_type = TREE_TYPE (vectype);
6095   tree bitsize = TYPE_SIZE (scalar_type);
6096   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6097   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6098
6099   for (unsigned HOST_WIDE_INT bit_offset = 0;
6100        bit_offset < vec_size_in_bits;
6101        bit_offset += element_bitsize)
6102     {
6103       tree bitpos = bitsize_int (bit_offset);
6104       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6105                          bitsize, bitpos);
6106
6107       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6108       rhs = make_ssa_name (scalar_dest, stmt);
6109       gimple_assign_set_lhs (stmt, rhs);
6110       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6111
6112       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6113       tree new_name = make_ssa_name (scalar_dest, stmt);
6114       gimple_assign_set_lhs (stmt, new_name);
6115       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6116       lhs = new_name;
6117     }
6118   return lhs;
6119 }
6120
6121 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6122    type of the vector input.  */
6123
6124 static internal_fn
6125 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6126 {
6127   internal_fn mask_reduc_fn;
6128
6129   switch (reduc_fn)
6130     {
6131     case IFN_FOLD_LEFT_PLUS:
6132       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6133       break;
6134
6135     default:
6136       return IFN_LAST;
6137     }
6138
6139   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6140                                       OPTIMIZE_FOR_SPEED))
6141     return mask_reduc_fn;
6142   return IFN_LAST;
6143 }
6144
6145 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6146    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6147    statement.  CODE is the operation performed by STMT_INFO and OPS are
6148    its scalar operands.  REDUC_INDEX is the index of the operand in
6149    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6150    implements in-order reduction, or IFN_LAST if we should open-code it.
6151    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6152    that should be used to control the operation in a fully-masked loop.  */
6153
6154 static bool
6155 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6156                                stmt_vec_info stmt_info,
6157                                gimple_stmt_iterator *gsi,
6158                                gimple **vec_stmt, slp_tree slp_node,
6159                                gimple *reduc_def_stmt,
6160                                tree_code code, internal_fn reduc_fn,
6161                                tree ops[3], tree vectype_in,
6162                                int reduc_index, vec_loop_masks *masks)
6163 {
6164   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6165   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6166   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6167
6168   int ncopies;
6169   if (slp_node)
6170     ncopies = 1;
6171   else
6172     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6173
6174   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6175   gcc_assert (ncopies == 1);
6176   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6177
6178   if (slp_node)
6179     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6180                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6181
6182   tree op0 = ops[1 - reduc_index];
6183
6184   int group_size = 1;
6185   stmt_vec_info scalar_dest_def_info;
6186   auto_vec<tree> vec_oprnds0;
6187   if (slp_node)
6188     {
6189       auto_vec<vec<tree> > vec_defs (2);
6190       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6191       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6192       vec_defs[0].release ();
6193       vec_defs[1].release ();
6194       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6195       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6196     }
6197   else
6198     {
6199       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6200                                      op0, &vec_oprnds0);
6201       scalar_dest_def_info = stmt_info;
6202     }
6203
6204   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6205   tree scalar_type = TREE_TYPE (scalar_dest);
6206   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6207
6208   int vec_num = vec_oprnds0.length ();
6209   gcc_assert (vec_num == 1 || slp_node);
6210   tree vec_elem_type = TREE_TYPE (vectype_out);
6211   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6212
6213   tree vector_identity = NULL_TREE;
6214   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6215     vector_identity = build_zero_cst (vectype_out);
6216
6217   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6218   int i;
6219   tree def0;
6220   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6221     {
6222       gimple *new_stmt;
6223       tree mask = NULL_TREE;
6224       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6225         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6226
6227       /* Handle MINUS by adding the negative.  */
6228       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6229         {
6230           tree negated = make_ssa_name (vectype_out);
6231           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6232           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6233           def0 = negated;
6234         }
6235
6236       if (mask && mask_reduc_fn == IFN_LAST)
6237         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6238                                     vector_identity);
6239
6240       /* On the first iteration the input is simply the scalar phi
6241          result, and for subsequent iterations it is the output of
6242          the preceding operation.  */
6243       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6244         {
6245           if (mask && mask_reduc_fn != IFN_LAST)
6246             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6247                                                    def0, mask);
6248           else
6249             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6250                                                    def0);
6251           /* For chained SLP reductions the output of the previous reduction
6252              operation serves as the input of the next. For the final statement
6253              the output cannot be a temporary - we reuse the original
6254              scalar destination of the last statement.  */
6255           if (i != vec_num - 1)
6256             {
6257               gimple_set_lhs (new_stmt, scalar_dest_var);
6258               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6259               gimple_set_lhs (new_stmt, reduc_var);
6260             }
6261         }
6262       else
6263         {
6264           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6265                                              reduc_var, def0);
6266           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6267           /* Remove the statement, so that we can use the same code paths
6268              as for statements that we've just created.  */
6269           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6270           gsi_remove (&tmp_gsi, true);
6271         }
6272
6273       if (i == vec_num - 1)
6274         {
6275           gimple_set_lhs (new_stmt, scalar_dest);
6276           vect_finish_replace_stmt (loop_vinfo,
6277                                     scalar_dest_def_info,
6278                                     new_stmt);
6279         }
6280       else
6281         vect_finish_stmt_generation (loop_vinfo,
6282                                      scalar_dest_def_info,
6283                                      new_stmt, gsi);
6284
6285       if (slp_node)
6286         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6287       else
6288         {
6289           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6290           *vec_stmt = new_stmt;
6291         }
6292     }
6293
6294   return true;
6295 }
6296
6297 /* Function is_nonwrapping_integer_induction.
6298
6299    Check if STMT_VINO (which is part of loop LOOP) both increments and
6300    does not cause overflow.  */
6301
6302 static bool
6303 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6304 {
6305   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6306   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6307   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6308   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6309   widest_int ni, max_loop_value, lhs_max;
6310   wi::overflow_type overflow = wi::OVF_NONE;
6311
6312   /* Make sure the loop is integer based.  */
6313   if (TREE_CODE (base) != INTEGER_CST
6314       || TREE_CODE (step) != INTEGER_CST)
6315     return false;
6316
6317   /* Check that the max size of the loop will not wrap.  */
6318
6319   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6320     return true;
6321
6322   if (! max_stmt_executions (loop, &ni))
6323     return false;
6324
6325   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6326                             &overflow);
6327   if (overflow)
6328     return false;
6329
6330   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6331                             TYPE_SIGN (lhs_type), &overflow);
6332   if (overflow)
6333     return false;
6334
6335   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6336           <= TYPE_PRECISION (lhs_type));
6337 }
6338
6339 /* Check if masking can be supported by inserting a conditional expression.
6340    CODE is the code for the operation.  COND_FN is the conditional internal
6341    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6342 static bool
6343 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6344                          tree vectype_in)
6345 {
6346   if (cond_fn != IFN_LAST
6347       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6348                                          OPTIMIZE_FOR_SPEED))
6349     return false;
6350
6351   switch (code)
6352     {
6353     case DOT_PROD_EXPR:
6354     case SAD_EXPR:
6355       return true;
6356
6357     default:
6358       return false;
6359     }
6360 }
6361
6362 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6363    code for the operation.  VOP is the array of operands.  MASK is the loop
6364    mask.  GSI is a statement iterator used to place the new conditional
6365    expression.  */
6366 static void
6367 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6368                       gimple_stmt_iterator *gsi)
6369 {
6370   switch (code)
6371     {
6372     case DOT_PROD_EXPR:
6373       {
6374         tree vectype = TREE_TYPE (vop[1]);
6375         tree zero = build_zero_cst (vectype);
6376         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6377         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6378                                                mask, vop[1], zero);
6379         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6380         vop[1] = masked_op1;
6381         break;
6382       }
6383
6384     case SAD_EXPR:
6385       {
6386         tree vectype = TREE_TYPE (vop[1]);
6387         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6388         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6389                                                mask, vop[1], vop[0]);
6390         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6391         vop[1] = masked_op1;
6392         break;
6393       }
6394
6395     default:
6396       gcc_unreachable ();
6397     }
6398 }
6399
6400 /* Function vectorizable_reduction.
6401
6402    Check if STMT_INFO performs a reduction operation that can be vectorized.
6403    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6404    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6405    Return true if STMT_INFO is vectorizable in this way.
6406
6407    This function also handles reduction idioms (patterns) that have been
6408    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6409    may be of this form:
6410      X = pattern_expr (arg0, arg1, ..., X)
6411    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6412    sequence that had been detected and replaced by the pattern-stmt
6413    (STMT_INFO).
6414
6415    This function also handles reduction of condition expressions, for example:
6416      for (int i = 0; i < N; i++)
6417        if (a[i] < value)
6418          last = a[i];
6419    This is handled by vectorising the loop and creating an additional vector
6420    containing the loop indexes for which "a[i] < value" was true.  In the
6421    function epilogue this is reduced to a single max value and then used to
6422    index into the vector of results.
6423
6424    In some cases of reduction patterns, the type of the reduction variable X is
6425    different than the type of the other arguments of STMT_INFO.
6426    In such cases, the vectype that is used when transforming STMT_INFO into
6427    a vector stmt is different than the vectype that is used to determine the
6428    vectorization factor, because it consists of a different number of elements
6429    than the actual number of elements that are being operated upon in parallel.
6430
6431    For example, consider an accumulation of shorts into an int accumulator.
6432    On some targets it's possible to vectorize this pattern operating on 8
6433    shorts at a time (hence, the vectype for purposes of determining the
6434    vectorization factor should be V8HI); on the other hand, the vectype that
6435    is used to create the vector form is actually V4SI (the type of the result).
6436
6437    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6438    indicates what is the actual level of parallelism (V8HI in the example), so
6439    that the right vectorization factor would be derived.  This vectype
6440    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6441    be used to create the vectorized stmt.  The right vectype for the vectorized
6442    stmt is obtained from the type of the result X:
6443       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6444
6445    This means that, contrary to "regular" reductions (or "regular" stmts in
6446    general), the following equation:
6447       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6448    does *NOT* necessarily hold for reduction patterns.  */
6449
6450 bool
6451 vectorizable_reduction (loop_vec_info loop_vinfo,
6452                         stmt_vec_info stmt_info, slp_tree slp_node,
6453                         slp_instance slp_node_instance,
6454                         stmt_vector_for_cost *cost_vec)
6455 {
6456   tree scalar_dest;
6457   tree vectype_in = NULL_TREE;
6458   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6459   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6460   stmt_vec_info cond_stmt_vinfo = NULL;
6461   tree scalar_type;
6462   int i;
6463   int ncopies;
6464   bool single_defuse_cycle = false;
6465   bool nested_cycle = false;
6466   bool double_reduc = false;
6467   int vec_num;
6468   tree tem;
6469   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6470   tree cond_reduc_val = NULL_TREE;
6471
6472   /* Make sure it was already recognized as a reduction computation.  */
6473   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6474       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6475       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6476     return false;
6477
6478   /* The stmt we store reduction analysis meta on.  */
6479   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6480   reduc_info->is_reduc_info = true;
6481
6482   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6483     {
6484       if (is_a <gphi *> (stmt_info->stmt))
6485         {
6486           if (slp_node)
6487             {
6488               /* We eventually need to set a vector type on invariant
6489                  arguments.  */
6490               unsigned j;
6491               slp_tree child;
6492               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6493                 if (!vect_maybe_update_slp_op_vectype
6494                        (child, SLP_TREE_VECTYPE (slp_node)))
6495                   {
6496                     if (dump_enabled_p ())
6497                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6498                                        "incompatible vector types for "
6499                                        "invariants\n");
6500                     return false;
6501                   }
6502             }
6503           /* Analysis for double-reduction is done on the outer
6504              loop PHI, nested cycles have no further restrictions.  */
6505           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6506         }
6507       else
6508         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6509       return true;
6510     }
6511
6512   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6513   stmt_vec_info phi_info = stmt_info;
6514   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6515       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6516     {
6517       if (!is_a <gphi *> (stmt_info->stmt))
6518         {
6519           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6520           return true;
6521         }
6522       if (slp_node)
6523         {
6524           slp_node_instance->reduc_phis = slp_node;
6525           /* ???  We're leaving slp_node to point to the PHIs, we only
6526              need it to get at the number of vector stmts which wasn't
6527              yet initialized for the instance root.  */
6528         }
6529       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6530         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6531       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6532         {
6533           use_operand_p use_p;
6534           gimple *use_stmt;
6535           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6536                                      &use_p, &use_stmt);
6537           gcc_assert (res);
6538           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6539           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6540         }
6541     }
6542
6543   /* PHIs should not participate in patterns.  */
6544   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6545   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6546
6547   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6548      and compute the reduction chain length.  Discover the real
6549      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6550   tree reduc_def
6551     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6552                              loop_latch_edge
6553                                (gimple_bb (reduc_def_phi)->loop_father));
6554   unsigned reduc_chain_length = 0;
6555   bool only_slp_reduc_chain = true;
6556   stmt_info = NULL;
6557   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6558   while (reduc_def != PHI_RESULT (reduc_def_phi))
6559     {
6560       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6561       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6562       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6563         {
6564           if (dump_enabled_p ())
6565             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6566                              "reduction chain broken by patterns.\n");
6567           return false;
6568         }
6569       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6570         only_slp_reduc_chain = false;
6571       /* ???  For epilogue generation live members of the chain need
6572          to point back to the PHI via their original stmt for
6573          info_for_reduction to work.  */
6574       if (STMT_VINFO_LIVE_P (vdef))
6575         STMT_VINFO_REDUC_DEF (def) = phi_info;
6576       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6577       if (!assign)
6578         {
6579           if (dump_enabled_p ())
6580             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581                              "reduction chain includes calls.\n");
6582           return false;
6583         }
6584       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6585         {
6586           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6587                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6588             {
6589               if (dump_enabled_p ())
6590                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6591                                  "conversion in the reduction chain.\n");
6592               return false;
6593             }
6594         }
6595       else if (!stmt_info)
6596         /* First non-conversion stmt.  */
6597         stmt_info = vdef;
6598       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6599       reduc_chain_length++;
6600       if (!stmt_info && slp_node)
6601         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6602     }
6603   /* PHIs should not participate in patterns.  */
6604   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6605
6606   if (nested_in_vect_loop_p (loop, stmt_info))
6607     {
6608       loop = loop->inner;
6609       nested_cycle = true;
6610     }
6611
6612   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6613      element.  */
6614   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6615     {
6616       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6617       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6618     }
6619   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6620     gcc_assert (slp_node
6621                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6622
6623   /* 1. Is vectorizable reduction?  */
6624   /* Not supportable if the reduction variable is used in the loop, unless
6625      it's a reduction chain.  */
6626   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6627       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6628     return false;
6629
6630   /* Reductions that are not used even in an enclosing outer-loop,
6631      are expected to be "live" (used out of the loop).  */
6632   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6633       && !STMT_VINFO_LIVE_P (stmt_info))
6634     return false;
6635
6636   /* 2. Has this been recognized as a reduction pattern?
6637
6638      Check if STMT represents a pattern that has been recognized
6639      in earlier analysis stages.  For stmts that represent a pattern,
6640      the STMT_VINFO_RELATED_STMT field records the last stmt in
6641      the original sequence that constitutes the pattern.  */
6642
6643   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6644   if (orig_stmt_info)
6645     {
6646       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6647       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6648     }
6649
6650   /* 3. Check the operands of the operation.  The first operands are defined
6651         inside the loop body. The last operand is the reduction variable,
6652         which is defined by the loop-header-phi.  */
6653
6654   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6655   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6656   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6657   enum tree_code code = gimple_assign_rhs_code (stmt);
6658   bool lane_reduc_code_p
6659     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6660   int op_type = TREE_CODE_LENGTH (code);
6661
6662   scalar_dest = gimple_assign_lhs (stmt);
6663   scalar_type = TREE_TYPE (scalar_dest);
6664   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6665       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6666     return false;
6667
6668   /* Do not try to vectorize bit-precision reductions.  */
6669   if (!type_has_mode_precision_p (scalar_type))
6670     return false;
6671
6672   /* For lane-reducing ops we're reducing the number of reduction PHIs
6673      which means the only use of that may be in the lane-reducing operation.  */
6674   if (lane_reduc_code_p
6675       && reduc_chain_length != 1
6676       && !only_slp_reduc_chain)
6677     {
6678       if (dump_enabled_p ())
6679         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6680                          "lane-reducing reduction with extra stmts.\n");
6681       return false;
6682     }
6683
6684   /* All uses but the last are expected to be defined in the loop.
6685      The last use is the reduction variable.  In case of nested cycle this
6686      assumption is not true: we use reduc_index to record the index of the
6687      reduction variable.  */
6688   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6689   /* We need to skip an extra operand for COND_EXPRs with embedded
6690      comparison.  */
6691   unsigned opno_adjust = 0;
6692   if (code == COND_EXPR
6693       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6694     opno_adjust = 1;
6695   for (i = 0; i < op_type; i++)
6696     {
6697       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6698       if (i == 0 && code == COND_EXPR)
6699         continue;
6700
6701       stmt_vec_info def_stmt_info;
6702       enum vect_def_type dt;
6703       tree op;
6704       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6705                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6706                                &def_stmt_info))
6707         {
6708           if (dump_enabled_p ())
6709             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710                              "use not simple.\n");
6711           return false;
6712         }
6713       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6714         continue;
6715
6716       /* There should be only one cycle def in the stmt, the one
6717          leading to reduc_def.  */
6718       if (VECTORIZABLE_CYCLE_DEF (dt))
6719         return false;
6720
6721       /* To properly compute ncopies we are interested in the widest
6722          non-reduction input type in case we're looking at a widening
6723          accumulation that we later handle in vect_transform_reduction.  */
6724       if (lane_reduc_code_p
6725           && tem
6726           && (!vectype_in
6727               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6728                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6729         vectype_in = tem;
6730
6731       if (code == COND_EXPR)
6732         {
6733           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6734           if (dt == vect_constant_def)
6735             {
6736               cond_reduc_dt = dt;
6737               cond_reduc_val = op;
6738             }
6739           if (dt == vect_induction_def
6740               && def_stmt_info
6741               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6742             {
6743               cond_reduc_dt = dt;
6744               cond_stmt_vinfo = def_stmt_info;
6745             }
6746         }
6747     }
6748   if (!vectype_in)
6749     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6750   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6751
6752   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6753   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6754   /* If we have a condition reduction, see if we can simplify it further.  */
6755   if (v_reduc_type == COND_REDUCTION)
6756     {
6757       if (slp_node)
6758         return false;
6759
6760       /* When the condition uses the reduction value in the condition, fail.  */
6761       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6762         {
6763           if (dump_enabled_p ())
6764             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765                              "condition depends on previous iteration\n");
6766           return false;
6767         }
6768
6769       if (reduc_chain_length == 1
6770           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6771                                              vectype_in, OPTIMIZE_FOR_SPEED))
6772         {
6773           if (dump_enabled_p ())
6774             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6775                              "optimizing condition reduction with"
6776                              " FOLD_EXTRACT_LAST.\n");
6777           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6778         }
6779       else if (cond_reduc_dt == vect_induction_def)
6780         {
6781           tree base
6782             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6783           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6784
6785           gcc_assert (TREE_CODE (base) == INTEGER_CST
6786                       && TREE_CODE (step) == INTEGER_CST);
6787           cond_reduc_val = NULL_TREE;
6788           enum tree_code cond_reduc_op_code = ERROR_MARK;
6789           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6790           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6791             ;
6792           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6793              above base; punt if base is the minimum value of the type for
6794              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6795           else if (tree_int_cst_sgn (step) == -1)
6796             {
6797               cond_reduc_op_code = MIN_EXPR;
6798               if (tree_int_cst_sgn (base) == -1)
6799                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6800               else if (tree_int_cst_lt (base,
6801                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6802                 cond_reduc_val
6803                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6804             }
6805           else
6806             {
6807               cond_reduc_op_code = MAX_EXPR;
6808               if (tree_int_cst_sgn (base) == 1)
6809                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6810               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6811                                         base))
6812                 cond_reduc_val
6813                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6814             }
6815           if (cond_reduc_val)
6816             {
6817               if (dump_enabled_p ())
6818                 dump_printf_loc (MSG_NOTE, vect_location,
6819                                  "condition expression based on "
6820                                  "integer induction.\n");
6821               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6822               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6823                 = cond_reduc_val;
6824               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6825             }
6826         }
6827       else if (cond_reduc_dt == vect_constant_def)
6828         {
6829           enum vect_def_type cond_initial_dt;
6830           tree cond_initial_val
6831             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6832
6833           gcc_assert (cond_reduc_val != NULL_TREE);
6834           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6835           if (cond_initial_dt == vect_constant_def
6836               && types_compatible_p (TREE_TYPE (cond_initial_val),
6837                                      TREE_TYPE (cond_reduc_val)))
6838             {
6839               tree e = fold_binary (LE_EXPR, boolean_type_node,
6840                                     cond_initial_val, cond_reduc_val);
6841               if (e && (integer_onep (e) || integer_zerop (e)))
6842                 {
6843                   if (dump_enabled_p ())
6844                     dump_printf_loc (MSG_NOTE, vect_location,
6845                                      "condition expression based on "
6846                                      "compile time constant.\n");
6847                   /* Record reduction code at analysis stage.  */
6848                   STMT_VINFO_REDUC_CODE (reduc_info)
6849                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6850                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6851                 }
6852             }
6853         }
6854     }
6855
6856   if (STMT_VINFO_LIVE_P (phi_info))
6857     return false;
6858
6859   if (slp_node)
6860     ncopies = 1;
6861   else
6862     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6863
6864   gcc_assert (ncopies >= 1);
6865
6866   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6867
6868   if (nested_cycle)
6869     {
6870       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6871                   == vect_double_reduction_def);
6872       double_reduc = true;
6873     }
6874
6875   /* 4.2. Check support for the epilog operation.
6876
6877           If STMT represents a reduction pattern, then the type of the
6878           reduction variable may be different than the type of the rest
6879           of the arguments.  For example, consider the case of accumulation
6880           of shorts into an int accumulator; The original code:
6881                         S1: int_a = (int) short_a;
6882           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6883
6884           was replaced with:
6885                         STMT: int_acc = widen_sum <short_a, int_acc>
6886
6887           This means that:
6888           1. The tree-code that is used to create the vector operation in the
6889              epilog code (that reduces the partial results) is not the
6890              tree-code of STMT, but is rather the tree-code of the original
6891              stmt from the pattern that STMT is replacing.  I.e, in the example
6892              above we want to use 'widen_sum' in the loop, but 'plus' in the
6893              epilog.
6894           2. The type (mode) we use to check available target support
6895              for the vector operation to be created in the *epilog*, is
6896              determined by the type of the reduction variable (in the example
6897              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6898              However the type (mode) we use to check available target support
6899              for the vector operation to be created *inside the loop*, is
6900              determined by the type of the other arguments to STMT (in the
6901              example we'd check this: optab_handler (widen_sum_optab,
6902              vect_short_mode)).
6903
6904           This is contrary to "regular" reductions, in which the types of all
6905           the arguments are the same as the type of the reduction variable.
6906           For "regular" reductions we can therefore use the same vector type
6907           (and also the same tree-code) when generating the epilog code and
6908           when generating the code inside the loop.  */
6909
6910   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6911   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6912
6913   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6914   if (reduction_type == TREE_CODE_REDUCTION)
6915     {
6916       /* Check whether it's ok to change the order of the computation.
6917          Generally, when vectorizing a reduction we change the order of the
6918          computation.  This may change the behavior of the program in some
6919          cases, so we need to check that this is ok.  One exception is when
6920          vectorizing an outer-loop: the inner-loop is executed sequentially,
6921          and therefore vectorizing reductions in the inner-loop during
6922          outer-loop vectorization is safe.  Likewise when we are vectorizing
6923          a series of reductions using SLP and the VF is one the reductions
6924          are performed in scalar order.  */
6925       if (slp_node
6926           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6927           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6928         ;
6929       else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6930         {
6931           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6932              is not directy used in stmt.  */
6933           if (!only_slp_reduc_chain
6934               && reduc_chain_length != 1)
6935             {
6936               if (dump_enabled_p ())
6937                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6938                                  "in-order reduction chain without SLP.\n");
6939               return false;
6940             }
6941           STMT_VINFO_REDUC_TYPE (reduc_info)
6942             = reduction_type = FOLD_LEFT_REDUCTION;
6943         }
6944       else if (!commutative_tree_code (orig_code)
6945                || !associative_tree_code (orig_code))
6946         {
6947           if (dump_enabled_p ())
6948             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6949                             "reduction: not commutative/associative");
6950           return false;
6951         }
6952     }
6953
6954   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6955       && ncopies > 1)
6956     {
6957       if (dump_enabled_p ())
6958         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6959                          "multiple types in double reduction or condition "
6960                          "reduction or fold-left reduction.\n");
6961       return false;
6962     }
6963
6964   internal_fn reduc_fn = IFN_LAST;
6965   if (reduction_type == TREE_CODE_REDUCTION
6966       || reduction_type == FOLD_LEFT_REDUCTION
6967       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6968       || reduction_type == CONST_COND_REDUCTION)
6969     {
6970       if (reduction_type == FOLD_LEFT_REDUCTION
6971           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6972           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6973         {
6974           if (reduc_fn != IFN_LAST
6975               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6976                                                   OPTIMIZE_FOR_SPEED))
6977             {
6978               if (dump_enabled_p ())
6979                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6980                                  "reduc op not supported by target.\n");
6981
6982               reduc_fn = IFN_LAST;
6983             }
6984         }
6985       else
6986         {
6987           if (!nested_cycle || double_reduc)
6988             {
6989               if (dump_enabled_p ())
6990                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6991                                  "no reduc code for scalar code.\n");
6992
6993               return false;
6994             }
6995         }
6996     }
6997   else if (reduction_type == COND_REDUCTION)
6998     {
6999       int scalar_precision
7000         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7001       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7002       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7003                                                 vectype_out);
7004
7005       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7006                                           OPTIMIZE_FOR_SPEED))
7007         reduc_fn = IFN_REDUC_MAX;
7008     }
7009   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7010
7011   if (reduction_type != EXTRACT_LAST_REDUCTION
7012       && (!nested_cycle || double_reduc)
7013       && reduc_fn == IFN_LAST
7014       && !nunits_out.is_constant ())
7015     {
7016       if (dump_enabled_p ())
7017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7018                          "missing target support for reduction on"
7019                          " variable-length vectors.\n");
7020       return false;
7021     }
7022
7023   /* For SLP reductions, see if there is a neutral value we can use.  */
7024   tree neutral_op = NULL_TREE;
7025   if (slp_node)
7026     neutral_op = neutral_op_for_slp_reduction
7027       (slp_node_instance->reduc_phis, vectype_out, orig_code,
7028        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7029
7030   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7031     {
7032       /* We can't support in-order reductions of code such as this:
7033
7034            for (int i = 0; i < n1; ++i)
7035              for (int j = 0; j < n2; ++j)
7036                l += a[j];
7037
7038          since GCC effectively transforms the loop when vectorizing:
7039
7040            for (int i = 0; i < n1 / VF; ++i)
7041              for (int j = 0; j < n2; ++j)
7042                for (int k = 0; k < VF; ++k)
7043                  l += a[j];
7044
7045          which is a reassociation of the original operation.  */
7046       if (dump_enabled_p ())
7047         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7048                          "in-order double reduction not supported.\n");
7049
7050       return false;
7051     }
7052
7053   if (reduction_type == FOLD_LEFT_REDUCTION
7054       && slp_node
7055       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7056     {
7057       /* We cannot use in-order reductions in this case because there is
7058          an implicit reassociation of the operations involved.  */
7059       if (dump_enabled_p ())
7060         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7061                          "in-order unchained SLP reductions not supported.\n");
7062       return false;
7063     }
7064
7065   /* For double reductions, and for SLP reductions with a neutral value,
7066      we construct a variable-length initial vector by loading a vector
7067      full of the neutral value and then shift-and-inserting the start
7068      values into the low-numbered elements.  */
7069   if ((double_reduc || neutral_op)
7070       && !nunits_out.is_constant ()
7071       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7072                                           vectype_out, OPTIMIZE_FOR_SPEED))
7073     {
7074       if (dump_enabled_p ())
7075         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7076                          "reduction on variable-length vectors requires"
7077                          " target support for a vector-shift-and-insert"
7078                          " operation.\n");
7079       return false;
7080     }
7081
7082   /* Check extra constraints for variable-length unchained SLP reductions.  */
7083   if (STMT_SLP_TYPE (stmt_info)
7084       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7085       && !nunits_out.is_constant ())
7086     {
7087       /* We checked above that we could build the initial vector when
7088          there's a neutral element value.  Check here for the case in
7089          which each SLP statement has its own initial value and in which
7090          that value needs to be repeated for every instance of the
7091          statement within the initial vector.  */
7092       unsigned int group_size = SLP_TREE_LANES (slp_node);
7093       if (!neutral_op
7094           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7095                                               TREE_TYPE (vectype_out)))
7096         {
7097           if (dump_enabled_p ())
7098             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7099                              "unsupported form of SLP reduction for"
7100                              " variable-length vectors: cannot build"
7101                              " initial vector.\n");
7102           return false;
7103         }
7104       /* The epilogue code relies on the number of elements being a multiple
7105          of the group size.  The duplicate-and-interleave approach to setting
7106          up the initial vector does too.  */
7107       if (!multiple_p (nunits_out, group_size))
7108         {
7109           if (dump_enabled_p ())
7110             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7111                              "unsupported form of SLP reduction for"
7112                              " variable-length vectors: the vector size"
7113                              " is not a multiple of the number of results.\n");
7114           return false;
7115         }
7116     }
7117
7118   if (reduction_type == COND_REDUCTION)
7119     {
7120       widest_int ni;
7121
7122       if (! max_loop_iterations (loop, &ni))
7123         {
7124           if (dump_enabled_p ())
7125             dump_printf_loc (MSG_NOTE, vect_location,
7126                              "loop count not known, cannot create cond "
7127                              "reduction.\n");
7128           return false;
7129         }
7130       /* Convert backedges to iterations.  */
7131       ni += 1;
7132
7133       /* The additional index will be the same type as the condition.  Check
7134          that the loop can fit into this less one (because we'll use up the
7135          zero slot for when there are no matches).  */
7136       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7137       if (wi::geu_p (ni, wi::to_widest (max_index)))
7138         {
7139           if (dump_enabled_p ())
7140             dump_printf_loc (MSG_NOTE, vect_location,
7141                              "loop size is greater than data size.\n");
7142           return false;
7143         }
7144     }
7145
7146   /* In case the vectorization factor (VF) is bigger than the number
7147      of elements that we can fit in a vectype (nunits), we have to generate
7148      more than one vector stmt - i.e - we need to "unroll" the
7149      vector stmt by a factor VF/nunits.  For more details see documentation
7150      in vectorizable_operation.  */
7151
7152   /* If the reduction is used in an outer loop we need to generate
7153      VF intermediate results, like so (e.g. for ncopies=2):
7154         r0 = phi (init, r0)
7155         r1 = phi (init, r1)
7156         r0 = x0 + r0;
7157         r1 = x1 + r1;
7158     (i.e. we generate VF results in 2 registers).
7159     In this case we have a separate def-use cycle for each copy, and therefore
7160     for each copy we get the vector def for the reduction variable from the
7161     respective phi node created for this copy.
7162
7163     Otherwise (the reduction is unused in the loop nest), we can combine
7164     together intermediate results, like so (e.g. for ncopies=2):
7165         r = phi (init, r)
7166         r = x0 + r;
7167         r = x1 + r;
7168    (i.e. we generate VF/2 results in a single register).
7169    In this case for each copy we get the vector def for the reduction variable
7170    from the vectorized reduction operation generated in the previous iteration.
7171
7172    This only works when we see both the reduction PHI and its only consumer
7173    in vectorizable_reduction and there are no intermediate stmts
7174    participating.  */
7175   if (ncopies > 1
7176       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7177       && reduc_chain_length == 1)
7178     single_defuse_cycle = true;
7179
7180   if (single_defuse_cycle || lane_reduc_code_p)
7181     {
7182       gcc_assert (code != COND_EXPR);
7183
7184       /* 4. Supportable by target?  */
7185       bool ok = true;
7186
7187       /* 4.1. check support for the operation in the loop  */
7188       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
7189       if (!optab)
7190         {
7191           if (dump_enabled_p ())
7192             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7193                              "no optab.\n");
7194           ok = false;
7195         }
7196
7197       machine_mode vec_mode = TYPE_MODE (vectype_in);
7198       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7199         {
7200           if (dump_enabled_p ())
7201             dump_printf (MSG_NOTE, "op not supported by target.\n");
7202           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7203               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
7204             ok = false;
7205           else
7206             if (dump_enabled_p ())
7207               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7208         }
7209
7210       /* Worthwhile without SIMD support?  */
7211       if (ok
7212           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
7213           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
7214         {
7215           if (dump_enabled_p ())
7216             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7217                              "not worthwhile without SIMD support.\n");
7218           ok = false;
7219         }
7220
7221       /* lane-reducing operations have to go through vect_transform_reduction.
7222          For the other cases try without the single cycle optimization.  */
7223       if (!ok)
7224         {
7225           if (lane_reduc_code_p)
7226             return false;
7227           else
7228             single_defuse_cycle = false;
7229         }
7230     }
7231   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7232
7233   /* If the reduction stmt is one of the patterns that have lane
7234      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7235   if ((ncopies > 1 && ! single_defuse_cycle)
7236       && lane_reduc_code_p)
7237     {
7238       if (dump_enabled_p ())
7239         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7240                          "multi def-use cycle not possible for lane-reducing "
7241                          "reduction operation\n");
7242       return false;
7243     }
7244
7245   if (slp_node
7246       && !(!single_defuse_cycle
7247            && code != DOT_PROD_EXPR
7248            && code != WIDEN_SUM_EXPR
7249            && code != SAD_EXPR
7250            && reduction_type != FOLD_LEFT_REDUCTION))
7251     for (i = 0; i < op_type; i++)
7252       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7253         {
7254           if (dump_enabled_p ())
7255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7256                              "incompatible vector types for invariants\n");
7257           return false;
7258         }
7259
7260   if (slp_node)
7261     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7262   else
7263     vec_num = 1;
7264
7265   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7266                              reduction_type, ncopies, cost_vec);
7267   /* Cost the reduction op inside the loop if transformed via
7268      vect_transform_reduction.  Otherwise this is costed by the
7269      separate vectorizable_* routines.  */
7270   if (single_defuse_cycle
7271       || code == DOT_PROD_EXPR
7272       || code == WIDEN_SUM_EXPR
7273       || code == SAD_EXPR)
7274     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7275
7276   if (dump_enabled_p ()
7277       && reduction_type == FOLD_LEFT_REDUCTION)
7278     dump_printf_loc (MSG_NOTE, vect_location,
7279                      "using an in-order (fold-left) reduction.\n");
7280   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7281   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7282      reductions go through their own vectorizable_* routines.  */
7283   if (!single_defuse_cycle
7284       && code != DOT_PROD_EXPR
7285       && code != WIDEN_SUM_EXPR
7286       && code != SAD_EXPR
7287       && reduction_type != FOLD_LEFT_REDUCTION)
7288     {
7289       stmt_vec_info tem
7290         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7291       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7292         {
7293           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7294           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7295         }
7296       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7297       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7298     }
7299   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7300     {
7301       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7302       internal_fn cond_fn = get_conditional_internal_fn (code);
7303
7304       if (reduction_type != FOLD_LEFT_REDUCTION
7305           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7306           && (cond_fn == IFN_LAST
7307               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7308                                                   OPTIMIZE_FOR_SPEED)))
7309         {
7310           if (dump_enabled_p ())
7311             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312                              "can't operate on partial vectors because"
7313                              " no conditional operation is available.\n");
7314           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7315         }
7316       else if (reduction_type == FOLD_LEFT_REDUCTION
7317                && reduc_fn == IFN_LAST
7318                && !expand_vec_cond_expr_p (vectype_in,
7319                                            truth_type_for (vectype_in),
7320                                            SSA_NAME))
7321         {
7322           if (dump_enabled_p ())
7323             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7324                              "can't operate on partial vectors because"
7325                              " no conditional operation is available.\n");
7326           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7327         }
7328       else
7329         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7330                                vectype_in, NULL);
7331     }
7332   return true;
7333 }
7334
7335 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7336    value.  */
7337
7338 bool
7339 vect_transform_reduction (loop_vec_info loop_vinfo,
7340                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7341                           gimple **vec_stmt, slp_tree slp_node)
7342 {
7343   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7344   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7345   int i;
7346   int ncopies;
7347   int vec_num;
7348
7349   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7350   gcc_assert (reduc_info->is_reduc_info);
7351
7352   if (nested_in_vect_loop_p (loop, stmt_info))
7353     {
7354       loop = loop->inner;
7355       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7356     }
7357
7358   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7359   enum tree_code code = gimple_assign_rhs_code (stmt);
7360   int op_type = TREE_CODE_LENGTH (code);
7361
7362   /* Flatten RHS.  */
7363   tree ops[3];
7364   switch (get_gimple_rhs_class (code))
7365     {
7366     case GIMPLE_TERNARY_RHS:
7367       ops[2] = gimple_assign_rhs3 (stmt);
7368       /* Fall thru.  */
7369     case GIMPLE_BINARY_RHS:
7370       ops[0] = gimple_assign_rhs1 (stmt);
7371       ops[1] = gimple_assign_rhs2 (stmt);
7372       break;
7373     default:
7374       gcc_unreachable ();
7375     }
7376
7377   /* All uses but the last are expected to be defined in the loop.
7378      The last use is the reduction variable.  In case of nested cycle this
7379      assumption is not true: we use reduc_index to record the index of the
7380      reduction variable.  */
7381   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7382   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7383   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7384   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7385
7386   if (slp_node)
7387     {
7388       ncopies = 1;
7389       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7390     }
7391   else
7392     {
7393       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7394       vec_num = 1;
7395     }
7396
7397   internal_fn cond_fn = get_conditional_internal_fn (code);
7398   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7399   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7400
7401   /* Transform.  */
7402   tree new_temp = NULL_TREE;
7403   auto_vec<tree> vec_oprnds0;
7404   auto_vec<tree> vec_oprnds1;
7405   auto_vec<tree> vec_oprnds2;
7406   tree def0;
7407
7408   if (dump_enabled_p ())
7409     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7410
7411   /* FORNOW: Multiple types are not supported for condition.  */
7412   if (code == COND_EXPR)
7413     gcc_assert (ncopies == 1);
7414
7415   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7416
7417   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7418   if (reduction_type == FOLD_LEFT_REDUCTION)
7419     {
7420       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7421       return vectorize_fold_left_reduction
7422           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7423            reduc_fn, ops, vectype_in, reduc_index, masks);
7424     }
7425
7426   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7427   gcc_assert (single_defuse_cycle
7428               || code == DOT_PROD_EXPR
7429               || code == WIDEN_SUM_EXPR
7430               || code == SAD_EXPR);
7431
7432   /* Create the destination vector  */
7433   tree scalar_dest = gimple_assign_lhs (stmt);
7434   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7435
7436   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7437                      single_defuse_cycle && reduc_index == 0
7438                      ? NULL_TREE : ops[0], &vec_oprnds0,
7439                      single_defuse_cycle && reduc_index == 1
7440                      ? NULL_TREE : ops[1], &vec_oprnds1,
7441                      op_type == ternary_op
7442                      && !(single_defuse_cycle && reduc_index == 2)
7443                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7444   if (single_defuse_cycle)
7445     {
7446       gcc_assert (!slp_node);
7447       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7448                                      ops[reduc_index],
7449                                      reduc_index == 0 ? &vec_oprnds0
7450                                      : (reduc_index == 1 ? &vec_oprnds1
7451                                         : &vec_oprnds2));
7452     }
7453
7454   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7455     {
7456       gimple *new_stmt;
7457       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7458       if (masked_loop_p && !mask_by_cond_expr)
7459         {
7460           /* Make sure that the reduction accumulator is vop[0].  */
7461           if (reduc_index == 1)
7462             {
7463               gcc_assert (commutative_tree_code (code));
7464               std::swap (vop[0], vop[1]);
7465             }
7466           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7467                                           vectype_in, i);
7468           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7469                                                     vop[0], vop[1], vop[0]);
7470           new_temp = make_ssa_name (vec_dest, call);
7471           gimple_call_set_lhs (call, new_temp);
7472           gimple_call_set_nothrow (call, true);
7473           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7474           new_stmt = call;
7475         }
7476       else
7477         {
7478           if (op_type == ternary_op)
7479             vop[2] = vec_oprnds2[i];
7480
7481           if (masked_loop_p && mask_by_cond_expr)
7482             {
7483               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7484                                               vectype_in, i);
7485               build_vect_cond_expr (code, vop, mask, gsi);
7486             }
7487
7488           new_stmt = gimple_build_assign (vec_dest, code,
7489                                           vop[0], vop[1], vop[2]);
7490           new_temp = make_ssa_name (vec_dest, new_stmt);
7491           gimple_assign_set_lhs (new_stmt, new_temp);
7492           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7493         }
7494
7495       if (slp_node)
7496         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7497       else if (single_defuse_cycle
7498                && i < ncopies - 1)
7499         {
7500           if (reduc_index == 0)
7501             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7502           else if (reduc_index == 1)
7503             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7504           else if (reduc_index == 2)
7505             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7506         }
7507       else
7508         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7509     }
7510
7511   if (!slp_node)
7512     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7513
7514   return true;
7515 }
7516
7517 /* Transform phase of a cycle PHI.  */
7518
7519 bool
7520 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7521                           stmt_vec_info stmt_info, gimple **vec_stmt,
7522                           slp_tree slp_node, slp_instance slp_node_instance)
7523 {
7524   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7525   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7526   int i;
7527   int ncopies;
7528   int j;
7529   bool nested_cycle = false;
7530   int vec_num;
7531
7532   if (nested_in_vect_loop_p (loop, stmt_info))
7533     {
7534       loop = loop->inner;
7535       nested_cycle = true;
7536     }
7537
7538   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7539   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7540   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7541   gcc_assert (reduc_info->is_reduc_info);
7542
7543   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7544       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7545     /* Leave the scalar phi in place.  */
7546     return true;
7547
7548   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7549   /* For a nested cycle we do not fill the above.  */
7550   if (!vectype_in)
7551     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7552   gcc_assert (vectype_in);
7553
7554   if (slp_node)
7555     {
7556       /* The size vect_schedule_slp_instance computes is off for us.  */
7557       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7558                                       * SLP_TREE_LANES (slp_node), vectype_in);
7559       ncopies = 1;
7560     }
7561   else
7562     {
7563       vec_num = 1;
7564       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7565     }
7566
7567   /* Check whether we should use a single PHI node and accumulate
7568      vectors to one before the backedge.  */
7569   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7570     ncopies = 1;
7571
7572   /* Create the destination vector  */
7573   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7574   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7575                                                vectype_out);
7576
7577   /* Get the loop-entry arguments.  */
7578   tree vec_initial_def;
7579   auto_vec<tree> vec_initial_defs;
7580   if (slp_node)
7581     {
7582       vec_initial_defs.reserve (vec_num);
7583       if (nested_cycle)
7584         {
7585           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7586           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7587                              &vec_initial_defs);
7588         }
7589       else
7590         {
7591           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7592           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7593           tree neutral_op
7594               = neutral_op_for_slp_reduction (slp_node, vectype_out,
7595                                               STMT_VINFO_REDUC_CODE (reduc_info),
7596                                               first != NULL);
7597           get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7598                                           &vec_initial_defs, vec_num,
7599                                           first != NULL, neutral_op);
7600         }
7601     }
7602   else
7603     {
7604       /* Get at the scalar def before the loop, that defines the initial
7605          value of the reduction variable.  */
7606       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7607                                                 loop_preheader_edge (loop));
7608       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7609          and we can't use zero for induc_val, use initial_def.  Similarly
7610          for REDUC_MIN and initial_def larger than the base.  */
7611       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7612         {
7613           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7614           if (TREE_CODE (initial_def) == INTEGER_CST
7615               && !integer_zerop (induc_val)
7616               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7617                    && tree_int_cst_lt (initial_def, induc_val))
7618                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7619                       && tree_int_cst_lt (induc_val, initial_def))))
7620             {
7621               induc_val = initial_def;
7622               /* Communicate we used the initial_def to epilouge
7623                  generation.  */
7624               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7625             }
7626           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7627           vec_initial_defs.create (ncopies);
7628           for (i = 0; i < ncopies; ++i)
7629             vec_initial_defs.quick_push (vec_initial_def);
7630         }
7631       else if (nested_cycle)
7632         {
7633           /* Do not use an adjustment def as that case is not supported
7634              correctly if ncopies is not one.  */
7635           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7636                                          ncopies, initial_def,
7637                                          &vec_initial_defs);
7638         }
7639       else
7640         {
7641           tree adjustment_def = NULL_TREE;
7642           tree *adjustment_defp = &adjustment_def;
7643           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7644           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7645             adjustment_defp = NULL;
7646           vec_initial_def
7647             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7648                                              initial_def, adjustment_defp);
7649           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7650           vec_initial_defs.create (ncopies);
7651           for (i = 0; i < ncopies; ++i)
7652             vec_initial_defs.quick_push (vec_initial_def);
7653         }
7654     }
7655
7656   /* Generate the reduction PHIs upfront.  */
7657   for (i = 0; i < vec_num; i++)
7658     {
7659       tree vec_init_def = vec_initial_defs[i];
7660       for (j = 0; j < ncopies; j++)
7661         {
7662           /* Create the reduction-phi that defines the reduction
7663              operand.  */
7664           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7665
7666           /* Set the loop-entry arg of the reduction-phi.  */
7667           if (j != 0 && nested_cycle)
7668             vec_init_def = vec_initial_defs[j];
7669           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7670                        UNKNOWN_LOCATION);
7671
7672           /* The loop-latch arg is set in epilogue processing.  */
7673
7674           if (slp_node)
7675             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7676           else
7677             {
7678               if (j == 0)
7679                 *vec_stmt = new_phi;
7680               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7681             }
7682         }
7683     }
7684
7685   return true;
7686 }
7687
7688 /* Vectorizes LC PHIs.  */
7689
7690 bool
7691 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7692                      stmt_vec_info stmt_info, gimple **vec_stmt,
7693                      slp_tree slp_node)
7694 {
7695   if (!loop_vinfo
7696       || !is_a <gphi *> (stmt_info->stmt)
7697       || gimple_phi_num_args (stmt_info->stmt) != 1)
7698     return false;
7699
7700   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7701       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7702     return false;
7703
7704   if (!vec_stmt) /* transformation not required.  */
7705     {
7706       /* Deal with copies from externs or constants that disguise as
7707          loop-closed PHI nodes (PR97886).  */
7708       if (slp_node
7709           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7710                                                 SLP_TREE_VECTYPE (slp_node)))
7711         {
7712           if (dump_enabled_p ())
7713             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7714                              "incompatible vector types for invariants\n");
7715           return false;
7716         }
7717       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7718       return true;
7719     }
7720
7721   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7722   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7723   basic_block bb = gimple_bb (stmt_info->stmt);
7724   edge e = single_pred_edge (bb);
7725   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7726   auto_vec<tree> vec_oprnds;
7727   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7728                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7729                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7730   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7731     {
7732       /* Create the vectorized LC PHI node.  */
7733       gphi *new_phi = create_phi_node (vec_dest, bb);
7734       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7735       if (slp_node)
7736         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7737       else
7738         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7739     }
7740   if (!slp_node)
7741     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7742
7743   return true;
7744 }
7745
7746 /* Vectorizes PHIs.  */
7747
7748 bool
7749 vectorizable_phi (vec_info *,
7750                   stmt_vec_info stmt_info, gimple **vec_stmt,
7751                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7752 {
7753   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7754     return false;
7755
7756   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7757     return false;
7758
7759   tree vectype = SLP_TREE_VECTYPE (slp_node);
7760
7761   if (!vec_stmt) /* transformation not required.  */
7762     {
7763       slp_tree child;
7764       unsigned i;
7765       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7766         if (!child)
7767           {
7768             if (dump_enabled_p ())
7769               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7770                                "PHI node with unvectorized backedge def\n");
7771             return false;
7772           }
7773         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7774           {
7775             if (dump_enabled_p ())
7776               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7777                                "incompatible vector types for invariants\n");
7778             return false;
7779           }
7780       record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7781                         vector_stmt, stmt_info, vectype, 0, vect_body);
7782       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7783       return true;
7784     }
7785
7786   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7787   basic_block bb = gimple_bb (stmt_info->stmt);
7788   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7789   auto_vec<gphi *> new_phis;
7790   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7791     {
7792       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7793
7794       /* Skip not yet vectorized defs.  */
7795       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7796           && SLP_TREE_VEC_STMTS (child).is_empty ())
7797         continue;
7798
7799       auto_vec<tree> vec_oprnds;
7800       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7801       if (!new_phis.exists ())
7802         {
7803           new_phis.create (vec_oprnds.length ());
7804           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7805             {
7806               /* Create the vectorized LC PHI node.  */
7807               new_phis.quick_push (create_phi_node (vec_dest, bb));
7808               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7809             }
7810         }
7811       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7812       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7813         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7814     }
7815   /* We should have at least one already vectorized child.  */
7816   gcc_assert (new_phis.exists ());
7817
7818   return true;
7819 }
7820
7821
7822 /* Function vect_min_worthwhile_factor.
7823
7824    For a loop where we could vectorize the operation indicated by CODE,
7825    return the minimum vectorization factor that makes it worthwhile
7826    to use generic vectors.  */
7827 static unsigned int
7828 vect_min_worthwhile_factor (enum tree_code code)
7829 {
7830   switch (code)
7831     {
7832     case PLUS_EXPR:
7833     case MINUS_EXPR:
7834     case NEGATE_EXPR:
7835       return 4;
7836
7837     case BIT_AND_EXPR:
7838     case BIT_IOR_EXPR:
7839     case BIT_XOR_EXPR:
7840     case BIT_NOT_EXPR:
7841       return 2;
7842
7843     default:
7844       return INT_MAX;
7845     }
7846 }
7847
7848 /* Return true if VINFO indicates we are doing loop vectorization and if
7849    it is worth decomposing CODE operations into scalar operations for
7850    that loop's vectorization factor.  */
7851
7852 bool
7853 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7854 {
7855   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7856   unsigned HOST_WIDE_INT value;
7857   return (loop_vinfo
7858           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7859           && value >= vect_min_worthwhile_factor (code));
7860 }
7861
7862 /* Function vectorizable_induction
7863
7864    Check if STMT_INFO performs an induction computation that can be vectorized.
7865    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7866    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7867    Return true if STMT_INFO is vectorizable in this way.  */
7868
7869 bool
7870 vectorizable_induction (loop_vec_info loop_vinfo,
7871                         stmt_vec_info stmt_info,
7872                         gimple **vec_stmt, slp_tree slp_node,
7873                         stmt_vector_for_cost *cost_vec)
7874 {
7875   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7876   unsigned ncopies;
7877   bool nested_in_vect_loop = false;
7878   class loop *iv_loop;
7879   tree vec_def;
7880   edge pe = loop_preheader_edge (loop);
7881   basic_block new_bb;
7882   tree new_vec, vec_init, vec_step, t;
7883   tree new_name;
7884   gimple *new_stmt;
7885   gphi *induction_phi;
7886   tree induc_def, vec_dest;
7887   tree init_expr, step_expr;
7888   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7889   unsigned i;
7890   tree expr;
7891   gimple_stmt_iterator si;
7892
7893   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7894   if (!phi)
7895     return false;
7896
7897   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7898     return false;
7899
7900   /* Make sure it was recognized as induction computation.  */
7901   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7902     return false;
7903
7904   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7905   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7906
7907   if (slp_node)
7908     ncopies = 1;
7909   else
7910     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7911   gcc_assert (ncopies >= 1);
7912
7913   /* FORNOW. These restrictions should be relaxed.  */
7914   if (nested_in_vect_loop_p (loop, stmt_info))
7915     {
7916       imm_use_iterator imm_iter;
7917       use_operand_p use_p;
7918       gimple *exit_phi;
7919       edge latch_e;
7920       tree loop_arg;
7921
7922       if (ncopies > 1)
7923         {
7924           if (dump_enabled_p ())
7925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7926                              "multiple types in nested loop.\n");
7927           return false;
7928         }
7929
7930       exit_phi = NULL;
7931       latch_e = loop_latch_edge (loop->inner);
7932       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7933       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7934         {
7935           gimple *use_stmt = USE_STMT (use_p);
7936           if (is_gimple_debug (use_stmt))
7937             continue;
7938
7939           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7940             {
7941               exit_phi = use_stmt;
7942               break;
7943             }
7944         }
7945       if (exit_phi)
7946         {
7947           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7948           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7949                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7950             {
7951               if (dump_enabled_p ())
7952                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7953                                  "inner-loop induction only used outside "
7954                                  "of the outer vectorized loop.\n");
7955               return false;
7956             }
7957         }
7958
7959       nested_in_vect_loop = true;
7960       iv_loop = loop->inner;
7961     }
7962   else
7963     iv_loop = loop;
7964   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7965
7966   if (slp_node && !nunits.is_constant ())
7967     {
7968       /* The current SLP code creates the step value element-by-element.  */
7969       if (dump_enabled_p ())
7970         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7971                          "SLP induction not supported for variable-length"
7972                          " vectors.\n");
7973       return false;
7974     }
7975
7976   if (!vec_stmt) /* transformation not required.  */
7977     {
7978       unsigned inside_cost = 0, prologue_cost = 0;
7979       if (slp_node)
7980         {
7981           /* We eventually need to set a vector type on invariant
7982              arguments.  */
7983           unsigned j;
7984           slp_tree child;
7985           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7986             if (!vect_maybe_update_slp_op_vectype
7987                 (child, SLP_TREE_VECTYPE (slp_node)))
7988               {
7989                 if (dump_enabled_p ())
7990                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7991                                    "incompatible vector types for "
7992                                    "invariants\n");
7993                 return false;
7994               }
7995           /* loop cost for vec_loop.  */
7996           inside_cost
7997             = record_stmt_cost (cost_vec,
7998                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7999                                 vector_stmt, stmt_info, 0, vect_body);
8000           /* prologue cost for vec_init (if not nested) and step.  */
8001           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8002                                             scalar_to_vec,
8003                                             stmt_info, 0, vect_prologue);
8004         }
8005       else /* if (!slp_node) */
8006         {
8007           /* loop cost for vec_loop.  */
8008           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8009                                           stmt_info, 0, vect_body);
8010           /* prologue cost for vec_init and vec_step.  */
8011           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8012                                             stmt_info, 0, vect_prologue);
8013         }
8014       if (dump_enabled_p ())
8015         dump_printf_loc (MSG_NOTE, vect_location,
8016                          "vect_model_induction_cost: inside_cost = %d, "
8017                          "prologue_cost = %d .\n", inside_cost,
8018                          prologue_cost);
8019
8020       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8021       DUMP_VECT_SCOPE ("vectorizable_induction");
8022       return true;
8023     }
8024
8025   /* Transform.  */
8026
8027   /* Compute a vector variable, initialized with the first VF values of
8028      the induction variable.  E.g., for an iv with IV_PHI='X' and
8029      evolution S, for a vector of 4 units, we want to compute:
8030      [X, X + S, X + 2*S, X + 3*S].  */
8031
8032   if (dump_enabled_p ())
8033     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8034
8035   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8036   gcc_assert (step_expr != NULL_TREE);
8037   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8038
8039   pe = loop_preheader_edge (iv_loop);
8040   /* Find the first insertion point in the BB.  */
8041   basic_block bb = gimple_bb (phi);
8042   si = gsi_after_labels (bb);
8043
8044   /* For SLP induction we have to generate several IVs as for example
8045      with group size 3 we need
8046        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8047        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8048   if (slp_node)
8049     {
8050       /* Enforced above.  */
8051       unsigned int const_nunits = nunits.to_constant ();
8052
8053       /* The initial values are vectorized, but any lanes > group_size
8054          need adjustment.  */
8055       slp_tree init_node
8056         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8057
8058       /* Gather steps.  Since we do not vectorize inductions as
8059          cycles we have to reconstruct the step from SCEV data.  */
8060       unsigned group_size = SLP_TREE_LANES (slp_node);
8061       tree *steps = XALLOCAVEC (tree, group_size);
8062       tree *inits = XALLOCAVEC (tree, group_size);
8063       stmt_vec_info phi_info;
8064       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8065         {
8066           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8067           if (!init_node)
8068             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8069                                            pe->dest_idx);
8070         }
8071
8072       /* Now generate the IVs.  */
8073       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8074       gcc_assert ((const_nunits * nvects) % group_size == 0);
8075       unsigned nivs;
8076       if (nested_in_vect_loop)
8077         nivs = nvects;
8078       else
8079         {
8080           /* Compute the number of distinct IVs we need.  First reduce
8081              group_size if it is a multiple of const_nunits so we get
8082              one IV for a group_size of 4 but const_nunits 2.  */
8083           unsigned group_sizep = group_size;
8084           if (group_sizep % const_nunits == 0)
8085             group_sizep = group_sizep / const_nunits;
8086           nivs = least_common_multiple (group_sizep,
8087                                         const_nunits) / const_nunits;
8088         }
8089       tree stept = TREE_TYPE (step_vectype);
8090       tree lupdate_mul = NULL_TREE;
8091       if (!nested_in_vect_loop)
8092         {
8093           /* The number of iterations covered in one vector iteration.  */
8094           unsigned lup_mul = (nvects * const_nunits) / group_size;
8095           lupdate_mul
8096             = build_vector_from_val (step_vectype,
8097                                      SCALAR_FLOAT_TYPE_P (stept)
8098                                      ? build_real_from_wide (stept, lup_mul,
8099                                                              UNSIGNED)
8100                                      : build_int_cstu (stept, lup_mul));
8101         }
8102       tree peel_mul = NULL_TREE;
8103       gimple_seq init_stmts = NULL;
8104       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8105         {
8106           if (SCALAR_FLOAT_TYPE_P (stept))
8107             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8108                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8109           else
8110             peel_mul = gimple_convert (&init_stmts, stept,
8111                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8112           peel_mul = gimple_build_vector_from_val (&init_stmts,
8113                                                    step_vectype, peel_mul);
8114         }
8115       unsigned ivn;
8116       auto_vec<tree> vec_steps;
8117       for (ivn = 0; ivn < nivs; ++ivn)
8118         {
8119           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8120           tree_vector_builder init_elts (vectype, const_nunits, 1);
8121           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8122           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8123             {
8124               /* The scalar steps of the IVs.  */
8125               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8126               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8127               step_elts.quick_push (elt);
8128               if (!init_node)
8129                 {
8130                   /* The scalar inits of the IVs if not vectorized.  */
8131                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8132                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8133                                                   TREE_TYPE (elt)))
8134                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8135                                         TREE_TYPE (vectype), elt);
8136                   init_elts.quick_push (elt);
8137                 }
8138               /* The number of steps to add to the initial values.  */
8139               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8140               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8141                                    ? build_real_from_wide (stept,
8142                                                            mul_elt, UNSIGNED)
8143                                    : build_int_cstu (stept, mul_elt));
8144             }
8145           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8146           vec_steps.safe_push (vec_step);
8147           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8148           if (peel_mul)
8149             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8150                                      step_mul, peel_mul);
8151           if (!init_node)
8152             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8153
8154           /* Create the induction-phi that defines the induction-operand.  */
8155           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8156                                             "vec_iv_");
8157           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8158           induc_def = PHI_RESULT (induction_phi);
8159
8160           /* Create the iv update inside the loop  */
8161           tree up = vec_step;
8162           if (lupdate_mul)
8163             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8164                                vec_step, lupdate_mul);
8165           gimple_seq stmts = NULL;
8166           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8167           vec_def = gimple_build (&stmts,
8168                                   PLUS_EXPR, step_vectype, vec_def, up);
8169           vec_def = gimple_convert (&stmts, vectype, vec_def);
8170           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8171           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8172                        UNKNOWN_LOCATION);
8173
8174           if (init_node)
8175             vec_init = vect_get_slp_vect_def (init_node, ivn);
8176           if (!nested_in_vect_loop
8177               && !integer_zerop (step_mul))
8178             {
8179               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8180               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8181                                  vec_step, step_mul);
8182               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8183                                       vec_def, up);
8184               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8185             }
8186
8187           /* Set the arguments of the phi node:  */
8188           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8189
8190           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8191         }
8192       if (!nested_in_vect_loop)
8193         {
8194           /* Fill up to the number of vectors we need for the whole group.  */
8195           nivs = least_common_multiple (group_size,
8196                                         const_nunits) / const_nunits;
8197           for (; ivn < nivs; ++ivn)
8198             {
8199               SLP_TREE_VEC_STMTS (slp_node)
8200                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8201               vec_steps.safe_push (vec_steps[0]);
8202             }
8203         }
8204
8205       /* Re-use IVs when we can.  We are generating further vector
8206          stmts by adding VF' * stride to the IVs generated above.  */
8207       if (ivn < nvects)
8208         {
8209           unsigned vfp
8210             = least_common_multiple (group_size, const_nunits) / group_size;
8211           tree lupdate_mul
8212             = build_vector_from_val (step_vectype,
8213                                      SCALAR_FLOAT_TYPE_P (stept)
8214                                      ? build_real_from_wide (stept,
8215                                                              vfp, UNSIGNED)
8216                                      : build_int_cstu (stept, vfp));
8217           for (; ivn < nvects; ++ivn)
8218             {
8219               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8220               tree def = gimple_get_lhs (iv);
8221               if (ivn < 2*nivs)
8222                 vec_steps[ivn - nivs]
8223                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8224                                   vec_steps[ivn - nivs], lupdate_mul);
8225               gimple_seq stmts = NULL;
8226               def = gimple_convert (&stmts, step_vectype, def);
8227               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8228                                   def, vec_steps[ivn % nivs]);
8229               def = gimple_convert (&stmts, vectype, def);
8230               if (gimple_code (iv) == GIMPLE_PHI)
8231                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8232               else
8233                 {
8234                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8235                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8236                 }
8237               SLP_TREE_VEC_STMTS (slp_node)
8238                 .quick_push (SSA_NAME_DEF_STMT (def));
8239             }
8240         }
8241
8242       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8243       gcc_assert (!new_bb);
8244
8245       return true;
8246     }
8247
8248   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
8249                                      loop_preheader_edge (iv_loop));
8250
8251   gimple_seq stmts = NULL;
8252   if (!nested_in_vect_loop)
8253     {
8254       /* Convert the initial value to the IV update type.  */
8255       tree new_type = TREE_TYPE (step_expr);
8256       init_expr = gimple_convert (&stmts, new_type, init_expr);
8257
8258       /* If we are using the loop mask to "peel" for alignment then we need
8259          to adjust the start value here.  */
8260       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8261       if (skip_niters != NULL_TREE)
8262         {
8263           if (FLOAT_TYPE_P (vectype))
8264             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8265                                         skip_niters);
8266           else
8267             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8268           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8269                                          skip_niters, step_expr);
8270           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8271                                     init_expr, skip_step);
8272         }
8273     }
8274
8275   if (stmts)
8276     {
8277       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8278       gcc_assert (!new_bb);
8279     }
8280
8281   /* Create the vector that holds the initial_value of the induction.  */
8282   if (nested_in_vect_loop)
8283     {
8284       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8285          been created during vectorization of previous stmts.  We obtain it
8286          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8287       auto_vec<tree> vec_inits;
8288       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8289                                      init_expr, &vec_inits);
8290       vec_init = vec_inits[0];
8291       /* If the initial value is not of proper type, convert it.  */
8292       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8293         {
8294           new_stmt
8295             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8296                                                           vect_simple_var,
8297                                                           "vec_iv_"),
8298                                    VIEW_CONVERT_EXPR,
8299                                    build1 (VIEW_CONVERT_EXPR, vectype,
8300                                            vec_init));
8301           vec_init = gimple_assign_lhs (new_stmt);
8302           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8303                                                  new_stmt);
8304           gcc_assert (!new_bb);
8305         }
8306     }
8307   else
8308     {
8309       /* iv_loop is the loop to be vectorized. Create:
8310          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8311       stmts = NULL;
8312       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8313
8314       unsigned HOST_WIDE_INT const_nunits;
8315       if (nunits.is_constant (&const_nunits))
8316         {
8317           tree_vector_builder elts (step_vectype, const_nunits, 1);
8318           elts.quick_push (new_name);
8319           for (i = 1; i < const_nunits; i++)
8320             {
8321               /* Create: new_name_i = new_name + step_expr  */
8322               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8323                                        new_name, step_expr);
8324               elts.quick_push (new_name);
8325             }
8326           /* Create a vector from [new_name_0, new_name_1, ...,
8327              new_name_nunits-1]  */
8328           vec_init = gimple_build_vector (&stmts, &elts);
8329         }
8330       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8331         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8332         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8333                                  new_name, step_expr);
8334       else
8335         {
8336           /* Build:
8337                 [base, base, base, ...]
8338                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8339           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8340           gcc_assert (flag_associative_math);
8341           tree index = build_index_vector (step_vectype, 0, 1);
8342           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8343                                                         new_name);
8344           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8345                                                         step_expr);
8346           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8347           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8348                                    vec_init, step_vec);
8349           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8350                                    vec_init, base_vec);
8351         }
8352       vec_init = gimple_convert (&stmts, vectype, vec_init);
8353
8354       if (stmts)
8355         {
8356           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8357           gcc_assert (!new_bb);
8358         }
8359     }
8360
8361
8362   /* Create the vector that holds the step of the induction.  */
8363   if (nested_in_vect_loop)
8364     /* iv_loop is nested in the loop to be vectorized. Generate:
8365        vec_step = [S, S, S, S]  */
8366     new_name = step_expr;
8367   else
8368     {
8369       /* iv_loop is the loop to be vectorized. Generate:
8370           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8371       gimple_seq seq = NULL;
8372       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8373         {
8374           expr = build_int_cst (integer_type_node, vf);
8375           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8376         }
8377       else
8378         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8379       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8380                                expr, step_expr);
8381       if (seq)
8382         {
8383           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8384           gcc_assert (!new_bb);
8385         }
8386     }
8387
8388   t = unshare_expr (new_name);
8389   gcc_assert (CONSTANT_CLASS_P (new_name)
8390               || TREE_CODE (new_name) == SSA_NAME);
8391   new_vec = build_vector_from_val (step_vectype, t);
8392   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8393                                new_vec, step_vectype, NULL);
8394
8395
8396   /* Create the following def-use cycle:
8397      loop prolog:
8398          vec_init = ...
8399          vec_step = ...
8400      loop:
8401          vec_iv = PHI <vec_init, vec_loop>
8402          ...
8403          STMT
8404          ...
8405          vec_loop = vec_iv + vec_step;  */
8406
8407   /* Create the induction-phi that defines the induction-operand.  */
8408   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8409   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8410   induc_def = PHI_RESULT (induction_phi);
8411
8412   /* Create the iv update inside the loop  */
8413   stmts = NULL;
8414   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8415   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8416   vec_def = gimple_convert (&stmts, vectype, vec_def);
8417   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8418   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8419
8420   /* Set the arguments of the phi node:  */
8421   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8422   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8423                UNKNOWN_LOCATION);
8424
8425   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8426   *vec_stmt = induction_phi;
8427
8428   /* In case that vectorization factor (VF) is bigger than the number
8429      of elements that we can fit in a vectype (nunits), we have to generate
8430      more than one vector stmt - i.e - we need to "unroll" the
8431      vector stmt by a factor VF/nunits.  For more details see documentation
8432      in vectorizable_operation.  */
8433
8434   if (ncopies > 1)
8435     {
8436       gimple_seq seq = NULL;
8437       /* FORNOW. This restriction should be relaxed.  */
8438       gcc_assert (!nested_in_vect_loop);
8439
8440       /* Create the vector that holds the step of the induction.  */
8441       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8442         {
8443           expr = build_int_cst (integer_type_node, nunits);
8444           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8445         }
8446       else
8447         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8448       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8449                                expr, step_expr);
8450       if (seq)
8451         {
8452           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8453           gcc_assert (!new_bb);
8454         }
8455
8456       t = unshare_expr (new_name);
8457       gcc_assert (CONSTANT_CLASS_P (new_name)
8458                   || TREE_CODE (new_name) == SSA_NAME);
8459       new_vec = build_vector_from_val (step_vectype, t);
8460       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8461                                    new_vec, step_vectype, NULL);
8462
8463       vec_def = induc_def;
8464       for (i = 1; i < ncopies; i++)
8465         {
8466           /* vec_i = vec_prev + vec_step  */
8467           gimple_seq stmts = NULL;
8468           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8469           vec_def = gimple_build (&stmts,
8470                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8471           vec_def = gimple_convert (&stmts, vectype, vec_def);
8472
8473           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8474           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8475           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8476         }
8477     }
8478
8479   if (dump_enabled_p ())
8480     dump_printf_loc (MSG_NOTE, vect_location,
8481                      "transform induction: created def-use cycle: %G%G",
8482                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8483
8484   return true;
8485 }
8486
8487 /* Function vectorizable_live_operation.
8488
8489    STMT_INFO computes a value that is used outside the loop.  Check if
8490    it can be supported.  */
8491
8492 bool
8493 vectorizable_live_operation (vec_info *vinfo,
8494                              stmt_vec_info stmt_info,
8495                              gimple_stmt_iterator *gsi,
8496                              slp_tree slp_node, slp_instance slp_node_instance,
8497                              int slp_index, bool vec_stmt_p,
8498                              stmt_vector_for_cost *cost_vec)
8499 {
8500   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8501   imm_use_iterator imm_iter;
8502   tree lhs, lhs_type, bitsize;
8503   tree vectype = (slp_node
8504                   ? SLP_TREE_VECTYPE (slp_node)
8505                   : STMT_VINFO_VECTYPE (stmt_info));
8506   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8507   int ncopies;
8508   gimple *use_stmt;
8509   auto_vec<tree> vec_oprnds;
8510   int vec_entry = 0;
8511   poly_uint64 vec_index = 0;
8512
8513   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8514
8515   /* If a stmt of a reduction is live, vectorize it via
8516      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8517      validity so just trigger the transform here.  */
8518   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8519     {
8520       if (!vec_stmt_p)
8521         return true;
8522       if (slp_node)
8523         {
8524           /* For reduction chains the meta-info is attached to
8525              the group leader.  */
8526           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8527             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8528           /* For SLP reductions we vectorize the epilogue for
8529              all involved stmts together.  */
8530           else if (slp_index != 0)
8531             return true;
8532           else
8533             /* For SLP reductions the meta-info is attached to
8534                the representative.  */
8535             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8536         }
8537       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8538       gcc_assert (reduc_info->is_reduc_info);
8539       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8540           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8541         return true;
8542       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8543                                         slp_node_instance);
8544       return true;
8545     }
8546
8547   /* If STMT is not relevant and it is a simple assignment and its inputs are
8548      invariant then it can remain in place, unvectorized.  The original last
8549      scalar value that it computes will be used.  */
8550   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8551     {
8552       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8553       if (dump_enabled_p ())
8554         dump_printf_loc (MSG_NOTE, vect_location,
8555                          "statement is simple and uses invariant.  Leaving in "
8556                          "place.\n");
8557       return true;
8558     }
8559
8560   if (slp_node)
8561     ncopies = 1;
8562   else
8563     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8564
8565   if (slp_node)
8566     {
8567       gcc_assert (slp_index >= 0);
8568
8569       /* Get the last occurrence of the scalar index from the concatenation of
8570          all the slp vectors. Calculate which slp vector it is and the index
8571          within.  */
8572       int num_scalar = SLP_TREE_LANES (slp_node);
8573       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8574       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8575
8576       /* Calculate which vector contains the result, and which lane of
8577          that vector we need.  */
8578       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8579         {
8580           if (dump_enabled_p ())
8581             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8582                              "Cannot determine which vector holds the"
8583                              " final result.\n");
8584           return false;
8585         }
8586     }
8587
8588   if (!vec_stmt_p)
8589     {
8590       /* No transformation required.  */
8591       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8592         {
8593           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8594                                                OPTIMIZE_FOR_SPEED))
8595             {
8596               if (dump_enabled_p ())
8597                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8598                                  "can't operate on partial vectors "
8599                                  "because the target doesn't support extract "
8600                                  "last reduction.\n");
8601               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8602             }
8603           else if (slp_node)
8604             {
8605               if (dump_enabled_p ())
8606                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8607                                  "can't operate on partial vectors "
8608                                  "because an SLP statement is live after "
8609                                  "the loop.\n");
8610               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8611             }
8612           else if (ncopies > 1)
8613             {
8614               if (dump_enabled_p ())
8615                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8616                                  "can't operate on partial vectors "
8617                                  "because ncopies is greater than 1.\n");
8618               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8619             }
8620           else
8621             {
8622               gcc_assert (ncopies == 1 && !slp_node);
8623               vect_record_loop_mask (loop_vinfo,
8624                                      &LOOP_VINFO_MASKS (loop_vinfo),
8625                                      1, vectype, NULL);
8626             }
8627         }
8628       /* ???  Enable for loop costing as well.  */
8629       if (!loop_vinfo)
8630         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8631                           0, vect_epilogue);
8632       return true;
8633     }
8634
8635   /* Use the lhs of the original scalar statement.  */
8636   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8637   if (dump_enabled_p ())
8638     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8639                      "stmt %G", stmt);
8640
8641   lhs = gimple_get_lhs (stmt);
8642   lhs_type = TREE_TYPE (lhs);
8643
8644   bitsize = vector_element_bits_tree (vectype);
8645
8646   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8647   tree vec_lhs, bitstart;
8648   gimple *vec_stmt;
8649   if (slp_node)
8650     {
8651       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8652
8653       /* Get the correct slp vectorized stmt.  */
8654       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8655       vec_lhs = gimple_get_lhs (vec_stmt);
8656
8657       /* Get entry to use.  */
8658       bitstart = bitsize_int (vec_index);
8659       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8660     }
8661   else
8662     {
8663       /* For multiple copies, get the last copy.  */
8664       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8665       vec_lhs = gimple_get_lhs (vec_stmt);
8666
8667       /* Get the last lane in the vector.  */
8668       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8669     }
8670
8671   if (loop_vinfo)
8672     {
8673       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8674          requirement, insert one phi node for it.  It looks like:
8675            loop;
8676          BB:
8677            # lhs' = PHI <lhs>
8678          ==>
8679            loop;
8680          BB:
8681            # vec_lhs' = PHI <vec_lhs>
8682            new_tree = lane_extract <vec_lhs', ...>;
8683            lhs' = new_tree;  */
8684
8685       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8686       basic_block exit_bb = single_exit (loop)->dest;
8687       gcc_assert (single_pred_p (exit_bb));
8688
8689       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8690       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8691       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8692
8693       gimple_seq stmts = NULL;
8694       tree new_tree;
8695       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8696         {
8697           /* Emit:
8698
8699                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8700
8701              where VEC_LHS is the vectorized live-out result and MASK is
8702              the loop mask for the final iteration.  */
8703           gcc_assert (ncopies == 1 && !slp_node);
8704           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8705           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8706                                           1, vectype, 0);
8707           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8708                                           mask, vec_lhs_phi);
8709
8710           /* Convert the extracted vector element to the scalar type.  */
8711           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8712         }
8713       else
8714         {
8715           tree bftype = TREE_TYPE (vectype);
8716           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8717             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8718           new_tree = build3 (BIT_FIELD_REF, bftype,
8719                              vec_lhs_phi, bitsize, bitstart);
8720           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8721                                            &stmts, true, NULL_TREE);
8722         }
8723
8724       if (stmts)
8725         {
8726           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8727           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8728
8729           /* Remove existing phi from lhs and create one copy from new_tree.  */
8730           tree lhs_phi = NULL_TREE;
8731           gimple_stmt_iterator gsi;
8732           for (gsi = gsi_start_phis (exit_bb);
8733                !gsi_end_p (gsi); gsi_next (&gsi))
8734             {
8735               gimple *phi = gsi_stmt (gsi);
8736               if ((gimple_phi_arg_def (phi, 0) == lhs))
8737                 {
8738                   remove_phi_node (&gsi, false);
8739                   lhs_phi = gimple_phi_result (phi);
8740                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8741                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8742                   break;
8743                 }
8744             }
8745         }
8746
8747       /* Replace use of lhs with newly computed result.  If the use stmt is a
8748          single arg PHI, just replace all uses of PHI result.  It's necessary
8749          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8750       use_operand_p use_p;
8751       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8752         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8753             && !is_gimple_debug (use_stmt))
8754           {
8755             if (gimple_code (use_stmt) == GIMPLE_PHI
8756                 && gimple_phi_num_args (use_stmt) == 1)
8757               {
8758                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8759               }
8760             else
8761               {
8762                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8763                     SET_USE (use_p, new_tree);
8764               }
8765             update_stmt (use_stmt);
8766           }
8767     }
8768   else
8769     {
8770       /* For basic-block vectorization simply insert the lane-extraction.  */
8771       tree bftype = TREE_TYPE (vectype);
8772       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8773         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8774       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8775                               vec_lhs, bitsize, bitstart);
8776       gimple_seq stmts = NULL;
8777       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8778                                        &stmts, true, NULL_TREE);
8779       if (TREE_CODE (new_tree) == SSA_NAME
8780           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8781         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8782       if (is_a <gphi *> (vec_stmt))
8783         {
8784           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8785           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8786         }
8787       else
8788         {
8789           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8790           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8791         }
8792
8793       /* Replace use of lhs with newly computed result.  If the use stmt is a
8794          single arg PHI, just replace all uses of PHI result.  It's necessary
8795          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8796       use_operand_p use_p;
8797       stmt_vec_info use_stmt_info;
8798       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8799         if (!is_gimple_debug (use_stmt)
8800             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8801                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8802           {
8803             /* ???  This can happen when the live lane ends up being
8804                used in a vector construction code-generated by an
8805                external SLP node (and code-generation for that already
8806                happened).  See gcc.dg/vect/bb-slp-47.c.
8807                Doing this is what would happen if that vector CTOR
8808                were not code-generated yet so it is not too bad.
8809                ???  In fact we'd likely want to avoid this situation
8810                in the first place.  */
8811             if (TREE_CODE (new_tree) == SSA_NAME
8812                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8813                 && gimple_code (use_stmt) != GIMPLE_PHI
8814                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8815                                                 use_stmt))
8816               {
8817                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8818                 gcc_assert (code == CONSTRUCTOR
8819                             || code == VIEW_CONVERT_EXPR
8820                             || CONVERT_EXPR_CODE_P (code));
8821                 if (dump_enabled_p ())
8822                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8823                                    "Using original scalar computation for "
8824                                    "live lane because use preceeds vector "
8825                                    "def\n");
8826                 continue;
8827               }
8828             /* ???  It can also happen that we end up pulling a def into
8829                a loop where replacing out-of-loop uses would require
8830                a new LC SSA PHI node.  Retain the original scalar in
8831                those cases as well.  PR98064.  */
8832             if (TREE_CODE (new_tree) == SSA_NAME
8833                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8834                 && (gimple_bb (use_stmt)->loop_father
8835                     != gimple_bb (vec_stmt)->loop_father)
8836                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8837                                         gimple_bb (use_stmt)->loop_father))
8838               {
8839                 if (dump_enabled_p ())
8840                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8841                                    "Using original scalar computation for "
8842                                    "live lane because there is an out-of-loop "
8843                                    "definition for it\n");
8844                 continue;
8845               }
8846             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8847               SET_USE (use_p, new_tree);
8848             update_stmt (use_stmt);
8849           }
8850     }
8851
8852   return true;
8853 }
8854
8855 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8856
8857 static void
8858 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8859 {
8860   ssa_op_iter op_iter;
8861   imm_use_iterator imm_iter;
8862   def_operand_p def_p;
8863   gimple *ustmt;
8864
8865   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8866     {
8867       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8868         {
8869           basic_block bb;
8870
8871           if (!is_gimple_debug (ustmt))
8872             continue;
8873
8874           bb = gimple_bb (ustmt);
8875
8876           if (!flow_bb_inside_loop_p (loop, bb))
8877             {
8878               if (gimple_debug_bind_p (ustmt))
8879                 {
8880                   if (dump_enabled_p ())
8881                     dump_printf_loc (MSG_NOTE, vect_location,
8882                                      "killing debug use\n");
8883
8884                   gimple_debug_bind_reset_value (ustmt);
8885                   update_stmt (ustmt);
8886                 }
8887               else
8888                 gcc_unreachable ();
8889             }
8890         }
8891     }
8892 }
8893
8894 /* Given loop represented by LOOP_VINFO, return true if computation of
8895    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8896    otherwise.  */
8897
8898 static bool
8899 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8900 {
8901   /* Constant case.  */
8902   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8903     {
8904       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8905       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8906
8907       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8908       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8909       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8910         return true;
8911     }
8912
8913   widest_int max;
8914   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8915   /* Check the upper bound of loop niters.  */
8916   if (get_max_loop_iterations (loop, &max))
8917     {
8918       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8919       signop sgn = TYPE_SIGN (type);
8920       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8921       if (max < type_max)
8922         return true;
8923     }
8924   return false;
8925 }
8926
8927 /* Return a mask type with half the number of elements as OLD_TYPE,
8928    given that it should have mode NEW_MODE.  */
8929
8930 tree
8931 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8932 {
8933   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8934   return build_truth_vector_type_for_mode (nunits, new_mode);
8935 }
8936
8937 /* Return a mask type with twice as many elements as OLD_TYPE,
8938    given that it should have mode NEW_MODE.  */
8939
8940 tree
8941 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8942 {
8943   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8944   return build_truth_vector_type_for_mode (nunits, new_mode);
8945 }
8946
8947 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8948    contain a sequence of NVECTORS masks that each control a vector of type
8949    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8950    these vector masks with the vector version of SCALAR_MASK.  */
8951
8952 void
8953 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8954                        unsigned int nvectors, tree vectype, tree scalar_mask)
8955 {
8956   gcc_assert (nvectors != 0);
8957   if (masks->length () < nvectors)
8958     masks->safe_grow_cleared (nvectors, true);
8959   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8960   /* The number of scalars per iteration and the number of vectors are
8961      both compile-time constants.  */
8962   unsigned int nscalars_per_iter
8963     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8964                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8965
8966   if (scalar_mask)
8967     {
8968       scalar_cond_masked_key cond (scalar_mask, nvectors);
8969       loop_vinfo->scalar_cond_masked_set.add (cond);
8970     }
8971
8972   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8973     {
8974       rgm->max_nscalars_per_iter = nscalars_per_iter;
8975       rgm->type = truth_type_for (vectype);
8976       rgm->factor = 1;
8977     }
8978 }
8979
8980 /* Given a complete set of masks MASKS, extract mask number INDEX
8981    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8982    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8983
8984    See the comment above vec_loop_masks for more details about the mask
8985    arrangement.  */
8986
8987 tree
8988 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8989                     unsigned int nvectors, tree vectype, unsigned int index)
8990 {
8991   rgroup_controls *rgm = &(*masks)[nvectors - 1];
8992   tree mask_type = rgm->type;
8993
8994   /* Populate the rgroup's mask array, if this is the first time we've
8995      used it.  */
8996   if (rgm->controls.is_empty ())
8997     {
8998       rgm->controls.safe_grow_cleared (nvectors, true);
8999       for (unsigned int i = 0; i < nvectors; ++i)
9000         {
9001           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9002           /* Provide a dummy definition until the real one is available.  */
9003           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9004           rgm->controls[i] = mask;
9005         }
9006     }
9007
9008   tree mask = rgm->controls[index];
9009   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9010                 TYPE_VECTOR_SUBPARTS (vectype)))
9011     {
9012       /* A loop mask for data type X can be reused for data type Y
9013          if X has N times more elements than Y and if Y's elements
9014          are N times bigger than X's.  In this case each sequence
9015          of N elements in the loop mask will be all-zero or all-one.
9016          We can then view-convert the mask so that each sequence of
9017          N elements is replaced by a single element.  */
9018       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9019                               TYPE_VECTOR_SUBPARTS (vectype)));
9020       gimple_seq seq = NULL;
9021       mask_type = truth_type_for (vectype);
9022       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9023       if (seq)
9024         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9025     }
9026   return mask;
9027 }
9028
9029 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9030    lengths for controlling an operation on VECTYPE.  The operation splits
9031    each element of VECTYPE into FACTOR separate subelements, measuring the
9032    length as a number of these subelements.  */
9033
9034 void
9035 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9036                       unsigned int nvectors, tree vectype, unsigned int factor)
9037 {
9038   gcc_assert (nvectors != 0);
9039   if (lens->length () < nvectors)
9040     lens->safe_grow_cleared (nvectors, true);
9041   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9042
9043   /* The number of scalars per iteration, scalar occupied bytes and
9044      the number of vectors are both compile-time constants.  */
9045   unsigned int nscalars_per_iter
9046     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9047                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9048
9049   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9050     {
9051       /* For now, we only support cases in which all loads and stores fall back
9052          to VnQI or none do.  */
9053       gcc_assert (!rgl->max_nscalars_per_iter
9054                   || (rgl->factor == 1 && factor == 1)
9055                   || (rgl->max_nscalars_per_iter * rgl->factor
9056                       == nscalars_per_iter * factor));
9057       rgl->max_nscalars_per_iter = nscalars_per_iter;
9058       rgl->type = vectype;
9059       rgl->factor = factor;
9060     }
9061 }
9062
9063 /* Given a complete set of length LENS, extract length number INDEX for an
9064    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9065
9066 tree
9067 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9068                    unsigned int nvectors, unsigned int index)
9069 {
9070   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9071
9072   /* Populate the rgroup's len array, if this is the first time we've
9073      used it.  */
9074   if (rgl->controls.is_empty ())
9075     {
9076       rgl->controls.safe_grow_cleared (nvectors, true);
9077       for (unsigned int i = 0; i < nvectors; ++i)
9078         {
9079           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9080           gcc_assert (len_type != NULL_TREE);
9081           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9082
9083           /* Provide a dummy definition until the real one is available.  */
9084           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9085           rgl->controls[i] = len;
9086         }
9087     }
9088
9089   return rgl->controls[index];
9090 }
9091
9092 /* Scale profiling counters by estimation for LOOP which is vectorized
9093    by factor VF.  */
9094
9095 static void
9096 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9097 {
9098   edge preheader = loop_preheader_edge (loop);
9099   /* Reduce loop iterations by the vectorization factor.  */
9100   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9101   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9102
9103   if (freq_h.nonzero_p ())
9104     {
9105       profile_probability p;
9106
9107       /* Avoid dropping loop body profile counter to 0 because of zero count
9108          in loop's preheader.  */
9109       if (!(freq_e == profile_count::zero ()))
9110         freq_e = freq_e.force_nonzero ();
9111       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9112       scale_loop_frequencies (loop, p);
9113     }
9114
9115   edge exit_e = single_exit (loop);
9116   exit_e->probability = profile_probability::always ()
9117                                  .apply_scale (1, new_est_niter + 1);
9118
9119   edge exit_l = single_pred_edge (loop->latch);
9120   profile_probability prob = exit_l->probability;
9121   exit_l->probability = exit_e->probability.invert ();
9122   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9123     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9124 }
9125
9126 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9127    latch edge values originally defined by it.  */
9128
9129 static void
9130 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9131                                      stmt_vec_info def_stmt_info)
9132 {
9133   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9134   if (!def || TREE_CODE (def) != SSA_NAME)
9135     return;
9136   stmt_vec_info phi_info;
9137   imm_use_iterator iter;
9138   use_operand_p use_p;
9139   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9140     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9141       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9142           && (phi_info = loop_vinfo->lookup_stmt (phi))
9143           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9144           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9145           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9146         {
9147           loop_p loop = gimple_bb (phi)->loop_father;
9148           edge e = loop_latch_edge (loop);
9149           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9150             {
9151               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9152               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9153               gcc_assert (phi_defs.length () == latch_defs.length ());
9154               for (unsigned i = 0; i < phi_defs.length (); ++i)
9155                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9156                              gimple_get_lhs (latch_defs[i]), e,
9157                              gimple_phi_arg_location (phi, e->dest_idx));
9158             }
9159         }
9160 }
9161
9162 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9163    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9164    stmt_vec_info.  */
9165
9166 static bool
9167 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9168                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9169 {
9170   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9171   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9172
9173   if (dump_enabled_p ())
9174     dump_printf_loc (MSG_NOTE, vect_location,
9175                      "------>vectorizing statement: %G", stmt_info->stmt);
9176
9177   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9178     vect_loop_kill_debug_uses (loop, stmt_info);
9179
9180   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9181       && !STMT_VINFO_LIVE_P (stmt_info))
9182     return false;
9183
9184   if (STMT_VINFO_VECTYPE (stmt_info))
9185     {
9186       poly_uint64 nunits
9187         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9188       if (!STMT_SLP_TYPE (stmt_info)
9189           && maybe_ne (nunits, vf)
9190           && dump_enabled_p ())
9191         /* For SLP VF is set according to unrolling factor, and not
9192            to vector size, hence for SLP this print is not valid.  */
9193         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9194     }
9195
9196   /* Pure SLP statements have already been vectorized.  We still need
9197      to apply loop vectorization to hybrid SLP statements.  */
9198   if (PURE_SLP_STMT (stmt_info))
9199     return false;
9200
9201   if (dump_enabled_p ())
9202     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9203
9204   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9205     *seen_store = stmt_info;
9206
9207   return true;
9208 }
9209
9210 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9211    in the hash_map with its corresponding values.  */
9212
9213 static tree
9214 find_in_mapping (tree t, void *context)
9215 {
9216   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9217
9218   tree *value = mapping->get (t);
9219   return value ? *value : t;
9220 }
9221
9222 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9223    original loop that has now been vectorized.
9224
9225    The inits of the data_references need to be advanced with the number of
9226    iterations of the main loop.  This has been computed in vect_do_peeling and
9227    is stored in parameter ADVANCE.  We first restore the data_references
9228    initial offset with the values recored in ORIG_DRS_INIT.
9229
9230    Since the loop_vec_info of this EPILOGUE was constructed for the original
9231    loop, its stmt_vec_infos all point to the original statements.  These need
9232    to be updated to point to their corresponding copies as well as the SSA_NAMES
9233    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9234
9235    The data_reference's connections also need to be updated.  Their
9236    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9237    stmt_vec_infos, their statements need to point to their corresponding copy,
9238    if they are gather loads or scatter stores then their reference needs to be
9239    updated to point to its corresponding copy and finally we set
9240    'base_misaligned' to false as we have already peeled for alignment in the
9241    prologue of the main loop.  */
9242
9243 static void
9244 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9245 {
9246   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9247   auto_vec<gimple *> stmt_worklist;
9248   hash_map<tree,tree> mapping;
9249   gimple *orig_stmt, *new_stmt;
9250   gimple_stmt_iterator epilogue_gsi;
9251   gphi_iterator epilogue_phi_gsi;
9252   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9253   basic_block *epilogue_bbs = get_loop_body (epilogue);
9254   unsigned i;
9255
9256   free (LOOP_VINFO_BBS (epilogue_vinfo));
9257   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9258
9259   /* Advance data_reference's with the number of iterations of the previous
9260      loop and its prologue.  */
9261   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9262
9263
9264   /* The EPILOGUE loop is a copy of the original loop so they share the same
9265      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9266      point to the copied statements.  We also create a mapping of all LHS' in
9267      the original loop and all the LHS' in the EPILOGUE and create worklists to
9268      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9269   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9270     {
9271       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9272            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9273         {
9274           new_stmt = epilogue_phi_gsi.phi ();
9275
9276           gcc_assert (gimple_uid (new_stmt) > 0);
9277           stmt_vinfo
9278             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9279
9280           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9281           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9282
9283           mapping.put (gimple_phi_result (orig_stmt),
9284                        gimple_phi_result (new_stmt));
9285           /* PHI nodes can not have patterns or related statements.  */
9286           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9287                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9288         }
9289
9290       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9291            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9292         {
9293           new_stmt = gsi_stmt (epilogue_gsi);
9294           if (is_gimple_debug (new_stmt))
9295             continue;
9296
9297           gcc_assert (gimple_uid (new_stmt) > 0);
9298           stmt_vinfo
9299             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9300
9301           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9302           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9303
9304           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9305             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9306
9307           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9308             {
9309               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9310               for (gimple_stmt_iterator gsi = gsi_start (seq);
9311                    !gsi_end_p (gsi); gsi_next (&gsi))
9312                 stmt_worklist.safe_push (gsi_stmt (gsi));
9313             }
9314
9315           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9316           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9317             {
9318               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9319               stmt_worklist.safe_push (stmt);
9320               /* Set BB such that the assert in
9321                 'get_initial_def_for_reduction' is able to determine that
9322                 the BB of the related stmt is inside this loop.  */
9323               gimple_set_bb (stmt,
9324                              gimple_bb (new_stmt));
9325               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9326               gcc_assert (related_vinfo == NULL
9327                           || related_vinfo == stmt_vinfo);
9328             }
9329         }
9330     }
9331
9332   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9333      using the original main loop and thus need to be updated to refer to the
9334      cloned variables used in the epilogue.  */
9335   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9336     {
9337       gimple *stmt = stmt_worklist[i];
9338       tree *new_op;
9339
9340       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9341         {
9342           tree op = gimple_op (stmt, j);
9343           if ((new_op = mapping.get(op)))
9344             gimple_set_op (stmt, j, *new_op);
9345           else
9346             {
9347               /* PR92429: The last argument of simplify_replace_tree disables
9348                  folding when replacing arguments.  This is required as
9349                  otherwise you might end up with different statements than the
9350                  ones analyzed in vect_loop_analyze, leading to different
9351                  vectorization.  */
9352               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9353                                           &find_in_mapping, &mapping, false);
9354               gimple_set_op (stmt, j, op);
9355             }
9356         }
9357     }
9358
9359   struct data_reference *dr;
9360   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9361   FOR_EACH_VEC_ELT (datarefs, i, dr)
9362     {
9363       orig_stmt = DR_STMT (dr);
9364       gcc_assert (gimple_uid (orig_stmt) > 0);
9365       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9366       /* Data references for gather loads and scatter stores do not use the
9367          updated offset we set using ADVANCE.  Instead we have to make sure the
9368          reference in the data references point to the corresponding copy of
9369          the original in the epilogue.  */
9370       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9371           == VMAT_GATHER_SCATTER)
9372         {
9373           DR_REF (dr)
9374             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9375                                      &find_in_mapping, &mapping);
9376           DR_BASE_ADDRESS (dr)
9377             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9378                                      &find_in_mapping, &mapping);
9379         }
9380       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9381       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9382       /* The vector size of the epilogue is smaller than that of the main loop
9383          so the alignment is either the same or lower. This means the dr will
9384          thus by definition be aligned.  */
9385       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9386     }
9387
9388   epilogue_vinfo->shared->datarefs_copy.release ();
9389   epilogue_vinfo->shared->save_datarefs ();
9390 }
9391
9392 /* Function vect_transform_loop.
9393
9394    The analysis phase has determined that the loop is vectorizable.
9395    Vectorize the loop - created vectorized stmts to replace the scalar
9396    stmts in the loop, and update the loop exit condition.
9397    Returns scalar epilogue loop if any.  */
9398
9399 class loop *
9400 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9401 {
9402   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9403   class loop *epilogue = NULL;
9404   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9405   int nbbs = loop->num_nodes;
9406   int i;
9407   tree niters_vector = NULL_TREE;
9408   tree step_vector = NULL_TREE;
9409   tree niters_vector_mult_vf = NULL_TREE;
9410   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9411   unsigned int lowest_vf = constant_lower_bound (vf);
9412   gimple *stmt;
9413   bool check_profitability = false;
9414   unsigned int th;
9415
9416   DUMP_VECT_SCOPE ("vec_transform_loop");
9417
9418   loop_vinfo->shared->check_datarefs ();
9419
9420   /* Use the more conservative vectorization threshold.  If the number
9421      of iterations is constant assume the cost check has been performed
9422      by our caller.  If the threshold makes all loops profitable that
9423      run at least the (estimated) vectorization factor number of times
9424      checking is pointless, too.  */
9425   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9426   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9427     {
9428       if (dump_enabled_p ())
9429         dump_printf_loc (MSG_NOTE, vect_location,
9430                          "Profitability threshold is %d loop iterations.\n",
9431                          th);
9432       check_profitability = true;
9433     }
9434
9435   /* Make sure there exists a single-predecessor exit bb.  Do this before
9436      versioning.   */
9437   edge e = single_exit (loop);
9438   if (! single_pred_p (e->dest))
9439     {
9440       split_loop_exit_edge (e, true);
9441       if (dump_enabled_p ())
9442         dump_printf (MSG_NOTE, "split exit edge\n");
9443     }
9444
9445   /* Version the loop first, if required, so the profitability check
9446      comes first.  */
9447
9448   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9449     {
9450       class loop *sloop
9451         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9452       sloop->force_vectorize = false;
9453       check_profitability = false;
9454     }
9455
9456   /* Make sure there exists a single-predecessor exit bb also on the
9457      scalar loop copy.  Do this after versioning but before peeling
9458      so CFG structure is fine for both scalar and if-converted loop
9459      to make slpeel_duplicate_current_defs_from_edges face matched
9460      loop closed PHI nodes on the exit.  */
9461   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9462     {
9463       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9464       if (! single_pred_p (e->dest))
9465         {
9466           split_loop_exit_edge (e, true);
9467           if (dump_enabled_p ())
9468             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9469         }
9470     }
9471
9472   tree niters = vect_build_loop_niters (loop_vinfo);
9473   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9474   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9475   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9476   tree advance;
9477   drs_init_vec orig_drs_init;
9478
9479   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9480                               &step_vector, &niters_vector_mult_vf, th,
9481                               check_profitability, niters_no_overflow,
9482                               &advance);
9483
9484   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9485       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9486     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9487                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9488
9489   if (niters_vector == NULL_TREE)
9490     {
9491       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9492           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9493           && known_eq (lowest_vf, vf))
9494         {
9495           niters_vector
9496             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9497                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9498           step_vector = build_one_cst (TREE_TYPE (niters));
9499         }
9500       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9501         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9502                                      &step_vector, niters_no_overflow);
9503       else
9504         /* vect_do_peeling subtracted the number of peeled prologue
9505            iterations from LOOP_VINFO_NITERS.  */
9506         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9507                                      &niters_vector, &step_vector,
9508                                      niters_no_overflow);
9509     }
9510
9511   /* 1) Make sure the loop header has exactly two entries
9512      2) Make sure we have a preheader basic block.  */
9513
9514   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9515
9516   split_edge (loop_preheader_edge (loop));
9517
9518   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9519     /* This will deal with any possible peeling.  */
9520     vect_prepare_for_masked_peels (loop_vinfo);
9521
9522   /* Schedule the SLP instances first, then handle loop vectorization
9523      below.  */
9524   if (!loop_vinfo->slp_instances.is_empty ())
9525     {
9526       DUMP_VECT_SCOPE ("scheduling SLP instances");
9527       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9528     }
9529
9530   /* FORNOW: the vectorizer supports only loops which body consist
9531      of one basic block (header + empty latch). When the vectorizer will
9532      support more involved loop forms, the order by which the BBs are
9533      traversed need to be reconsidered.  */
9534
9535   for (i = 0; i < nbbs; i++)
9536     {
9537       basic_block bb = bbs[i];
9538       stmt_vec_info stmt_info;
9539
9540       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9541            gsi_next (&si))
9542         {
9543           gphi *phi = si.phi ();
9544           if (dump_enabled_p ())
9545             dump_printf_loc (MSG_NOTE, vect_location,
9546                              "------>vectorizing phi: %G", phi);
9547           stmt_info = loop_vinfo->lookup_stmt (phi);
9548           if (!stmt_info)
9549             continue;
9550
9551           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9552             vect_loop_kill_debug_uses (loop, stmt_info);
9553
9554           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9555               && !STMT_VINFO_LIVE_P (stmt_info))
9556             continue;
9557
9558           if (STMT_VINFO_VECTYPE (stmt_info)
9559               && (maybe_ne
9560                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9561               && dump_enabled_p ())
9562             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9563
9564           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9565                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9566                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9567                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9568                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9569               && ! PURE_SLP_STMT (stmt_info))
9570             {
9571               if (dump_enabled_p ())
9572                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9573               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9574             }
9575         }
9576
9577       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9578            gsi_next (&si))
9579         {
9580           gphi *phi = si.phi ();
9581           stmt_info = loop_vinfo->lookup_stmt (phi);
9582           if (!stmt_info)
9583             continue;
9584
9585           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9586               && !STMT_VINFO_LIVE_P (stmt_info))
9587             continue;
9588
9589           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9590                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9591                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9592                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9593                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9594               && ! PURE_SLP_STMT (stmt_info))
9595             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9596         }
9597
9598       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9599            !gsi_end_p (si);)
9600         {
9601           stmt = gsi_stmt (si);
9602           /* During vectorization remove existing clobber stmts.  */
9603           if (gimple_clobber_p (stmt))
9604             {
9605               unlink_stmt_vdef (stmt);
9606               gsi_remove (&si, true);
9607               release_defs (stmt);
9608             }
9609           else
9610             {
9611               /* Ignore vector stmts created in the outer loop.  */
9612               stmt_info = loop_vinfo->lookup_stmt (stmt);
9613
9614               /* vector stmts created in the outer-loop during vectorization of
9615                  stmts in an inner-loop may not have a stmt_info, and do not
9616                  need to be vectorized.  */
9617               stmt_vec_info seen_store = NULL;
9618               if (stmt_info)
9619                 {
9620                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9621                     {
9622                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9623                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9624                            !gsi_end_p (subsi); gsi_next (&subsi))
9625                         {
9626                           stmt_vec_info pat_stmt_info
9627                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9628                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9629                                                     &si, &seen_store);
9630                         }
9631                       stmt_vec_info pat_stmt_info
9632                         = STMT_VINFO_RELATED_STMT (stmt_info);
9633                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9634                                                     &si, &seen_store))
9635                         maybe_set_vectorized_backedge_value (loop_vinfo,
9636                                                              pat_stmt_info);
9637                     }
9638                   else
9639                     {
9640                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9641                                                     &seen_store))
9642                         maybe_set_vectorized_backedge_value (loop_vinfo,
9643                                                              stmt_info);
9644                     }
9645                 }
9646               gsi_next (&si);
9647               if (seen_store)
9648                 {
9649                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9650                     /* Interleaving.  If IS_STORE is TRUE, the
9651                        vectorization of the interleaving chain was
9652                        completed - free all the stores in the chain.  */
9653                     vect_remove_stores (loop_vinfo,
9654                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9655                   else
9656                     /* Free the attached stmt_vec_info and remove the stmt.  */
9657                     loop_vinfo->remove_stmt (stmt_info);
9658                 }
9659             }
9660         }
9661
9662       /* Stub out scalar statements that must not survive vectorization.
9663          Doing this here helps with grouped statements, or statements that
9664          are involved in patterns.  */
9665       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9666            !gsi_end_p (gsi); gsi_next (&gsi))
9667         {
9668           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9669           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
9670             {
9671               tree lhs = gimple_get_lhs (call);
9672               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9673                 {
9674                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9675                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9676                   gsi_replace (&gsi, new_stmt, true);
9677                 }
9678             }
9679         }
9680     }                           /* BBs in loop */
9681
9682   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9683      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9684   if (integer_onep (step_vector))
9685     niters_no_overflow = true;
9686   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9687                            niters_vector_mult_vf, !niters_no_overflow);
9688
9689   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9690   scale_profile_for_vect_loop (loop, assumed_vf);
9691
9692   /* True if the final iteration might not handle a full vector's
9693      worth of scalar iterations.  */
9694   bool final_iter_may_be_partial
9695     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9696   /* The minimum number of iterations performed by the epilogue.  This
9697      is 1 when peeling for gaps because we always need a final scalar
9698      iteration.  */
9699   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9700   /* +1 to convert latch counts to loop iteration counts,
9701      -min_epilogue_iters to remove iterations that cannot be performed
9702        by the vector code.  */
9703   int bias_for_lowest = 1 - min_epilogue_iters;
9704   int bias_for_assumed = bias_for_lowest;
9705   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9706   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9707     {
9708       /* When the amount of peeling is known at compile time, the first
9709          iteration will have exactly alignment_npeels active elements.
9710          In the worst case it will have at least one.  */
9711       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9712       bias_for_lowest += lowest_vf - min_first_active;
9713       bias_for_assumed += assumed_vf - min_first_active;
9714     }
9715   /* In these calculations the "- 1" converts loop iteration counts
9716      back to latch counts.  */
9717   if (loop->any_upper_bound)
9718     loop->nb_iterations_upper_bound
9719       = (final_iter_may_be_partial
9720          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9721                           lowest_vf) - 1
9722          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9723                            lowest_vf) - 1);
9724   if (loop->any_likely_upper_bound)
9725     loop->nb_iterations_likely_upper_bound
9726       = (final_iter_may_be_partial
9727          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9728                           + bias_for_lowest, lowest_vf) - 1
9729          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9730                            + bias_for_lowest, lowest_vf) - 1);
9731   if (loop->any_estimate)
9732     loop->nb_iterations_estimate
9733       = (final_iter_may_be_partial
9734          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9735                           assumed_vf) - 1
9736          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9737                            assumed_vf) - 1);
9738
9739   if (dump_enabled_p ())
9740     {
9741       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9742         {
9743           dump_printf_loc (MSG_NOTE, vect_location,
9744                            "LOOP VECTORIZED\n");
9745           if (loop->inner)
9746             dump_printf_loc (MSG_NOTE, vect_location,
9747                              "OUTER LOOP VECTORIZED\n");
9748           dump_printf (MSG_NOTE, "\n");
9749         }
9750       else
9751         dump_printf_loc (MSG_NOTE, vect_location,
9752                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9753                          GET_MODE_NAME (loop_vinfo->vector_mode));
9754     }
9755
9756   /* Loops vectorized with a variable factor won't benefit from
9757      unrolling/peeling.  */
9758   if (!vf.is_constant ())
9759     {
9760       loop->unroll = 1;
9761       if (dump_enabled_p ())
9762         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9763                          " variable-length vectorization factor\n");
9764     }
9765   /* Free SLP instances here because otherwise stmt reference counting
9766      won't work.  */
9767   slp_instance instance;
9768   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9769     vect_free_slp_instance (instance);
9770   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9771   /* Clear-up safelen field since its value is invalid after vectorization
9772      since vectorized loop can have loop-carried dependencies.  */
9773   loop->safelen = 0;
9774
9775   if (epilogue)
9776     {
9777       update_epilogue_loop_vinfo (epilogue, advance);
9778
9779       epilogue->simduid = loop->simduid;
9780       epilogue->force_vectorize = loop->force_vectorize;
9781       epilogue->dont_vectorize = false;
9782     }
9783
9784   return epilogue;
9785 }
9786
9787 /* The code below is trying to perform simple optimization - revert
9788    if-conversion for masked stores, i.e. if the mask of a store is zero
9789    do not perform it and all stored value producers also if possible.
9790    For example,
9791      for (i=0; i<n; i++)
9792        if (c[i])
9793         {
9794           p1[i] += 1;
9795           p2[i] = p3[i] +2;
9796         }
9797    this transformation will produce the following semi-hammock:
9798
9799    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9800      {
9801        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9802        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9803        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9804        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9805        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9806        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9807      }
9808 */
9809
9810 void
9811 optimize_mask_stores (class loop *loop)
9812 {
9813   basic_block *bbs = get_loop_body (loop);
9814   unsigned nbbs = loop->num_nodes;
9815   unsigned i;
9816   basic_block bb;
9817   class loop *bb_loop;
9818   gimple_stmt_iterator gsi;
9819   gimple *stmt;
9820   auto_vec<gimple *> worklist;
9821   auto_purge_vect_location sentinel;
9822
9823   vect_location = find_loop_location (loop);
9824   /* Pick up all masked stores in loop if any.  */
9825   for (i = 0; i < nbbs; i++)
9826     {
9827       bb = bbs[i];
9828       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9829            gsi_next (&gsi))
9830         {
9831           stmt = gsi_stmt (gsi);
9832           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9833             worklist.safe_push (stmt);
9834         }
9835     }
9836
9837   free (bbs);
9838   if (worklist.is_empty ())
9839     return;
9840
9841   /* Loop has masked stores.  */
9842   while (!worklist.is_empty ())
9843     {
9844       gimple *last, *last_store;
9845       edge e, efalse;
9846       tree mask;
9847       basic_block store_bb, join_bb;
9848       gimple_stmt_iterator gsi_to;
9849       tree vdef, new_vdef;
9850       gphi *phi;
9851       tree vectype;
9852       tree zero;
9853
9854       last = worklist.pop ();
9855       mask = gimple_call_arg (last, 2);
9856       bb = gimple_bb (last);
9857       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9858          the same loop as if_bb.  It could be different to LOOP when two
9859          level loop-nest is vectorized and mask_store belongs to the inner
9860          one.  */
9861       e = split_block (bb, last);
9862       bb_loop = bb->loop_father;
9863       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9864       join_bb = e->dest;
9865       store_bb = create_empty_bb (bb);
9866       add_bb_to_loop (store_bb, bb_loop);
9867       e->flags = EDGE_TRUE_VALUE;
9868       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9869       /* Put STORE_BB to likely part.  */
9870       efalse->probability = profile_probability::unlikely ();
9871       store_bb->count = efalse->count ();
9872       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9873       if (dom_info_available_p (CDI_DOMINATORS))
9874         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9875       if (dump_enabled_p ())
9876         dump_printf_loc (MSG_NOTE, vect_location,
9877                          "Create new block %d to sink mask stores.",
9878                          store_bb->index);
9879       /* Create vector comparison with boolean result.  */
9880       vectype = TREE_TYPE (mask);
9881       zero = build_zero_cst (vectype);
9882       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9883       gsi = gsi_last_bb (bb);
9884       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9885       /* Create new PHI node for vdef of the last masked store:
9886          .MEM_2 = VDEF <.MEM_1>
9887          will be converted to
9888          .MEM.3 = VDEF <.MEM_1>
9889          and new PHI node will be created in join bb
9890          .MEM_2 = PHI <.MEM_1, .MEM_3>
9891       */
9892       vdef = gimple_vdef (last);
9893       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9894       gimple_set_vdef (last, new_vdef);
9895       phi = create_phi_node (vdef, join_bb);
9896       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9897
9898       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9899       while (true)
9900         {
9901           gimple_stmt_iterator gsi_from;
9902           gimple *stmt1 = NULL;
9903
9904           /* Move masked store to STORE_BB.  */
9905           last_store = last;
9906           gsi = gsi_for_stmt (last);
9907           gsi_from = gsi;
9908           /* Shift GSI to the previous stmt for further traversal.  */
9909           gsi_prev (&gsi);
9910           gsi_to = gsi_start_bb (store_bb);
9911           gsi_move_before (&gsi_from, &gsi_to);
9912           /* Setup GSI_TO to the non-empty block start.  */
9913           gsi_to = gsi_start_bb (store_bb);
9914           if (dump_enabled_p ())
9915             dump_printf_loc (MSG_NOTE, vect_location,
9916                              "Move stmt to created bb\n%G", last);
9917           /* Move all stored value producers if possible.  */
9918           while (!gsi_end_p (gsi))
9919             {
9920               tree lhs;
9921               imm_use_iterator imm_iter;
9922               use_operand_p use_p;
9923               bool res;
9924
9925               /* Skip debug statements.  */
9926               if (is_gimple_debug (gsi_stmt (gsi)))
9927                 {
9928                   gsi_prev (&gsi);
9929                   continue;
9930                 }
9931               stmt1 = gsi_stmt (gsi);
9932               /* Do not consider statements writing to memory or having
9933                  volatile operand.  */
9934               if (gimple_vdef (stmt1)
9935                   || gimple_has_volatile_ops (stmt1))
9936                 break;
9937               gsi_from = gsi;
9938               gsi_prev (&gsi);
9939               lhs = gimple_get_lhs (stmt1);
9940               if (!lhs)
9941                 break;
9942
9943               /* LHS of vectorized stmt must be SSA_NAME.  */
9944               if (TREE_CODE (lhs) != SSA_NAME)
9945                 break;
9946
9947               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9948                 {
9949                   /* Remove dead scalar statement.  */
9950                   if (has_zero_uses (lhs))
9951                     {
9952                       gsi_remove (&gsi_from, true);
9953                       continue;
9954                     }
9955                 }
9956
9957               /* Check that LHS does not have uses outside of STORE_BB.  */
9958               res = true;
9959               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9960                 {
9961                   gimple *use_stmt;
9962                   use_stmt = USE_STMT (use_p);
9963                   if (is_gimple_debug (use_stmt))
9964                     continue;
9965                   if (gimple_bb (use_stmt) != store_bb)
9966                     {
9967                       res = false;
9968                       break;
9969                     }
9970                 }
9971               if (!res)
9972                 break;
9973
9974               if (gimple_vuse (stmt1)
9975                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9976                 break;
9977
9978               /* Can move STMT1 to STORE_BB.  */
9979               if (dump_enabled_p ())
9980                 dump_printf_loc (MSG_NOTE, vect_location,
9981                                  "Move stmt to created bb\n%G", stmt1);
9982               gsi_move_before (&gsi_from, &gsi_to);
9983               /* Shift GSI_TO for further insertion.  */
9984               gsi_prev (&gsi_to);
9985             }
9986           /* Put other masked stores with the same mask to STORE_BB.  */
9987           if (worklist.is_empty ()
9988               || gimple_call_arg (worklist.last (), 2) != mask
9989               || worklist.last () != stmt1)
9990             break;
9991           last = worklist.pop ();
9992         }
9993       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9994     }
9995 }
9996
9997 /* Decide whether it is possible to use a zero-based induction variable
9998    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
9999    the value that the induction variable must be able to hold in order
10000    to ensure that the rgroups eventually have no active vector elements.
10001    Return -1 otherwise.  */
10002
10003 widest_int
10004 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10005 {
10006   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10007   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10008   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10009
10010   /* Calculate the value that the induction variable must be able
10011      to hit in order to ensure that we end the loop with an all-false mask.
10012      This involves adding the maximum number of inactive trailing scalar
10013      iterations.  */
10014   widest_int iv_limit = -1;
10015   if (max_loop_iterations (loop, &iv_limit))
10016     {
10017       if (niters_skip)
10018         {
10019           /* Add the maximum number of skipped iterations to the
10020              maximum iteration count.  */
10021           if (TREE_CODE (niters_skip) == INTEGER_CST)
10022             iv_limit += wi::to_widest (niters_skip);
10023           else
10024             iv_limit += max_vf - 1;
10025         }
10026       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10027         /* Make a conservatively-correct assumption.  */
10028         iv_limit += max_vf - 1;
10029
10030       /* IV_LIMIT is the maximum number of latch iterations, which is also
10031          the maximum in-range IV value.  Round this value down to the previous
10032          vector alignment boundary and then add an extra full iteration.  */
10033       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10034       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10035     }
10036   return iv_limit;
10037 }
10038
10039 /* For the given rgroup_controls RGC, check whether an induction variable
10040    would ever hit a value that produces a set of all-false masks or zero
10041    lengths before wrapping around.  Return true if it's possible to wrap
10042    around before hitting the desirable value, otherwise return false.  */
10043
10044 bool
10045 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10046 {
10047   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10048
10049   if (iv_limit == -1)
10050     return true;
10051
10052   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10053   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10054   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10055
10056   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10057     return true;
10058
10059   return false;
10060 }