gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 219                      stmt_info->stmt);
 220   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 221     return false;
 222
 223   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 224       && STMT_VINFO_RELATED_STMT (stmt_info))
 225     {
 226       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 227       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 228
 229       /* If a pattern statement has def stmts, analyze them too.  */
 230       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 231            !gsi_end_p (si); gsi_next (&si))
 232         {
 233           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 234           if (dump_enabled_p ())
 235             dump_printf_loc (MSG_NOTE, vect_location,
 236                              "==> examining pattern def stmt: %G",
 237                              def_stmt_info->stmt);
 238           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 239                                              vf, mask_producers))
 240             return false;
 241         }
 242
 243       if (dump_enabled_p ())
 244         dump_printf_loc (MSG_NOTE, vect_location,
 245                          "==> examining pattern statement: %G",
 246                          stmt_info->stmt);
 247       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 248         return false;
 249     }
 250
 251   return true;
 252 }
 253
 254 /* Function vect_determine_vectorization_factor
 255
 256    Determine the vectorization factor (VF).  VF is the number of data elements
 257    that are operated upon in parallel in a single iteration of the vectorized
 258    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 259    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 260    elements can fit in a single vector register.
 261
 262    We currently support vectorization of loops in which all types operated upon
 263    are of the same size.  Therefore this function currently sets VF according to
 264    the size of the types operated upon, and fails if there are multiple sizes
 265    in the loop.
 266
 267    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 268    original loop:
 269         for (i=0; i<N; i++){
 270           a[i] = b[i] + c[i];
 271         }
 272
 273    vectorized loop:
 274         for (i=0; i<N; i+=VF){
 275           a[i:VF] = b[i:VF] + c[i:VF];
 276         }
 277 */
 278
 279 static bool
 280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 281 {
 282   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 283   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 284   unsigned nbbs = loop->num_nodes;
 285   poly_uint64 vectorization_factor = 1;
 286   tree scalar_type = NULL_TREE;
 287   gphi *phi;
 288   tree vectype;
 289   stmt_vec_info stmt_info;
 290   unsigned i;
 291   auto_vec<stmt_vec_info> mask_producers;
 292
 293   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 294
 295   for (i = 0; i < nbbs; i++)
 296     {
 297       basic_block bb = bbs[i];
 298
 299       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 300            gsi_next (&si))
 301         {
 302           phi = si.phi ();
 303           stmt_info = loop_vinfo->lookup_stmt (phi);
 304           if (dump_enabled_p ())
 305             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 306                              phi);
 307
 308           gcc_assert (stmt_info);
 309
 310           if (STMT_VINFO_RELEVANT_P (stmt_info)
 311               || STMT_VINFO_LIVE_P (stmt_info))
 312             {
 313               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 314               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 315
 316               if (dump_enabled_p ())
 317                 dump_printf_loc (MSG_NOTE, vect_location,
 318                                  "get vectype for scalar type:  %T\n",
 319                                  scalar_type);
 320
 321               vectype = get_vectype_for_scalar_type (scalar_type);
 322               if (!vectype)
 323                 {
 324                   if (dump_enabled_p ())
 325                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 326                                      "not vectorized: unsupported "
 327                                      "data-type %T\n",
 328                                      scalar_type);
 329                   return false;
 330                 }
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 352           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 353                                            &mask_producers))
 354             return false;
 355         }
 356     }
 357
 358   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 359   if (dump_enabled_p ())
 360     {
 361       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 362       dump_dec (MSG_NOTE, vectorization_factor);
 363       dump_printf (MSG_NOTE, "\n");
 364     }
 365
 366   if (known_le (vectorization_factor, 1U))
 367     {
 368       if (dump_enabled_p ())
 369         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                          "not vectorized: unsupported data-type\n");
 371       return false;
 372     }
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374
 375   for (i = 0; i < mask_producers.length (); i++)
 376     {
 377       stmt_info = mask_producers[i];
 378       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 379       if (!mask_type)
 380         return false;
 381       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 382     }
 383
 384   return true;
 385 }
 386
 387
 388 /* Function vect_is_simple_iv_evolution.
 389
 390    FORNOW: A simple evolution of an induction variables in the loop is
 391    considered a polynomial evolution.  */
 392
 393 static bool
 394 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 395                              tree * step)
 396 {
 397   tree init_expr;
 398   tree step_expr;
 399   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 400   basic_block bb;
 401
 402   /* When there is no evolution in this loop, the evolution function
 403      is not "simple".  */
 404   if (evolution_part == NULL_TREE)
 405     return false;
 406
 407   /* When the evolution is a polynomial of degree >= 2
 408      the evolution function is not "simple".  */
 409   if (tree_is_chrec (evolution_part))
 410     return false;
 411
 412   step_expr = evolution_part;
 413   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 414
 415   if (dump_enabled_p ())
 416     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 417                      step_expr, init_expr);
 418
 419   *init = init_expr;
 420   *step = step_expr;
 421
 422   if (TREE_CODE (step_expr) != INTEGER_CST
 423       && (TREE_CODE (step_expr) != SSA_NAME
 424           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 425               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 426           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 427               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 428                   || !flag_associative_math)))
 429       && (TREE_CODE (step_expr) != REAL_CST
 430           || !flag_associative_math))
 431     {
 432       if (dump_enabled_p ())
 433         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 434                          "step unknown.\n");
 435       return false;
 436     }
 437
 438   return true;
 439 }
 440
 441 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 442    what we are assuming is a double reduction.  For example, given
 443    a structure like this:
 444
 445       outer1:
 446         x_1 = PHI <x_4(outer2), ...>;
 447         ...
 448
 449       inner:
 450         x_2 = PHI <x_1(outer1), ...>;
 451         ...
 452         x_3 = ...;
 453         ...
 454
 455       outer2:
 456         x_4 = PHI <x_3(inner)>;
 457         ...
 458
 459    outer loop analysis would treat x_1 as a double reduction phi and
 460    this function would then return true for x_2.  */
 461
 462 static bool
 463 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 464 {
 465   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 466   use_operand_p use_p;
 467   ssa_op_iter op_iter;
 468   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 469     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 470       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 471         return true;
 472   return false;
 473 }
 474
 475 /* Function vect_analyze_scalar_cycles_1.
 476
 477    Examine the cross iteration def-use cycles of scalar variables
 478    in LOOP.  LOOP_VINFO represents the loop that is now being
 479    considered for vectorization (can be LOOP, or an outer-loop
 480    enclosing LOOP).  */
 481
 482 static void
 483 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 484 {
 485   basic_block bb = loop->header;
 486   tree init, step;
 487   auto_vec<stmt_vec_info, 64> worklist;
 488   gphi_iterator gsi;
 489   bool double_reduc;
 490
 491   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 492
 493   /* First - identify all inductions.  Reduction detection assumes that all the
 494      inductions have been identified, therefore, this order must not be
 495      changed.  */
 496   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 497     {
 498       gphi *phi = gsi.phi ();
 499       tree access_fn = NULL;
 500       tree def = PHI_RESULT (phi);
 501       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 502
 503       if (dump_enabled_p ())
 504         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 505
 506       /* Skip virtual phi's.  The data dependences that are associated with
 507          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 508       if (virtual_operand_p (def))
 509         continue;
 510
 511       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 512
 513       /* Analyze the evolution function.  */
 514       access_fn = analyze_scalar_evolution (loop, def);
 515       if (access_fn)
 516         {
 517           STRIP_NOPS (access_fn);
 518           if (dump_enabled_p ())
 519             dump_printf_loc (MSG_NOTE, vect_location,
 520                              "Access function of PHI: %T\n", access_fn);
 521           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522             = initial_condition_in_loop_num (access_fn, loop->num);
 523           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 524             = evolution_part_in_loop_num (access_fn, loop->num);
 525         }
 526
 527       if (!access_fn
 528           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 529           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 530           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 531               && TREE_CODE (step) != INTEGER_CST))
 532         {
 533           worklist.safe_push (stmt_vinfo);
 534           continue;
 535         }
 536
 537       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 538                   != NULL_TREE);
 539       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 540
 541       if (dump_enabled_p ())
 542         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 543       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 544     }
 545
 546
 547   /* Second - identify all reductions and nested cycles.  */
 548   while (worklist.length () > 0)
 549     {
 550       stmt_vec_info stmt_vinfo = worklist.pop ();
 551       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 552       tree def = PHI_RESULT (phi);
 553
 554       if (dump_enabled_p ())
 555         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 556
 557       gcc_assert (!virtual_operand_p (def)
 558                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 559
 560       stmt_vec_info reduc_stmt_info
 561         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 562                                        &double_reduc, false);
 563       if (reduc_stmt_info)
 564         {
 565           if (double_reduc)
 566             {
 567               if (dump_enabled_p ())
 568                 dump_printf_loc (MSG_NOTE, vect_location,
 569                                  "Detected double reduction.\n");
 570
 571               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 572               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 573                 = vect_double_reduction_def;
 574             }
 575           else
 576             {
 577               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 578                 {
 579                   if (dump_enabled_p ())
 580                     dump_printf_loc (MSG_NOTE, vect_location,
 581                                      "Detected vectorizable nested cycle.\n");
 582
 583                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 584                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 585                 }
 586               else
 587                 {
 588                   if (dump_enabled_p ())
 589                     dump_printf_loc (MSG_NOTE, vect_location,
 590                                      "Detected reduction.\n");
 591
 592                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 593                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 594                   /* Store the reduction cycles for possible vectorization in
 595                      loop-aware SLP if it was not detected as reduction
 596                      chain.  */
 597                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 598                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 599                       (reduc_stmt_info);
 600                 }
 601             }
 602         }
 603       else
 604         if (dump_enabled_p ())
 605           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 606                            "Unknown def-use cycle pattern.\n");
 607     }
 608 }
 609
 610
 611 /* Function vect_analyze_scalar_cycles.
 612
 613    Examine the cross iteration def-use cycles of scalar variables, by
 614    analyzing the loop-header PHIs of scalar variables.  Classify each
 615    cycle as one of the following: invariant, induction, reduction, unknown.
 616    We do that for the loop represented by LOOP_VINFO, and also to its
 617    inner-loop, if exists.
 618    Examples for scalar cycles:
 619
 620    Example1: reduction:
 621
 622               loop1:
 623               for (i=0; i<N; i++)
 624                  sum += a[i];
 625
 626    Example2: induction:
 627
 628               loop2:
 629               for (i=0; i<N; i++)
 630                  a[i] = i;  */
 631
 632 static void
 633 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 634 {
 635   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 636
 637   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 638
 639   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 640      Reductions in such inner-loop therefore have different properties than
 641      the reductions in the nest that gets vectorized:
 642      1. When vectorized, they are executed in the same order as in the original
 643         scalar loop, so we can't change the order of computation when
 644         vectorizing them.
 645      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 646         current checks are too strict.  */
 647
 648   if (loop->inner)
 649     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 650 }
 651
 652 /* Transfer group and reduction information from STMT_INFO to its
 653    pattern stmt.  */
 654
 655 static void
 656 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 657 {
 658   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 659   stmt_vec_info stmtp;
 660   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 661               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 662   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 663   do
 664     {
 665       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 666       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 667       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 668       if (stmt_info)
 669         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 670           = STMT_VINFO_RELATED_STMT (stmt_info);
 671     }
 672   while (stmt_info);
 673   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 674 }
 675
 676 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 677
 678 static void
 679 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 680 {
 681   stmt_vec_info first;
 682   unsigned i;
 683
 684   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 685     if (STMT_VINFO_IN_PATTERN_P (first))
 686       {
 687         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 688         while (next)
 689           {
 690             if (! STMT_VINFO_IN_PATTERN_P (next))
 691               break;
 692             next = REDUC_GROUP_NEXT_ELEMENT (next);
 693           }
 694         /* If not all stmt in the chain are patterns try to handle
 695            the chain without patterns.  */
 696         if (! next)
 697           {
 698             vect_fixup_reduc_chain (first);
 699             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 700               = STMT_VINFO_RELATED_STMT (first);
 701           }
 702       }
 703 }
 704
 705 /* Function vect_get_loop_niters.
 706
 707    Determine how many iterations the loop is executed and place it
 708    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 709    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 710    niter information holds in ASSUMPTIONS.
 711
 712    Return the loop exit condition.  */
 713
 714
 715 static gcond *
 716 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 717                       tree *number_of_iterations, tree *number_of_iterationsm1)
 718 {
 719   edge exit = single_exit (loop);
 720   struct tree_niter_desc niter_desc;
 721   tree niter_assumptions, niter, may_be_zero;
 722   gcond *cond = get_loop_exit_condition (loop);
 723
 724   *assumptions = boolean_true_node;
 725   *number_of_iterationsm1 = chrec_dont_know;
 726   *number_of_iterations = chrec_dont_know;
 727   DUMP_VECT_SCOPE ("get_loop_niters");
 728
 729   if (!exit)
 730     return cond;
 731
 732   niter = chrec_dont_know;
 733   may_be_zero = NULL_TREE;
 734   niter_assumptions = boolean_true_node;
 735   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 736       || chrec_contains_undetermined (niter_desc.niter))
 737     return cond;
 738
 739   niter_assumptions = niter_desc.assumptions;
 740   may_be_zero = niter_desc.may_be_zero;
 741   niter = niter_desc.niter;
 742
 743   if (may_be_zero && integer_zerop (may_be_zero))
 744     may_be_zero = NULL_TREE;
 745
 746   if (may_be_zero)
 747     {
 748       if (COMPARISON_CLASS_P (may_be_zero))
 749         {
 750           /* Try to combine may_be_zero with assumptions, this can simplify
 751              computation of niter expression.  */
 752           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 753             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 754                                              niter_assumptions,
 755                                              fold_build1 (TRUTH_NOT_EXPR,
 756                                                           boolean_type_node,
 757                                                           may_be_zero));
 758           else
 759             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 760                                  build_int_cst (TREE_TYPE (niter), 0),
 761                                  rewrite_to_non_trapping_overflow (niter));
 762
 763           may_be_zero = NULL_TREE;
 764         }
 765       else if (integer_nonzerop (may_be_zero))
 766         {
 767           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 768           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 769           return cond;
 770         }
 771       else
 772         return cond;
 773     }
 774
 775   *assumptions = niter_assumptions;
 776   *number_of_iterationsm1 = niter;
 777
 778   /* We want the number of loop header executions which is the number
 779      of latch executions plus one.
 780      ???  For UINT_MAX latch executions this number overflows to zero
 781      for loops like do { n++; } while (n != 0);  */
 782   if (niter && !chrec_contains_undetermined (niter))
 783     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 784                           build_int_cst (TREE_TYPE (niter), 1));
 785   *number_of_iterations = niter;
 786
 787   return cond;
 788 }
 789
 790 /* Function bb_in_loop_p
 791
 792    Used as predicate for dfs order traversal of the loop bbs.  */
 793
 794 static bool
 795 bb_in_loop_p (const_basic_block bb, const void *data)
 796 {
 797   const struct loop *const loop = (const struct loop *)data;
 798   if (flow_bb_inside_loop_p (loop, bb))
 799     return true;
 800   return false;
 801 }
 802
 803
 804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 805    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 806
 807 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 808   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 809     loop (loop_in),
 810     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 811     num_itersm1 (NULL_TREE),
 812     num_iters (NULL_TREE),
 813     num_iters_unchanged (NULL_TREE),
 814     num_iters_assumptions (NULL_TREE),
 815     th (0),
 816     versioning_threshold (0),
 817     vectorization_factor (0),
 818     max_vectorization_factor (0),
 819     mask_skip_niters (NULL_TREE),
 820     mask_compare_type (NULL_TREE),
 821     unaligned_dr (NULL),
 822     peeling_for_alignment (0),
 823     ptr_mask (0),
 824     ivexpr_map (NULL),
 825     slp_unrolling_factor (1),
 826     single_scalar_iteration_cost (0),
 827     vectorizable (false),
 828     can_fully_mask_p (true),
 829     fully_masked_p (false),
 830     peeling_for_gaps (false),
 831     peeling_for_niter (false),
 832     operands_swapped (false),
 833     no_data_dependencies (false),
 834     has_mask_store (false),
 835     scalar_loop (NULL),
 836     orig_loop_info (NULL)
 837 {
 838   /* CHECKME: We want to visit all BBs before their successors (except for
 839      latch blocks, for which this assertion wouldn't hold).  In the simple
 840      case of the loop forms we allow, a dfs order of the BBs would the same
 841      as reversed postorder traversal, so we are safe.  */
 842
 843   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 844                                           bbs, loop->num_nodes, loop);
 845   gcc_assert (nbbs == loop->num_nodes);
 846
 847   for (unsigned int i = 0; i < nbbs; i++)
 848     {
 849       basic_block bb = bbs[i];
 850       gimple_stmt_iterator si;
 851
 852       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 853         {
 854           gimple *phi = gsi_stmt (si);
 855           gimple_set_uid (phi, 0);
 856           add_stmt (phi);
 857         }
 858
 859       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 860         {
 861           gimple *stmt = gsi_stmt (si);
 862           gimple_set_uid (stmt, 0);
 863           add_stmt (stmt);
 864         }
 865     }
 866 }
 867
 868 /* Free all levels of MASKS.  */
 869
 870 void
 871 release_vec_loop_masks (vec_loop_masks *masks)
 872 {
 873   rgroup_masks *rgm;
 874   unsigned int i;
 875   FOR_EACH_VEC_ELT (*masks, i, rgm)
 876     rgm->masks.release ();
 877   masks->release ();
 878 }
 879
 880 /* Free all memory used by the _loop_vec_info, as well as all the
 881    stmt_vec_info structs of all the stmts in the loop.  */
 882
 883 _loop_vec_info::~_loop_vec_info ()
 884 {
 885   int nbbs;
 886   gimple_stmt_iterator si;
 887   int j;
 888
 889   nbbs = loop->num_nodes;
 890   for (j = 0; j < nbbs; j++)
 891     {
 892       basic_block bb = bbs[j];
 893       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 894         {
 895           gimple *stmt = gsi_stmt (si);
 896
 897           /* We may have broken canonical form by moving a constant
 898              into RHS1 of a commutative op.  Fix such occurrences.  */
 899           if (operands_swapped && is_gimple_assign (stmt))
 900             {
 901               enum tree_code code = gimple_assign_rhs_code (stmt);
 902
 903               if ((code == PLUS_EXPR
 904                    || code == POINTER_PLUS_EXPR
 905                    || code == MULT_EXPR)
 906                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 907                 swap_ssa_operands (stmt,
 908                                    gimple_assign_rhs1_ptr (stmt),
 909                                    gimple_assign_rhs2_ptr (stmt));
 910               else if (code == COND_EXPR
 911                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 912                 {
 913                   tree cond_expr = gimple_assign_rhs1 (stmt);
 914                   enum tree_code cond_code = TREE_CODE (cond_expr);
 915
 916                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 917                     {
 918                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 919                                                                   0));
 920                       cond_code = invert_tree_comparison (cond_code,
 921                                                           honor_nans);
 922                       if (cond_code != ERROR_MARK)
 923                         {
 924                           TREE_SET_CODE (cond_expr, cond_code);
 925                           swap_ssa_operands (stmt,
 926                                              gimple_assign_rhs2_ptr (stmt),
 927                                              gimple_assign_rhs3_ptr (stmt));
 928                         }
 929                     }
 930                 }
 931             }
 932           gsi_next (&si);
 933         }
 934     }
 935
 936   free (bbs);
 937
 938   release_vec_loop_masks (&masks);
 939   delete ivexpr_map;
 940
 941   loop->aux = NULL;
 942 }
 943
 944 /* Return an invariant or register for EXPR and emit necessary
 945    computations in the LOOP_VINFO loop preheader.  */
 946
 947 tree
 948 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 949 {
 950   if (is_gimple_reg (expr)
 951       || is_gimple_min_invariant (expr))
 952     return expr;
 953
 954   if (! loop_vinfo->ivexpr_map)
 955     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 956   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 957   if (! cached)
 958     {
 959       gimple_seq stmts = NULL;
 960       cached = force_gimple_operand (unshare_expr (expr),
 961                                      &stmts, true, NULL_TREE);
 962       if (stmts)
 963         {
 964           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 965           gsi_insert_seq_on_edge_immediate (e, stmts);
 966         }
 967     }
 968   return cached;
 969 }
 970
 971 /* Return true if we can use CMP_TYPE as the comparison type to produce
 972    all masks required to mask LOOP_VINFO.  */
 973
 974 static bool
 975 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 976 {
 977   rgroup_masks *rgm;
 978   unsigned int i;
 979   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 980     if (rgm->mask_type != NULL_TREE
 981         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 982                                             cmp_type, rgm->mask_type,
 983                                             OPTIMIZE_FOR_SPEED))
 984       return false;
 985   return true;
 986 }
 987
 988 /* Calculate the maximum number of scalars per iteration for every
 989    rgroup in LOOP_VINFO.  */
 990
 991 static unsigned int
 992 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 993 {
 994   unsigned int res = 1;
 995   unsigned int i;
 996   rgroup_masks *rgm;
 997   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 998     res = MAX (res, rgm->max_nscalars_per_iter);
 999   return res;
1000 }
1001
1002 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1003    whether we can actually generate the masks required.  Return true if so,
1004    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1005
1006 static bool
1007 vect_verify_full_masking (loop_vec_info loop_vinfo)
1008 {
1009   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1010   unsigned int min_ni_width;
1011
1012   /* Use a normal loop if there are no statements that need masking.
1013      This only happens in rare degenerate cases: it means that the loop
1014      has no loads, no stores, and no live-out values.  */
1015   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1016     return false;
1017
1018   /* Get the maximum number of iterations that is representable
1019      in the counter type.  */
1020   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022
1023   /* Get a more refined estimate for the number of iterations.  */
1024   widest_int max_back_edges;
1025   if (max_loop_iterations (loop, &max_back_edges))
1026     max_ni = wi::smin (max_ni, max_back_edges + 1);
1027
1028   /* Account for rgroup masks, in which each bit is replicated N times.  */
1029   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1030
1031   /* Work out how many bits we need to represent the limit.  */
1032   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1033
1034   /* Find a scalar mode for which WHILE_ULT is supported.  */
1035   opt_scalar_int_mode cmp_mode_iter;
1036   tree cmp_type = NULL_TREE;
1037   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1038     {
1039       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1040       if (cmp_bits >= min_ni_width
1041           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1042         {
1043           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1044           if (this_type
1045               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1046             {
1047               /* Although we could stop as soon as we find a valid mode,
1048                  it's often better to continue until we hit Pmode, since the
1049                  operands to the WHILE are more likely to be reusable in
1050                  address calculations.  */
1051               cmp_type = this_type;
1052               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1053                 break;
1054             }
1055         }
1056     }
1057
1058   if (!cmp_type)
1059     return false;
1060
1061   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1062   return true;
1063 }
1064
1065 /* Calculate the cost of one scalar iteration of the loop.  */
1066 static void
1067 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1068 {
1069   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1070   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1071   int nbbs = loop->num_nodes, factor;
1072   int innerloop_iters, i;
1073
1074   /* Gather costs for statements in the scalar loop.  */
1075
1076   /* FORNOW.  */
1077   innerloop_iters = 1;
1078   if (loop->inner)
1079     innerloop_iters = 50; /* FIXME */
1080
1081   for (i = 0; i < nbbs; i++)
1082     {
1083       gimple_stmt_iterator si;
1084       basic_block bb = bbs[i];
1085
1086       if (bb->loop_father == loop->inner)
1087         factor = innerloop_iters;
1088       else
1089         factor = 1;
1090
1091       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1092         {
1093           gimple *stmt = gsi_stmt (si);
1094           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1095
1096           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1097             continue;
1098
1099           /* Skip stmts that are not vectorized inside the loop.  */
1100           if (stmt_info
1101               && !STMT_VINFO_RELEVANT_P (stmt_info)
1102               && (!STMT_VINFO_LIVE_P (stmt_info)
1103                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1104               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1105             continue;
1106
1107           vect_cost_for_stmt kind;
1108           if (STMT_VINFO_DATA_REF (stmt_info))
1109             {
1110               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1111                kind = scalar_load;
1112              else
1113                kind = scalar_store;
1114             }
1115           else
1116             kind = scalar_stmt;
1117
1118           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1119                             factor, kind, stmt_info, 0, vect_prologue);
1120         }
1121     }
1122
1123   /* Now accumulate cost.  */
1124   void *target_cost_data = init_cost (loop);
1125   stmt_info_for_cost *si;
1126   int j;
1127   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1128                     j, si)
1129     (void) add_stmt_cost (target_cost_data, si->count,
1130                           si->kind, si->stmt_info, si->misalign,
1131                           vect_body);
1132   unsigned dummy, body_cost = 0;
1133   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1134   destroy_cost_data (target_cost_data);
1135   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1136 }
1137
1138
1139 /* Function vect_analyze_loop_form_1.
1140
1141    Verify that certain CFG restrictions hold, including:
1142    - the loop has a pre-header
1143    - the loop has a single entry and exit
1144    - the loop exit condition is simple enough
1145    - the number of iterations can be analyzed, i.e, a countable loop.  The
1146      niter could be analyzed under some assumptions.  */
1147
1148 bool
1149 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1150                           tree *assumptions, tree *number_of_iterationsm1,
1151                           tree *number_of_iterations, gcond **inner_loop_cond)
1152 {
1153   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1154
1155   /* Different restrictions apply when we are considering an inner-most loop,
1156      vs. an outer (nested) loop.
1157      (FORNOW. May want to relax some of these restrictions in the future).  */
1158
1159   if (!loop->inner)
1160     {
1161       /* Inner-most loop.  We currently require that the number of BBs is
1162          exactly 2 (the header and latch).  Vectorizable inner-most loops
1163          look like this:
1164
1165                         (pre-header)
1166                            |
1167                           header <--------+
1168                            | |            |
1169                            | +--> latch --+
1170                            |
1171                         (exit-bb)  */
1172
1173       if (loop->num_nodes != 2)
1174         {
1175           if (dump_enabled_p ())
1176             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177                              "not vectorized: control flow in loop.\n");
1178           return false;
1179         }
1180
1181       if (empty_block_p (loop->header))
1182         {
1183           if (dump_enabled_p ())
1184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185                              "not vectorized: empty loop.\n");
1186           return false;
1187         }
1188     }
1189   else
1190     {
1191       struct loop *innerloop = loop->inner;
1192       edge entryedge;
1193
1194       /* Nested loop. We currently require that the loop is doubly-nested,
1195          contains a single inner loop, and the number of BBs is exactly 5.
1196          Vectorizable outer-loops look like this:
1197
1198                         (pre-header)
1199                            |
1200                           header <---+
1201                            |         |
1202                           inner-loop |
1203                            |         |
1204                           tail ------+
1205                            |
1206                         (exit-bb)
1207
1208          The inner-loop has the properties expected of inner-most loops
1209          as described above.  */
1210
1211       if ((loop->inner)->inner || (loop->inner)->next)
1212         {
1213           if (dump_enabled_p ())
1214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215                              "not vectorized: multiple nested loops.\n");
1216           return false;
1217         }
1218
1219       if (loop->num_nodes != 5)
1220         {
1221           if (dump_enabled_p ())
1222             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223                              "not vectorized: control flow in loop.\n");
1224           return false;
1225         }
1226
1227       entryedge = loop_preheader_edge (innerloop);
1228       if (entryedge->src != loop->header
1229           || !single_exit (innerloop)
1230           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1231         {
1232           if (dump_enabled_p ())
1233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234                              "not vectorized: unsupported outerloop form.\n");
1235           return false;
1236         }
1237
1238       /* Analyze the inner-loop.  */
1239       tree inner_niterm1, inner_niter, inner_assumptions;
1240       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1241                                       &inner_assumptions, &inner_niterm1,
1242                                       &inner_niter, NULL)
1243           /* Don't support analyzing niter under assumptions for inner
1244              loop.  */
1245           || !integer_onep (inner_assumptions))
1246         {
1247           if (dump_enabled_p ())
1248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                              "not vectorized: Bad inner loop.\n");
1250           return false;
1251         }
1252
1253       if (!expr_invariant_in_loop_p (loop, inner_niter))
1254         {
1255           if (dump_enabled_p ())
1256             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257                              "not vectorized: inner-loop count not"
1258                              " invariant.\n");
1259           return false;
1260         }
1261
1262       if (dump_enabled_p ())
1263         dump_printf_loc (MSG_NOTE, vect_location,
1264                          "Considering outer-loop vectorization.\n");
1265     }
1266
1267   if (!single_exit (loop)
1268       || EDGE_COUNT (loop->header->preds) != 2)
1269     {
1270       if (dump_enabled_p ())
1271         {
1272           if (!single_exit (loop))
1273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1274                              "not vectorized: multiple exits.\n");
1275           else if (EDGE_COUNT (loop->header->preds) != 2)
1276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1277                              "not vectorized: too many incoming edges.\n");
1278         }
1279       return false;
1280     }
1281
1282   /* We assume that the loop exit condition is at the end of the loop. i.e,
1283      that the loop is represented as a do-while (with a proper if-guard
1284      before the loop if needed), where the loop header contains all the
1285      executable statements, and the latch is empty.  */
1286   if (!empty_block_p (loop->latch)
1287       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1288     {
1289       if (dump_enabled_p ())
1290         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                          "not vectorized: latch block not empty.\n");
1292       return false;
1293     }
1294
1295   /* Make sure the exit is not abnormal.  */
1296   edge e = single_exit (loop);
1297   if (e->flags & EDGE_ABNORMAL)
1298     {
1299       if (dump_enabled_p ())
1300         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301                          "not vectorized: abnormal loop exit edge.\n");
1302       return false;
1303     }
1304
1305   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1306                                      number_of_iterationsm1);
1307   if (!*loop_cond)
1308     {
1309       if (dump_enabled_p ())
1310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311                          "not vectorized: complicated exit condition.\n");
1312       return false;
1313     }
1314
1315   if (integer_zerop (*assumptions)
1316       || !*number_of_iterations
1317       || chrec_contains_undetermined (*number_of_iterations))
1318     {
1319       if (dump_enabled_p ())
1320         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321                          "not vectorized: number of iterations cannot be "
1322                          "computed.\n");
1323       return false;
1324     }
1325
1326   if (integer_zerop (*number_of_iterations))
1327     {
1328       if (dump_enabled_p ())
1329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330                          "not vectorized: number of iterations = 0.\n");
1331       return false;
1332     }
1333
1334   return true;
1335 }
1336
1337 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1338
1339 loop_vec_info
1340 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1341 {
1342   tree assumptions, number_of_iterations, number_of_iterationsm1;
1343   gcond *loop_cond, *inner_loop_cond = NULL;
1344
1345   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1346                                   &assumptions, &number_of_iterationsm1,
1347                                   &number_of_iterations, &inner_loop_cond))
1348     return NULL;
1349
1350   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1351   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1352   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1353   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1354   if (!integer_onep (assumptions))
1355     {
1356       /* We consider to vectorize this loop by versioning it under
1357          some assumptions.  In order to do this, we need to clear
1358          existing information computed by scev and niter analyzer.  */
1359       scev_reset_htab ();
1360       free_numbers_of_iterations_estimates (loop);
1361       /* Also set flag for this loop so that following scev and niter
1362          analysis are done under the assumptions.  */
1363       loop_constraint_set (loop, LOOP_C_FINITE);
1364       /* Also record the assumptions for versioning.  */
1365       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1366     }
1367
1368   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1369     {
1370       if (dump_enabled_p ())
1371         {
1372           dump_printf_loc (MSG_NOTE, vect_location,
1373                            "Symbolic number of iterations is ");
1374           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1375           dump_printf (MSG_NOTE, "\n");
1376         }
1377     }
1378
1379   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1380   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1381   if (inner_loop_cond)
1382     {
1383       stmt_vec_info inner_loop_cond_info
1384         = loop_vinfo->lookup_stmt (inner_loop_cond);
1385       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1386     }
1387
1388   gcc_assert (!loop->aux);
1389   loop->aux = loop_vinfo;
1390   return loop_vinfo;
1391 }
1392
1393
1394
1395 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1396    statements update the vectorization factor.  */
1397
1398 static void
1399 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1400 {
1401   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1402   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1403   int nbbs = loop->num_nodes;
1404   poly_uint64 vectorization_factor;
1405   int i;
1406
1407   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1408
1409   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1410   gcc_assert (known_ne (vectorization_factor, 0U));
1411
1412   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1413      vectorization factor of the loop is the unrolling factor required by
1414      the SLP instances.  If that unrolling factor is 1, we say, that we
1415      perform pure SLP on loop - cross iteration parallelism is not
1416      exploited.  */
1417   bool only_slp_in_loop = true;
1418   for (i = 0; i < nbbs; i++)
1419     {
1420       basic_block bb = bbs[i];
1421       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1422            gsi_next (&si))
1423         {
1424           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1425           stmt_info = vect_stmt_to_vectorize (stmt_info);
1426           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1427                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1428               && !PURE_SLP_STMT (stmt_info))
1429             /* STMT needs both SLP and loop-based vectorization.  */
1430             only_slp_in_loop = false;
1431         }
1432     }
1433
1434   if (only_slp_in_loop)
1435     {
1436       dump_printf_loc (MSG_NOTE, vect_location,
1437                        "Loop contains only SLP stmts\n");
1438       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1439     }
1440   else
1441     {
1442       dump_printf_loc (MSG_NOTE, vect_location,
1443                        "Loop contains SLP and non-SLP stmts\n");
1444       /* Both the vectorization factor and unroll factor have the form
1445          current_vector_size * X for some rational X, so they must have
1446          a common multiple.  */
1447       vectorization_factor
1448         = force_common_multiple (vectorization_factor,
1449                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1450     }
1451
1452   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1453   if (dump_enabled_p ())
1454     {
1455       dump_printf_loc (MSG_NOTE, vect_location,
1456                        "Updating vectorization factor to ");
1457       dump_dec (MSG_NOTE, vectorization_factor);
1458       dump_printf (MSG_NOTE, ".\n");
1459     }
1460 }
1461
1462 /* Return true if STMT_INFO describes a double reduction phi and if
1463    the other phi in the reduction is also relevant for vectorization.
1464    This rejects cases such as:
1465
1466       outer1:
1467         x_1 = PHI <x_3(outer2), ...>;
1468         ...
1469
1470       inner:
1471         x_2 = ...;
1472         ...
1473
1474       outer2:
1475         x_3 = PHI <x_2(inner)>;
1476
1477    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1478
1479 static bool
1480 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1481 {
1482   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1483     return false;
1484
1485   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1486 }
1487
1488 /* Function vect_analyze_loop_operations.
1489
1490    Scan the loop stmts and make sure they are all vectorizable.  */
1491
1492 static bool
1493 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1494 {
1495   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1496   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1497   int nbbs = loop->num_nodes;
1498   int i;
1499   stmt_vec_info stmt_info;
1500   bool need_to_vectorize = false;
1501   bool ok;
1502
1503   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1504
1505   stmt_vector_for_cost cost_vec;
1506   cost_vec.create (2);
1507
1508   for (i = 0; i < nbbs; i++)
1509     {
1510       basic_block bb = bbs[i];
1511
1512       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1513            gsi_next (&si))
1514         {
1515           gphi *phi = si.phi ();
1516           ok = true;
1517
1518           stmt_info = loop_vinfo->lookup_stmt (phi);
1519           if (dump_enabled_p ())
1520             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1521           if (virtual_operand_p (gimple_phi_result (phi)))
1522             continue;
1523
1524           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1525              (i.e., a phi in the tail of the outer-loop).  */
1526           if (! is_loop_header_bb_p (bb))
1527             {
1528               /* FORNOW: we currently don't support the case that these phis
1529                  are not used in the outerloop (unless it is double reduction,
1530                  i.e., this phi is vect_reduction_def), cause this case
1531                  requires to actually do something here.  */
1532               if (STMT_VINFO_LIVE_P (stmt_info)
1533                   && !vect_active_double_reduction_p (stmt_info))
1534                 {
1535                   if (dump_enabled_p ())
1536                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537                                      "Unsupported loop-closed phi in "
1538                                      "outer-loop.\n");
1539                   return false;
1540                 }
1541
1542               /* If PHI is used in the outer loop, we check that its operand
1543                  is defined in the inner loop.  */
1544               if (STMT_VINFO_RELEVANT_P (stmt_info))
1545                 {
1546                   tree phi_op;
1547
1548                   if (gimple_phi_num_args (phi) != 1)
1549                     return false;
1550
1551                   phi_op = PHI_ARG_DEF (phi, 0);
1552                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1553                   if (!op_def_info)
1554                     return false;
1555
1556                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1557                       && (STMT_VINFO_RELEVANT (op_def_info)
1558                           != vect_used_in_outer_by_reduction))
1559                     return false;
1560                 }
1561
1562               continue;
1563             }
1564
1565           gcc_assert (stmt_info);
1566
1567           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1568                || STMT_VINFO_LIVE_P (stmt_info))
1569               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1570             {
1571               /* A scalar-dependence cycle that we don't support.  */
1572               if (dump_enabled_p ())
1573                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574                                  "not vectorized: scalar dependence cycle.\n");
1575               return false;
1576             }
1577
1578           if (STMT_VINFO_RELEVANT_P (stmt_info))
1579             {
1580               need_to_vectorize = true;
1581               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1582                   && ! PURE_SLP_STMT (stmt_info))
1583                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1584                                              &cost_vec);
1585               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1586                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1587                        && ! PURE_SLP_STMT (stmt_info))
1588                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1589                                              &cost_vec);
1590             }
1591
1592           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1593           if (ok
1594               && STMT_VINFO_LIVE_P (stmt_info)
1595               && !PURE_SLP_STMT (stmt_info))
1596             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1597                                               &cost_vec);
1598
1599           if (!ok)
1600             {
1601               if (dump_enabled_p ())
1602                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1603                                  "not vectorized: relevant phi not "
1604                                  "supported: %G", phi);
1605               return false;
1606             }
1607         }
1608
1609       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1610            gsi_next (&si))
1611         {
1612           gimple *stmt = gsi_stmt (si);
1613           if (!gimple_clobber_p (stmt)
1614               && !vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1615                                      &need_to_vectorize,
1616                                      NULL, NULL, &cost_vec))
1617             return false;
1618         }
1619     } /* bbs */
1620
1621   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1622   cost_vec.release ();
1623
1624   /* All operations in the loop are either irrelevant (deal with loop
1625      control, or dead), or only used outside the loop and can be moved
1626      out of the loop (e.g. invariants, inductions).  The loop can be
1627      optimized away by scalar optimizations.  We're better off not
1628      touching this loop.  */
1629   if (!need_to_vectorize)
1630     {
1631       if (dump_enabled_p ())
1632         dump_printf_loc (MSG_NOTE, vect_location,
1633                          "All the computation can be taken out of the loop.\n");
1634       if (dump_enabled_p ())
1635         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1636                          "not vectorized: redundant loop. no profit to "
1637                          "vectorize.\n");
1638       return false;
1639     }
1640
1641   return true;
1642 }
1643
1644 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1645    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1646    definitely no, or -1 if it's worth retrying.  */
1647
1648 static int
1649 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1650 {
1651   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1652   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1653
1654   /* Only fully-masked loops can have iteration counts less than the
1655      vectorization factor.  */
1656   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1657     {
1658       HOST_WIDE_INT max_niter;
1659
1660       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1661         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1662       else
1663         max_niter = max_stmt_executions_int (loop);
1664
1665       if (max_niter != -1
1666           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1667         {
1668           if (dump_enabled_p ())
1669             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670                              "not vectorized: iteration count smaller than "
1671                              "vectorization factor.\n");
1672           return 0;
1673         }
1674     }
1675
1676   int min_profitable_iters, min_profitable_estimate;
1677   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1678                                       &min_profitable_estimate);
1679
1680   if (min_profitable_iters < 0)
1681     {
1682       if (dump_enabled_p ())
1683         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1684                          "not vectorized: vectorization not profitable.\n");
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687                          "not vectorized: vector version will never be "
1688                          "profitable.\n");
1689       return -1;
1690     }
1691
1692   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1693                                * assumed_vf);
1694
1695   /* Use the cost model only if it is more conservative than user specified
1696      threshold.  */
1697   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1698                                     min_profitable_iters);
1699
1700   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1701
1702   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1703       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1704     {
1705       if (dump_enabled_p ())
1706         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                          "not vectorized: vectorization not profitable.\n");
1708       if (dump_enabled_p ())
1709         dump_printf_loc (MSG_NOTE, vect_location,
1710                          "not vectorized: iteration count smaller than user "
1711                          "specified loop bound parameter or minimum profitable "
1712                          "iterations (whichever is more conservative).\n");
1713       return 0;
1714     }
1715
1716   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1717   if (estimated_niter == -1)
1718     estimated_niter = likely_max_stmt_executions_int (loop);
1719   if (estimated_niter != -1
1720       && ((unsigned HOST_WIDE_INT) estimated_niter
1721           < MAX (th, (unsigned) min_profitable_estimate)))
1722     {
1723       if (dump_enabled_p ())
1724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725                          "not vectorized: estimated iteration count too "
1726                          "small.\n");
1727       if (dump_enabled_p ())
1728         dump_printf_loc (MSG_NOTE, vect_location,
1729                          "not vectorized: estimated iteration count smaller "
1730                          "than specified loop bound parameter or minimum "
1731                          "profitable iterations (whichever is more "
1732                          "conservative).\n");
1733       return -1;
1734     }
1735
1736   return 1;
1737 }
1738
1739 static bool
1740 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1741                            vec<data_reference_p> *datarefs,
1742                            unsigned int *n_stmts)
1743 {
1744   *n_stmts = 0;
1745   for (unsigned i = 0; i < loop->num_nodes; i++)
1746     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1747          !gsi_end_p (gsi); gsi_next (&gsi))
1748       {
1749         gimple *stmt = gsi_stmt (gsi);
1750         if (is_gimple_debug (stmt))
1751           continue;
1752         ++(*n_stmts);
1753         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1754           {
1755             if (is_gimple_call (stmt) && loop->safelen)
1756               {
1757                 tree fndecl = gimple_call_fndecl (stmt), op;
1758                 if (fndecl != NULL_TREE)
1759                   {
1760                     cgraph_node *node = cgraph_node::get (fndecl);
1761                     if (node != NULL && node->simd_clones != NULL)
1762                       {
1763                         unsigned int j, n = gimple_call_num_args (stmt);
1764                         for (j = 0; j < n; j++)
1765                           {
1766                             op = gimple_call_arg (stmt, j);
1767                             if (DECL_P (op)
1768                                 || (REFERENCE_CLASS_P (op)
1769                                     && get_base_address (op)))
1770                               break;
1771                           }
1772                         op = gimple_call_lhs (stmt);
1773                         /* Ignore #pragma omp declare simd functions
1774                            if they don't have data references in the
1775                            call stmt itself.  */
1776                         if (j == n
1777                             && !(op
1778                                  && (DECL_P (op)
1779                                      || (REFERENCE_CLASS_P (op)
1780                                          && get_base_address (op)))))
1781                           continue;
1782                       }
1783                   }
1784               }
1785             return false;
1786           }
1787         /* If dependence analysis will give up due to the limit on the
1788            number of datarefs stop here and fail fatally.  */
1789         if (datarefs->length ()
1790             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1791           return false;
1792       }
1793   return true;
1794 }
1795
1796 /* Function vect_analyze_loop_2.
1797
1798    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1799    for it.  The different analyses will record information in the
1800    loop_vec_info struct.  */
1801 static bool
1802 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1803 {
1804   bool ok;
1805   int res;
1806   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1807   poly_uint64 min_vf = 2;
1808
1809   /* The first group of checks is independent of the vector size.  */
1810   fatal = true;
1811
1812   /* Find all data references in the loop (which correspond to vdefs/vuses)
1813      and analyze their evolution in the loop.  */
1814
1815   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1816
1817   /* Gather the data references and count stmts in the loop.  */
1818   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1819     {
1820       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1821                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1822                                       n_stmts))
1823         {
1824           if (dump_enabled_p ())
1825             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                              "not vectorized: loop contains function "
1827                              "calls or data references that cannot "
1828                              "be analyzed\n");
1829           return false;
1830         }
1831       loop_vinfo->shared->save_datarefs ();
1832     }
1833   else
1834     loop_vinfo->shared->check_datarefs ();
1835
1836   /* Analyze the data references and also adjust the minimal
1837      vectorization factor according to the loads and stores.  */
1838
1839   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1840   if (!ok)
1841     {
1842       if (dump_enabled_p ())
1843         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844                          "bad data references.\n");
1845       return false;
1846     }
1847
1848   /* Classify all cross-iteration scalar data-flow cycles.
1849      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1850   vect_analyze_scalar_cycles (loop_vinfo);
1851
1852   vect_pattern_recog (loop_vinfo);
1853
1854   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1855
1856   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1857      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1858
1859   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1860   if (!ok)
1861     {
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864                          "bad data access.\n");
1865       return false;
1866     }
1867
1868   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1869
1870   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1871   if (!ok)
1872     {
1873       if (dump_enabled_p ())
1874         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875                          "unexpected pattern.\n");
1876       return false;
1877     }
1878
1879   /* While the rest of the analysis below depends on it in some way.  */
1880   fatal = false;
1881
1882   /* Analyze data dependences between the data-refs in the loop
1883      and adjust the maximum vectorization factor according to
1884      the dependences.
1885      FORNOW: fail at the first data dependence that we encounter.  */
1886
1887   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1888   if (!ok
1889       || (max_vf != MAX_VECTORIZATION_FACTOR
1890           && maybe_lt (max_vf, min_vf)))
1891     {
1892       if (dump_enabled_p ())
1893             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894                              "bad data dependence.\n");
1895       return false;
1896     }
1897   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1898
1899   ok = vect_determine_vectorization_factor (loop_vinfo);
1900   if (!ok)
1901     {
1902       if (dump_enabled_p ())
1903         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904                          "can't determine vectorization factor.\n");
1905       return false;
1906     }
1907   if (max_vf != MAX_VECTORIZATION_FACTOR
1908       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1909     {
1910       if (dump_enabled_p ())
1911         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1912                          "bad data dependence.\n");
1913       return false;
1914     }
1915
1916   /* Compute the scalar iteration cost.  */
1917   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1918
1919   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1920   unsigned th;
1921
1922   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1923   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1924   if (!ok)
1925     return false;
1926
1927   /* If there are any SLP instances mark them as pure_slp.  */
1928   bool slp = vect_make_slp_decision (loop_vinfo);
1929   if (slp)
1930     {
1931       /* Find stmts that need to be both vectorized and SLPed.  */
1932       vect_detect_hybrid_slp (loop_vinfo);
1933
1934       /* Update the vectorization factor based on the SLP decision.  */
1935       vect_update_vf_for_slp (loop_vinfo);
1936     }
1937
1938   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1939
1940   /* We don't expect to have to roll back to anything other than an empty
1941      set of rgroups.  */
1942   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1943
1944   /* This is the point where we can re-start analysis with SLP forced off.  */
1945 start_over:
1946
1947   /* Now the vectorization factor is final.  */
1948   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1949   gcc_assert (known_ne (vectorization_factor, 0U));
1950
1951   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1952     {
1953       dump_printf_loc (MSG_NOTE, vect_location,
1954                        "vectorization_factor = ");
1955       dump_dec (MSG_NOTE, vectorization_factor);
1956       dump_printf (MSG_NOTE, ", niters = %wd\n",
1957                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1958     }
1959
1960   HOST_WIDE_INT max_niter
1961     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1962
1963   /* Analyze the alignment of the data-refs in the loop.
1964      Fail if a data reference is found that cannot be vectorized.  */
1965
1966   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1967   if (!ok)
1968     {
1969       if (dump_enabled_p ())
1970         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1971                          "bad data alignment.\n");
1972       return false;
1973     }
1974
1975   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1976      It is important to call pruning after vect_analyze_data_ref_accesses,
1977      since we use grouping information gathered by interleaving analysis.  */
1978   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1979   if (!ok)
1980     return false;
1981
1982   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
1983      vectorization.  */
1984   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1985     {
1986     /* This pass will decide on using loop versioning and/or loop peeling in
1987        order to enhance the alignment of data references in the loop.  */
1988     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1989     if (!ok)
1990       {
1991         if (dump_enabled_p ())
1992           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1993                            "bad data alignment.\n");
1994         return false;
1995       }
1996     }
1997
1998   if (slp)
1999     {
2000       /* Analyze operations in the SLP instances.  Note this may
2001          remove unsupported SLP instances which makes the above
2002          SLP kind detection invalid.  */
2003       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2004       vect_slp_analyze_operations (loop_vinfo);
2005       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2006         goto again;
2007     }
2008
2009   /* Scan all the remaining operations in the loop that are not subject
2010      to SLP and make sure they are vectorizable.  */
2011   ok = vect_analyze_loop_operations (loop_vinfo);
2012   if (!ok)
2013     {
2014       if (dump_enabled_p ())
2015         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2016                          "bad operation or unsupported loop bound.\n");
2017       return false;
2018     }
2019
2020   /* Decide whether to use a fully-masked loop for this vectorization
2021      factor.  */
2022   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2023     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2024        && vect_verify_full_masking (loop_vinfo));
2025   if (dump_enabled_p ())
2026     {
2027       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2028         dump_printf_loc (MSG_NOTE, vect_location,
2029                          "using a fully-masked loop.\n");
2030       else
2031         dump_printf_loc (MSG_NOTE, vect_location,
2032                          "not using a fully-masked loop.\n");
2033     }
2034
2035   /* If epilog loop is required because of data accesses with gaps,
2036      one additional iteration needs to be peeled.  Check if there is
2037      enough iterations for vectorization.  */
2038   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2039       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2040       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2041     {
2042       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2043       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2044
2045       if (known_lt (wi::to_widest (scalar_niters), vf))
2046         {
2047           if (dump_enabled_p ())
2048             dump_printf_loc (MSG_NOTE, vect_location,
2049                              "loop has no enough iterations to support"
2050                              " peeling for gaps.\n");
2051           return false;
2052         }
2053     }
2054
2055   /* Check the costings of the loop make vectorizing worthwhile.  */
2056   res = vect_analyze_loop_costing (loop_vinfo);
2057   if (res < 0)
2058     goto again;
2059   if (!res)
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063                          "Loop costings not worthwhile.\n");
2064       return false;
2065     }
2066
2067   /* Decide whether we need to create an epilogue loop to handle
2068      remaining scalar iterations.  */
2069   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2070
2071   unsigned HOST_WIDE_INT const_vf;
2072   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2073     /* The main loop handles all iterations.  */
2074     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2075   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2076            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2077     {
2078       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2079                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2080                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2081         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2082     }
2083   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2084            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2085            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2086                 < (unsigned) exact_log2 (const_vf))
2087                /* In case of versioning, check if the maximum number of
2088                   iterations is greater than th.  If they are identical,
2089                   the epilogue is unnecessary.  */
2090                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2091                    || ((unsigned HOST_WIDE_INT) max_niter
2092                        > (th / const_vf) * const_vf))))
2093     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2094
2095   /* If an epilogue loop is required make sure we can create one.  */
2096   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2097       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2098     {
2099       if (dump_enabled_p ())
2100         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2101       if (!vect_can_advance_ivs_p (loop_vinfo)
2102           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2103                                            single_exit (LOOP_VINFO_LOOP
2104                                                          (loop_vinfo))))
2105         {
2106           if (dump_enabled_p ())
2107             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108                              "not vectorized: can't create required "
2109                              "epilog loop\n");
2110           goto again;
2111         }
2112     }
2113
2114   /* During peeling, we need to check if number of loop iterations is
2115      enough for both peeled prolog loop and vector loop.  This check
2116      can be merged along with threshold check of loop versioning, so
2117      increase threshold for this case if necessary.  */
2118   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2119     {
2120       poly_uint64 niters_th = 0;
2121
2122       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2123         {
2124           /* Niters for peeled prolog loop.  */
2125           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2126             {
2127               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2128               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2129               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2130             }
2131           else
2132             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2133         }
2134
2135       /* Niters for at least one iteration of vectorized loop.  */
2136       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2137         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2138       /* One additional iteration because of peeling for gap.  */
2139       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2140         niters_th += 1;
2141       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2142     }
2143
2144   gcc_assert (known_eq (vectorization_factor,
2145                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2146
2147   /* Ok to vectorize!  */
2148   return true;
2149
2150 again:
2151   /* Try again with SLP forced off but if we didn't do any SLP there is
2152      no point in re-trying.  */
2153   if (!slp)
2154     return false;
2155
2156   /* If there are reduction chains re-trying will fail anyway.  */
2157   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2158     return false;
2159
2160   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2161      via interleaving or lane instructions.  */
2162   slp_instance instance;
2163   slp_tree node;
2164   unsigned i, j;
2165   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2166     {
2167       stmt_vec_info vinfo;
2168       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2169       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2170         continue;
2171       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2172       unsigned int size = DR_GROUP_SIZE (vinfo);
2173       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2174       if (! vect_store_lanes_supported (vectype, size, false)
2175          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2176          && ! vect_grouped_store_supported (vectype, size))
2177        return false;
2178       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2179         {
2180           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2181           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2182           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2183           size = DR_GROUP_SIZE (vinfo);
2184           vectype = STMT_VINFO_VECTYPE (vinfo);
2185           if (! vect_load_lanes_supported (vectype, size, false)
2186               && ! vect_grouped_load_supported (vectype, single_element_p,
2187                                                 size))
2188             return false;
2189         }
2190     }
2191
2192   if (dump_enabled_p ())
2193     dump_printf_loc (MSG_NOTE, vect_location,
2194                      "re-trying with SLP disabled\n");
2195
2196   /* Roll back state appropriately.  No SLP this time.  */
2197   slp = false;
2198   /* Restore vectorization factor as it were without SLP.  */
2199   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2200   /* Free the SLP instances.  */
2201   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2202     vect_free_slp_instance (instance, false);
2203   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2204   /* Reset SLP type to loop_vect on all stmts.  */
2205   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2206     {
2207       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2208       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2209            !gsi_end_p (si); gsi_next (&si))
2210         {
2211           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2212           STMT_SLP_TYPE (stmt_info) = loop_vect;
2213         }
2214       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2215            !gsi_end_p (si); gsi_next (&si))
2216         {
2217           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2218           STMT_SLP_TYPE (stmt_info) = loop_vect;
2219           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2220             {
2221               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2222               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2223               STMT_SLP_TYPE (stmt_info) = loop_vect;
2224               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2225                    !gsi_end_p (pi); gsi_next (&pi))
2226                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2227                   = loop_vect;
2228             }
2229         }
2230     }
2231   /* Free optimized alias test DDRS.  */
2232   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2233   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2234   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2235   /* Reset target cost data.  */
2236   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2237   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2238     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2239   /* Reset accumulated rgroup information.  */
2240   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2241   /* Reset assorted flags.  */
2242   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2243   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2244   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2245   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2246   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2247
2248   goto start_over;
2249 }
2250
2251 /* Function vect_analyze_loop.
2252
2253    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2254    for it.  The different analyses will record information in the
2255    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2256    be vectorized.  */
2257 loop_vec_info
2258 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2259                    vec_info_shared *shared)
2260 {
2261   loop_vec_info loop_vinfo;
2262   auto_vector_sizes vector_sizes;
2263
2264   /* Autodetect first vector size we try.  */
2265   current_vector_size = 0;
2266   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2267   unsigned int next_size = 0;
2268
2269   DUMP_VECT_SCOPE ("analyze_loop_nest");
2270
2271   if (loop_outer (loop)
2272       && loop_vec_info_for_loop (loop_outer (loop))
2273       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2274     {
2275       if (dump_enabled_p ())
2276         dump_printf_loc (MSG_NOTE, vect_location,
2277                          "outer-loop already vectorized.\n");
2278       return NULL;
2279     }
2280
2281   if (!find_loop_nest (loop, &shared->loop_nest))
2282     {
2283       if (dump_enabled_p ())
2284         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285                          "not vectorized: loop nest containing two "
2286                          "or more consecutive inner loops cannot be "
2287                          "vectorized\n");
2288       return NULL;
2289     }
2290
2291   unsigned n_stmts = 0;
2292   poly_uint64 autodetected_vector_size = 0;
2293   while (1)
2294     {
2295       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2296       loop_vinfo = vect_analyze_loop_form (loop, shared);
2297       if (!loop_vinfo)
2298         {
2299           if (dump_enabled_p ())
2300             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2301                              "bad loop form.\n");
2302           return NULL;
2303         }
2304
2305       bool fatal = false;
2306
2307       if (orig_loop_vinfo)
2308         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2309
2310       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2311         {
2312           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2313
2314           return loop_vinfo;
2315         }
2316
2317       delete loop_vinfo;
2318
2319       if (next_size == 0)
2320         autodetected_vector_size = current_vector_size;
2321
2322       if (next_size < vector_sizes.length ()
2323           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2324         next_size += 1;
2325
2326       if (fatal
2327           || next_size == vector_sizes.length ()
2328           || known_eq (current_vector_size, 0U))
2329         return NULL;
2330
2331       /* Try the next biggest vector size.  */
2332       current_vector_size = vector_sizes[next_size++];
2333       if (dump_enabled_p ())
2334         {
2335           dump_printf_loc (MSG_NOTE, vect_location,
2336                            "***** Re-trying analysis with "
2337                            "vector size ");
2338           dump_dec (MSG_NOTE, current_vector_size);
2339           dump_printf (MSG_NOTE, "\n");
2340         }
2341     }
2342 }
2343
2344 /* Return true if there is an in-order reduction function for CODE, storing
2345    it in *REDUC_FN if so.  */
2346
2347 static bool
2348 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2349 {
2350   switch (code)
2351     {
2352     case PLUS_EXPR:
2353       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2354       return true;
2355
2356     default:
2357       return false;
2358     }
2359 }
2360
2361 /* Function reduction_fn_for_scalar_code
2362
2363    Input:
2364    CODE - tree_code of a reduction operations.
2365
2366    Output:
2367    REDUC_FN - the corresponding internal function to be used to reduce the
2368       vector of partial results into a single scalar result, or IFN_LAST
2369       if the operation is a supported reduction operation, but does not have
2370       such an internal function.
2371
2372    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2373
2374 static bool
2375 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2376 {
2377   switch (code)
2378     {
2379       case MAX_EXPR:
2380         *reduc_fn = IFN_REDUC_MAX;
2381         return true;
2382
2383       case MIN_EXPR:
2384         *reduc_fn = IFN_REDUC_MIN;
2385         return true;
2386
2387       case PLUS_EXPR:
2388         *reduc_fn = IFN_REDUC_PLUS;
2389         return true;
2390
2391       case BIT_AND_EXPR:
2392         *reduc_fn = IFN_REDUC_AND;
2393         return true;
2394
2395       case BIT_IOR_EXPR:
2396         *reduc_fn = IFN_REDUC_IOR;
2397         return true;
2398
2399       case BIT_XOR_EXPR:
2400         *reduc_fn = IFN_REDUC_XOR;
2401         return true;
2402
2403       case MULT_EXPR:
2404       case MINUS_EXPR:
2405         *reduc_fn = IFN_LAST;
2406         return true;
2407
2408       default:
2409        return false;
2410     }
2411 }
2412
2413 /* If there is a neutral value X such that SLP reduction NODE would not
2414    be affected by the introduction of additional X elements, return that X,
2415    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2416    is true if the SLP statements perform a single reduction, false if each
2417    statement performs an independent reduction.  */
2418
2419 static tree
2420 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2421                               bool reduc_chain)
2422 {
2423   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2424   stmt_vec_info stmt_vinfo = stmts[0];
2425   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2426   tree scalar_type = TREE_TYPE (vector_type);
2427   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2428   gcc_assert (loop);
2429
2430   switch (code)
2431     {
2432     case WIDEN_SUM_EXPR:
2433     case DOT_PROD_EXPR:
2434     case SAD_EXPR:
2435     case PLUS_EXPR:
2436     case MINUS_EXPR:
2437     case BIT_IOR_EXPR:
2438     case BIT_XOR_EXPR:
2439       return build_zero_cst (scalar_type);
2440
2441     case MULT_EXPR:
2442       return build_one_cst (scalar_type);
2443
2444     case BIT_AND_EXPR:
2445       return build_all_ones_cst (scalar_type);
2446
2447     case MAX_EXPR:
2448     case MIN_EXPR:
2449       /* For MIN/MAX the initial values are neutral.  A reduction chain
2450          has only a single initial value, so that value is neutral for
2451          all statements.  */
2452       if (reduc_chain)
2453         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2454                                       loop_preheader_edge (loop));
2455       return NULL_TREE;
2456
2457     default:
2458       return NULL_TREE;
2459     }
2460 }
2461
2462 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2463    STMT is printed with a message MSG. */
2464
2465 static void
2466 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2467 {
2468   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2469 }
2470
2471 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2472    operation.  Return true if the results of DEF_STMT_INFO are something
2473    that can be accumulated by such a reduction.  */
2474
2475 static bool
2476 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2477 {
2478   return (is_gimple_assign (def_stmt_info->stmt)
2479           || is_gimple_call (def_stmt_info->stmt)
2480           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2481           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2482               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2483               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2484 }
2485
2486 /* Detect SLP reduction of the form:
2487
2488    #a1 = phi <a5, a0>
2489    a2 = operation (a1)
2490    a3 = operation (a2)
2491    a4 = operation (a3)
2492    a5 = operation (a4)
2493
2494    #a = phi <a5>
2495
2496    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2497    FIRST_STMT is the first reduction stmt in the chain
2498    (a2 = operation (a1)).
2499
2500    Return TRUE if a reduction chain was detected.  */
2501
2502 static bool
2503 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2504                        gimple *first_stmt)
2505 {
2506   struct loop *loop = (gimple_bb (phi))->loop_father;
2507   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2508   enum tree_code code;
2509   gimple *loop_use_stmt = NULL;
2510   stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2511   tree lhs;
2512   imm_use_iterator imm_iter;
2513   use_operand_p use_p;
2514   int nloop_uses, size = 0, n_out_of_loop_uses;
2515   bool found = false;
2516
2517   if (loop != vect_loop)
2518     return false;
2519
2520   lhs = PHI_RESULT (phi);
2521   code = gimple_assign_rhs_code (first_stmt);
2522   while (1)
2523     {
2524       nloop_uses = 0;
2525       n_out_of_loop_uses = 0;
2526       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2527         {
2528           gimple *use_stmt = USE_STMT (use_p);
2529           if (is_gimple_debug (use_stmt))
2530             continue;
2531
2532           /* Check if we got back to the reduction phi.  */
2533           if (use_stmt == phi)
2534             {
2535               loop_use_stmt = use_stmt;
2536               found = true;
2537               break;
2538             }
2539
2540           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2541             {
2542               loop_use_stmt = use_stmt;
2543               nloop_uses++;
2544             }
2545            else
2546              n_out_of_loop_uses++;
2547
2548            /* There are can be either a single use in the loop or two uses in
2549               phi nodes.  */
2550            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2551              return false;
2552         }
2553
2554       if (found)
2555         break;
2556
2557       /* We reached a statement with no loop uses.  */
2558       if (nloop_uses == 0)
2559         return false;
2560
2561       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2562       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2563         return false;
2564
2565       if (!is_gimple_assign (loop_use_stmt)
2566           || code != gimple_assign_rhs_code (loop_use_stmt)
2567           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2568         return false;
2569
2570       /* Insert USE_STMT into reduction chain.  */
2571       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2572       if (current_stmt_info)
2573         {
2574           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2575           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2576             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2577         }
2578       else
2579         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2580
2581       lhs = gimple_assign_lhs (loop_use_stmt);
2582       current_stmt_info = use_stmt_info;
2583       size++;
2584    }
2585
2586   if (!found || loop_use_stmt != phi || size < 2)
2587     return false;
2588
2589   /* Swap the operands, if needed, to make the reduction operand be the second
2590      operand.  */
2591   lhs = PHI_RESULT (phi);
2592   stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2593   while (next_stmt_info)
2594     {
2595       gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2596       if (gimple_assign_rhs2 (next_stmt) == lhs)
2597         {
2598           tree op = gimple_assign_rhs1 (next_stmt);
2599           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2600
2601           /* Check that the other def is either defined in the loop
2602              ("vect_internal_def"), or it's an induction (defined by a
2603              loop-header phi-node).  */
2604           if (def_stmt_info
2605               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2606               && vect_valid_reduction_input_p (def_stmt_info))
2607             {
2608               lhs = gimple_assign_lhs (next_stmt);
2609               next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2610               continue;
2611             }
2612
2613           return false;
2614         }
2615       else
2616         {
2617           tree op = gimple_assign_rhs2 (next_stmt);
2618           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2619
2620           /* Check that the other def is either defined in the loop
2621             ("vect_internal_def"), or it's an induction (defined by a
2622             loop-header phi-node).  */
2623           if (def_stmt_info
2624               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2625               && vect_valid_reduction_input_p (def_stmt_info))
2626             {
2627               if (dump_enabled_p ())
2628                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2629                                  next_stmt);
2630
2631               swap_ssa_operands (next_stmt,
2632                                  gimple_assign_rhs1_ptr (next_stmt),
2633                                  gimple_assign_rhs2_ptr (next_stmt));
2634               update_stmt (next_stmt);
2635
2636               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2637                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2638             }
2639           else
2640             return false;
2641         }
2642
2643       lhs = gimple_assign_lhs (next_stmt);
2644       next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2645     }
2646
2647   /* Save the chain for further analysis in SLP detection.  */
2648   stmt_vec_info first_stmt_info
2649     = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2650   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2651   REDUC_GROUP_SIZE (first_stmt_info) = size;
2652
2653   return true;
2654 }
2655
2656 /* Return true if we need an in-order reduction for operation CODE
2657    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2658    overflow must wrap.  */
2659
2660 static bool
2661 needs_fold_left_reduction_p (tree type, tree_code code,
2662                              bool need_wrapping_integral_overflow)
2663 {
2664   /* CHECKME: check for !flag_finite_math_only too?  */
2665   if (SCALAR_FLOAT_TYPE_P (type))
2666     switch (code)
2667       {
2668       case MIN_EXPR:
2669       case MAX_EXPR:
2670         return false;
2671
2672       default:
2673         return !flag_associative_math;
2674       }
2675
2676   if (INTEGRAL_TYPE_P (type))
2677     {
2678       if (!operation_no_trapping_overflow (type, code))
2679         return true;
2680       if (need_wrapping_integral_overflow
2681           && !TYPE_OVERFLOW_WRAPS (type)
2682           && operation_can_overflow (code))
2683         return true;
2684       return false;
2685     }
2686
2687   if (SAT_FIXED_POINT_TYPE_P (type))
2688     return true;
2689
2690   return false;
2691 }
2692
2693 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2694    reduction operation CODE has a handled computation expression.  */
2695
2696 bool
2697 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2698                       tree loop_arg, enum tree_code code)
2699 {
2700   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2701   auto_bitmap visited;
2702   tree lookfor = PHI_RESULT (phi);
2703   ssa_op_iter curri;
2704   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2705   while (USE_FROM_PTR (curr) != loop_arg)
2706     curr = op_iter_next_use (&curri);
2707   curri.i = curri.numops;
2708   do
2709     {
2710       path.safe_push (std::make_pair (curri, curr));
2711       tree use = USE_FROM_PTR (curr);
2712       if (use == lookfor)
2713         break;
2714       gimple *def = SSA_NAME_DEF_STMT (use);
2715       if (gimple_nop_p (def)
2716           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2717         {
2718 pop:
2719           do
2720             {
2721               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2722               curri = x.first;
2723               curr = x.second;
2724               do
2725                 curr = op_iter_next_use (&curri);
2726               /* Skip already visited or non-SSA operands (from iterating
2727                  over PHI args).  */
2728               while (curr != NULL_USE_OPERAND_P
2729                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2730                          || ! bitmap_set_bit (visited,
2731                                               SSA_NAME_VERSION
2732                                                 (USE_FROM_PTR (curr)))));
2733             }
2734           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2735           if (curr == NULL_USE_OPERAND_P)
2736             break;
2737         }
2738       else
2739         {
2740           if (gimple_code (def) == GIMPLE_PHI)
2741             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2742           else
2743             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2744           while (curr != NULL_USE_OPERAND_P
2745                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2746                      || ! bitmap_set_bit (visited,
2747                                           SSA_NAME_VERSION
2748                                             (USE_FROM_PTR (curr)))))
2749             curr = op_iter_next_use (&curri);
2750           if (curr == NULL_USE_OPERAND_P)
2751             goto pop;
2752         }
2753     }
2754   while (1);
2755   if (dump_file && (dump_flags & TDF_DETAILS))
2756     {
2757       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2758       unsigned i;
2759       std::pair<ssa_op_iter, use_operand_p> *x;
2760       FOR_EACH_VEC_ELT (path, i, x)
2761         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2762       dump_printf (MSG_NOTE, "\n");
2763     }
2764
2765   /* Check whether the reduction path detected is valid.  */
2766   bool fail = path.length () == 0;
2767   bool neg = false;
2768   for (unsigned i = 1; i < path.length (); ++i)
2769     {
2770       gimple *use_stmt = USE_STMT (path[i].second);
2771       tree op = USE_FROM_PTR (path[i].second);
2772       if (! has_single_use (op)
2773           || ! is_gimple_assign (use_stmt))
2774         {
2775           fail = true;
2776           break;
2777         }
2778       if (gimple_assign_rhs_code (use_stmt) != code)
2779         {
2780           if (code == PLUS_EXPR
2781               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2782             {
2783               /* Track whether we negate the reduction value each iteration.  */
2784               if (gimple_assign_rhs2 (use_stmt) == op)
2785                 neg = ! neg;
2786             }
2787           else
2788             {
2789               fail = true;
2790               break;
2791             }
2792         }
2793     }
2794   return ! fail && ! neg;
2795 }
2796
2797
2798 /* Function vect_is_simple_reduction
2799
2800    (1) Detect a cross-iteration def-use cycle that represents a simple
2801    reduction computation.  We look for the following pattern:
2802
2803    loop_header:
2804      a1 = phi < a0, a2 >
2805      a3 = ...
2806      a2 = operation (a3, a1)
2807
2808    or
2809
2810    a3 = ...
2811    loop_header:
2812      a1 = phi < a0, a2 >
2813      a2 = operation (a3, a1)
2814
2815    such that:
2816    1. operation is commutative and associative and it is safe to
2817       change the order of the computation
2818    2. no uses for a2 in the loop (a2 is used out of the loop)
2819    3. no uses of a1 in the loop besides the reduction operation
2820    4. no uses of a1 outside the loop.
2821
2822    Conditions 1,4 are tested here.
2823    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2824
2825    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2826    nested cycles.
2827
2828    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2829    reductions:
2830
2831      a1 = phi < a0, a2 >
2832      inner loop (def of a3)
2833      a2 = phi < a3 >
2834
2835    (4) Detect condition expressions, ie:
2836      for (int i = 0; i < N; i++)
2837        if (a[i] < val)
2838         ret_val = a[i];
2839
2840 */
2841
2842 static stmt_vec_info
2843 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2844                           bool *double_reduc,
2845                           bool need_wrapping_integral_overflow,
2846                           enum vect_reduction_type *v_reduc_type)
2847 {
2848   gphi *phi = as_a <gphi *> (phi_info->stmt);
2849   struct loop *loop = (gimple_bb (phi))->loop_father;
2850   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2851   gimple *phi_use_stmt = NULL;
2852   enum tree_code orig_code, code;
2853   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2854   tree type;
2855   int nloop_uses;
2856   tree name;
2857   imm_use_iterator imm_iter;
2858   use_operand_p use_p;
2859   bool phi_def;
2860
2861   *double_reduc = false;
2862   *v_reduc_type = TREE_CODE_REDUCTION;
2863
2864   tree phi_name = PHI_RESULT (phi);
2865   /* ???  If there are no uses of the PHI result the inner loop reduction
2866      won't be detected as possibly double-reduction by vectorizable_reduction
2867      because that tries to walk the PHI arg from the preheader edge which
2868      can be constant.  See PR60382.  */
2869   if (has_zero_uses (phi_name))
2870     return NULL;
2871   nloop_uses = 0;
2872   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2873     {
2874       gimple *use_stmt = USE_STMT (use_p);
2875       if (is_gimple_debug (use_stmt))
2876         continue;
2877
2878       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2879         {
2880           if (dump_enabled_p ())
2881             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2882                              "intermediate value used outside loop.\n");
2883
2884           return NULL;
2885         }
2886
2887       nloop_uses++;
2888       if (nloop_uses > 1)
2889         {
2890           if (dump_enabled_p ())
2891             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2892                              "reduction value used in loop.\n");
2893           return NULL;
2894         }
2895
2896       phi_use_stmt = use_stmt;
2897     }
2898
2899   edge latch_e = loop_latch_edge (loop);
2900   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2901   if (TREE_CODE (loop_arg) != SSA_NAME)
2902     {
2903       if (dump_enabled_p ())
2904         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2905                          "reduction: not ssa_name: %T\n", loop_arg);
2906       return NULL;
2907     }
2908
2909   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2910   if (!def_stmt_info
2911       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2912     return NULL;
2913
2914   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2915     {
2916       name = gimple_assign_lhs (def_stmt);
2917       phi_def = false;
2918     }
2919   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2920     {
2921       name = PHI_RESULT (def_stmt);
2922       phi_def = true;
2923     }
2924   else
2925     {
2926       if (dump_enabled_p ())
2927         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2928                          "reduction: unhandled reduction operation: %G",
2929                          def_stmt_info->stmt);
2930       return NULL;
2931     }
2932
2933   nloop_uses = 0;
2934   auto_vec<gphi *, 3> lcphis;
2935   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2936     {
2937       gimple *use_stmt = USE_STMT (use_p);
2938       if (is_gimple_debug (use_stmt))
2939         continue;
2940       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2941         nloop_uses++;
2942       else
2943         /* We can have more than one loop-closed PHI.  */
2944         lcphis.safe_push (as_a <gphi *> (use_stmt));
2945       if (nloop_uses > 1)
2946         {
2947           if (dump_enabled_p ())
2948             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2949                              "reduction used in loop.\n");
2950           return NULL;
2951         }
2952     }
2953
2954   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2955      defined in the inner loop.  */
2956   if (phi_def)
2957     {
2958       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2959       op1 = PHI_ARG_DEF (def_stmt, 0);
2960
2961       if (gimple_phi_num_args (def_stmt) != 1
2962           || TREE_CODE (op1) != SSA_NAME)
2963         {
2964           if (dump_enabled_p ())
2965             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2966                              "unsupported phi node definition.\n");
2967
2968           return NULL;
2969         }
2970
2971       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2972       if (gimple_bb (def1)
2973           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2974           && loop->inner
2975           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2976           && is_gimple_assign (def1)
2977           && is_a <gphi *> (phi_use_stmt)
2978           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2979         {
2980           if (dump_enabled_p ())
2981             report_vect_op (MSG_NOTE, def_stmt,
2982                             "detected double reduction: ");
2983
2984           *double_reduc = true;
2985           return def_stmt_info;
2986         }
2987
2988       return NULL;
2989     }
2990
2991   /* If we are vectorizing an inner reduction we are executing that
2992      in the original order only in case we are not dealing with a
2993      double reduction.  */
2994   bool check_reduction = true;
2995   if (flow_loop_nested_p (vect_loop, loop))
2996     {
2997       gphi *lcphi;
2998       unsigned i;
2999       check_reduction = false;
3000       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3001         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3002           {
3003             gimple *use_stmt = USE_STMT (use_p);
3004             if (is_gimple_debug (use_stmt))
3005               continue;
3006             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3007               check_reduction = true;
3008           }
3009     }
3010
3011   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3012   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3013   code = orig_code = gimple_assign_rhs_code (def_stmt);
3014
3015   /* We can handle "res -= x[i]", which is non-associative by
3016      simply rewriting this into "res += -x[i]".  Avoid changing
3017      gimple instruction for the first simple tests and only do this
3018      if we're allowed to change code at all.  */
3019   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3020     code = PLUS_EXPR;
3021
3022   if (code == COND_EXPR)
3023     {
3024       if (! nested_in_vect_loop)
3025         *v_reduc_type = COND_REDUCTION;
3026
3027       op3 = gimple_assign_rhs1 (def_stmt);
3028       if (COMPARISON_CLASS_P (op3))
3029         {
3030           op4 = TREE_OPERAND (op3, 1);
3031           op3 = TREE_OPERAND (op3, 0);
3032         }
3033       if (op3 == phi_name || op4 == phi_name)
3034         {
3035           if (dump_enabled_p ())
3036             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3037                             "reduction: condition depends on previous"
3038                             " iteration: ");
3039           return NULL;
3040         }
3041
3042       op1 = gimple_assign_rhs2 (def_stmt);
3043       op2 = gimple_assign_rhs3 (def_stmt);
3044     }
3045   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3046     {
3047       if (dump_enabled_p ())
3048         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3049                         "reduction: not commutative/associative: ");
3050       return NULL;
3051     }
3052   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3053     {
3054       op1 = gimple_assign_rhs1 (def_stmt);
3055       op2 = gimple_assign_rhs2 (def_stmt);
3056     }
3057   else
3058     {
3059       if (dump_enabled_p ())
3060         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3061                         "reduction: not handled operation: ");
3062       return NULL;
3063     }
3064
3065   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3066     {
3067       if (dump_enabled_p ())
3068         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3069                         "reduction: both uses not ssa_names: ");
3070
3071       return NULL;
3072     }
3073
3074   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3075   if ((TREE_CODE (op1) == SSA_NAME
3076        && !types_compatible_p (type,TREE_TYPE (op1)))
3077       || (TREE_CODE (op2) == SSA_NAME
3078           && !types_compatible_p (type, TREE_TYPE (op2)))
3079       || (op3 && TREE_CODE (op3) == SSA_NAME
3080           && !types_compatible_p (type, TREE_TYPE (op3)))
3081       || (op4 && TREE_CODE (op4) == SSA_NAME
3082           && !types_compatible_p (type, TREE_TYPE (op4))))
3083     {
3084       if (dump_enabled_p ())
3085         {
3086           dump_printf_loc (MSG_NOTE, vect_location,
3087                            "reduction: multiple types: operation type: "
3088                            "%T, operands types: %T,%T",
3089                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3090           if (op3)
3091             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3092
3093           if (op4)
3094             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3095           dump_printf (MSG_NOTE, "\n");
3096         }
3097
3098       return NULL;
3099     }
3100
3101   /* Check whether it's ok to change the order of the computation.
3102      Generally, when vectorizing a reduction we change the order of the
3103      computation.  This may change the behavior of the program in some
3104      cases, so we need to check that this is ok.  One exception is when
3105      vectorizing an outer-loop: the inner-loop is executed sequentially,
3106      and therefore vectorizing reductions in the inner-loop during
3107      outer-loop vectorization is safe.  */
3108   if (check_reduction
3109       && *v_reduc_type == TREE_CODE_REDUCTION
3110       && needs_fold_left_reduction_p (type, code,
3111                                       need_wrapping_integral_overflow))
3112     *v_reduc_type = FOLD_LEFT_REDUCTION;
3113
3114   /* Reduction is safe. We're dealing with one of the following:
3115      1) integer arithmetic and no trapv
3116      2) floating point arithmetic, and special flags permit this optimization
3117      3) nested cycle (i.e., outer loop vectorization).  */
3118   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3119   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3120   if (code != COND_EXPR && !def1_info && !def2_info)
3121     {
3122       if (dump_enabled_p ())
3123         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3124       return NULL;
3125     }
3126
3127   /* Check that one def is the reduction def, defined by PHI,
3128      the other def is either defined in the loop ("vect_internal_def"),
3129      or it's an induction (defined by a loop-header phi-node).  */
3130
3131   if (def2_info
3132       && def2_info->stmt == phi
3133       && (code == COND_EXPR
3134           || !def1_info
3135           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3136           || vect_valid_reduction_input_p (def1_info)))
3137     {
3138       if (dump_enabled_p ())
3139         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3140       return def_stmt_info;
3141     }
3142
3143   if (def1_info
3144       && def1_info->stmt == phi
3145       && (code == COND_EXPR
3146           || !def2_info
3147           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3148           || vect_valid_reduction_input_p (def2_info)))
3149     {
3150       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3151         {
3152           /* Check if we can swap operands (just for simplicity - so that
3153              the rest of the code can assume that the reduction variable
3154              is always the last (second) argument).  */
3155           if (code == COND_EXPR)
3156             {
3157               /* Swap cond_expr by inverting the condition.  */
3158               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3159               enum tree_code invert_code = ERROR_MARK;
3160               enum tree_code cond_code = TREE_CODE (cond_expr);
3161
3162               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3163                 {
3164                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3165                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3166                 }
3167               if (invert_code != ERROR_MARK)
3168                 {
3169                   TREE_SET_CODE (cond_expr, invert_code);
3170                   swap_ssa_operands (def_stmt,
3171                                      gimple_assign_rhs2_ptr (def_stmt),
3172                                      gimple_assign_rhs3_ptr (def_stmt));
3173                 }
3174               else
3175                 {
3176                   if (dump_enabled_p ())
3177                     report_vect_op (MSG_NOTE, def_stmt,
3178                                     "detected reduction: cannot swap operands "
3179                                     "for cond_expr");
3180                   return NULL;
3181                 }
3182             }
3183           else
3184             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3185                                gimple_assign_rhs2_ptr (def_stmt));
3186
3187           if (dump_enabled_p ())
3188             report_vect_op (MSG_NOTE, def_stmt,
3189                             "detected reduction: need to swap operands: ");
3190
3191           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3192             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3193         }
3194       else
3195         {
3196           if (dump_enabled_p ())
3197             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3198         }
3199
3200       return def_stmt_info;
3201     }
3202
3203   /* Try to find SLP reduction chain.  */
3204   if (! nested_in_vect_loop
3205       && code != COND_EXPR
3206       && orig_code != MINUS_EXPR
3207       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3208     {
3209       if (dump_enabled_p ())
3210         report_vect_op (MSG_NOTE, def_stmt,
3211                         "reduction: detected reduction chain: ");
3212
3213       return def_stmt_info;
3214     }
3215
3216   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3217   stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3218   while (first)
3219     {
3220       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3221       REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3222       REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3223       first = next;
3224     }
3225
3226   /* Look for the expression computing loop_arg from loop PHI result.  */
3227   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3228     return def_stmt_info;
3229
3230   if (dump_enabled_p ())
3231     {
3232       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3233                       "reduction: unknown pattern: ");
3234     }
3235
3236   return NULL;
3237 }
3238
3239 /* Wrapper around vect_is_simple_reduction, which will modify code
3240    in-place if it enables detection of more reductions.  Arguments
3241    as there.  */
3242
3243 stmt_vec_info
3244 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3245                              bool *double_reduc,
3246                              bool need_wrapping_integral_overflow)
3247 {
3248   enum vect_reduction_type v_reduc_type;
3249   stmt_vec_info def_info
3250     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3251                                 need_wrapping_integral_overflow,
3252                                 &v_reduc_type);
3253   if (def_info)
3254     {
3255       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3256       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3257       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3258       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3259     }
3260   return def_info;
3261 }
3262
3263 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3264 int
3265 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3266                              int *peel_iters_epilogue,
3267                              stmt_vector_for_cost *scalar_cost_vec,
3268                              stmt_vector_for_cost *prologue_cost_vec,
3269                              stmt_vector_for_cost *epilogue_cost_vec)
3270 {
3271   int retval = 0;
3272   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3273
3274   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3275     {
3276       *peel_iters_epilogue = assumed_vf / 2;
3277       if (dump_enabled_p ())
3278         dump_printf_loc (MSG_NOTE, vect_location,
3279                          "cost model: epilogue peel iters set to vf/2 "
3280                          "because loop iterations are unknown .\n");
3281
3282       /* If peeled iterations are known but number of scalar loop
3283          iterations are unknown, count a taken branch per peeled loop.  */
3284       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3285                                  NULL, 0, vect_prologue);
3286       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3287                                  NULL, 0, vect_epilogue);
3288     }
3289   else
3290     {
3291       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3292       peel_iters_prologue = niters < peel_iters_prologue ?
3293                             niters : peel_iters_prologue;
3294       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3295       /* If we need to peel for gaps, but no peeling is required, we have to
3296          peel VF iterations.  */
3297       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3298         *peel_iters_epilogue = assumed_vf;
3299     }
3300
3301   stmt_info_for_cost *si;
3302   int j;
3303   if (peel_iters_prologue)
3304     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3305       retval += record_stmt_cost (prologue_cost_vec,
3306                                   si->count * peel_iters_prologue,
3307                                   si->kind, si->stmt_info, si->misalign,
3308                                   vect_prologue);
3309   if (*peel_iters_epilogue)
3310     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3311       retval += record_stmt_cost (epilogue_cost_vec,
3312                                   si->count * *peel_iters_epilogue,
3313                                   si->kind, si->stmt_info, si->misalign,
3314                                   vect_epilogue);
3315
3316   return retval;
3317 }
3318
3319 /* Function vect_estimate_min_profitable_iters
3320
3321    Return the number of iterations required for the vector version of the
3322    loop to be profitable relative to the cost of the scalar version of the
3323    loop.
3324
3325    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3326    of iterations for vectorization.  -1 value means loop vectorization
3327    is not profitable.  This returned value may be used for dynamic
3328    profitability check.
3329
3330    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3331    for static check against estimated number of iterations.  */
3332
3333 static void
3334 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3335                                     int *ret_min_profitable_niters,
3336                                     int *ret_min_profitable_estimate)
3337 {
3338   int min_profitable_iters;
3339   int min_profitable_estimate;
3340   int peel_iters_prologue;
3341   int peel_iters_epilogue;
3342   unsigned vec_inside_cost = 0;
3343   int vec_outside_cost = 0;
3344   unsigned vec_prologue_cost = 0;
3345   unsigned vec_epilogue_cost = 0;
3346   int scalar_single_iter_cost = 0;
3347   int scalar_outside_cost = 0;
3348   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3349   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3350   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3351
3352   /* Cost model disabled.  */
3353   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3354     {
3355       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3356       *ret_min_profitable_niters = 0;
3357       *ret_min_profitable_estimate = 0;
3358       return;
3359     }
3360
3361   /* Requires loop versioning tests to handle misalignment.  */
3362   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3363     {
3364       /*  FIXME: Make cost depend on complexity of individual check.  */
3365       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3366       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3367                             vect_prologue);
3368       dump_printf (MSG_NOTE,
3369                    "cost model: Adding cost of checks for loop "
3370                    "versioning to treat misalignment.\n");
3371     }
3372
3373   /* Requires loop versioning with alias checks.  */
3374   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3375     {
3376       /*  FIXME: Make cost depend on complexity of individual check.  */
3377       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3378       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3379                             vect_prologue);
3380       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3381       if (len)
3382         /* Count LEN - 1 ANDs and LEN comparisons.  */
3383         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3384                               NULL, 0, vect_prologue);
3385       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3386       if (len)
3387         {
3388           /* Count LEN - 1 ANDs and LEN comparisons.  */
3389           unsigned int nstmts = len * 2 - 1;
3390           /* +1 for each bias that needs adding.  */
3391           for (unsigned int i = 0; i < len; ++i)
3392             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3393               nstmts += 1;
3394           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3395                                 NULL, 0, vect_prologue);
3396         }
3397       dump_printf (MSG_NOTE,
3398                    "cost model: Adding cost of checks for loop "
3399                    "versioning aliasing.\n");
3400     }
3401
3402   /* Requires loop versioning with niter checks.  */
3403   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3404     {
3405       /*  FIXME: Make cost depend on complexity of individual check.  */
3406       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3407                             vect_prologue);
3408       dump_printf (MSG_NOTE,
3409                    "cost model: Adding cost of checks for loop "
3410                    "versioning niters.\n");
3411     }
3412
3413   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3414     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3415                           vect_prologue);
3416
3417   /* Count statements in scalar loop.  Using this as scalar cost for a single
3418      iteration for now.
3419
3420      TODO: Add outer loop support.
3421
3422      TODO: Consider assigning different costs to different scalar
3423      statements.  */
3424
3425   scalar_single_iter_cost
3426     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3427
3428   /* Add additional cost for the peeled instructions in prologue and epilogue
3429      loop.  (For fully-masked loops there will be no peeling.)
3430
3431      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3432      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3433
3434      TODO: Build an expression that represents peel_iters for prologue and
3435      epilogue to be used in a run-time test.  */
3436
3437   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3438     {
3439       peel_iters_prologue = 0;
3440       peel_iters_epilogue = 0;
3441
3442       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3443         {
3444           /* We need to peel exactly one iteration.  */
3445           peel_iters_epilogue += 1;
3446           stmt_info_for_cost *si;
3447           int j;
3448           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3449                             j, si)
3450             (void) add_stmt_cost (target_cost_data, si->count,
3451                                   si->kind, si->stmt_info, si->misalign,
3452                                   vect_epilogue);
3453         }
3454     }
3455   else if (npeel < 0)
3456     {
3457       peel_iters_prologue = assumed_vf / 2;
3458       dump_printf (MSG_NOTE, "cost model: "
3459                    "prologue peel iters set to vf/2.\n");
3460
3461       /* If peeling for alignment is unknown, loop bound of main loop becomes
3462          unknown.  */
3463       peel_iters_epilogue = assumed_vf / 2;
3464       dump_printf (MSG_NOTE, "cost model: "
3465                    "epilogue peel iters set to vf/2 because "
3466                    "peeling for alignment is unknown.\n");
3467
3468       /* If peeled iterations are unknown, count a taken branch and a not taken
3469          branch per peeled loop. Even if scalar loop iterations are known,
3470          vector iterations are not known since peeled prologue iterations are
3471          not known. Hence guards remain the same.  */
3472       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3473                             NULL, 0, vect_prologue);
3474       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3475                             NULL, 0, vect_prologue);
3476       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3477                             NULL, 0, vect_epilogue);
3478       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3479                             NULL, 0, vect_epilogue);
3480       stmt_info_for_cost *si;
3481       int j;
3482       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3483         {
3484           (void) add_stmt_cost (target_cost_data,
3485                                 si->count * peel_iters_prologue,
3486                                 si->kind, si->stmt_info, si->misalign,
3487                                 vect_prologue);
3488           (void) add_stmt_cost (target_cost_data,
3489                                 si->count * peel_iters_epilogue,
3490                                 si->kind, si->stmt_info, si->misalign,
3491                                 vect_epilogue);
3492         }
3493     }
3494   else
3495     {
3496       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3497       stmt_info_for_cost *si;
3498       int j;
3499       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3500
3501       prologue_cost_vec.create (2);
3502       epilogue_cost_vec.create (2);
3503       peel_iters_prologue = npeel;
3504
3505       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3506                                           &peel_iters_epilogue,
3507                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3508                                             (loop_vinfo),
3509                                           &prologue_cost_vec,
3510                                           &epilogue_cost_vec);
3511
3512       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3513         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3514                               si->misalign, vect_prologue);
3515
3516       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3517         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3518                               si->misalign, vect_epilogue);
3519
3520       prologue_cost_vec.release ();
3521       epilogue_cost_vec.release ();
3522     }
3523
3524   /* FORNOW: The scalar outside cost is incremented in one of the
3525      following ways:
3526
3527      1. The vectorizer checks for alignment and aliasing and generates
3528      a condition that allows dynamic vectorization.  A cost model
3529      check is ANDED with the versioning condition.  Hence scalar code
3530      path now has the added cost of the versioning check.
3531
3532        if (cost > th & versioning_check)
3533          jmp to vector code
3534
3535      Hence run-time scalar is incremented by not-taken branch cost.
3536
3537      2. The vectorizer then checks if a prologue is required.  If the
3538      cost model check was not done before during versioning, it has to
3539      be done before the prologue check.
3540
3541        if (cost <= th)
3542          prologue = scalar_iters
3543        if (prologue == 0)
3544          jmp to vector code
3545        else
3546          execute prologue
3547        if (prologue == num_iters)
3548          go to exit
3549
3550      Hence the run-time scalar cost is incremented by a taken branch,
3551      plus a not-taken branch, plus a taken branch cost.
3552
3553      3. The vectorizer then checks if an epilogue is required.  If the
3554      cost model check was not done before during prologue check, it
3555      has to be done with the epilogue check.
3556
3557        if (prologue == 0)
3558          jmp to vector code
3559        else
3560          execute prologue
3561        if (prologue == num_iters)
3562          go to exit
3563        vector code:
3564          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3565            jmp to epilogue
3566
3567      Hence the run-time scalar cost should be incremented by 2 taken
3568      branches.
3569
3570      TODO: The back end may reorder the BBS's differently and reverse
3571      conditions/branch directions.  Change the estimates below to
3572      something more reasonable.  */
3573
3574   /* If the number of iterations is known and we do not do versioning, we can
3575      decide whether to vectorize at compile time.  Hence the scalar version
3576      do not carry cost model guard costs.  */
3577   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3578       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3579     {
3580       /* Cost model check occurs at versioning.  */
3581       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3582         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3583       else
3584         {
3585           /* Cost model check occurs at prologue generation.  */
3586           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3587             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3588               + vect_get_stmt_cost (cond_branch_not_taken);
3589           /* Cost model check occurs at epilogue generation.  */
3590           else
3591             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3592         }
3593     }
3594
3595   /* Complete the target-specific cost calculations.  */
3596   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3597                &vec_inside_cost, &vec_epilogue_cost);
3598
3599   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3600
3601   if (dump_enabled_p ())
3602     {
3603       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3604       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3605                    vec_inside_cost);
3606       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3607                    vec_prologue_cost);
3608       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3609                    vec_epilogue_cost);
3610       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3611                    scalar_single_iter_cost);
3612       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3613                    scalar_outside_cost);
3614       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3615                    vec_outside_cost);
3616       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3617                    peel_iters_prologue);
3618       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3619                    peel_iters_epilogue);
3620     }
3621
3622   /* Calculate number of iterations required to make the vector version
3623      profitable, relative to the loop bodies only.  The following condition
3624      must hold true:
3625      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3626      where
3627      SIC = scalar iteration cost, VIC = vector iteration cost,
3628      VOC = vector outside cost, VF = vectorization factor,
3629      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3630      SOC = scalar outside cost for run time cost model check.  */
3631
3632   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3633     {
3634       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3635                               * assumed_vf
3636                               - vec_inside_cost * peel_iters_prologue
3637                               - vec_inside_cost * peel_iters_epilogue);
3638       if (min_profitable_iters <= 0)
3639         min_profitable_iters = 0;
3640       else
3641         {
3642           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3643                                    - vec_inside_cost);
3644
3645           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3646               <= (((int) vec_inside_cost * min_profitable_iters)
3647                   + (((int) vec_outside_cost - scalar_outside_cost)
3648                      * assumed_vf)))
3649             min_profitable_iters++;
3650         }
3651     }
3652   /* vector version will never be profitable.  */
3653   else
3654     {
3655       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3656         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3657                     "vectorization did not happen for a simd loop");
3658
3659       if (dump_enabled_p ())
3660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3661                          "cost model: the vector iteration cost = %d "
3662                          "divided by the scalar iteration cost = %d "
3663                          "is greater or equal to the vectorization factor = %d"
3664                          ".\n",
3665                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3666       *ret_min_profitable_niters = -1;
3667       *ret_min_profitable_estimate = -1;
3668       return;
3669     }
3670
3671   dump_printf (MSG_NOTE,
3672                "  Calculated minimum iters for profitability: %d\n",
3673                min_profitable_iters);
3674
3675   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3676       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3677     /* We want the vectorized loop to execute at least once.  */
3678     min_profitable_iters = assumed_vf + peel_iters_prologue;
3679
3680   if (dump_enabled_p ())
3681     dump_printf_loc (MSG_NOTE, vect_location,
3682                      "  Runtime profitability threshold = %d\n",
3683                      min_profitable_iters);
3684
3685   *ret_min_profitable_niters = min_profitable_iters;
3686
3687   /* Calculate number of iterations required to make the vector version
3688      profitable, relative to the loop bodies only.
3689
3690      Non-vectorized variant is SIC * niters and it must win over vector
3691      variant on the expected loop trip count.  The following condition must hold true:
3692      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3693
3694   if (vec_outside_cost <= 0)
3695     min_profitable_estimate = 0;
3696   else
3697     {
3698       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3699                                  * assumed_vf
3700                                  - vec_inside_cost * peel_iters_prologue
3701                                  - vec_inside_cost * peel_iters_epilogue)
3702                                  / ((scalar_single_iter_cost * assumed_vf)
3703                                    - vec_inside_cost);
3704     }
3705   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3706   if (dump_enabled_p ())
3707     dump_printf_loc (MSG_NOTE, vect_location,
3708                      "  Static estimate profitability threshold = %d\n",
3709                      min_profitable_estimate);
3710
3711   *ret_min_profitable_estimate = min_profitable_estimate;
3712 }
3713
3714 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3715    vector elements (not bits) for a vector with NELT elements.  */
3716 static void
3717 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3718                               vec_perm_builder *sel)
3719 {
3720   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3721      by vec_perm_indices.  */
3722   sel->new_vector (nelt, 1, 3);
3723   for (unsigned int i = 0; i < 3; i++)
3724     sel->quick_push (i + offset);
3725 }
3726
3727 /* Checks whether the target supports whole-vector shifts for vectors of mode
3728    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3729    it supports vec_perm_const with masks for all necessary shift amounts.  */
3730 static bool
3731 have_whole_vector_shift (machine_mode mode)
3732 {
3733   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3734     return true;
3735
3736   /* Variable-length vectors should be handled via the optab.  */
3737   unsigned int nelt;
3738   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3739     return false;
3740
3741   vec_perm_builder sel;
3742   vec_perm_indices indices;
3743   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3744     {
3745       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3746       indices.new_vector (sel, 2, nelt);
3747       if (!can_vec_perm_const_p (mode, indices, false))
3748         return false;
3749     }
3750   return true;
3751 }
3752
3753 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3754    functions. Design better to avoid maintenance issues.  */
3755
3756 /* Function vect_model_reduction_cost.
3757
3758    Models cost for a reduction operation, including the vector ops
3759    generated within the strip-mine loop, the initial definition before
3760    the loop, and the epilogue code that must be generated.  */
3761
3762 static void
3763 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3764                            int ncopies, stmt_vector_for_cost *cost_vec)
3765 {
3766   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3767   enum tree_code code;
3768   optab optab;
3769   tree vectype;
3770   machine_mode mode;
3771   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3772   struct loop *loop = NULL;
3773
3774   if (loop_vinfo)
3775     loop = LOOP_VINFO_LOOP (loop_vinfo);
3776
3777   /* Condition reductions generate two reductions in the loop.  */
3778   vect_reduction_type reduction_type
3779     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3780   if (reduction_type == COND_REDUCTION)
3781     ncopies *= 2;
3782
3783   vectype = STMT_VINFO_VECTYPE (stmt_info);
3784   mode = TYPE_MODE (vectype);
3785   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3786
3787   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3788
3789   if (reduction_type == EXTRACT_LAST_REDUCTION
3790       || reduction_type == FOLD_LEFT_REDUCTION)
3791     {
3792       /* No extra instructions needed in the prologue.  */
3793       prologue_cost = 0;
3794
3795       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3796         /* Count one reduction-like operation per vector.  */
3797         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3798                                         stmt_info, 0, vect_body);
3799       else
3800         {
3801           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3802           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3803           inside_cost = record_stmt_cost (cost_vec, nelements,
3804                                           vec_to_scalar, stmt_info, 0,
3805                                           vect_body);
3806           inside_cost += record_stmt_cost (cost_vec, nelements,
3807                                            scalar_stmt, stmt_info, 0,
3808                                            vect_body);
3809         }
3810     }
3811   else
3812     {
3813       /* Add in cost for initial definition.
3814          For cond reduction we have four vectors: initial index, step,
3815          initial result of the data reduction, initial value of the index
3816          reduction.  */
3817       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3818       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3819                                          scalar_to_vec, stmt_info, 0,
3820                                          vect_prologue);
3821
3822       /* Cost of reduction op inside loop.  */
3823       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3824                                       stmt_info, 0, vect_body);
3825     }
3826
3827   /* Determine cost of epilogue code.
3828
3829      We have a reduction operator that will reduce the vector in one statement.
3830      Also requires scalar extract.  */
3831
3832   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3833     {
3834       if (reduc_fn != IFN_LAST)
3835         {
3836           if (reduction_type == COND_REDUCTION)
3837             {
3838               /* An EQ stmt and an COND_EXPR stmt.  */
3839               epilogue_cost += record_stmt_cost (cost_vec, 2,
3840                                                  vector_stmt, stmt_info, 0,
3841                                                  vect_epilogue);
3842               /* Reduction of the max index and a reduction of the found
3843                  values.  */
3844               epilogue_cost += record_stmt_cost (cost_vec, 2,
3845                                                  vec_to_scalar, stmt_info, 0,
3846                                                  vect_epilogue);
3847               /* A broadcast of the max value.  */
3848               epilogue_cost += record_stmt_cost (cost_vec, 1,
3849                                                  scalar_to_vec, stmt_info, 0,
3850                                                  vect_epilogue);
3851             }
3852           else
3853             {
3854               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3855                                                  stmt_info, 0, vect_epilogue);
3856               epilogue_cost += record_stmt_cost (cost_vec, 1,
3857                                                  vec_to_scalar, stmt_info, 0,
3858                                                  vect_epilogue);
3859             }
3860         }
3861       else if (reduction_type == COND_REDUCTION)
3862         {
3863           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3864           /* Extraction of scalar elements.  */
3865           epilogue_cost += record_stmt_cost (cost_vec,
3866                                              2 * estimated_nunits,
3867                                              vec_to_scalar, stmt_info, 0,
3868                                              vect_epilogue);
3869           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3870           epilogue_cost += record_stmt_cost (cost_vec,
3871                                              2 * estimated_nunits - 3,
3872                                              scalar_stmt, stmt_info, 0,
3873                                              vect_epilogue);
3874         }
3875       else if (reduction_type == EXTRACT_LAST_REDUCTION
3876                || reduction_type == FOLD_LEFT_REDUCTION)
3877         /* No extra instructions need in the epilogue.  */
3878         ;
3879       else
3880         {
3881           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3882           tree bitsize =
3883             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3884           int element_bitsize = tree_to_uhwi (bitsize);
3885           int nelements = vec_size_in_bits / element_bitsize;
3886
3887           if (code == COND_EXPR)
3888             code = MAX_EXPR;
3889
3890           optab = optab_for_tree_code (code, vectype, optab_default);
3891
3892           /* We have a whole vector shift available.  */
3893           if (optab != unknown_optab
3894               && VECTOR_MODE_P (mode)
3895               && optab_handler (optab, mode) != CODE_FOR_nothing
3896               && have_whole_vector_shift (mode))
3897             {
3898               /* Final reduction via vector shifts and the reduction operator.
3899                  Also requires scalar extract.  */
3900               epilogue_cost += record_stmt_cost (cost_vec,
3901                                                  exact_log2 (nelements) * 2,
3902                                                  vector_stmt, stmt_info, 0,
3903                                                  vect_epilogue);
3904               epilogue_cost += record_stmt_cost (cost_vec, 1,
3905                                                  vec_to_scalar, stmt_info, 0,
3906                                                  vect_epilogue);
3907             }
3908           else
3909             /* Use extracts and reduction op for final reduction.  For N
3910                elements, we have N extracts and N-1 reduction ops.  */
3911             epilogue_cost += record_stmt_cost (cost_vec,
3912                                                nelements + nelements - 1,
3913                                                vector_stmt, stmt_info, 0,
3914                                                vect_epilogue);
3915         }
3916     }
3917
3918   if (dump_enabled_p ())
3919     dump_printf (MSG_NOTE,
3920                  "vect_model_reduction_cost: inside_cost = %d, "
3921                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3922                  prologue_cost, epilogue_cost);
3923 }
3924
3925
3926 /* Function vect_model_induction_cost.
3927
3928    Models cost for induction operations.  */
3929
3930 static void
3931 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3932                            stmt_vector_for_cost *cost_vec)
3933 {
3934   unsigned inside_cost, prologue_cost;
3935
3936   if (PURE_SLP_STMT (stmt_info))
3937     return;
3938
3939   /* loop cost for vec_loop.  */
3940   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3941                                   stmt_info, 0, vect_body);
3942
3943   /* prologue cost for vec_init and vec_step.  */
3944   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3945                                     stmt_info, 0, vect_prologue);
3946
3947   if (dump_enabled_p ())
3948     dump_printf_loc (MSG_NOTE, vect_location,
3949                      "vect_model_induction_cost: inside_cost = %d, "
3950                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3951 }
3952
3953
3954
3955 /* Function get_initial_def_for_reduction
3956
3957    Input:
3958    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3959    INIT_VAL - the initial value of the reduction variable
3960
3961    Output:
3962    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3963         of the reduction (used for adjusting the epilog - see below).
3964    Return a vector variable, initialized according to the operation that
3965         STMT_VINFO performs. This vector will be used as the initial value
3966         of the vector of partial results.
3967
3968    Option1 (adjust in epilog): Initialize the vector as follows:
3969      add/bit or/xor:    [0,0,...,0,0]
3970      mult/bit and:      [1,1,...,1,1]
3971      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3972    and when necessary (e.g. add/mult case) let the caller know
3973    that it needs to adjust the result by init_val.
3974
3975    Option2: Initialize the vector as follows:
3976      add/bit or/xor:    [init_val,0,0,...,0]
3977      mult/bit and:      [init_val,1,1,...,1]
3978      min/max/cond_expr: [init_val,init_val,...,init_val]
3979    and no adjustments are needed.
3980
3981    For example, for the following code:
3982
3983    s = init_val;
3984    for (i=0;i<n;i++)
3985      s = s + a[i];
3986
3987    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3988    For a vector of 4 units, we want to return either [0,0,0,init_val],
3989    or [0,0,0,0] and let the caller know that it needs to adjust
3990    the result at the end by 'init_val'.
3991
3992    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3993    initialization vector is simpler (same element in all entries), if
3994    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3995
3996    A cost model should help decide between these two schemes.  */
3997
3998 tree
3999 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4000                                tree *adjustment_def)
4001 {
4002   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4003   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4004   tree scalar_type = TREE_TYPE (init_val);
4005   tree vectype = get_vectype_for_scalar_type (scalar_type);
4006   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4007   tree def_for_init;
4008   tree init_def;
4009   REAL_VALUE_TYPE real_init_val = dconst0;
4010   int int_init_val = 0;
4011   gimple_seq stmts = NULL;
4012
4013   gcc_assert (vectype);
4014
4015   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4016               || SCALAR_FLOAT_TYPE_P (scalar_type));
4017
4018   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4019               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4020
4021   vect_reduction_type reduction_type
4022     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4023
4024   switch (code)
4025     {
4026     case WIDEN_SUM_EXPR:
4027     case DOT_PROD_EXPR:
4028     case SAD_EXPR:
4029     case PLUS_EXPR:
4030     case MINUS_EXPR:
4031     case BIT_IOR_EXPR:
4032     case BIT_XOR_EXPR:
4033     case MULT_EXPR:
4034     case BIT_AND_EXPR:
4035       {
4036         /* ADJUSTMENT_DEF is NULL when called from
4037            vect_create_epilog_for_reduction to vectorize double reduction.  */
4038         if (adjustment_def)
4039           *adjustment_def = init_val;
4040
4041         if (code == MULT_EXPR)
4042           {
4043             real_init_val = dconst1;
4044             int_init_val = 1;
4045           }
4046
4047         if (code == BIT_AND_EXPR)
4048           int_init_val = -1;
4049
4050         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4051           def_for_init = build_real (scalar_type, real_init_val);
4052         else
4053           def_for_init = build_int_cst (scalar_type, int_init_val);
4054
4055         if (adjustment_def)
4056           /* Option1: the first element is '0' or '1' as well.  */
4057           init_def = gimple_build_vector_from_val (&stmts, vectype,
4058                                                    def_for_init);
4059         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4060           {
4061             /* Option2 (variable length): the first element is INIT_VAL.  */
4062             init_def = gimple_build_vector_from_val (&stmts, vectype,
4063                                                      def_for_init);
4064             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4065                                      vectype, init_def, init_val);
4066           }
4067         else
4068           {
4069             /* Option2: the first element is INIT_VAL.  */
4070             tree_vector_builder elts (vectype, 1, 2);
4071             elts.quick_push (init_val);
4072             elts.quick_push (def_for_init);
4073             init_def = gimple_build_vector (&stmts, &elts);
4074           }
4075       }
4076       break;
4077
4078     case MIN_EXPR:
4079     case MAX_EXPR:
4080     case COND_EXPR:
4081       {
4082         if (adjustment_def)
4083           {
4084             *adjustment_def = NULL_TREE;
4085             if (reduction_type != COND_REDUCTION
4086                 && reduction_type != EXTRACT_LAST_REDUCTION)
4087               {
4088                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4089                 break;
4090               }
4091           }
4092         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4093         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4094       }
4095       break;
4096
4097     default:
4098       gcc_unreachable ();
4099     }
4100
4101   if (stmts)
4102     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4103   return init_def;
4104 }
4105
4106 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4107    NUMBER_OF_VECTORS is the number of vector defs to create.
4108    If NEUTRAL_OP is nonnull, introducing extra elements of that
4109    value will not change the result.  */
4110
4111 static void
4112 get_initial_defs_for_reduction (slp_tree slp_node,
4113                                 vec<tree> *vec_oprnds,
4114                                 unsigned int number_of_vectors,
4115                                 bool reduc_chain, tree neutral_op)
4116 {
4117   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4118   stmt_vec_info stmt_vinfo = stmts[0];
4119   unsigned HOST_WIDE_INT nunits;
4120   unsigned j, number_of_places_left_in_vector;
4121   tree vector_type;
4122   tree vop;
4123   int group_size = stmts.length ();
4124   unsigned int vec_num, i;
4125   unsigned number_of_copies = 1;
4126   vec<tree> voprnds;
4127   voprnds.create (number_of_vectors);
4128   struct loop *loop;
4129   auto_vec<tree, 16> permute_results;
4130
4131   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4132
4133   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4134
4135   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4136   gcc_assert (loop);
4137   edge pe = loop_preheader_edge (loop);
4138
4139   gcc_assert (!reduc_chain || neutral_op);
4140
4141   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4142      created vectors. It is greater than 1 if unrolling is performed.
4143
4144      For example, we have two scalar operands, s1 and s2 (e.g., group of
4145      strided accesses of size two), while NUNITS is four (i.e., four scalars
4146      of this type can be packed in a vector).  The output vector will contain
4147      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4148      will be 2).
4149
4150      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4151      vectors containing the operands.
4152
4153      For example, NUNITS is four as before, and the group size is 8
4154      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4155      {s5, s6, s7, s8}.  */
4156
4157   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4158     nunits = group_size;
4159
4160   number_of_copies = nunits * number_of_vectors / group_size;
4161
4162   number_of_places_left_in_vector = nunits;
4163   bool constant_p = true;
4164   tree_vector_builder elts (vector_type, nunits, 1);
4165   elts.quick_grow (nunits);
4166   for (j = 0; j < number_of_copies; j++)
4167     {
4168       for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4169         {
4170           tree op;
4171           /* Get the def before the loop.  In reduction chain we have only
4172              one initial value.  */
4173           if ((j != (number_of_copies - 1)
4174                || (reduc_chain && i != 0))
4175               && neutral_op)
4176             op = neutral_op;
4177           else
4178             op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4179
4180           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4181           number_of_places_left_in_vector--;
4182           elts[number_of_places_left_in_vector] = op;
4183           if (!CONSTANT_CLASS_P (op))
4184             constant_p = false;
4185
4186           if (number_of_places_left_in_vector == 0)
4187             {
4188               gimple_seq ctor_seq = NULL;
4189               tree init;
4190               if (constant_p && !neutral_op
4191                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4192                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4193                 /* Build the vector directly from ELTS.  */
4194                 init = gimple_build_vector (&ctor_seq, &elts);
4195               else if (neutral_op)
4196                 {
4197                   /* Build a vector of the neutral value and shift the
4198                      other elements into place.  */
4199                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4200                                                        neutral_op);
4201                   int k = nunits;
4202                   while (k > 0 && elts[k - 1] == neutral_op)
4203                     k -= 1;
4204                   while (k > 0)
4205                     {
4206                       k -= 1;
4207                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4208                                            vector_type, init, elts[k]);
4209                     }
4210                 }
4211               else
4212                 {
4213                   /* First time round, duplicate ELTS to fill the
4214                      required number of vectors, then cherry pick the
4215                      appropriate result for each iteration.  */
4216                   if (vec_oprnds->is_empty ())
4217                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4218                                               number_of_vectors,
4219                                               permute_results);
4220                   init = permute_results[number_of_vectors - j - 1];
4221                 }
4222               if (ctor_seq != NULL)
4223                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4224               voprnds.quick_push (init);
4225
4226               number_of_places_left_in_vector = nunits;
4227               elts.new_vector (vector_type, nunits, 1);
4228               elts.quick_grow (nunits);
4229               constant_p = true;
4230             }
4231         }
4232     }
4233
4234   /* Since the vectors are created in the reverse order, we should invert
4235      them.  */
4236   vec_num = voprnds.length ();
4237   for (j = vec_num; j != 0; j--)
4238     {
4239       vop = voprnds[j - 1];
4240       vec_oprnds->quick_push (vop);
4241     }
4242
4243   voprnds.release ();
4244
4245   /* In case that VF is greater than the unrolling factor needed for the SLP
4246      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4247      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4248      to replicate the vectors.  */
4249   tree neutral_vec = NULL;
4250   while (number_of_vectors > vec_oprnds->length ())
4251     {
4252       if (neutral_op)
4253         {
4254           if (!neutral_vec)
4255             {
4256               gimple_seq ctor_seq = NULL;
4257               neutral_vec = gimple_build_vector_from_val
4258                 (&ctor_seq, vector_type, neutral_op);
4259               if (ctor_seq != NULL)
4260                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4261             }
4262           vec_oprnds->quick_push (neutral_vec);
4263         }
4264       else
4265         {
4266           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4267             vec_oprnds->quick_push (vop);
4268         }
4269     }
4270 }
4271
4272
4273 /* Function vect_create_epilog_for_reduction
4274
4275    Create code at the loop-epilog to finalize the result of a reduction
4276    computation.
4277
4278    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4279      reduction statements.
4280    STMT_INFO is the scalar reduction stmt that is being vectorized.
4281    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4282      number of elements that we can fit in a vectype (nunits).  In this case
4283      we have to generate more than one vector stmt - i.e - we need to "unroll"
4284      the vector stmt by a factor VF/nunits.  For more details see documentation
4285      in vectorizable_operation.
4286    REDUC_FN is the internal function for the epilog reduction.
4287    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4288      computation.
4289    REDUC_INDEX is the index of the operand in the right hand side of the
4290      statement that is defined by REDUCTION_PHI.
4291    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4292    SLP_NODE is an SLP node containing a group of reduction statements. The
4293      first one in this group is STMT_INFO.
4294    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4295      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4296      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4297      any value of the IV in the loop.
4298    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4299    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4300      null if this is not an SLP reduction
4301
4302    This function:
4303    1. Creates the reduction def-use cycles: sets the arguments for
4304       REDUCTION_PHIS:
4305       The loop-entry argument is the vectorized initial-value of the reduction.
4306       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4307       sums.
4308    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4309       by calling the function specified by REDUC_FN if available, or by
4310       other means (whole-vector shifts or a scalar loop).
4311       The function also creates a new phi node at the loop exit to preserve
4312       loop-closed form, as illustrated below.
4313
4314      The flow at the entry to this function:
4315
4316         loop:
4317           vec_def = phi <null, null>            # REDUCTION_PHI
4318           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4319           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4320         loop_exit:
4321           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4322           use <s_out0>
4323           use <s_out0>
4324
4325      The above is transformed by this function into:
4326
4327         loop:
4328           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4329           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4330           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4331         loop_exit:
4332           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4333           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4334           v_out2 = reduce <v_out1>
4335           s_out3 = extract_field <v_out2, 0>
4336           s_out4 = adjust_result <s_out3>
4337           use <s_out4>
4338           use <s_out4>
4339 */
4340
4341 static void
4342 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4343                                   stmt_vec_info stmt_info,
4344                                   gimple *reduc_def_stmt,
4345                                   int ncopies, internal_fn reduc_fn,
4346                                   vec<stmt_vec_info> reduction_phis,
4347                                   bool double_reduc,
4348                                   slp_tree slp_node,
4349                                   slp_instance slp_node_instance,
4350                                   tree induc_val, enum tree_code induc_code,
4351                                   tree neutral_op)
4352 {
4353   stmt_vec_info prev_phi_info;
4354   tree vectype;
4355   machine_mode mode;
4356   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4357   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4358   basic_block exit_bb;
4359   tree scalar_dest;
4360   tree scalar_type;
4361   gimple *new_phi = NULL, *phi;
4362   stmt_vec_info phi_info;
4363   gimple_stmt_iterator exit_gsi;
4364   tree vec_dest;
4365   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4366   gimple *epilog_stmt = NULL;
4367   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4368   gimple *exit_phi;
4369   tree bitsize;
4370   tree adjustment_def = NULL;
4371   tree vec_initial_def = NULL;
4372   tree expr, def, initial_def = NULL;
4373   tree orig_name, scalar_result;
4374   imm_use_iterator imm_iter, phi_imm_iter;
4375   use_operand_p use_p, phi_use_p;
4376   gimple *use_stmt;
4377   stmt_vec_info reduction_phi_info = NULL;
4378   bool nested_in_vect_loop = false;
4379   auto_vec<gimple *> new_phis;
4380   auto_vec<stmt_vec_info> inner_phis;
4381   int j, i;
4382   auto_vec<tree> scalar_results;
4383   unsigned int group_size = 1, k, ratio;
4384   auto_vec<tree> vec_initial_defs;
4385   auto_vec<gimple *> phis;
4386   bool slp_reduc = false;
4387   bool direct_slp_reduc;
4388   tree new_phi_result;
4389   stmt_vec_info inner_phi = NULL;
4390   tree induction_index = NULL_TREE;
4391
4392   if (slp_node)
4393     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4394
4395   if (nested_in_vect_loop_p (loop, stmt_info))
4396     {
4397       outer_loop = loop;
4398       loop = loop->inner;
4399       nested_in_vect_loop = true;
4400       gcc_assert (!slp_node);
4401     }
4402
4403   vectype = STMT_VINFO_VECTYPE (stmt_info);
4404   gcc_assert (vectype);
4405   mode = TYPE_MODE (vectype);
4406
4407   /* 1. Create the reduction def-use cycle:
4408      Set the arguments of REDUCTION_PHIS, i.e., transform
4409
4410         loop:
4411           vec_def = phi <null, null>            # REDUCTION_PHI
4412           VECT_DEF = vector_stmt                # vectorized form of STMT
4413           ...
4414
4415      into:
4416
4417         loop:
4418           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4419           VECT_DEF = vector_stmt                # vectorized form of STMT
4420           ...
4421
4422      (in case of SLP, do it for all the phis). */
4423
4424   /* Get the loop-entry arguments.  */
4425   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4426   if (slp_node)
4427     {
4428       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4429       vec_initial_defs.reserve (vec_num);
4430       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4431                                       &vec_initial_defs, vec_num,
4432                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4433                                       neutral_op);
4434     }
4435   else
4436     {
4437       /* Get at the scalar def before the loop, that defines the initial value
4438          of the reduction variable.  */
4439       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4440                                            loop_preheader_edge (loop));
4441       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4442          and we can't use zero for induc_val, use initial_def.  Similarly
4443          for REDUC_MIN and initial_def larger than the base.  */
4444       if (TREE_CODE (initial_def) == INTEGER_CST
4445           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4446               == INTEGER_INDUC_COND_REDUCTION)
4447           && !integer_zerop (induc_val)
4448           && ((induc_code == MAX_EXPR
4449                && tree_int_cst_lt (initial_def, induc_val))
4450               || (induc_code == MIN_EXPR
4451                   && tree_int_cst_lt (induc_val, initial_def))))
4452         induc_val = initial_def;
4453
4454       if (double_reduc)
4455         /* In case of double reduction we only create a vector variable
4456            to be put in the reduction phi node.  The actual statement
4457            creation is done later in this function.  */
4458         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4459       else if (nested_in_vect_loop)
4460         {
4461           /* Do not use an adjustment def as that case is not supported
4462              correctly if ncopies is not one.  */
4463           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4464           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4465                                                           stmt_info);
4466         }
4467       else
4468         vec_initial_def
4469           = get_initial_def_for_reduction (stmt_info, initial_def,
4470                                            &adjustment_def);
4471       vec_initial_defs.create (1);
4472       vec_initial_defs.quick_push (vec_initial_def);
4473     }
4474
4475   /* Set phi nodes arguments.  */
4476   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4477     {
4478       tree vec_init_def = vec_initial_defs[i];
4479       tree def = vect_defs[i];
4480       for (j = 0; j < ncopies; j++)
4481         {
4482           if (j != 0)
4483             {
4484               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4485               if (nested_in_vect_loop)
4486                 vec_init_def
4487                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4488             }
4489
4490           /* Set the loop-entry arg of the reduction-phi.  */
4491
4492           gphi *phi = as_a <gphi *> (phi_info->stmt);
4493           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4494               == INTEGER_INDUC_COND_REDUCTION)
4495             {
4496               /* Initialise the reduction phi to zero.  This prevents initial
4497                  values of non-zero interferring with the reduction op.  */
4498               gcc_assert (ncopies == 1);
4499               gcc_assert (i == 0);
4500
4501               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4502               tree induc_val_vec
4503                 = build_vector_from_val (vec_init_def_type, induc_val);
4504
4505               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4506                            UNKNOWN_LOCATION);
4507             }
4508           else
4509             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4510                          UNKNOWN_LOCATION);
4511
4512           /* Set the loop-latch arg for the reduction-phi.  */
4513           if (j > 0)
4514             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4515
4516           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4517
4518           if (dump_enabled_p ())
4519             dump_printf_loc (MSG_NOTE, vect_location,
4520                              "transform reduction: created def-use cycle: %G%G",
4521                              phi, SSA_NAME_DEF_STMT (def));
4522         }
4523     }
4524
4525   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4526      which is updated with the current index of the loop for every match of
4527      the original loop's cond_expr (VEC_STMT).  This results in a vector
4528      containing the last time the condition passed for that vector lane.
4529      The first match will be a 1 to allow 0 to be used for non-matching
4530      indexes.  If there are no matches at all then the vector will be all
4531      zeroes.  */
4532   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4533     {
4534       tree indx_before_incr, indx_after_incr;
4535       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4536
4537       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4538       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4539
4540       int scalar_precision
4541         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4542       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4543       tree cr_index_vector_type = build_vector_type
4544         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4545
4546       /* First we create a simple vector induction variable which starts
4547          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4548          vector size (STEP).  */
4549
4550       /* Create a {1,2,3,...} vector.  */
4551       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4552
4553       /* Create a vector of the step value.  */
4554       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4555       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4556
4557       /* Create an induction variable.  */
4558       gimple_stmt_iterator incr_gsi;
4559       bool insert_after;
4560       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4561       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4562                  insert_after, &indx_before_incr, &indx_after_incr);
4563
4564       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4565          filled with zeros (VEC_ZERO).  */
4566
4567       /* Create a vector of 0s.  */
4568       tree zero = build_zero_cst (cr_index_scalar_type);
4569       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4570
4571       /* Create a vector phi node.  */
4572       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4573       new_phi = create_phi_node (new_phi_tree, loop->header);
4574       loop_vinfo->add_stmt (new_phi);
4575       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4576                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4577
4578       /* Now take the condition from the loops original cond_expr
4579          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4580          every match uses values from the induction variable
4581          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4582          (NEW_PHI_TREE).
4583          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4584          the new cond_expr (INDEX_COND_EXPR).  */
4585
4586       /* Duplicate the condition from vec_stmt.  */
4587       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4588
4589       /* Create a conditional, where the condition is taken from vec_stmt
4590          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4591          else is the phi (NEW_PHI_TREE).  */
4592       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4593                                      ccompare, indx_before_incr,
4594                                      new_phi_tree);
4595       induction_index = make_ssa_name (cr_index_vector_type);
4596       gimple *index_condition = gimple_build_assign (induction_index,
4597                                                      index_cond_expr);
4598       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4599       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4600       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4601
4602       /* Update the phi with the vec cond.  */
4603       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4604                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4605     }
4606
4607   /* 2. Create epilog code.
4608         The reduction epilog code operates across the elements of the vector
4609         of partial results computed by the vectorized loop.
4610         The reduction epilog code consists of:
4611
4612         step 1: compute the scalar result in a vector (v_out2)
4613         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4614         step 3: adjust the scalar result (s_out3) if needed.
4615
4616         Step 1 can be accomplished using one the following three schemes:
4617           (scheme 1) using reduc_fn, if available.
4618           (scheme 2) using whole-vector shifts, if available.
4619           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4620                      combined.
4621
4622           The overall epilog code looks like this:
4623
4624           s_out0 = phi <s_loop>         # original EXIT_PHI
4625           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4626           v_out2 = reduce <v_out1>              # step 1
4627           s_out3 = extract_field <v_out2, 0>    # step 2
4628           s_out4 = adjust_result <s_out3>       # step 3
4629
4630           (step 3 is optional, and steps 1 and 2 may be combined).
4631           Lastly, the uses of s_out0 are replaced by s_out4.  */
4632
4633
4634   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4635          v_out1 = phi <VECT_DEF>
4636          Store them in NEW_PHIS.  */
4637
4638   exit_bb = single_exit (loop)->dest;
4639   prev_phi_info = NULL;
4640   new_phis.create (vect_defs.length ());
4641   FOR_EACH_VEC_ELT (vect_defs, i, def)
4642     {
4643       for (j = 0; j < ncopies; j++)
4644         {
4645           tree new_def = copy_ssa_name (def);
4646           phi = create_phi_node (new_def, exit_bb);
4647           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4648           if (j == 0)
4649             new_phis.quick_push (phi);
4650           else
4651             {
4652               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4653               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4654             }
4655
4656           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4657           prev_phi_info = phi_info;
4658         }
4659     }
4660
4661   /* The epilogue is created for the outer-loop, i.e., for the loop being
4662      vectorized.  Create exit phis for the outer loop.  */
4663   if (double_reduc)
4664     {
4665       loop = outer_loop;
4666       exit_bb = single_exit (loop)->dest;
4667       inner_phis.create (vect_defs.length ());
4668       FOR_EACH_VEC_ELT (new_phis, i, phi)
4669         {
4670           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4671           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4672           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4673           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4674                            PHI_RESULT (phi));
4675           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4676           inner_phis.quick_push (phi_info);
4677           new_phis[i] = outer_phi;
4678           while (STMT_VINFO_RELATED_STMT (phi_info))
4679             {
4680               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4681               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4682               outer_phi = create_phi_node (new_result, exit_bb);
4683               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4684                                PHI_RESULT (phi_info->stmt));
4685               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4686               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4687               prev_phi_info = outer_phi_info;
4688             }
4689         }
4690     }
4691
4692   exit_gsi = gsi_after_labels (exit_bb);
4693
4694   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4695          (i.e. when reduc_fn is not available) and in the final adjustment
4696          code (if needed).  Also get the original scalar reduction variable as
4697          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4698          represents a reduction pattern), the tree-code and scalar-def are
4699          taken from the original stmt that the pattern-stmt (STMT) replaces.
4700          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4701          are taken from STMT.  */
4702
4703   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4704   if (orig_stmt_info != stmt_info)
4705     {
4706       /* Reduction pattern  */
4707       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4708       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4709     }
4710
4711   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4712   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4713      partial results are added and not subtracted.  */
4714   if (code == MINUS_EXPR)
4715     code = PLUS_EXPR;
4716
4717   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4718   scalar_type = TREE_TYPE (scalar_dest);
4719   scalar_results.create (group_size);
4720   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4721   bitsize = TYPE_SIZE (scalar_type);
4722
4723   /* In case this is a reduction in an inner-loop while vectorizing an outer
4724      loop - we don't need to extract a single scalar result at the end of the
4725      inner-loop (unless it is double reduction, i.e., the use of reduction is
4726      outside the outer-loop).  The final vector of partial results will be used
4727      in the vectorized outer-loop, or reduced to a scalar result at the end of
4728      the outer-loop.  */
4729   if (nested_in_vect_loop && !double_reduc)
4730     goto vect_finalize_reduction;
4731
4732   /* SLP reduction without reduction chain, e.g.,
4733      # a1 = phi <a2, a0>
4734      # b1 = phi <b2, b0>
4735      a2 = operation (a1)
4736      b2 = operation (b1)  */
4737   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4738
4739   /* True if we should implement SLP_REDUC using native reduction operations
4740      instead of scalar operations.  */
4741   direct_slp_reduc = (reduc_fn != IFN_LAST
4742                       && slp_reduc
4743                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4744
4745   /* In case of reduction chain, e.g.,
4746      # a1 = phi <a3, a0>
4747      a2 = operation (a1)
4748      a3 = operation (a2),
4749
4750      we may end up with more than one vector result.  Here we reduce them to
4751      one vector.  */
4752   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4753     {
4754       tree first_vect = PHI_RESULT (new_phis[0]);
4755       gassign *new_vec_stmt = NULL;
4756       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4757       for (k = 1; k < new_phis.length (); k++)
4758         {
4759           gimple *next_phi = new_phis[k];
4760           tree second_vect = PHI_RESULT (next_phi);
4761           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4762           new_vec_stmt = gimple_build_assign (tem, code,
4763                                               first_vect, second_vect);
4764           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4765           first_vect = tem;
4766         }
4767
4768       new_phi_result = first_vect;
4769       if (new_vec_stmt)
4770         {
4771           new_phis.truncate (0);
4772           new_phis.safe_push (new_vec_stmt);
4773         }
4774     }
4775   /* Likewise if we couldn't use a single defuse cycle.  */
4776   else if (ncopies > 1)
4777     {
4778       gcc_assert (new_phis.length () == 1);
4779       tree first_vect = PHI_RESULT (new_phis[0]);
4780       gassign *new_vec_stmt = NULL;
4781       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4782       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4783       for (int k = 1; k < ncopies; ++k)
4784         {
4785           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4786           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4787           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4788           new_vec_stmt = gimple_build_assign (tem, code,
4789                                               first_vect, second_vect);
4790           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4791           first_vect = tem;
4792         }
4793       new_phi_result = first_vect;
4794       new_phis.truncate (0);
4795       new_phis.safe_push (new_vec_stmt);
4796     }
4797   else
4798     new_phi_result = PHI_RESULT (new_phis[0]);
4799
4800   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4801       && reduc_fn != IFN_LAST)
4802     {
4803       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4804          various data values where the condition matched and another vector
4805          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4806          need to extract the last matching index (which will be the index with
4807          highest value) and use this to index into the data vector.
4808          For the case where there were no matches, the data vector will contain
4809          all default values and the index vector will be all zeros.  */
4810
4811       /* Get various versions of the type of the vector of indexes.  */
4812       tree index_vec_type = TREE_TYPE (induction_index);
4813       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4814       tree index_scalar_type = TREE_TYPE (index_vec_type);
4815       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4816         (index_vec_type);
4817
4818       /* Get an unsigned integer version of the type of the data vector.  */
4819       int scalar_precision
4820         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4821       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4822       tree vectype_unsigned = build_vector_type
4823         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4824
4825       /* First we need to create a vector (ZERO_VEC) of zeros and another
4826          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4827          can create using a MAX reduction and then expanding.
4828          In the case where the loop never made any matches, the max index will
4829          be zero.  */
4830
4831       /* Vector of {0, 0, 0,...}.  */
4832       tree zero_vec = make_ssa_name (vectype);
4833       tree zero_vec_rhs = build_zero_cst (vectype);
4834       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4835       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4836
4837       /* Find maximum value from the vector of found indexes.  */
4838       tree max_index = make_ssa_name (index_scalar_type);
4839       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4840                                                           1, induction_index);
4841       gimple_call_set_lhs (max_index_stmt, max_index);
4842       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4843
4844       /* Vector of {max_index, max_index, max_index,...}.  */
4845       tree max_index_vec = make_ssa_name (index_vec_type);
4846       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4847                                                       max_index);
4848       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4849                                                         max_index_vec_rhs);
4850       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4851
4852       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4853          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4854          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4855          otherwise.  Only one value should match, resulting in a vector
4856          (VEC_COND) with one data value and the rest zeros.
4857          In the case where the loop never made any matches, every index will
4858          match, resulting in a vector with all data values (which will all be
4859          the default value).  */
4860
4861       /* Compare the max index vector to the vector of found indexes to find
4862          the position of the max value.  */
4863       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4864       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4865                                                       induction_index,
4866                                                       max_index_vec);
4867       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4868
4869       /* Use the compare to choose either values from the data vector or
4870          zero.  */
4871       tree vec_cond = make_ssa_name (vectype);
4872       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4873                                                    vec_compare, new_phi_result,
4874                                                    zero_vec);
4875       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4876
4877       /* Finally we need to extract the data value from the vector (VEC_COND)
4878          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4879          reduction, but because this doesn't exist, we can use a MAX reduction
4880          instead.  The data value might be signed or a float so we need to cast
4881          it first.
4882          In the case where the loop never made any matches, the data values are
4883          all identical, and so will reduce down correctly.  */
4884
4885       /* Make the matched data values unsigned.  */
4886       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4887       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4888                                        vec_cond);
4889       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4890                                                         VIEW_CONVERT_EXPR,
4891                                                         vec_cond_cast_rhs);
4892       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4893
4894       /* Reduce down to a scalar value.  */
4895       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4896       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4897                                                            1, vec_cond_cast);
4898       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4899       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4900
4901       /* Convert the reduced value back to the result type and set as the
4902          result.  */
4903       gimple_seq stmts = NULL;
4904       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4905                                data_reduc);
4906       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4907       scalar_results.safe_push (new_temp);
4908     }
4909   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4910            && reduc_fn == IFN_LAST)
4911     {
4912       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4913          idx = 0;
4914          idx_val = induction_index[0];
4915          val = data_reduc[0];
4916          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4917            if (induction_index[i] > idx_val)
4918              val = data_reduc[i], idx_val = induction_index[i];
4919          return val;  */
4920
4921       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4922       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4923       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4924       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4925       /* Enforced by vectorizable_reduction, which ensures we have target
4926          support before allowing a conditional reduction on variable-length
4927          vectors.  */
4928       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4929       tree idx_val = NULL_TREE, val = NULL_TREE;
4930       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4931         {
4932           tree old_idx_val = idx_val;
4933           tree old_val = val;
4934           idx_val = make_ssa_name (idx_eltype);
4935           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4936                                              build3 (BIT_FIELD_REF, idx_eltype,
4937                                                      induction_index,
4938                                                      bitsize_int (el_size),
4939                                                      bitsize_int (off)));
4940           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4941           val = make_ssa_name (data_eltype);
4942           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4943                                              build3 (BIT_FIELD_REF,
4944                                                      data_eltype,
4945                                                      new_phi_result,
4946                                                      bitsize_int (el_size),
4947                                                      bitsize_int (off)));
4948           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4949           if (off != 0)
4950             {
4951               tree new_idx_val = idx_val;
4952               tree new_val = val;
4953               if (off != v_size - el_size)
4954                 {
4955                   new_idx_val = make_ssa_name (idx_eltype);
4956                   epilog_stmt = gimple_build_assign (new_idx_val,
4957                                                      MAX_EXPR, idx_val,
4958                                                      old_idx_val);
4959                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4960                 }
4961               new_val = make_ssa_name (data_eltype);
4962               epilog_stmt = gimple_build_assign (new_val,
4963                                                  COND_EXPR,
4964                                                  build2 (GT_EXPR,
4965                                                          boolean_type_node,
4966                                                          idx_val,
4967                                                          old_idx_val),
4968                                                  val, old_val);
4969               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4970               idx_val = new_idx_val;
4971               val = new_val;
4972             }
4973         }
4974       /* Convert the reduced value back to the result type and set as the
4975          result.  */
4976       gimple_seq stmts = NULL;
4977       val = gimple_convert (&stmts, scalar_type, val);
4978       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4979       scalar_results.safe_push (val);
4980     }
4981
4982   /* 2.3 Create the reduction code, using one of the three schemes described
4983          above. In SLP we simply need to extract all the elements from the
4984          vector (without reducing them), so we use scalar shifts.  */
4985   else if (reduc_fn != IFN_LAST && !slp_reduc)
4986     {
4987       tree tmp;
4988       tree vec_elem_type;
4989
4990       /* Case 1:  Create:
4991          v_out2 = reduc_expr <v_out1>  */
4992
4993       if (dump_enabled_p ())
4994         dump_printf_loc (MSG_NOTE, vect_location,
4995                          "Reduce using direct vector reduction.\n");
4996
4997       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4998       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4999         {
5000           tree tmp_dest
5001             = vect_create_destination_var (scalar_dest, vec_elem_type);
5002           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5003                                                     new_phi_result);
5004           gimple_set_lhs (epilog_stmt, tmp_dest);
5005           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5006           gimple_set_lhs (epilog_stmt, new_temp);
5007           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5008
5009           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5010                                              new_temp);
5011         }
5012       else
5013         {
5014           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5015                                                     new_phi_result);
5016           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5017         }
5018
5019       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5020       gimple_set_lhs (epilog_stmt, new_temp);
5021       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5022
5023       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5024            == INTEGER_INDUC_COND_REDUCTION)
5025           && !operand_equal_p (initial_def, induc_val, 0))
5026         {
5027           /* Earlier we set the initial value to be a vector if induc_val
5028              values.  Check the result and if it is induc_val then replace
5029              with the original initial value, unless induc_val is
5030              the same as initial_def already.  */
5031           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5032                                   induc_val);
5033
5034           tmp = make_ssa_name (new_scalar_dest);
5035           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5036                                              initial_def, new_temp);
5037           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5038           new_temp = tmp;
5039         }
5040
5041       scalar_results.safe_push (new_temp);
5042     }
5043   else if (direct_slp_reduc)
5044     {
5045       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5046          with the elements for other SLP statements replaced with the
5047          neutral value.  We can then do a normal reduction on each vector.  */
5048
5049       /* Enforced by vectorizable_reduction.  */
5050       gcc_assert (new_phis.length () == 1);
5051       gcc_assert (pow2p_hwi (group_size));
5052
5053       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5054       vec<stmt_vec_info> orig_phis
5055         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5056       gimple_seq seq = NULL;
5057
5058       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5059          and the same element size as VECTYPE.  */
5060       tree index = build_index_vector (vectype, 0, 1);
5061       tree index_type = TREE_TYPE (index);
5062       tree index_elt_type = TREE_TYPE (index_type);
5063       tree mask_type = build_same_sized_truth_vector_type (index_type);
5064
5065       /* Create a vector that, for each element, identifies which of
5066          the REDUC_GROUP_SIZE results should use it.  */
5067       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5068       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5069                             build_vector_from_val (index_type, index_mask));
5070
5071       /* Get a neutral vector value.  This is simply a splat of the neutral
5072          scalar value if we have one, otherwise the initial scalar value
5073          is itself a neutral value.  */
5074       tree vector_identity = NULL_TREE;
5075       if (neutral_op)
5076         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5077                                                         neutral_op);
5078       for (unsigned int i = 0; i < group_size; ++i)
5079         {
5080           /* If there's no univeral neutral value, we can use the
5081              initial scalar value from the original PHI.  This is used
5082              for MIN and MAX reduction, for example.  */
5083           if (!neutral_op)
5084             {
5085               tree scalar_value
5086                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5087                                          loop_preheader_edge (loop));
5088               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5089                                                               scalar_value);
5090             }
5091
5092           /* Calculate the equivalent of:
5093
5094              sel[j] = (index[j] == i);
5095
5096              which selects the elements of NEW_PHI_RESULT that should
5097              be included in the result.  */
5098           tree compare_val = build_int_cst (index_elt_type, i);
5099           compare_val = build_vector_from_val (index_type, compare_val);
5100           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5101                                    index, compare_val);
5102
5103           /* Calculate the equivalent of:
5104
5105              vec = seq ? new_phi_result : vector_identity;
5106
5107              VEC is now suitable for a full vector reduction.  */
5108           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5109                                    sel, new_phi_result, vector_identity);
5110
5111           /* Do the reduction and convert it to the appropriate type.  */
5112           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5113                                       TREE_TYPE (vectype), vec);
5114           scalar = gimple_convert (&seq, scalar_type, scalar);
5115           scalar_results.safe_push (scalar);
5116         }
5117       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5118     }
5119   else
5120     {
5121       bool reduce_with_shift;
5122       tree vec_temp;
5123
5124       /* COND reductions all do the final reduction with MAX_EXPR
5125          or MIN_EXPR.  */
5126       if (code == COND_EXPR)
5127         {
5128           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5129               == INTEGER_INDUC_COND_REDUCTION)
5130             code = induc_code;
5131           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5132                    == CONST_COND_REDUCTION)
5133             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5134           else
5135             code = MAX_EXPR;
5136         }
5137
5138       /* See if the target wants to do the final (shift) reduction
5139          in a vector mode of smaller size and first reduce upper/lower
5140          halves against each other.  */
5141       enum machine_mode mode1 = mode;
5142       tree vectype1 = vectype;
5143       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5144       unsigned sz1 = sz;
5145       if (!slp_reduc
5146           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5147         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5148
5149       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5150       reduce_with_shift = have_whole_vector_shift (mode1);
5151       if (!VECTOR_MODE_P (mode1))
5152         reduce_with_shift = false;
5153       else
5154         {
5155           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5156           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5157             reduce_with_shift = false;
5158         }
5159
5160       /* First reduce the vector to the desired vector size we should
5161          do shift reduction on by combining upper and lower halves.  */
5162       new_temp = new_phi_result;
5163       while (sz > sz1)
5164         {
5165           gcc_assert (!slp_reduc);
5166           sz /= 2;
5167           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5168
5169           /* The target has to make sure we support lowpart/highpart
5170              extraction, either via direct vector extract or through
5171              an integer mode punning.  */
5172           tree dst1, dst2;
5173           if (convert_optab_handler (vec_extract_optab,
5174                                      TYPE_MODE (TREE_TYPE (new_temp)),
5175                                      TYPE_MODE (vectype1))
5176               != CODE_FOR_nothing)
5177             {
5178               /* Extract sub-vectors directly once vec_extract becomes
5179                  a conversion optab.  */
5180               dst1 = make_ssa_name (vectype1);
5181               epilog_stmt
5182                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5183                                          build3 (BIT_FIELD_REF, vectype1,
5184                                                  new_temp, TYPE_SIZE (vectype1),
5185                                                  bitsize_int (0)));
5186               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5187               dst2 =  make_ssa_name (vectype1);
5188               epilog_stmt
5189                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5190                                          build3 (BIT_FIELD_REF, vectype1,
5191                                                  new_temp, TYPE_SIZE (vectype1),
5192                                                  bitsize_int (sz * BITS_PER_UNIT)));
5193               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5194             }
5195           else
5196             {
5197               /* Extract via punning to appropriately sized integer mode
5198                  vector.  */
5199               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5200                                                             1);
5201               tree etype = build_vector_type (eltype, 2);
5202               gcc_assert (convert_optab_handler (vec_extract_optab,
5203                                                  TYPE_MODE (etype),
5204                                                  TYPE_MODE (eltype))
5205                           != CODE_FOR_nothing);
5206               tree tem = make_ssa_name (etype);
5207               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5208                                                  build1 (VIEW_CONVERT_EXPR,
5209                                                          etype, new_temp));
5210               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5211               new_temp = tem;
5212               tem = make_ssa_name (eltype);
5213               epilog_stmt
5214                   = gimple_build_assign (tem, BIT_FIELD_REF,
5215                                          build3 (BIT_FIELD_REF, eltype,
5216                                                  new_temp, TYPE_SIZE (eltype),
5217                                                  bitsize_int (0)));
5218               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5219               dst1 = make_ssa_name (vectype1);
5220               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5221                                                  build1 (VIEW_CONVERT_EXPR,
5222                                                          vectype1, tem));
5223               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5224               tem = make_ssa_name (eltype);
5225               epilog_stmt
5226                   = gimple_build_assign (tem, BIT_FIELD_REF,
5227                                          build3 (BIT_FIELD_REF, eltype,
5228                                                  new_temp, TYPE_SIZE (eltype),
5229                                                  bitsize_int (sz * BITS_PER_UNIT)));
5230               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5231               dst2 =  make_ssa_name (vectype1);
5232               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5233                                                  build1 (VIEW_CONVERT_EXPR,
5234                                                          vectype1, tem));
5235               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5236             }
5237
5238           new_temp = make_ssa_name (vectype1);
5239           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5240           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5241         }
5242
5243       if (reduce_with_shift && !slp_reduc)
5244         {
5245           int element_bitsize = tree_to_uhwi (bitsize);
5246           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5247              for variable-length vectors and also requires direct target support
5248              for loop reductions.  */
5249           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5250           int nelements = vec_size_in_bits / element_bitsize;
5251           vec_perm_builder sel;
5252           vec_perm_indices indices;
5253
5254           int elt_offset;
5255
5256           tree zero_vec = build_zero_cst (vectype1);
5257           /* Case 2: Create:
5258              for (offset = nelements/2; offset >= 1; offset/=2)
5259                 {
5260                   Create:  va' = vec_shift <va, offset>
5261                   Create:  va = vop <va, va'>
5262                 }  */
5263
5264           tree rhs;
5265
5266           if (dump_enabled_p ())
5267             dump_printf_loc (MSG_NOTE, vect_location,
5268                              "Reduce using vector shifts\n");
5269
5270           mode1 = TYPE_MODE (vectype1);
5271           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5272           for (elt_offset = nelements / 2;
5273                elt_offset >= 1;
5274                elt_offset /= 2)
5275             {
5276               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5277               indices.new_vector (sel, 2, nelements);
5278               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5279               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5280                                                  new_temp, zero_vec, mask);
5281               new_name = make_ssa_name (vec_dest, epilog_stmt);
5282               gimple_assign_set_lhs (epilog_stmt, new_name);
5283               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5284
5285               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5286                                                  new_temp);
5287               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5288               gimple_assign_set_lhs (epilog_stmt, new_temp);
5289               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5290             }
5291
5292           /* 2.4  Extract the final scalar result.  Create:
5293              s_out3 = extract_field <v_out2, bitpos>  */
5294
5295           if (dump_enabled_p ())
5296             dump_printf_loc (MSG_NOTE, vect_location,
5297                              "extract scalar result\n");
5298
5299           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5300                         bitsize, bitsize_zero_node);
5301           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5302           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5303           gimple_assign_set_lhs (epilog_stmt, new_temp);
5304           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305           scalar_results.safe_push (new_temp);
5306         }
5307       else
5308         {
5309           /* Case 3: Create:
5310              s = extract_field <v_out2, 0>
5311              for (offset = element_size;
5312                   offset < vector_size;
5313                   offset += element_size;)
5314                {
5315                  Create:  s' = extract_field <v_out2, offset>
5316                  Create:  s = op <s, s'>  // For non SLP cases
5317                }  */
5318
5319           if (dump_enabled_p ())
5320             dump_printf_loc (MSG_NOTE, vect_location,
5321                              "Reduce using scalar code.\n");
5322
5323           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5324           int element_bitsize = tree_to_uhwi (bitsize);
5325           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5326             {
5327               int bit_offset;
5328               if (gimple_code (new_phi) == GIMPLE_PHI)
5329                 vec_temp = PHI_RESULT (new_phi);
5330               else
5331                 vec_temp = gimple_assign_lhs (new_phi);
5332               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5333                                  bitsize_zero_node);
5334               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5335               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5336               gimple_assign_set_lhs (epilog_stmt, new_temp);
5337               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5338
5339               /* In SLP we don't need to apply reduction operation, so we just
5340                  collect s' values in SCALAR_RESULTS.  */
5341               if (slp_reduc)
5342                 scalar_results.safe_push (new_temp);
5343
5344               for (bit_offset = element_bitsize;
5345                    bit_offset < vec_size_in_bits;
5346                    bit_offset += element_bitsize)
5347                 {
5348                   tree bitpos = bitsize_int (bit_offset);
5349                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5350                                      bitsize, bitpos);
5351
5352                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5353                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5354                   gimple_assign_set_lhs (epilog_stmt, new_name);
5355                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5356
5357                   if (slp_reduc)
5358                     {
5359                       /* In SLP we don't need to apply reduction operation, so
5360                          we just collect s' values in SCALAR_RESULTS.  */
5361                       new_temp = new_name;
5362                       scalar_results.safe_push (new_name);
5363                     }
5364                   else
5365                     {
5366                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5367                                                          new_name, new_temp);
5368                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5369                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5370                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5371                     }
5372                 }
5373             }
5374
5375           /* The only case where we need to reduce scalar results in SLP, is
5376              unrolling.  If the size of SCALAR_RESULTS is greater than
5377              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5378              REDUC_GROUP_SIZE.  */
5379           if (slp_reduc)
5380             {
5381               tree res, first_res, new_res;
5382               gimple *new_stmt;
5383
5384               /* Reduce multiple scalar results in case of SLP unrolling.  */
5385               for (j = group_size; scalar_results.iterate (j, &res);
5386                    j++)
5387                 {
5388                   first_res = scalar_results[j % group_size];
5389                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5390                                                   first_res, res);
5391                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5392                   gimple_assign_set_lhs (new_stmt, new_res);
5393                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5394                   scalar_results[j % group_size] = new_res;
5395                 }
5396             }
5397           else
5398             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5399             scalar_results.safe_push (new_temp);
5400         }
5401
5402       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5403            == INTEGER_INDUC_COND_REDUCTION)
5404           && !operand_equal_p (initial_def, induc_val, 0))
5405         {
5406           /* Earlier we set the initial value to be a vector if induc_val
5407              values.  Check the result and if it is induc_val then replace
5408              with the original initial value, unless induc_val is
5409              the same as initial_def already.  */
5410           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5411                                   induc_val);
5412
5413           tree tmp = make_ssa_name (new_scalar_dest);
5414           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5415                                              initial_def, new_temp);
5416           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5417           scalar_results[0] = tmp;
5418         }
5419     }
5420
5421 vect_finalize_reduction:
5422
5423   if (double_reduc)
5424     loop = loop->inner;
5425
5426   /* 2.5 Adjust the final result by the initial value of the reduction
5427          variable. (When such adjustment is not needed, then
5428          'adjustment_def' is zero).  For example, if code is PLUS we create:
5429          new_temp = loop_exit_def + adjustment_def  */
5430
5431   if (adjustment_def)
5432     {
5433       gcc_assert (!slp_reduc);
5434       if (nested_in_vect_loop)
5435         {
5436           new_phi = new_phis[0];
5437           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5438           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5439           new_dest = vect_create_destination_var (scalar_dest, vectype);
5440         }
5441       else
5442         {
5443           new_temp = scalar_results[0];
5444           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5445           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5446           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5447         }
5448
5449       epilog_stmt = gimple_build_assign (new_dest, expr);
5450       new_temp = make_ssa_name (new_dest, epilog_stmt);
5451       gimple_assign_set_lhs (epilog_stmt, new_temp);
5452       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5453       if (nested_in_vect_loop)
5454         {
5455           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5456           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5457             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5458
5459           if (!double_reduc)
5460             scalar_results.quick_push (new_temp);
5461           else
5462             scalar_results[0] = new_temp;
5463         }
5464       else
5465         scalar_results[0] = new_temp;
5466
5467       new_phis[0] = epilog_stmt;
5468     }
5469
5470   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5471           phis with new adjusted scalar results, i.e., replace use <s_out0>
5472           with use <s_out4>.
5473
5474      Transform:
5475         loop_exit:
5476           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5477           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5478           v_out2 = reduce <v_out1>
5479           s_out3 = extract_field <v_out2, 0>
5480           s_out4 = adjust_result <s_out3>
5481           use <s_out0>
5482           use <s_out0>
5483
5484      into:
5485
5486         loop_exit:
5487           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5488           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5489           v_out2 = reduce <v_out1>
5490           s_out3 = extract_field <v_out2, 0>
5491           s_out4 = adjust_result <s_out3>
5492           use <s_out4>
5493           use <s_out4> */
5494
5495
5496   /* In SLP reduction chain we reduce vector results into one vector if
5497      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5498      LHS of the last stmt in the reduction chain, since we are looking for
5499      the loop exit phi node.  */
5500   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5501     {
5502       stmt_vec_info dest_stmt_info
5503         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5504       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5505       group_size = 1;
5506     }
5507
5508   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5509      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5510      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5511      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5512      correspond to the first vector stmt, etc.
5513      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5514   if (group_size > new_phis.length ())
5515     {
5516       ratio = group_size / new_phis.length ();
5517       gcc_assert (!(group_size % new_phis.length ()));
5518     }
5519   else
5520     ratio = 1;
5521
5522   stmt_vec_info epilog_stmt_info = NULL;
5523   for (k = 0; k < group_size; k++)
5524     {
5525       if (k % ratio == 0)
5526         {
5527           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5528           reduction_phi_info = reduction_phis[k / ratio];
5529           if (double_reduc)
5530             inner_phi = inner_phis[k / ratio];
5531         }
5532
5533       if (slp_reduc)
5534         {
5535           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5536
5537           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5538           /* SLP statements can't participate in patterns.  */
5539           gcc_assert (!orig_stmt_info);
5540           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5541         }
5542
5543       phis.create (3);
5544       /* Find the loop-closed-use at the loop exit of the original scalar
5545          result.  (The reduction result is expected to have two immediate uses -
5546          one at the latch block, and one at the loop exit).  */
5547       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5548         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5549             && !is_gimple_debug (USE_STMT (use_p)))
5550           phis.safe_push (USE_STMT (use_p));
5551
5552       /* While we expect to have found an exit_phi because of loop-closed-ssa
5553          form we can end up without one if the scalar cycle is dead.  */
5554
5555       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5556         {
5557           if (outer_loop)
5558             {
5559               stmt_vec_info exit_phi_vinfo
5560                 = loop_vinfo->lookup_stmt (exit_phi);
5561               gphi *vect_phi;
5562
5563               /* FORNOW. Currently not supporting the case that an inner-loop
5564                  reduction is not used in the outer-loop (but only outside the
5565                  outer-loop), unless it is double reduction.  */
5566               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5567                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5568                           || double_reduc);
5569
5570               if (double_reduc)
5571                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5572               else
5573                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5574               if (!double_reduc
5575                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5576                       != vect_double_reduction_def)
5577                 continue;
5578
5579               /* Handle double reduction:
5580
5581                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5582                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5583                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5584                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5585
5586                  At that point the regular reduction (stmt2 and stmt3) is
5587                  already vectorized, as well as the exit phi node, stmt4.
5588                  Here we vectorize the phi node of double reduction, stmt1, and
5589                  update all relevant statements.  */
5590
5591               /* Go through all the uses of s2 to find double reduction phi
5592                  node, i.e., stmt1 above.  */
5593               orig_name = PHI_RESULT (exit_phi);
5594               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5595                 {
5596                   stmt_vec_info use_stmt_vinfo;
5597                   tree vect_phi_init, preheader_arg, vect_phi_res;
5598                   basic_block bb = gimple_bb (use_stmt);
5599
5600                   /* Check that USE_STMT is really double reduction phi
5601                      node.  */
5602                   if (gimple_code (use_stmt) != GIMPLE_PHI
5603                       || gimple_phi_num_args (use_stmt) != 2
5604                       || bb->loop_father != outer_loop)
5605                     continue;
5606                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5607                   if (!use_stmt_vinfo
5608                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5609                           != vect_double_reduction_def)
5610                     continue;
5611
5612                   /* Create vector phi node for double reduction:
5613                      vs1 = phi <vs0, vs2>
5614                      vs1 was created previously in this function by a call to
5615                        vect_get_vec_def_for_operand and is stored in
5616                        vec_initial_def;
5617                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5618                      vs0 is created here.  */
5619
5620                   /* Create vector phi node.  */
5621                   vect_phi = create_phi_node (vec_initial_def, bb);
5622                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5623
5624                   /* Create vs0 - initial def of the double reduction phi.  */
5625                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5626                                              loop_preheader_edge (outer_loop));
5627                   vect_phi_init = get_initial_def_for_reduction
5628                     (stmt_info, preheader_arg, NULL);
5629
5630                   /* Update phi node arguments with vs0 and vs2.  */
5631                   add_phi_arg (vect_phi, vect_phi_init,
5632                                loop_preheader_edge (outer_loop),
5633                                UNKNOWN_LOCATION);
5634                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5635                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5636                   if (dump_enabled_p ())
5637                     dump_printf_loc (MSG_NOTE, vect_location,
5638                                      "created double reduction phi node: %G",
5639                                      vect_phi);
5640
5641                   vect_phi_res = PHI_RESULT (vect_phi);
5642
5643                   /* Replace the use, i.e., set the correct vs1 in the regular
5644                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5645                      loop is redundant.  */
5646                   stmt_vec_info use_info = reduction_phi_info;
5647                   for (j = 0; j < ncopies; j++)
5648                     {
5649                       edge pr_edge = loop_preheader_edge (loop);
5650                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5651                                        pr_edge->dest_idx, vect_phi_res);
5652                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5653                     }
5654                 }
5655             }
5656         }
5657
5658       phis.release ();
5659       if (nested_in_vect_loop)
5660         {
5661           if (double_reduc)
5662             loop = outer_loop;
5663           else
5664             continue;
5665         }
5666
5667       phis.create (3);
5668       /* Find the loop-closed-use at the loop exit of the original scalar
5669          result.  (The reduction result is expected to have two immediate uses,
5670          one at the latch block, and one at the loop exit).  For double
5671          reductions we are looking for exit phis of the outer loop.  */
5672       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5673         {
5674           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5675             {
5676               if (!is_gimple_debug (USE_STMT (use_p)))
5677                 phis.safe_push (USE_STMT (use_p));
5678             }
5679           else
5680             {
5681               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5682                 {
5683                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5684
5685                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5686                     {
5687                       if (!flow_bb_inside_loop_p (loop,
5688                                              gimple_bb (USE_STMT (phi_use_p)))
5689                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5690                         phis.safe_push (USE_STMT (phi_use_p));
5691                     }
5692                 }
5693             }
5694         }
5695
5696       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5697         {
5698           /* Replace the uses:  */
5699           orig_name = PHI_RESULT (exit_phi);
5700           scalar_result = scalar_results[k];
5701           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5702             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5703               SET_USE (use_p, scalar_result);
5704         }
5705
5706       phis.release ();
5707     }
5708 }
5709
5710 /* Return a vector of type VECTYPE that is equal to the vector select
5711    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5712    before GSI.  */
5713
5714 static tree
5715 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5716                      tree vec, tree identity)
5717 {
5718   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5719   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5720                                           mask, vec, identity);
5721   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5722   return cond;
5723 }
5724
5725 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5726    order, starting with LHS.  Insert the extraction statements before GSI and
5727    associate the new scalar SSA names with variable SCALAR_DEST.
5728    Return the SSA name for the result.  */
5729
5730 static tree
5731 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5732                        tree_code code, tree lhs, tree vector_rhs)
5733 {
5734   tree vectype = TREE_TYPE (vector_rhs);
5735   tree scalar_type = TREE_TYPE (vectype);
5736   tree bitsize = TYPE_SIZE (scalar_type);
5737   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5738   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5739
5740   for (unsigned HOST_WIDE_INT bit_offset = 0;
5741        bit_offset < vec_size_in_bits;
5742        bit_offset += element_bitsize)
5743     {
5744       tree bitpos = bitsize_int (bit_offset);
5745       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5746                          bitsize, bitpos);
5747
5748       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5749       rhs = make_ssa_name (scalar_dest, stmt);
5750       gimple_assign_set_lhs (stmt, rhs);
5751       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5752
5753       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5754       tree new_name = make_ssa_name (scalar_dest, stmt);
5755       gimple_assign_set_lhs (stmt, new_name);
5756       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5757       lhs = new_name;
5758     }
5759   return lhs;
5760 }
5761
5762 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5763    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5764    statement.  CODE is the operation performed by STMT_INFO and OPS are
5765    its scalar operands.  REDUC_INDEX is the index of the operand in
5766    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5767    implements in-order reduction, or IFN_LAST if we should open-code it.
5768    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5769    that should be used to control the operation in a fully-masked loop.  */
5770
5771 static bool
5772 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5773                                gimple_stmt_iterator *gsi,
5774                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5775                                gimple *reduc_def_stmt,
5776                                tree_code code, internal_fn reduc_fn,
5777                                tree ops[3], tree vectype_in,
5778                                int reduc_index, vec_loop_masks *masks)
5779 {
5780   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5781   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5782   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5783   stmt_vec_info new_stmt_info = NULL;
5784
5785   int ncopies;
5786   if (slp_node)
5787     ncopies = 1;
5788   else
5789     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5790
5791   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5792   gcc_assert (ncopies == 1);
5793   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5794   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5795   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5796               == FOLD_LEFT_REDUCTION);
5797
5798   if (slp_node)
5799     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5800                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5801
5802   tree op0 = ops[1 - reduc_index];
5803
5804   int group_size = 1;
5805   stmt_vec_info scalar_dest_def_info;
5806   auto_vec<tree> vec_oprnds0;
5807   if (slp_node)
5808     {
5809       vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5810                          slp_node);
5811       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5812       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5813     }
5814   else
5815     {
5816       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5817       vec_oprnds0.create (1);
5818       vec_oprnds0.quick_push (loop_vec_def0);
5819       scalar_dest_def_info = stmt_info;
5820     }
5821
5822   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5823   tree scalar_type = TREE_TYPE (scalar_dest);
5824   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5825
5826   int vec_num = vec_oprnds0.length ();
5827   gcc_assert (vec_num == 1 || slp_node);
5828   tree vec_elem_type = TREE_TYPE (vectype_out);
5829   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5830
5831   tree vector_identity = NULL_TREE;
5832   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5833     vector_identity = build_zero_cst (vectype_out);
5834
5835   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5836   int i;
5837   tree def0;
5838   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5839     {
5840       gimple *new_stmt;
5841       tree mask = NULL_TREE;
5842       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5843         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5844
5845       /* Handle MINUS by adding the negative.  */
5846       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5847         {
5848           tree negated = make_ssa_name (vectype_out);
5849           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5850           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5851           def0 = negated;
5852         }
5853
5854       if (mask)
5855         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5856                                     vector_identity);
5857
5858       /* On the first iteration the input is simply the scalar phi
5859          result, and for subsequent iterations it is the output of
5860          the preceding operation.  */
5861       if (reduc_fn != IFN_LAST)
5862         {
5863           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5864           /* For chained SLP reductions the output of the previous reduction
5865              operation serves as the input of the next. For the final statement
5866              the output cannot be a temporary - we reuse the original
5867              scalar destination of the last statement.  */
5868           if (i != vec_num - 1)
5869             {
5870               gimple_set_lhs (new_stmt, scalar_dest_var);
5871               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5872               gimple_set_lhs (new_stmt, reduc_var);
5873             }
5874         }
5875       else
5876         {
5877           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5878                                              reduc_var, def0);
5879           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5880           /* Remove the statement, so that we can use the same code paths
5881              as for statements that we've just created.  */
5882           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5883           gsi_remove (&tmp_gsi, false);
5884         }
5885
5886       if (i == vec_num - 1)
5887         {
5888           gimple_set_lhs (new_stmt, scalar_dest);
5889           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5890                                                     new_stmt);
5891         }
5892       else
5893         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5894                                                      new_stmt, gsi);
5895
5896       if (slp_node)
5897         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5898     }
5899
5900   if (!slp_node)
5901     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5902
5903   return true;
5904 }
5905
5906 /* Function is_nonwrapping_integer_induction.
5907
5908    Check if STMT_VINO (which is part of loop LOOP) both increments and
5909    does not cause overflow.  */
5910
5911 static bool
5912 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5913 {
5914   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5915   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5916   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5917   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5918   widest_int ni, max_loop_value, lhs_max;
5919   wi::overflow_type overflow = wi::OVF_NONE;
5920
5921   /* Make sure the loop is integer based.  */
5922   if (TREE_CODE (base) != INTEGER_CST
5923       || TREE_CODE (step) != INTEGER_CST)
5924     return false;
5925
5926   /* Check that the max size of the loop will not wrap.  */
5927
5928   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5929     return true;
5930
5931   if (! max_stmt_executions (loop, &ni))
5932     return false;
5933
5934   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5935                             &overflow);
5936   if (overflow)
5937     return false;
5938
5939   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5940                             TYPE_SIGN (lhs_type), &overflow);
5941   if (overflow)
5942     return false;
5943
5944   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5945           <= TYPE_PRECISION (lhs_type));
5946 }
5947
5948 /* Function vectorizable_reduction.
5949
5950    Check if STMT_INFO performs a reduction operation that can be vectorized.
5951    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5952    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5953    Return true if STMT_INFO is vectorizable in this way.
5954
5955    This function also handles reduction idioms (patterns) that have been
5956    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5957    may be of this form:
5958      X = pattern_expr (arg0, arg1, ..., X)
5959    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5960    sequence that had been detected and replaced by the pattern-stmt
5961    (STMT_INFO).
5962
5963    This function also handles reduction of condition expressions, for example:
5964      for (int i = 0; i < N; i++)
5965        if (a[i] < value)
5966          last = a[i];
5967    This is handled by vectorising the loop and creating an additional vector
5968    containing the loop indexes for which "a[i] < value" was true.  In the
5969    function epilogue this is reduced to a single max value and then used to
5970    index into the vector of results.
5971
5972    In some cases of reduction patterns, the type of the reduction variable X is
5973    different than the type of the other arguments of STMT_INFO.
5974    In such cases, the vectype that is used when transforming STMT_INFO into
5975    a vector stmt is different than the vectype that is used to determine the
5976    vectorization factor, because it consists of a different number of elements
5977    than the actual number of elements that are being operated upon in parallel.
5978
5979    For example, consider an accumulation of shorts into an int accumulator.
5980    On some targets it's possible to vectorize this pattern operating on 8
5981    shorts at a time (hence, the vectype for purposes of determining the
5982    vectorization factor should be V8HI); on the other hand, the vectype that
5983    is used to create the vector form is actually V4SI (the type of the result).
5984
5985    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5986    indicates what is the actual level of parallelism (V8HI in the example), so
5987    that the right vectorization factor would be derived.  This vectype
5988    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5989    be used to create the vectorized stmt.  The right vectype for the vectorized
5990    stmt is obtained from the type of the result X:
5991         get_vectype_for_scalar_type (TREE_TYPE (X))
5992
5993    This means that, contrary to "regular" reductions (or "regular" stmts in
5994    general), the following equation:
5995       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5996    does *NOT* necessarily hold for reduction patterns.  */
5997
5998 bool
5999 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6000                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6001                         slp_instance slp_node_instance,
6002                         stmt_vector_for_cost *cost_vec)
6003 {
6004   tree vec_dest;
6005   tree scalar_dest;
6006   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6007   tree vectype_in = NULL_TREE;
6008   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6009   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6010   enum tree_code code, orig_code;
6011   internal_fn reduc_fn;
6012   machine_mode vec_mode;
6013   int op_type;
6014   optab optab;
6015   tree new_temp = NULL_TREE;
6016   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6017   stmt_vec_info cond_stmt_vinfo = NULL;
6018   enum tree_code cond_reduc_op_code = ERROR_MARK;
6019   tree scalar_type;
6020   bool is_simple_use;
6021   int i;
6022   int ncopies;
6023   int epilog_copies;
6024   stmt_vec_info prev_stmt_info, prev_phi_info;
6025   bool single_defuse_cycle = false;
6026   stmt_vec_info new_stmt_info = NULL;
6027   int j;
6028   tree ops[3];
6029   enum vect_def_type dts[3];
6030   bool nested_cycle = false, found_nested_cycle_def = false;
6031   bool double_reduc = false;
6032   basic_block def_bb;
6033   struct loop * def_stmt_loop;
6034   tree def_arg;
6035   auto_vec<tree> vec_oprnds0;
6036   auto_vec<tree> vec_oprnds1;
6037   auto_vec<tree> vec_oprnds2;
6038   auto_vec<tree> vect_defs;
6039   auto_vec<stmt_vec_info> phis;
6040   int vec_num;
6041   tree def0, tem;
6042   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6043   tree cond_reduc_val = NULL_TREE;
6044
6045   /* Make sure it was already recognized as a reduction computation.  */
6046   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6047       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6048     return false;
6049
6050   if (nested_in_vect_loop_p (loop, stmt_info))
6051     {
6052       loop = loop->inner;
6053       nested_cycle = true;
6054     }
6055
6056   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6057     gcc_assert (slp_node
6058                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6059
6060   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6061     {
6062       tree phi_result = gimple_phi_result (phi);
6063       /* Analysis is fully done on the reduction stmt invocation.  */
6064       if (! vec_stmt)
6065         {
6066           if (slp_node)
6067             slp_node_instance->reduc_phis = slp_node;
6068
6069           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6070           return true;
6071         }
6072
6073       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6074         /* Leave the scalar phi in place.  Note that checking
6075            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6076            for reductions involving a single statement.  */
6077         return true;
6078
6079       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6080       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6081
6082       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6083           == EXTRACT_LAST_REDUCTION)
6084         /* Leave the scalar phi in place.  */
6085         return true;
6086
6087       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6088       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6089         {
6090           tree op = gimple_op (reduc_stmt, k);
6091           if (op == phi_result)
6092             continue;
6093           if (k == 1
6094               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6095             continue;
6096           if (!vectype_in
6097               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6098                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6099             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6100           break;
6101         }
6102       gcc_assert (vectype_in);
6103
6104       if (slp_node)
6105         ncopies = 1;
6106       else
6107         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6108
6109       stmt_vec_info use_stmt_info;
6110       if (ncopies > 1
6111           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6112           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6113           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6114         single_defuse_cycle = true;
6115
6116       /* Create the destination vector  */
6117       scalar_dest = gimple_assign_lhs (reduc_stmt);
6118       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6119
6120       if (slp_node)
6121         /* The size vect_schedule_slp_instance computes is off for us.  */
6122         vec_num = vect_get_num_vectors
6123           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6124            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6125            vectype_in);
6126       else
6127         vec_num = 1;
6128
6129       /* Generate the reduction PHIs upfront.  */
6130       prev_phi_info = NULL;
6131       for (j = 0; j < ncopies; j++)
6132         {
6133           if (j == 0 || !single_defuse_cycle)
6134             {
6135               for (i = 0; i < vec_num; i++)
6136                 {
6137                   /* Create the reduction-phi that defines the reduction
6138                      operand.  */
6139                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6140                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6141
6142                   if (slp_node)
6143                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6144                   else
6145                     {
6146                       if (j == 0)
6147                         STMT_VINFO_VEC_STMT (stmt_info)
6148                           = *vec_stmt = new_phi_info;
6149                       else
6150                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6151                       prev_phi_info = new_phi_info;
6152                     }
6153                 }
6154             }
6155         }
6156
6157       return true;
6158     }
6159
6160   /* 1. Is vectorizable reduction?  */
6161   /* Not supportable if the reduction variable is used in the loop, unless
6162      it's a reduction chain.  */
6163   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6164       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6165     return false;
6166
6167   /* Reductions that are not used even in an enclosing outer-loop,
6168      are expected to be "live" (used out of the loop).  */
6169   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6170       && !STMT_VINFO_LIVE_P (stmt_info))
6171     return false;
6172
6173   /* 2. Has this been recognized as a reduction pattern?
6174
6175      Check if STMT represents a pattern that has been recognized
6176      in earlier analysis stages.  For stmts that represent a pattern,
6177      the STMT_VINFO_RELATED_STMT field records the last stmt in
6178      the original sequence that constitutes the pattern.  */
6179
6180   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6181   if (orig_stmt_info)
6182     {
6183       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6184       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6185     }
6186
6187   /* 3. Check the operands of the operation.  The first operands are defined
6188         inside the loop body. The last operand is the reduction variable,
6189         which is defined by the loop-header-phi.  */
6190
6191   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6192
6193   /* Flatten RHS.  */
6194   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6195     {
6196     case GIMPLE_BINARY_RHS:
6197       code = gimple_assign_rhs_code (stmt);
6198       op_type = TREE_CODE_LENGTH (code);
6199       gcc_assert (op_type == binary_op);
6200       ops[0] = gimple_assign_rhs1 (stmt);
6201       ops[1] = gimple_assign_rhs2 (stmt);
6202       break;
6203
6204     case GIMPLE_TERNARY_RHS:
6205       code = gimple_assign_rhs_code (stmt);
6206       op_type = TREE_CODE_LENGTH (code);
6207       gcc_assert (op_type == ternary_op);
6208       ops[0] = gimple_assign_rhs1 (stmt);
6209       ops[1] = gimple_assign_rhs2 (stmt);
6210       ops[2] = gimple_assign_rhs3 (stmt);
6211       break;
6212
6213     case GIMPLE_UNARY_RHS:
6214       return false;
6215
6216     default:
6217       gcc_unreachable ();
6218     }
6219
6220   if (code == COND_EXPR && slp_node)
6221     return false;
6222
6223   scalar_dest = gimple_assign_lhs (stmt);
6224   scalar_type = TREE_TYPE (scalar_dest);
6225   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6226       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6227     return false;
6228
6229   /* Do not try to vectorize bit-precision reductions.  */
6230   if (!type_has_mode_precision_p (scalar_type))
6231     return false;
6232
6233   /* All uses but the last are expected to be defined in the loop.
6234      The last use is the reduction variable.  In case of nested cycle this
6235      assumption is not true: we use reduc_index to record the index of the
6236      reduction variable.  */
6237   stmt_vec_info reduc_def_info = NULL;
6238   int reduc_index = -1;
6239   for (i = 0; i < op_type; i++)
6240     {
6241       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6242       if (i == 0 && code == COND_EXPR)
6243         continue;
6244
6245       stmt_vec_info def_stmt_info;
6246       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6247                                           &def_stmt_info);
6248       dt = dts[i];
6249       gcc_assert (is_simple_use);
6250       if (dt == vect_reduction_def)
6251         {
6252           reduc_def_info = def_stmt_info;
6253           reduc_index = i;
6254           continue;
6255         }
6256       else if (tem)
6257         {
6258           /* To properly compute ncopies we are interested in the widest
6259              input type in case we're looking at a widening accumulation.  */
6260           if (!vectype_in
6261               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6262                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6263             vectype_in = tem;
6264         }
6265
6266       if (dt != vect_internal_def
6267           && dt != vect_external_def
6268           && dt != vect_constant_def
6269           && dt != vect_induction_def
6270           && !(dt == vect_nested_cycle && nested_cycle))
6271         return false;
6272
6273       if (dt == vect_nested_cycle)
6274         {
6275           found_nested_cycle_def = true;
6276           reduc_def_info = def_stmt_info;
6277           reduc_index = i;
6278         }
6279
6280       if (i == 1 && code == COND_EXPR)
6281         {
6282           /* Record how value of COND_EXPR is defined.  */
6283           if (dt == vect_constant_def)
6284             {
6285               cond_reduc_dt = dt;
6286               cond_reduc_val = ops[i];
6287             }
6288           if (dt == vect_induction_def
6289               && def_stmt_info
6290               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6291             {
6292               cond_reduc_dt = dt;
6293               cond_stmt_vinfo = def_stmt_info;
6294             }
6295         }
6296     }
6297
6298   if (!vectype_in)
6299     vectype_in = vectype_out;
6300
6301   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6302      directy used in stmt.  */
6303   if (reduc_index == -1)
6304     {
6305       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6306         {
6307           if (dump_enabled_p ())
6308             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6309                              "in-order reduction chain without SLP.\n");
6310           return false;
6311         }
6312
6313       if (orig_stmt_info)
6314         reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6315       else
6316         reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6317     }
6318
6319   if (! reduc_def_info)
6320     return false;
6321
6322   gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6323   if (!reduc_def_phi)
6324     return false;
6325
6326   if (!(reduc_index == -1
6327         || dts[reduc_index] == vect_reduction_def
6328         || dts[reduc_index] == vect_nested_cycle
6329         || ((dts[reduc_index] == vect_internal_def
6330              || dts[reduc_index] == vect_external_def
6331              || dts[reduc_index] == vect_constant_def
6332              || dts[reduc_index] == vect_induction_def)
6333             && nested_cycle && found_nested_cycle_def)))
6334     {
6335       /* For pattern recognized stmts, orig_stmt might be a reduction,
6336          but some helper statements for the pattern might not, or
6337          might be COND_EXPRs with reduction uses in the condition.  */
6338       gcc_assert (orig_stmt_info);
6339       return false;
6340     }
6341
6342   /* PHIs should not participate in patterns.  */
6343   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6344   enum vect_reduction_type v_reduc_type
6345     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6346   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6347
6348   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6349   /* If we have a condition reduction, see if we can simplify it further.  */
6350   if (v_reduc_type == COND_REDUCTION)
6351     {
6352       /* TODO: We can't yet handle reduction chains, since we need to treat
6353          each COND_EXPR in the chain specially, not just the last one.
6354          E.g. for:
6355
6356             x_1 = PHI <x_3, ...>
6357             x_2 = a_2 ? ... : x_1;
6358             x_3 = a_3 ? ... : x_2;
6359
6360          we're interested in the last element in x_3 for which a_2 || a_3
6361          is true, whereas the current reduction chain handling would
6362          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6363          as a reduction operation.  */
6364       if (reduc_index == -1)
6365         {
6366           if (dump_enabled_p ())
6367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6368                              "conditional reduction chains not supported\n");
6369           return false;
6370         }
6371
6372       /* vect_is_simple_reduction ensured that operand 2 is the
6373          loop-carried operand.  */
6374       gcc_assert (reduc_index == 2);
6375
6376       /* Loop peeling modifies initial value of reduction PHI, which
6377          makes the reduction stmt to be transformed different to the
6378          original stmt analyzed.  We need to record reduction code for
6379          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6380          it can be used directly at transform stage.  */
6381       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6382           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6383         {
6384           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6385           gcc_assert (cond_reduc_dt == vect_constant_def);
6386           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6387         }
6388       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6389                                                vectype_in, OPTIMIZE_FOR_SPEED))
6390         {
6391           if (dump_enabled_p ())
6392             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6393                              "optimizing condition reduction with"
6394                              " FOLD_EXTRACT_LAST.\n");
6395           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6396         }
6397       else if (cond_reduc_dt == vect_induction_def)
6398         {
6399           tree base
6400             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6401           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6402
6403           gcc_assert (TREE_CODE (base) == INTEGER_CST
6404                       && TREE_CODE (step) == INTEGER_CST);
6405           cond_reduc_val = NULL_TREE;
6406           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6407              above base; punt if base is the minimum value of the type for
6408              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6409           if (tree_int_cst_sgn (step) == -1)
6410             {
6411               cond_reduc_op_code = MIN_EXPR;
6412               if (tree_int_cst_sgn (base) == -1)
6413                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6414               else if (tree_int_cst_lt (base,
6415                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6416                 cond_reduc_val
6417                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6418             }
6419           else
6420             {
6421               cond_reduc_op_code = MAX_EXPR;
6422               if (tree_int_cst_sgn (base) == 1)
6423                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6424               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6425                                         base))
6426                 cond_reduc_val
6427                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6428             }
6429           if (cond_reduc_val)
6430             {
6431               if (dump_enabled_p ())
6432                 dump_printf_loc (MSG_NOTE, vect_location,
6433                                  "condition expression based on "
6434                                  "integer induction.\n");
6435               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6436                 = INTEGER_INDUC_COND_REDUCTION;
6437             }
6438         }
6439       else if (cond_reduc_dt == vect_constant_def)
6440         {
6441           enum vect_def_type cond_initial_dt;
6442           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6443           tree cond_initial_val
6444             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6445
6446           gcc_assert (cond_reduc_val != NULL_TREE);
6447           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6448           if (cond_initial_dt == vect_constant_def
6449               && types_compatible_p (TREE_TYPE (cond_initial_val),
6450                                      TREE_TYPE (cond_reduc_val)))
6451             {
6452               tree e = fold_binary (LE_EXPR, boolean_type_node,
6453                                     cond_initial_val, cond_reduc_val);
6454               if (e && (integer_onep (e) || integer_zerop (e)))
6455                 {
6456                   if (dump_enabled_p ())
6457                     dump_printf_loc (MSG_NOTE, vect_location,
6458                                      "condition expression based on "
6459                                      "compile time constant.\n");
6460                   /* Record reduction code at analysis stage.  */
6461                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6462                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6463                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6464                     = CONST_COND_REDUCTION;
6465                 }
6466             }
6467         }
6468     }
6469
6470   if (orig_stmt_info)
6471     gcc_assert (tmp == orig_stmt_info
6472                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6473   else
6474     /* We changed STMT to be the first stmt in reduction chain, hence we
6475        check that in this case the first element in the chain is STMT.  */
6476     gcc_assert (tmp == stmt_info
6477                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6478
6479   if (STMT_VINFO_LIVE_P (reduc_def_info))
6480     return false;
6481
6482   if (slp_node)
6483     ncopies = 1;
6484   else
6485     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6486
6487   gcc_assert (ncopies >= 1);
6488
6489   vec_mode = TYPE_MODE (vectype_in);
6490   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6491
6492   if (code == COND_EXPR)
6493     {
6494       /* Only call during the analysis stage, otherwise we'll lose
6495          STMT_VINFO_TYPE.  */
6496       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6497                                                 ops[reduc_index], 0, NULL,
6498                                                 cost_vec))
6499         {
6500           if (dump_enabled_p ())
6501             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6502                              "unsupported condition in reduction\n");
6503           return false;
6504         }
6505     }
6506   else
6507     {
6508       /* 4. Supportable by target?  */
6509
6510       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6511           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6512         {
6513           /* Shifts and rotates are only supported by vectorizable_shifts,
6514              not vectorizable_reduction.  */
6515           if (dump_enabled_p ())
6516             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6517                              "unsupported shift or rotation.\n");
6518           return false;
6519         }
6520
6521       /* 4.1. check support for the operation in the loop  */
6522       optab = optab_for_tree_code (code, vectype_in, optab_default);
6523       if (!optab)
6524         {
6525           if (dump_enabled_p ())
6526             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6527                              "no optab.\n");
6528
6529           return false;
6530         }
6531
6532       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6533         {
6534           if (dump_enabled_p ())
6535             dump_printf (MSG_NOTE, "op not supported by target.\n");
6536
6537           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6538               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6539             return false;
6540
6541           if (dump_enabled_p ())
6542             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6543         }
6544
6545       /* Worthwhile without SIMD support?  */
6546       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6547           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6548         {
6549           if (dump_enabled_p ())
6550             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6551                              "not worthwhile without SIMD support.\n");
6552
6553           return false;
6554         }
6555     }
6556
6557   /* 4.2. Check support for the epilog operation.
6558
6559           If STMT represents a reduction pattern, then the type of the
6560           reduction variable may be different than the type of the rest
6561           of the arguments.  For example, consider the case of accumulation
6562           of shorts into an int accumulator; The original code:
6563                         S1: int_a = (int) short_a;
6564           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6565
6566           was replaced with:
6567                         STMT: int_acc = widen_sum <short_a, int_acc>
6568
6569           This means that:
6570           1. The tree-code that is used to create the vector operation in the
6571              epilog code (that reduces the partial results) is not the
6572              tree-code of STMT, but is rather the tree-code of the original
6573              stmt from the pattern that STMT is replacing.  I.e, in the example
6574              above we want to use 'widen_sum' in the loop, but 'plus' in the
6575              epilog.
6576           2. The type (mode) we use to check available target support
6577              for the vector operation to be created in the *epilog*, is
6578              determined by the type of the reduction variable (in the example
6579              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6580              However the type (mode) we use to check available target support
6581              for the vector operation to be created *inside the loop*, is
6582              determined by the type of the other arguments to STMT (in the
6583              example we'd check this: optab_handler (widen_sum_optab,
6584              vect_short_mode)).
6585
6586           This is contrary to "regular" reductions, in which the types of all
6587           the arguments are the same as the type of the reduction variable.
6588           For "regular" reductions we can therefore use the same vector type
6589           (and also the same tree-code) when generating the epilog code and
6590           when generating the code inside the loop.  */
6591
6592   vect_reduction_type reduction_type
6593     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6594   if (orig_stmt_info
6595       && (reduction_type == TREE_CODE_REDUCTION
6596           || reduction_type == FOLD_LEFT_REDUCTION))
6597     {
6598       /* This is a reduction pattern: get the vectype from the type of the
6599          reduction variable, and get the tree-code from orig_stmt.  */
6600       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6601       gcc_assert (vectype_out);
6602       vec_mode = TYPE_MODE (vectype_out);
6603     }
6604   else
6605     {
6606       /* Regular reduction: use the same vectype and tree-code as used for
6607          the vector code inside the loop can be used for the epilog code. */
6608       orig_code = code;
6609
6610       if (code == MINUS_EXPR)
6611         orig_code = PLUS_EXPR;
6612
6613       /* For simple condition reductions, replace with the actual expression
6614          we want to base our reduction around.  */
6615       if (reduction_type == CONST_COND_REDUCTION)
6616         {
6617           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6618           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6619         }
6620       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6621         orig_code = cond_reduc_op_code;
6622     }
6623
6624   if (nested_cycle)
6625     {
6626       def_bb = gimple_bb (reduc_def_phi);
6627       def_stmt_loop = def_bb->loop_father;
6628       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6629                                        loop_preheader_edge (def_stmt_loop));
6630       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6631       if (def_arg_stmt_info
6632           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6633               == vect_double_reduction_def))
6634         double_reduc = true;
6635     }
6636
6637   reduc_fn = IFN_LAST;
6638
6639   if (reduction_type == TREE_CODE_REDUCTION
6640       || reduction_type == FOLD_LEFT_REDUCTION
6641       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6642       || reduction_type == CONST_COND_REDUCTION)
6643     {
6644       if (reduction_type == FOLD_LEFT_REDUCTION
6645           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6646           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6647         {
6648           if (reduc_fn != IFN_LAST
6649               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6650                                                   OPTIMIZE_FOR_SPEED))
6651             {
6652               if (dump_enabled_p ())
6653                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6654                                  "reduc op not supported by target.\n");
6655
6656               reduc_fn = IFN_LAST;
6657             }
6658         }
6659       else
6660         {
6661           if (!nested_cycle || double_reduc)
6662             {
6663               if (dump_enabled_p ())
6664                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6665                                  "no reduc code for scalar code.\n");
6666
6667               return false;
6668             }
6669         }
6670     }
6671   else if (reduction_type == COND_REDUCTION)
6672     {
6673       int scalar_precision
6674         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6675       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6676       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6677                                                 nunits_out);
6678
6679       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6680                                           OPTIMIZE_FOR_SPEED))
6681         reduc_fn = IFN_REDUC_MAX;
6682     }
6683
6684   if (reduction_type != EXTRACT_LAST_REDUCTION
6685       && (!nested_cycle || double_reduc)
6686       && reduc_fn == IFN_LAST
6687       && !nunits_out.is_constant ())
6688     {
6689       if (dump_enabled_p ())
6690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6691                          "missing target support for reduction on"
6692                          " variable-length vectors.\n");
6693       return false;
6694     }
6695
6696   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6697       && ncopies > 1)
6698     {
6699       if (dump_enabled_p ())
6700         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6701                          "multiple types in double reduction or condition "
6702                          "reduction.\n");
6703       return false;
6704     }
6705
6706   /* For SLP reductions, see if there is a neutral value we can use.  */
6707   tree neutral_op = NULL_TREE;
6708   if (slp_node)
6709     neutral_op = neutral_op_for_slp_reduction
6710       (slp_node_instance->reduc_phis, code,
6711        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6712
6713   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6714     {
6715       /* We can't support in-order reductions of code such as this:
6716
6717            for (int i = 0; i < n1; ++i)
6718              for (int j = 0; j < n2; ++j)
6719                l += a[j];
6720
6721          since GCC effectively transforms the loop when vectorizing:
6722
6723            for (int i = 0; i < n1 / VF; ++i)
6724              for (int j = 0; j < n2; ++j)
6725                for (int k = 0; k < VF; ++k)
6726                  l += a[j];
6727
6728          which is a reassociation of the original operation.  */
6729       if (dump_enabled_p ())
6730         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6731                          "in-order double reduction not supported.\n");
6732
6733       return false;
6734     }
6735
6736   if (reduction_type == FOLD_LEFT_REDUCTION
6737       && slp_node
6738       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6739     {
6740       /* We cannot use in-order reductions in this case because there is
6741          an implicit reassociation of the operations involved.  */
6742       if (dump_enabled_p ())
6743         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744                          "in-order unchained SLP reductions not supported.\n");
6745       return false;
6746     }
6747
6748   /* For double reductions, and for SLP reductions with a neutral value,
6749      we construct a variable-length initial vector by loading a vector
6750      full of the neutral value and then shift-and-inserting the start
6751      values into the low-numbered elements.  */
6752   if ((double_reduc || neutral_op)
6753       && !nunits_out.is_constant ()
6754       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6755                                           vectype_out, OPTIMIZE_FOR_SPEED))
6756     {
6757       if (dump_enabled_p ())
6758         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6759                          "reduction on variable-length vectors requires"
6760                          " target support for a vector-shift-and-insert"
6761                          " operation.\n");
6762       return false;
6763     }
6764
6765   /* Check extra constraints for variable-length unchained SLP reductions.  */
6766   if (STMT_SLP_TYPE (stmt_info)
6767       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6768       && !nunits_out.is_constant ())
6769     {
6770       /* We checked above that we could build the initial vector when
6771          there's a neutral element value.  Check here for the case in
6772          which each SLP statement has its own initial value and in which
6773          that value needs to be repeated for every instance of the
6774          statement within the initial vector.  */
6775       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6776       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6777       if (!neutral_op
6778           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6779         {
6780           if (dump_enabled_p ())
6781             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6782                              "unsupported form of SLP reduction for"
6783                              " variable-length vectors: cannot build"
6784                              " initial vector.\n");
6785           return false;
6786         }
6787       /* The epilogue code relies on the number of elements being a multiple
6788          of the group size.  The duplicate-and-interleave approach to setting
6789          up the the initial vector does too.  */
6790       if (!multiple_p (nunits_out, group_size))
6791         {
6792           if (dump_enabled_p ())
6793             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6794                              "unsupported form of SLP reduction for"
6795                              " variable-length vectors: the vector size"
6796                              " is not a multiple of the number of results.\n");
6797           return false;
6798         }
6799     }
6800
6801   /* In case of widenning multiplication by a constant, we update the type
6802      of the constant to be the type of the other operand.  We check that the
6803      constant fits the type in the pattern recognition pass.  */
6804   if (code == DOT_PROD_EXPR
6805       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6806     {
6807       if (TREE_CODE (ops[0]) == INTEGER_CST)
6808         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6809       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6810         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6811       else
6812         {
6813           if (dump_enabled_p ())
6814             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6815                              "invalid types in dot-prod\n");
6816
6817           return false;
6818         }
6819     }
6820
6821   if (reduction_type == COND_REDUCTION)
6822     {
6823       widest_int ni;
6824
6825       if (! max_loop_iterations (loop, &ni))
6826         {
6827           if (dump_enabled_p ())
6828             dump_printf_loc (MSG_NOTE, vect_location,
6829                              "loop count not known, cannot create cond "
6830                              "reduction.\n");
6831           return false;
6832         }
6833       /* Convert backedges to iterations.  */
6834       ni += 1;
6835
6836       /* The additional index will be the same type as the condition.  Check
6837          that the loop can fit into this less one (because we'll use up the
6838          zero slot for when there are no matches).  */
6839       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6840       if (wi::geu_p (ni, wi::to_widest (max_index)))
6841         {
6842           if (dump_enabled_p ())
6843             dump_printf_loc (MSG_NOTE, vect_location,
6844                              "loop size is greater than data size.\n");
6845           return false;
6846         }
6847     }
6848
6849   /* In case the vectorization factor (VF) is bigger than the number
6850      of elements that we can fit in a vectype (nunits), we have to generate
6851      more than one vector stmt - i.e - we need to "unroll" the
6852      vector stmt by a factor VF/nunits.  For more details see documentation
6853      in vectorizable_operation.  */
6854
6855   /* If the reduction is used in an outer loop we need to generate
6856      VF intermediate results, like so (e.g. for ncopies=2):
6857         r0 = phi (init, r0)
6858         r1 = phi (init, r1)
6859         r0 = x0 + r0;
6860         r1 = x1 + r1;
6861     (i.e. we generate VF results in 2 registers).
6862     In this case we have a separate def-use cycle for each copy, and therefore
6863     for each copy we get the vector def for the reduction variable from the
6864     respective phi node created for this copy.
6865
6866     Otherwise (the reduction is unused in the loop nest), we can combine
6867     together intermediate results, like so (e.g. for ncopies=2):
6868         r = phi (init, r)
6869         r = x0 + r;
6870         r = x1 + r;
6871    (i.e. we generate VF/2 results in a single register).
6872    In this case for each copy we get the vector def for the reduction variable
6873    from the vectorized reduction operation generated in the previous iteration.
6874
6875    This only works when we see both the reduction PHI and its only consumer
6876    in vectorizable_reduction and there are no intermediate stmts
6877    participating.  */
6878   stmt_vec_info use_stmt_info;
6879   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6880   if (ncopies > 1
6881       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6882       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6883       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6884     {
6885       single_defuse_cycle = true;
6886       epilog_copies = 1;
6887     }
6888   else
6889     epilog_copies = ncopies;
6890
6891   /* If the reduction stmt is one of the patterns that have lane
6892      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6893   if ((ncopies > 1
6894        && ! single_defuse_cycle)
6895       && (code == DOT_PROD_EXPR
6896           || code == WIDEN_SUM_EXPR
6897           || code == SAD_EXPR))
6898     {
6899       if (dump_enabled_p ())
6900         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6901                          "multi def-use cycle not possible for lane-reducing "
6902                          "reduction operation\n");
6903       return false;
6904     }
6905
6906   if (slp_node)
6907     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6908   else
6909     vec_num = 1;
6910
6911   internal_fn cond_fn = get_conditional_internal_fn (code);
6912   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6913
6914   if (!vec_stmt) /* transformation not required.  */
6915     {
6916       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6917       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6918         {
6919           if (reduction_type != FOLD_LEFT_REDUCTION
6920               && (cond_fn == IFN_LAST
6921                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6922                                                       OPTIMIZE_FOR_SPEED)))
6923             {
6924               if (dump_enabled_p ())
6925                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6926                                  "can't use a fully-masked loop because no"
6927                                  " conditional operation is available.\n");
6928               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6929             }
6930           else if (reduc_index == -1)
6931             {
6932               if (dump_enabled_p ())
6933                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6934                                  "can't use a fully-masked loop for chained"
6935                                  " reductions.\n");
6936               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6937             }
6938           else
6939             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6940                                    vectype_in);
6941         }
6942       if (dump_enabled_p ()
6943           && reduction_type == FOLD_LEFT_REDUCTION)
6944         dump_printf_loc (MSG_NOTE, vect_location,
6945                          "using an in-order (fold-left) reduction.\n");
6946       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6947       return true;
6948     }
6949
6950   /* Transform.  */
6951
6952   if (dump_enabled_p ())
6953     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6954
6955   /* FORNOW: Multiple types are not supported for condition.  */
6956   if (code == COND_EXPR)
6957     gcc_assert (ncopies == 1);
6958
6959   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6960
6961   if (reduction_type == FOLD_LEFT_REDUCTION)
6962     return vectorize_fold_left_reduction
6963       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6964        reduc_fn, ops, vectype_in, reduc_index, masks);
6965
6966   if (reduction_type == EXTRACT_LAST_REDUCTION)
6967     {
6968       gcc_assert (!slp_node);
6969       return vectorizable_condition (stmt_info, gsi, vec_stmt,
6970                                      NULL, reduc_index, NULL, NULL);
6971     }
6972
6973   /* Create the destination vector  */
6974   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6975
6976   prev_stmt_info = NULL;
6977   prev_phi_info = NULL;
6978   if (!slp_node)
6979     {
6980       vec_oprnds0.create (1);
6981       vec_oprnds1.create (1);
6982       if (op_type == ternary_op)
6983         vec_oprnds2.create (1);
6984     }
6985
6986   phis.create (vec_num);
6987   vect_defs.create (vec_num);
6988   if (!slp_node)
6989     vect_defs.quick_push (NULL_TREE);
6990
6991   if (slp_node)
6992     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6993   else
6994     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6995
6996   for (j = 0; j < ncopies; j++)
6997     {
6998       if (code == COND_EXPR)
6999         {
7000           gcc_assert (!slp_node);
7001           vectorizable_condition (stmt_info, gsi, vec_stmt,
7002                                   PHI_RESULT (phis[0]->stmt),
7003                                   reduc_index, NULL, NULL);
7004           /* Multiple types are not supported for condition.  */
7005           break;
7006         }
7007
7008       /* Handle uses.  */
7009       if (j == 0)
7010         {
7011           if (slp_node)
7012             {
7013               /* Get vec defs for all the operands except the reduction index,
7014                  ensuring the ordering of the ops in the vector is kept.  */
7015               auto_vec<tree, 3> slp_ops;
7016               auto_vec<vec<tree>, 3> vec_defs;
7017
7018               slp_ops.quick_push (ops[0]);
7019               slp_ops.quick_push (ops[1]);
7020               if (op_type == ternary_op)
7021                 slp_ops.quick_push (ops[2]);
7022
7023               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7024
7025               vec_oprnds0.safe_splice (vec_defs[0]);
7026               vec_defs[0].release ();
7027               vec_oprnds1.safe_splice (vec_defs[1]);
7028               vec_defs[1].release ();
7029               if (op_type == ternary_op)
7030                 {
7031                   vec_oprnds2.safe_splice (vec_defs[2]);
7032                   vec_defs[2].release ();
7033                 }
7034             }
7035           else
7036             {
7037               vec_oprnds0.quick_push
7038                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7039               vec_oprnds1.quick_push
7040                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7041               if (op_type == ternary_op)
7042                 vec_oprnds2.quick_push
7043                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7044             }
7045         }
7046       else
7047         {
7048           if (!slp_node)
7049             {
7050               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7051
7052               if (single_defuse_cycle && reduc_index == 0)
7053                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7054               else
7055                 vec_oprnds0[0]
7056                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7057                                                     vec_oprnds0[0]);
7058               if (single_defuse_cycle && reduc_index == 1)
7059                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7060               else
7061                 vec_oprnds1[0]
7062                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7063                                                     vec_oprnds1[0]);
7064               if (op_type == ternary_op)
7065                 {
7066                   if (single_defuse_cycle && reduc_index == 2)
7067                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7068                   else
7069                     vec_oprnds2[0]
7070                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7071                                                         vec_oprnds2[0]);
7072                 }
7073             }
7074         }
7075
7076       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7077         {
7078           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7079           if (masked_loop_p)
7080             {
7081               /* Make sure that the reduction accumulator is vop[0].  */
7082               if (reduc_index == 1)
7083                 {
7084                   gcc_assert (commutative_tree_code (code));
7085                   std::swap (vop[0], vop[1]);
7086                 }
7087               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7088                                               vectype_in, i * ncopies + j);
7089               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7090                                                         vop[0], vop[1],
7091                                                         vop[0]);
7092               new_temp = make_ssa_name (vec_dest, call);
7093               gimple_call_set_lhs (call, new_temp);
7094               gimple_call_set_nothrow (call, true);
7095               new_stmt_info
7096                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7097             }
7098           else
7099             {
7100               if (op_type == ternary_op)
7101                 vop[2] = vec_oprnds2[i];
7102
7103               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7104                                                        vop[0], vop[1], vop[2]);
7105               new_temp = make_ssa_name (vec_dest, new_stmt);
7106               gimple_assign_set_lhs (new_stmt, new_temp);
7107               new_stmt_info
7108                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7109             }
7110
7111           if (slp_node)
7112             {
7113               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7114               vect_defs.quick_push (new_temp);
7115             }
7116           else
7117             vect_defs[0] = new_temp;
7118         }
7119
7120       if (slp_node)
7121         continue;
7122
7123       if (j == 0)
7124         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7125       else
7126         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7127
7128       prev_stmt_info = new_stmt_info;
7129     }
7130
7131   /* Finalize the reduction-phi (set its arguments) and create the
7132      epilog reduction code.  */
7133   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7134     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7135
7136   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7137                                     epilog_copies, reduc_fn, phis,
7138                                     double_reduc, slp_node, slp_node_instance,
7139                                     cond_reduc_val, cond_reduc_op_code,
7140                                     neutral_op);
7141
7142   return true;
7143 }
7144
7145 /* Function vect_min_worthwhile_factor.
7146
7147    For a loop where we could vectorize the operation indicated by CODE,
7148    return the minimum vectorization factor that makes it worthwhile
7149    to use generic vectors.  */
7150 static unsigned int
7151 vect_min_worthwhile_factor (enum tree_code code)
7152 {
7153   switch (code)
7154     {
7155     case PLUS_EXPR:
7156     case MINUS_EXPR:
7157     case NEGATE_EXPR:
7158       return 4;
7159
7160     case BIT_AND_EXPR:
7161     case BIT_IOR_EXPR:
7162     case BIT_XOR_EXPR:
7163     case BIT_NOT_EXPR:
7164       return 2;
7165
7166     default:
7167       return INT_MAX;
7168     }
7169 }
7170
7171 /* Return true if VINFO indicates we are doing loop vectorization and if
7172    it is worth decomposing CODE operations into scalar operations for
7173    that loop's vectorization factor.  */
7174
7175 bool
7176 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7177 {
7178   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7179   unsigned HOST_WIDE_INT value;
7180   return (loop_vinfo
7181           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7182           && value >= vect_min_worthwhile_factor (code));
7183 }
7184
7185 /* Function vectorizable_induction
7186
7187    Check if STMT_INFO performs an induction computation that can be vectorized.
7188    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7189    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7190    Return true if STMT_INFO is vectorizable in this way.  */
7191
7192 bool
7193 vectorizable_induction (stmt_vec_info stmt_info,
7194                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7195                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7196                         stmt_vector_for_cost *cost_vec)
7197 {
7198   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7199   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7200   unsigned ncopies;
7201   bool nested_in_vect_loop = false;
7202   struct loop *iv_loop;
7203   tree vec_def;
7204   edge pe = loop_preheader_edge (loop);
7205   basic_block new_bb;
7206   tree new_vec, vec_init, vec_step, t;
7207   tree new_name;
7208   gimple *new_stmt;
7209   gphi *induction_phi;
7210   tree induc_def, vec_dest;
7211   tree init_expr, step_expr;
7212   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7213   unsigned i;
7214   tree expr;
7215   gimple_seq stmts;
7216   imm_use_iterator imm_iter;
7217   use_operand_p use_p;
7218   gimple *exit_phi;
7219   edge latch_e;
7220   tree loop_arg;
7221   gimple_stmt_iterator si;
7222
7223   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7224   if (!phi)
7225     return false;
7226
7227   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7228     return false;
7229
7230   /* Make sure it was recognized as induction computation.  */
7231   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7232     return false;
7233
7234   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7235   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7236
7237   if (slp_node)
7238     ncopies = 1;
7239   else
7240     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7241   gcc_assert (ncopies >= 1);
7242
7243   /* FORNOW. These restrictions should be relaxed.  */
7244   if (nested_in_vect_loop_p (loop, stmt_info))
7245     {
7246       imm_use_iterator imm_iter;
7247       use_operand_p use_p;
7248       gimple *exit_phi;
7249       edge latch_e;
7250       tree loop_arg;
7251
7252       if (ncopies > 1)
7253         {
7254           if (dump_enabled_p ())
7255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7256                              "multiple types in nested loop.\n");
7257           return false;
7258         }
7259
7260       /* FORNOW: outer loop induction with SLP not supported.  */
7261       if (STMT_SLP_TYPE (stmt_info))
7262         return false;
7263
7264       exit_phi = NULL;
7265       latch_e = loop_latch_edge (loop->inner);
7266       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7267       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7268         {
7269           gimple *use_stmt = USE_STMT (use_p);
7270           if (is_gimple_debug (use_stmt))
7271             continue;
7272
7273           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7274             {
7275               exit_phi = use_stmt;
7276               break;
7277             }
7278         }
7279       if (exit_phi)
7280         {
7281           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7282           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7283                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7284             {
7285               if (dump_enabled_p ())
7286                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7287                                  "inner-loop induction only used outside "
7288                                  "of the outer vectorized loop.\n");
7289               return false;
7290             }
7291         }
7292
7293       nested_in_vect_loop = true;
7294       iv_loop = loop->inner;
7295     }
7296   else
7297     iv_loop = loop;
7298   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7299
7300   if (slp_node && !nunits.is_constant ())
7301     {
7302       /* The current SLP code creates the initial value element-by-element.  */
7303       if (dump_enabled_p ())
7304         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7305                          "SLP induction not supported for variable-length"
7306                          " vectors.\n");
7307       return false;
7308     }
7309
7310   if (!vec_stmt) /* transformation not required.  */
7311     {
7312       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7313       DUMP_VECT_SCOPE ("vectorizable_induction");
7314       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7315       return true;
7316     }
7317
7318   /* Transform.  */
7319
7320   /* Compute a vector variable, initialized with the first VF values of
7321      the induction variable.  E.g., for an iv with IV_PHI='X' and
7322      evolution S, for a vector of 4 units, we want to compute:
7323      [X, X + S, X + 2*S, X + 3*S].  */
7324
7325   if (dump_enabled_p ())
7326     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7327
7328   latch_e = loop_latch_edge (iv_loop);
7329   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7330
7331   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7332   gcc_assert (step_expr != NULL_TREE);
7333
7334   pe = loop_preheader_edge (iv_loop);
7335   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7336                                      loop_preheader_edge (iv_loop));
7337
7338   stmts = NULL;
7339   if (!nested_in_vect_loop)
7340     {
7341       /* Convert the initial value to the desired type.  */
7342       tree new_type = TREE_TYPE (vectype);
7343       init_expr = gimple_convert (&stmts, new_type, init_expr);
7344
7345       /* If we are using the loop mask to "peel" for alignment then we need
7346          to adjust the start value here.  */
7347       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7348       if (skip_niters != NULL_TREE)
7349         {
7350           if (FLOAT_TYPE_P (vectype))
7351             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7352                                         skip_niters);
7353           else
7354             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7355           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7356                                          skip_niters, step_expr);
7357           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7358                                     init_expr, skip_step);
7359         }
7360     }
7361
7362   /* Convert the step to the desired type.  */
7363   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7364
7365   if (stmts)
7366     {
7367       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7368       gcc_assert (!new_bb);
7369     }
7370
7371   /* Find the first insertion point in the BB.  */
7372   basic_block bb = gimple_bb (phi);
7373   si = gsi_after_labels (bb);
7374
7375   /* For SLP induction we have to generate several IVs as for example
7376      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7377      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7378      [VF*S, VF*S, VF*S, VF*S] for all.  */
7379   if (slp_node)
7380     {
7381       /* Enforced above.  */
7382       unsigned int const_nunits = nunits.to_constant ();
7383
7384       /* Generate [VF*S, VF*S, ... ].  */
7385       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7386         {
7387           expr = build_int_cst (integer_type_node, vf);
7388           expr = fold_convert (TREE_TYPE (step_expr), expr);
7389         }
7390       else
7391         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7392       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7393                               expr, step_expr);
7394       if (! CONSTANT_CLASS_P (new_name))
7395         new_name = vect_init_vector (stmt_info, new_name,
7396                                      TREE_TYPE (step_expr), NULL);
7397       new_vec = build_vector_from_val (vectype, new_name);
7398       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7399
7400       /* Now generate the IVs.  */
7401       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7402       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7403       unsigned elts = const_nunits * nvects;
7404       unsigned nivs = least_common_multiple (group_size,
7405                                              const_nunits) / const_nunits;
7406       gcc_assert (elts % group_size == 0);
7407       tree elt = init_expr;
7408       unsigned ivn;
7409       for (ivn = 0; ivn < nivs; ++ivn)
7410         {
7411           tree_vector_builder elts (vectype, const_nunits, 1);
7412           stmts = NULL;
7413           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7414             {
7415               if (ivn*const_nunits + eltn >= group_size
7416                   && (ivn * const_nunits + eltn) % group_size == 0)
7417                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7418                                     elt, step_expr);
7419               elts.quick_push (elt);
7420             }
7421           vec_init = gimple_build_vector (&stmts, &elts);
7422           if (stmts)
7423             {
7424               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7425               gcc_assert (!new_bb);
7426             }
7427
7428           /* Create the induction-phi that defines the induction-operand.  */
7429           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7430           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7431           stmt_vec_info induction_phi_info
7432             = loop_vinfo->add_stmt (induction_phi);
7433           induc_def = PHI_RESULT (induction_phi);
7434
7435           /* Create the iv update inside the loop  */
7436           vec_def = make_ssa_name (vec_dest);
7437           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7438           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7439           loop_vinfo->add_stmt (new_stmt);
7440
7441           /* Set the arguments of the phi node:  */
7442           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7443           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7444                        UNKNOWN_LOCATION);
7445
7446           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7447         }
7448
7449       /* Re-use IVs when we can.  */
7450       if (ivn < nvects)
7451         {
7452           unsigned vfp
7453             = least_common_multiple (group_size, const_nunits) / group_size;
7454           /* Generate [VF'*S, VF'*S, ... ].  */
7455           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7456             {
7457               expr = build_int_cst (integer_type_node, vfp);
7458               expr = fold_convert (TREE_TYPE (step_expr), expr);
7459             }
7460           else
7461             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7462           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7463                                   expr, step_expr);
7464           if (! CONSTANT_CLASS_P (new_name))
7465             new_name = vect_init_vector (stmt_info, new_name,
7466                                          TREE_TYPE (step_expr), NULL);
7467           new_vec = build_vector_from_val (vectype, new_name);
7468           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7469           for (; ivn < nvects; ++ivn)
7470             {
7471               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7472               tree def;
7473               if (gimple_code (iv) == GIMPLE_PHI)
7474                 def = gimple_phi_result (iv);
7475               else
7476                 def = gimple_assign_lhs (iv);
7477               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7478                                               PLUS_EXPR,
7479                                               def, vec_step);
7480               if (gimple_code (iv) == GIMPLE_PHI)
7481                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7482               else
7483                 {
7484                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7485                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7486                 }
7487               SLP_TREE_VEC_STMTS (slp_node).quick_push
7488                 (loop_vinfo->add_stmt (new_stmt));
7489             }
7490         }
7491
7492       return true;
7493     }
7494
7495   /* Create the vector that holds the initial_value of the induction.  */
7496   if (nested_in_vect_loop)
7497     {
7498       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7499          been created during vectorization of previous stmts.  We obtain it
7500          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7501       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7502       /* If the initial value is not of proper type, convert it.  */
7503       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7504         {
7505           new_stmt
7506             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7507                                                           vect_simple_var,
7508                                                           "vec_iv_"),
7509                                    VIEW_CONVERT_EXPR,
7510                                    build1 (VIEW_CONVERT_EXPR, vectype,
7511                                            vec_init));
7512           vec_init = gimple_assign_lhs (new_stmt);
7513           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7514                                                  new_stmt);
7515           gcc_assert (!new_bb);
7516           loop_vinfo->add_stmt (new_stmt);
7517         }
7518     }
7519   else
7520     {
7521       /* iv_loop is the loop to be vectorized. Create:
7522          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7523       stmts = NULL;
7524       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7525
7526       unsigned HOST_WIDE_INT const_nunits;
7527       if (nunits.is_constant (&const_nunits))
7528         {
7529           tree_vector_builder elts (vectype, const_nunits, 1);
7530           elts.quick_push (new_name);
7531           for (i = 1; i < const_nunits; i++)
7532             {
7533               /* Create: new_name_i = new_name + step_expr  */
7534               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7535                                        new_name, step_expr);
7536               elts.quick_push (new_name);
7537             }
7538           /* Create a vector from [new_name_0, new_name_1, ...,
7539              new_name_nunits-1]  */
7540           vec_init = gimple_build_vector (&stmts, &elts);
7541         }
7542       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7543         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7544         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7545                                  new_name, step_expr);
7546       else
7547         {
7548           /* Build:
7549                 [base, base, base, ...]
7550                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7551           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7552           gcc_assert (flag_associative_math);
7553           tree index = build_index_vector (vectype, 0, 1);
7554           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7555                                                         new_name);
7556           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7557                                                         step_expr);
7558           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7559           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7560                                    vec_init, step_vec);
7561           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7562                                    vec_init, base_vec);
7563         }
7564
7565       if (stmts)
7566         {
7567           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7568           gcc_assert (!new_bb);
7569         }
7570     }
7571
7572
7573   /* Create the vector that holds the step of the induction.  */
7574   if (nested_in_vect_loop)
7575     /* iv_loop is nested in the loop to be vectorized. Generate:
7576        vec_step = [S, S, S, S]  */
7577     new_name = step_expr;
7578   else
7579     {
7580       /* iv_loop is the loop to be vectorized. Generate:
7581           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7582       gimple_seq seq = NULL;
7583       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7584         {
7585           expr = build_int_cst (integer_type_node, vf);
7586           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7587         }
7588       else
7589         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7590       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7591                                expr, step_expr);
7592       if (seq)
7593         {
7594           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7595           gcc_assert (!new_bb);
7596         }
7597     }
7598
7599   t = unshare_expr (new_name);
7600   gcc_assert (CONSTANT_CLASS_P (new_name)
7601               || TREE_CODE (new_name) == SSA_NAME);
7602   new_vec = build_vector_from_val (vectype, t);
7603   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7604
7605
7606   /* Create the following def-use cycle:
7607      loop prolog:
7608          vec_init = ...
7609          vec_step = ...
7610      loop:
7611          vec_iv = PHI <vec_init, vec_loop>
7612          ...
7613          STMT
7614          ...
7615          vec_loop = vec_iv + vec_step;  */
7616
7617   /* Create the induction-phi that defines the induction-operand.  */
7618   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7619   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7620   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7621   induc_def = PHI_RESULT (induction_phi);
7622
7623   /* Create the iv update inside the loop  */
7624   vec_def = make_ssa_name (vec_dest);
7625   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7626   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7627   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7628
7629   /* Set the arguments of the phi node:  */
7630   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7631   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7632                UNKNOWN_LOCATION);
7633
7634   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7635
7636   /* In case that vectorization factor (VF) is bigger than the number
7637      of elements that we can fit in a vectype (nunits), we have to generate
7638      more than one vector stmt - i.e - we need to "unroll" the
7639      vector stmt by a factor VF/nunits.  For more details see documentation
7640      in vectorizable_operation.  */
7641
7642   if (ncopies > 1)
7643     {
7644       gimple_seq seq = NULL;
7645       stmt_vec_info prev_stmt_vinfo;
7646       /* FORNOW. This restriction should be relaxed.  */
7647       gcc_assert (!nested_in_vect_loop);
7648
7649       /* Create the vector that holds the step of the induction.  */
7650       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7651         {
7652           expr = build_int_cst (integer_type_node, nunits);
7653           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7654         }
7655       else
7656         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7657       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7658                                expr, step_expr);
7659       if (seq)
7660         {
7661           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7662           gcc_assert (!new_bb);
7663         }
7664
7665       t = unshare_expr (new_name);
7666       gcc_assert (CONSTANT_CLASS_P (new_name)
7667                   || TREE_CODE (new_name) == SSA_NAME);
7668       new_vec = build_vector_from_val (vectype, t);
7669       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7670
7671       vec_def = induc_def;
7672       prev_stmt_vinfo = induction_phi_info;
7673       for (i = 1; i < ncopies; i++)
7674         {
7675           /* vec_i = vec_prev + vec_step  */
7676           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7677                                           vec_def, vec_step);
7678           vec_def = make_ssa_name (vec_dest, new_stmt);
7679           gimple_assign_set_lhs (new_stmt, vec_def);
7680
7681           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7682           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7683           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7684           prev_stmt_vinfo = new_stmt_info;
7685         }
7686     }
7687
7688   if (nested_in_vect_loop)
7689     {
7690       /* Find the loop-closed exit-phi of the induction, and record
7691          the final vector of induction results:  */
7692       exit_phi = NULL;
7693       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7694         {
7695           gimple *use_stmt = USE_STMT (use_p);
7696           if (is_gimple_debug (use_stmt))
7697             continue;
7698
7699           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7700             {
7701               exit_phi = use_stmt;
7702               break;
7703             }
7704         }
7705       if (exit_phi)
7706         {
7707           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7708           /* FORNOW. Currently not supporting the case that an inner-loop induction
7709              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7710           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7711                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7712
7713           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7714           if (dump_enabled_p ())
7715             dump_printf_loc (MSG_NOTE, vect_location,
7716                              "vector of inductions after inner-loop:%G",
7717                              new_stmt);
7718         }
7719     }
7720
7721
7722   if (dump_enabled_p ())
7723     dump_printf_loc (MSG_NOTE, vect_location,
7724                      "transform induction: created def-use cycle: %G%G",
7725                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7726
7727   return true;
7728 }
7729
7730 /* Function vectorizable_live_operation.
7731
7732    STMT_INFO computes a value that is used outside the loop.  Check if
7733    it can be supported.  */
7734
7735 bool
7736 vectorizable_live_operation (stmt_vec_info stmt_info,
7737                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7738                              slp_tree slp_node, int slp_index,
7739                              stmt_vec_info *vec_stmt,
7740                              stmt_vector_for_cost *)
7741 {
7742   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7743   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7744   imm_use_iterator imm_iter;
7745   tree lhs, lhs_type, bitsize, vec_bitsize;
7746   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7747   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7748   int ncopies;
7749   gimple *use_stmt;
7750   auto_vec<tree> vec_oprnds;
7751   int vec_entry = 0;
7752   poly_uint64 vec_index = 0;
7753
7754   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7755
7756   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7757     return false;
7758
7759   /* FORNOW.  CHECKME.  */
7760   if (nested_in_vect_loop_p (loop, stmt_info))
7761     return false;
7762
7763   /* If STMT is not relevant and it is a simple assignment and its inputs are
7764      invariant then it can remain in place, unvectorized.  The original last
7765      scalar value that it computes will be used.  */
7766   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7767     {
7768       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7769       if (dump_enabled_p ())
7770         dump_printf_loc (MSG_NOTE, vect_location,
7771                          "statement is simple and uses invariant.  Leaving in "
7772                          "place.\n");
7773       return true;
7774     }
7775
7776   if (slp_node)
7777     ncopies = 1;
7778   else
7779     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7780
7781   if (slp_node)
7782     {
7783       gcc_assert (slp_index >= 0);
7784
7785       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7786       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7787
7788       /* Get the last occurrence of the scalar index from the concatenation of
7789          all the slp vectors. Calculate which slp vector it is and the index
7790          within.  */
7791       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7792
7793       /* Calculate which vector contains the result, and which lane of
7794          that vector we need.  */
7795       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7796         {
7797           if (dump_enabled_p ())
7798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7799                              "Cannot determine which vector holds the"
7800                              " final result.\n");
7801           return false;
7802         }
7803     }
7804
7805   if (!vec_stmt)
7806     {
7807       /* No transformation required.  */
7808       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7809         {
7810           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7811                                                OPTIMIZE_FOR_SPEED))
7812             {
7813               if (dump_enabled_p ())
7814                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7815                                  "can't use a fully-masked loop because "
7816                                  "the target doesn't support extract last "
7817                                  "reduction.\n");
7818               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7819             }
7820           else if (slp_node)
7821             {
7822               if (dump_enabled_p ())
7823                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7824                                  "can't use a fully-masked loop because an "
7825                                  "SLP statement is live after the loop.\n");
7826               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7827             }
7828           else if (ncopies > 1)
7829             {
7830               if (dump_enabled_p ())
7831                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7832                                  "can't use a fully-masked loop because"
7833                                  " ncopies is greater than 1.\n");
7834               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7835             }
7836           else
7837             {
7838               gcc_assert (ncopies == 1 && !slp_node);
7839               vect_record_loop_mask (loop_vinfo,
7840                                      &LOOP_VINFO_MASKS (loop_vinfo),
7841                                      1, vectype);
7842             }
7843         }
7844       return true;
7845     }
7846
7847   /* Use the lhs of the original scalar statement.  */
7848   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7849
7850   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7851         : gimple_get_lhs (stmt);
7852   lhs_type = TREE_TYPE (lhs);
7853
7854   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7855              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7856              : TYPE_SIZE (TREE_TYPE (vectype)));
7857   vec_bitsize = TYPE_SIZE (vectype);
7858
7859   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7860   tree vec_lhs, bitstart;
7861   if (slp_node)
7862     {
7863       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7864
7865       /* Get the correct slp vectorized stmt.  */
7866       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7867       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7868         vec_lhs = gimple_phi_result (phi);
7869       else
7870         vec_lhs = gimple_get_lhs (vec_stmt);
7871
7872       /* Get entry to use.  */
7873       bitstart = bitsize_int (vec_index);
7874       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7875     }
7876   else
7877     {
7878       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7879       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7880       gcc_checking_assert (ncopies == 1
7881                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7882
7883       /* For multiple copies, get the last copy.  */
7884       for (int i = 1; i < ncopies; ++i)
7885         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7886
7887       /* Get the last lane in the vector.  */
7888       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7889     }
7890
7891   gimple_seq stmts = NULL;
7892   tree new_tree;
7893   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7894     {
7895       /* Emit:
7896
7897            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7898
7899          where VEC_LHS is the vectorized live-out result and MASK is
7900          the loop mask for the final iteration.  */
7901       gcc_assert (ncopies == 1 && !slp_node);
7902       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7903       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7904                                       1, vectype, 0);
7905       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7906                                       scalar_type, mask, vec_lhs);
7907
7908       /* Convert the extracted vector element to the required scalar type.  */
7909       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7910     }
7911   else
7912     {
7913       tree bftype = TREE_TYPE (vectype);
7914       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7915         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7916       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7917       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7918                                        &stmts, true, NULL_TREE);
7919     }
7920
7921   if (stmts)
7922     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7923
7924   /* Replace use of lhs with newly computed result.  If the use stmt is a
7925      single arg PHI, just replace all uses of PHI result.  It's necessary
7926      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7927   use_operand_p use_p;
7928   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7929     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7930         && !is_gimple_debug (use_stmt))
7931     {
7932       if (gimple_code (use_stmt) == GIMPLE_PHI
7933           && gimple_phi_num_args (use_stmt) == 1)
7934         {
7935           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7936         }
7937       else
7938         {
7939           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7940             SET_USE (use_p, new_tree);
7941         }
7942       update_stmt (use_stmt);
7943     }
7944
7945   return true;
7946 }
7947
7948 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7949
7950 static void
7951 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7952 {
7953   ssa_op_iter op_iter;
7954   imm_use_iterator imm_iter;
7955   def_operand_p def_p;
7956   gimple *ustmt;
7957
7958   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7959     {
7960       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7961         {
7962           basic_block bb;
7963
7964           if (!is_gimple_debug (ustmt))
7965             continue;
7966
7967           bb = gimple_bb (ustmt);
7968
7969           if (!flow_bb_inside_loop_p (loop, bb))
7970             {
7971               if (gimple_debug_bind_p (ustmt))
7972                 {
7973                   if (dump_enabled_p ())
7974                     dump_printf_loc (MSG_NOTE, vect_location,
7975                                      "killing debug use\n");
7976
7977                   gimple_debug_bind_reset_value (ustmt);
7978                   update_stmt (ustmt);
7979                 }
7980               else
7981                 gcc_unreachable ();
7982             }
7983         }
7984     }
7985 }
7986
7987 /* Given loop represented by LOOP_VINFO, return true if computation of
7988    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7989    otherwise.  */
7990
7991 static bool
7992 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7993 {
7994   /* Constant case.  */
7995   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7996     {
7997       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7998       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7999
8000       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8001       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8002       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8003         return true;
8004     }
8005
8006   widest_int max;
8007   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8008   /* Check the upper bound of loop niters.  */
8009   if (get_max_loop_iterations (loop, &max))
8010     {
8011       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8012       signop sgn = TYPE_SIGN (type);
8013       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8014       if (max < type_max)
8015         return true;
8016     }
8017   return false;
8018 }
8019
8020 /* Return a mask type with half the number of elements as TYPE.  */
8021
8022 tree
8023 vect_halve_mask_nunits (tree type)
8024 {
8025   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8026   return build_truth_vector_type (nunits, current_vector_size);
8027 }
8028
8029 /* Return a mask type with twice as many elements as TYPE.  */
8030
8031 tree
8032 vect_double_mask_nunits (tree type)
8033 {
8034   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8035   return build_truth_vector_type (nunits, current_vector_size);
8036 }
8037
8038 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8039    contain a sequence of NVECTORS masks that each control a vector of type
8040    VECTYPE.  */
8041
8042 void
8043 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8044                        unsigned int nvectors, tree vectype)
8045 {
8046   gcc_assert (nvectors != 0);
8047   if (masks->length () < nvectors)
8048     masks->safe_grow_cleared (nvectors);
8049   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8050   /* The number of scalars per iteration and the number of vectors are
8051      both compile-time constants.  */
8052   unsigned int nscalars_per_iter
8053     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8054                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8055   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8056     {
8057       rgm->max_nscalars_per_iter = nscalars_per_iter;
8058       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8059     }
8060 }
8061
8062 /* Given a complete set of masks MASKS, extract mask number INDEX
8063    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8064    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8065
8066    See the comment above vec_loop_masks for more details about the mask
8067    arrangement.  */
8068
8069 tree
8070 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8071                     unsigned int nvectors, tree vectype, unsigned int index)
8072 {
8073   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8074   tree mask_type = rgm->mask_type;
8075
8076   /* Populate the rgroup's mask array, if this is the first time we've
8077      used it.  */
8078   if (rgm->masks.is_empty ())
8079     {
8080       rgm->masks.safe_grow_cleared (nvectors);
8081       for (unsigned int i = 0; i < nvectors; ++i)
8082         {
8083           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8084           /* Provide a dummy definition until the real one is available.  */
8085           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8086           rgm->masks[i] = mask;
8087         }
8088     }
8089
8090   tree mask = rgm->masks[index];
8091   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8092                 TYPE_VECTOR_SUBPARTS (vectype)))
8093     {
8094       /* A loop mask for data type X can be reused for data type Y
8095          if X has N times more elements than Y and if Y's elements
8096          are N times bigger than X's.  In this case each sequence
8097          of N elements in the loop mask will be all-zero or all-one.
8098          We can then view-convert the mask so that each sequence of
8099          N elements is replaced by a single element.  */
8100       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8101                               TYPE_VECTOR_SUBPARTS (vectype)));
8102       gimple_seq seq = NULL;
8103       mask_type = build_same_sized_truth_vector_type (vectype);
8104       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8105       if (seq)
8106         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8107     }
8108   return mask;
8109 }
8110
8111 /* Scale profiling counters by estimation for LOOP which is vectorized
8112    by factor VF.  */
8113
8114 static void
8115 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8116 {
8117   edge preheader = loop_preheader_edge (loop);
8118   /* Reduce loop iterations by the vectorization factor.  */
8119   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8120   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8121
8122   if (freq_h.nonzero_p ())
8123     {
8124       profile_probability p;
8125
8126       /* Avoid dropping loop body profile counter to 0 because of zero count
8127          in loop's preheader.  */
8128       if (!(freq_e == profile_count::zero ()))
8129         freq_e = freq_e.force_nonzero ();
8130       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8131       scale_loop_frequencies (loop, p);
8132     }
8133
8134   edge exit_e = single_exit (loop);
8135   exit_e->probability = profile_probability::always ()
8136                                  .apply_scale (1, new_est_niter + 1);
8137
8138   edge exit_l = single_pred_edge (loop->latch);
8139   profile_probability prob = exit_l->probability;
8140   exit_l->probability = exit_e->probability.invert ();
8141   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8142     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8143 }
8144
8145 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8146    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8147    stmt_vec_info.  */
8148
8149 static void
8150 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8151                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8152 {
8153   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8154   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8155
8156   if (dump_enabled_p ())
8157     dump_printf_loc (MSG_NOTE, vect_location,
8158                      "------>vectorizing statement: %G", stmt_info->stmt);
8159
8160   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8161     vect_loop_kill_debug_uses (loop, stmt_info);
8162
8163   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8164       && !STMT_VINFO_LIVE_P (stmt_info))
8165     return;
8166
8167   if (STMT_VINFO_VECTYPE (stmt_info))
8168     {
8169       poly_uint64 nunits
8170         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8171       if (!STMT_SLP_TYPE (stmt_info)
8172           && maybe_ne (nunits, vf)
8173           && dump_enabled_p ())
8174         /* For SLP VF is set according to unrolling factor, and not
8175            to vector size, hence for SLP this print is not valid.  */
8176         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8177     }
8178
8179   /* Pure SLP statements have already been vectorized.  We still need
8180      to apply loop vectorization to hybrid SLP statements.  */
8181   if (PURE_SLP_STMT (stmt_info))
8182     return;
8183
8184   if (dump_enabled_p ())
8185     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8186
8187   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8188     *seen_store = stmt_info;
8189 }
8190
8191 /* Function vect_transform_loop.
8192
8193    The analysis phase has determined that the loop is vectorizable.
8194    Vectorize the loop - created vectorized stmts to replace the scalar
8195    stmts in the loop, and update the loop exit condition.
8196    Returns scalar epilogue loop if any.  */
8197
8198 struct loop *
8199 vect_transform_loop (loop_vec_info loop_vinfo)
8200 {
8201   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8202   struct loop *epilogue = NULL;
8203   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8204   int nbbs = loop->num_nodes;
8205   int i;
8206   tree niters_vector = NULL_TREE;
8207   tree step_vector = NULL_TREE;
8208   tree niters_vector_mult_vf = NULL_TREE;
8209   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8210   unsigned int lowest_vf = constant_lower_bound (vf);
8211   gimple *stmt;
8212   bool check_profitability = false;
8213   unsigned int th;
8214
8215   DUMP_VECT_SCOPE ("vec_transform_loop");
8216
8217   loop_vinfo->shared->check_datarefs ();
8218
8219   /* Use the more conservative vectorization threshold.  If the number
8220      of iterations is constant assume the cost check has been performed
8221      by our caller.  If the threshold makes all loops profitable that
8222      run at least the (estimated) vectorization factor number of times
8223      checking is pointless, too.  */
8224   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8225   if (th >= vect_vf_for_cost (loop_vinfo)
8226       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8227     {
8228       if (dump_enabled_p ())
8229         dump_printf_loc (MSG_NOTE, vect_location,
8230                          "Profitability threshold is %d loop iterations.\n",
8231                          th);
8232       check_profitability = true;
8233     }
8234
8235   /* Make sure there exists a single-predecessor exit bb.  Do this before
8236      versioning.   */
8237   edge e = single_exit (loop);
8238   if (! single_pred_p (e->dest))
8239     {
8240       split_loop_exit_edge (e);
8241       if (dump_enabled_p ())
8242         dump_printf (MSG_NOTE, "split exit edge\n");
8243     }
8244
8245   /* Version the loop first, if required, so the profitability check
8246      comes first.  */
8247
8248   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8249     {
8250       poly_uint64 versioning_threshold
8251         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8252       if (check_profitability
8253           && ordered_p (poly_uint64 (th), versioning_threshold))
8254         {
8255           versioning_threshold = ordered_max (poly_uint64 (th),
8256                                               versioning_threshold);
8257           check_profitability = false;
8258         }
8259       vect_loop_versioning (loop_vinfo, th, check_profitability,
8260                             versioning_threshold);
8261       check_profitability = false;
8262     }
8263
8264   /* Make sure there exists a single-predecessor exit bb also on the
8265      scalar loop copy.  Do this after versioning but before peeling
8266      so CFG structure is fine for both scalar and if-converted loop
8267      to make slpeel_duplicate_current_defs_from_edges face matched
8268      loop closed PHI nodes on the exit.  */
8269   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8270     {
8271       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8272       if (! single_pred_p (e->dest))
8273         {
8274           split_loop_exit_edge (e);
8275           if (dump_enabled_p ())
8276             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8277         }
8278     }
8279
8280   tree niters = vect_build_loop_niters (loop_vinfo);
8281   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8282   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8283   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8284   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8285                               &step_vector, &niters_vector_mult_vf, th,
8286                               check_profitability, niters_no_overflow);
8287
8288   if (niters_vector == NULL_TREE)
8289     {
8290       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8291           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8292           && known_eq (lowest_vf, vf))
8293         {
8294           niters_vector
8295             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8296                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8297           step_vector = build_one_cst (TREE_TYPE (niters));
8298         }
8299       else
8300         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8301                                      &step_vector, niters_no_overflow);
8302     }
8303
8304   /* 1) Make sure the loop header has exactly two entries
8305      2) Make sure we have a preheader basic block.  */
8306
8307   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8308
8309   split_edge (loop_preheader_edge (loop));
8310
8311   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8312       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8313     /* This will deal with any possible peeling.  */
8314     vect_prepare_for_masked_peels (loop_vinfo);
8315
8316   /* Schedule the SLP instances first, then handle loop vectorization
8317      below.  */
8318   if (!loop_vinfo->slp_instances.is_empty ())
8319     {
8320       DUMP_VECT_SCOPE ("scheduling SLP instances");
8321       vect_schedule_slp (loop_vinfo);
8322     }
8323
8324   /* FORNOW: the vectorizer supports only loops which body consist
8325      of one basic block (header + empty latch). When the vectorizer will
8326      support more involved loop forms, the order by which the BBs are
8327      traversed need to be reconsidered.  */
8328
8329   for (i = 0; i < nbbs; i++)
8330     {
8331       basic_block bb = bbs[i];
8332       stmt_vec_info stmt_info;
8333
8334       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8335            gsi_next (&si))
8336         {
8337           gphi *phi = si.phi ();
8338           if (dump_enabled_p ())
8339             dump_printf_loc (MSG_NOTE, vect_location,
8340                              "------>vectorizing phi: %G", phi);
8341           stmt_info = loop_vinfo->lookup_stmt (phi);
8342           if (!stmt_info)
8343             continue;
8344
8345           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8346             vect_loop_kill_debug_uses (loop, stmt_info);
8347
8348           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8349               && !STMT_VINFO_LIVE_P (stmt_info))
8350             continue;
8351
8352           if (STMT_VINFO_VECTYPE (stmt_info)
8353               && (maybe_ne
8354                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8355               && dump_enabled_p ())
8356             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8357
8358           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8359                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8360                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8361               && ! PURE_SLP_STMT (stmt_info))
8362             {
8363               if (dump_enabled_p ())
8364                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8365               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8366             }
8367         }
8368
8369       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8370            !gsi_end_p (si);)
8371         {
8372           stmt = gsi_stmt (si);
8373           /* During vectorization remove existing clobber stmts.  */
8374           if (gimple_clobber_p (stmt))
8375             {
8376               unlink_stmt_vdef (stmt);
8377               gsi_remove (&si, true);
8378               release_defs (stmt);
8379             }
8380           else
8381             {
8382               stmt_info = loop_vinfo->lookup_stmt (stmt);
8383
8384               /* vector stmts created in the outer-loop during vectorization of
8385                  stmts in an inner-loop may not have a stmt_info, and do not
8386                  need to be vectorized.  */
8387               stmt_vec_info seen_store = NULL;
8388               if (stmt_info)
8389                 {
8390                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8391                     {
8392                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8393                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8394                            !gsi_end_p (subsi); gsi_next (&subsi))
8395                         {
8396                           stmt_vec_info pat_stmt_info
8397                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8398                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8399                                                     &si, &seen_store);
8400                         }
8401                       stmt_vec_info pat_stmt_info
8402                         = STMT_VINFO_RELATED_STMT (stmt_info);
8403                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8404                                                 &seen_store);
8405                     }
8406                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8407                                             &seen_store);
8408                 }
8409               gsi_next (&si);
8410               if (seen_store)
8411                 {
8412                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8413                     /* Interleaving.  If IS_STORE is TRUE, the
8414                        vectorization of the interleaving chain was
8415                        completed - free all the stores in the chain.  */
8416                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8417                   else
8418                     /* Free the attached stmt_vec_info and remove the stmt.  */
8419                     loop_vinfo->remove_stmt (stmt_info);
8420                 }
8421             }
8422         }
8423
8424       /* Stub out scalar statements that must not survive vectorization.
8425          Doing this here helps with grouped statements, or statements that
8426          are involved in patterns.  */
8427       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8428            !gsi_end_p (gsi); gsi_next (&gsi))
8429         {
8430           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8431           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8432             {
8433               tree lhs = gimple_get_lhs (call);
8434               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8435                 {
8436                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8437                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8438                   gsi_replace (&gsi, new_stmt, true);
8439                 }
8440             }
8441         }
8442     }                           /* BBs in loop */
8443
8444   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8445      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8446   if (integer_onep (step_vector))
8447     niters_no_overflow = true;
8448   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8449                            niters_vector_mult_vf, !niters_no_overflow);
8450
8451   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8452   scale_profile_for_vect_loop (loop, assumed_vf);
8453
8454   /* True if the final iteration might not handle a full vector's
8455      worth of scalar iterations.  */
8456   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8457   /* The minimum number of iterations performed by the epilogue.  This
8458      is 1 when peeling for gaps because we always need a final scalar
8459      iteration.  */
8460   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8461   /* +1 to convert latch counts to loop iteration counts,
8462      -min_epilogue_iters to remove iterations that cannot be performed
8463        by the vector code.  */
8464   int bias_for_lowest = 1 - min_epilogue_iters;
8465   int bias_for_assumed = bias_for_lowest;
8466   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8467   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8468     {
8469       /* When the amount of peeling is known at compile time, the first
8470          iteration will have exactly alignment_npeels active elements.
8471          In the worst case it will have at least one.  */
8472       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8473       bias_for_lowest += lowest_vf - min_first_active;
8474       bias_for_assumed += assumed_vf - min_first_active;
8475     }
8476   /* In these calculations the "- 1" converts loop iteration counts
8477      back to latch counts.  */
8478   if (loop->any_upper_bound)
8479     loop->nb_iterations_upper_bound
8480       = (final_iter_may_be_partial
8481          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8482                           lowest_vf) - 1
8483          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8484                            lowest_vf) - 1);
8485   if (loop->any_likely_upper_bound)
8486     loop->nb_iterations_likely_upper_bound
8487       = (final_iter_may_be_partial
8488          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8489                           + bias_for_lowest, lowest_vf) - 1
8490          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8491                            + bias_for_lowest, lowest_vf) - 1);
8492   if (loop->any_estimate)
8493     loop->nb_iterations_estimate
8494       = (final_iter_may_be_partial
8495          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8496                           assumed_vf) - 1
8497          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8498                            assumed_vf) - 1);
8499
8500   if (dump_enabled_p ())
8501     {
8502       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8503         {
8504           dump_printf_loc (MSG_NOTE, vect_location,
8505                            "LOOP VECTORIZED\n");
8506           if (loop->inner)
8507             dump_printf_loc (MSG_NOTE, vect_location,
8508                              "OUTER LOOP VECTORIZED\n");
8509           dump_printf (MSG_NOTE, "\n");
8510         }
8511       else
8512         {
8513           dump_printf_loc (MSG_NOTE, vect_location,
8514                            "LOOP EPILOGUE VECTORIZED (VS=");
8515           dump_dec (MSG_NOTE, current_vector_size);
8516           dump_printf (MSG_NOTE, ")\n");
8517         }
8518     }
8519
8520   /* Free SLP instances here because otherwise stmt reference counting
8521      won't work.  */
8522   slp_instance instance;
8523   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8524     vect_free_slp_instance (instance, true);
8525   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8526   /* Clear-up safelen field since its value is invalid after vectorization
8527      since vectorized loop can have loop-carried dependencies.  */
8528   loop->safelen = 0;
8529
8530   /* Don't vectorize epilogue for epilogue.  */
8531   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8532     epilogue = NULL;
8533
8534   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8535     epilogue = NULL;
8536
8537   if (epilogue)
8538     {
8539       auto_vector_sizes vector_sizes;
8540       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8541       unsigned int next_size = 0;
8542
8543       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8544           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8545           && known_eq (vf, lowest_vf))
8546         {
8547           unsigned int eiters
8548             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8549                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8550           eiters = eiters % lowest_vf;
8551           epilogue->nb_iterations_upper_bound = eiters - 1;
8552
8553           unsigned int ratio;
8554           while (next_size < vector_sizes.length ()
8555                  && !(constant_multiple_p (current_vector_size,
8556                                            vector_sizes[next_size], &ratio)
8557                       && eiters >= lowest_vf / ratio))
8558             next_size += 1;
8559         }
8560       else
8561         while (next_size < vector_sizes.length ()
8562                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8563           next_size += 1;
8564
8565       if (next_size == vector_sizes.length ())
8566         epilogue = NULL;
8567     }
8568
8569   if (epilogue)
8570     {
8571       epilogue->force_vectorize = loop->force_vectorize;
8572       epilogue->safelen = loop->safelen;
8573       epilogue->dont_vectorize = false;
8574
8575       /* We may need to if-convert epilogue to vectorize it.  */
8576       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8577         tree_if_conversion (epilogue);
8578     }
8579
8580   return epilogue;
8581 }
8582
8583 /* The code below is trying to perform simple optimization - revert
8584    if-conversion for masked stores, i.e. if the mask of a store is zero
8585    do not perform it and all stored value producers also if possible.
8586    For example,
8587      for (i=0; i<n; i++)
8588        if (c[i])
8589         {
8590           p1[i] += 1;
8591           p2[i] = p3[i] +2;
8592         }
8593    this transformation will produce the following semi-hammock:
8594
8595    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8596      {
8597        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8598        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8599        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8600        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8601        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8602        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8603      }
8604 */
8605
8606 void
8607 optimize_mask_stores (struct loop *loop)
8608 {
8609   basic_block *bbs = get_loop_body (loop);
8610   unsigned nbbs = loop->num_nodes;
8611   unsigned i;
8612   basic_block bb;
8613   struct loop *bb_loop;
8614   gimple_stmt_iterator gsi;
8615   gimple *stmt;
8616   auto_vec<gimple *> worklist;
8617
8618   vect_location = find_loop_location (loop);
8619   /* Pick up all masked stores in loop if any.  */
8620   for (i = 0; i < nbbs; i++)
8621     {
8622       bb = bbs[i];
8623       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8624            gsi_next (&gsi))
8625         {
8626           stmt = gsi_stmt (gsi);
8627           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8628             worklist.safe_push (stmt);
8629         }
8630     }
8631
8632   free (bbs);
8633   if (worklist.is_empty ())
8634     return;
8635
8636   /* Loop has masked stores.  */
8637   while (!worklist.is_empty ())
8638     {
8639       gimple *last, *last_store;
8640       edge e, efalse;
8641       tree mask;
8642       basic_block store_bb, join_bb;
8643       gimple_stmt_iterator gsi_to;
8644       tree vdef, new_vdef;
8645       gphi *phi;
8646       tree vectype;
8647       tree zero;
8648
8649       last = worklist.pop ();
8650       mask = gimple_call_arg (last, 2);
8651       bb = gimple_bb (last);
8652       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8653          the same loop as if_bb.  It could be different to LOOP when two
8654          level loop-nest is vectorized and mask_store belongs to the inner
8655          one.  */
8656       e = split_block (bb, last);
8657       bb_loop = bb->loop_father;
8658       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8659       join_bb = e->dest;
8660       store_bb = create_empty_bb (bb);
8661       add_bb_to_loop (store_bb, bb_loop);
8662       e->flags = EDGE_TRUE_VALUE;
8663       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8664       /* Put STORE_BB to likely part.  */
8665       efalse->probability = profile_probability::unlikely ();
8666       store_bb->count = efalse->count ();
8667       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8668       if (dom_info_available_p (CDI_DOMINATORS))
8669         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8670       if (dump_enabled_p ())
8671         dump_printf_loc (MSG_NOTE, vect_location,
8672                          "Create new block %d to sink mask stores.",
8673                          store_bb->index);
8674       /* Create vector comparison with boolean result.  */
8675       vectype = TREE_TYPE (mask);
8676       zero = build_zero_cst (vectype);
8677       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8678       gsi = gsi_last_bb (bb);
8679       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8680       /* Create new PHI node for vdef of the last masked store:
8681          .MEM_2 = VDEF <.MEM_1>
8682          will be converted to
8683          .MEM.3 = VDEF <.MEM_1>
8684          and new PHI node will be created in join bb
8685          .MEM_2 = PHI <.MEM_1, .MEM_3>
8686       */
8687       vdef = gimple_vdef (last);
8688       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8689       gimple_set_vdef (last, new_vdef);
8690       phi = create_phi_node (vdef, join_bb);
8691       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8692
8693       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8694       while (true)
8695         {
8696           gimple_stmt_iterator gsi_from;
8697           gimple *stmt1 = NULL;
8698
8699           /* Move masked store to STORE_BB.  */
8700           last_store = last;
8701           gsi = gsi_for_stmt (last);
8702           gsi_from = gsi;
8703           /* Shift GSI to the previous stmt for further traversal.  */
8704           gsi_prev (&gsi);
8705           gsi_to = gsi_start_bb (store_bb);
8706           gsi_move_before (&gsi_from, &gsi_to);
8707           /* Setup GSI_TO to the non-empty block start.  */
8708           gsi_to = gsi_start_bb (store_bb);
8709           if (dump_enabled_p ())
8710             dump_printf_loc (MSG_NOTE, vect_location,
8711                              "Move stmt to created bb\n%G", last);
8712           /* Move all stored value producers if possible.  */
8713           while (!gsi_end_p (gsi))
8714             {
8715               tree lhs;
8716               imm_use_iterator imm_iter;
8717               use_operand_p use_p;
8718               bool res;
8719
8720               /* Skip debug statements.  */
8721               if (is_gimple_debug (gsi_stmt (gsi)))
8722                 {
8723                   gsi_prev (&gsi);
8724                   continue;
8725                 }
8726               stmt1 = gsi_stmt (gsi);
8727               /* Do not consider statements writing to memory or having
8728                  volatile operand.  */
8729               if (gimple_vdef (stmt1)
8730                   || gimple_has_volatile_ops (stmt1))
8731                 break;
8732               gsi_from = gsi;
8733               gsi_prev (&gsi);
8734               lhs = gimple_get_lhs (stmt1);
8735               if (!lhs)
8736                 break;
8737
8738               /* LHS of vectorized stmt must be SSA_NAME.  */
8739               if (TREE_CODE (lhs) != SSA_NAME)
8740                 break;
8741
8742               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8743                 {
8744                   /* Remove dead scalar statement.  */
8745                   if (has_zero_uses (lhs))
8746                     {
8747                       gsi_remove (&gsi_from, true);
8748                       continue;
8749                     }
8750                 }
8751
8752               /* Check that LHS does not have uses outside of STORE_BB.  */
8753               res = true;
8754               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8755                 {
8756                   gimple *use_stmt;
8757                   use_stmt = USE_STMT (use_p);
8758                   if (is_gimple_debug (use_stmt))
8759                     continue;
8760                   if (gimple_bb (use_stmt) != store_bb)
8761                     {
8762                       res = false;
8763                       break;
8764                     }
8765                 }
8766               if (!res)
8767                 break;
8768
8769               if (gimple_vuse (stmt1)
8770                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8771                 break;
8772
8773               /* Can move STMT1 to STORE_BB.  */
8774               if (dump_enabled_p ())
8775                 dump_printf_loc (MSG_NOTE, vect_location,
8776                                  "Move stmt to created bb\n%G", stmt1);
8777               gsi_move_before (&gsi_from, &gsi_to);
8778               /* Shift GSI_TO for further insertion.  */
8779               gsi_prev (&gsi_to);
8780             }
8781           /* Put other masked stores with the same mask to STORE_BB.  */
8782           if (worklist.is_empty ()
8783               || gimple_call_arg (worklist.last (), 2) != mask
8784               || worklist.last () != stmt1)
8785             break;
8786           last = worklist.pop ();
8787         }
8788       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8789     }
8790 }