gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 219                      stmt_info->stmt);
 220   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 221     return false;
 222
 223   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 224       && STMT_VINFO_RELATED_STMT (stmt_info))
 225     {
 226       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 227       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 228
 229       /* If a pattern statement has def stmts, analyze them too.  */
 230       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 231            !gsi_end_p (si); gsi_next (&si))
 232         {
 233           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 234           if (dump_enabled_p ())
 235             dump_printf_loc (MSG_NOTE, vect_location,
 236                              "==> examining pattern def stmt: %G",
 237                              def_stmt_info->stmt);
 238           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 239                                              vf, mask_producers))
 240             return false;
 241         }
 242
 243       if (dump_enabled_p ())
 244         dump_printf_loc (MSG_NOTE, vect_location,
 245                          "==> examining pattern statement: %G",
 246                          stmt_info->stmt);
 247       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 248         return false;
 249     }
 250
 251   return true;
 252 }
 253
 254 /* Function vect_determine_vectorization_factor
 255
 256    Determine the vectorization factor (VF).  VF is the number of data elements
 257    that are operated upon in parallel in a single iteration of the vectorized
 258    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 259    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 260    elements can fit in a single vector register.
 261
 262    We currently support vectorization of loops in which all types operated upon
 263    are of the same size.  Therefore this function currently sets VF according to
 264    the size of the types operated upon, and fails if there are multiple sizes
 265    in the loop.
 266
 267    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 268    original loop:
 269         for (i=0; i<N; i++){
 270           a[i] = b[i] + c[i];
 271         }
 272
 273    vectorized loop:
 274         for (i=0; i<N; i+=VF){
 275           a[i:VF] = b[i:VF] + c[i:VF];
 276         }
 277 */
 278
 279 static bool
 280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 281 {
 282   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 283   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 284   unsigned nbbs = loop->num_nodes;
 285   poly_uint64 vectorization_factor = 1;
 286   tree scalar_type = NULL_TREE;
 287   gphi *phi;
 288   tree vectype;
 289   stmt_vec_info stmt_info;
 290   unsigned i;
 291   auto_vec<stmt_vec_info> mask_producers;
 292
 293   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 294
 295   for (i = 0; i < nbbs; i++)
 296     {
 297       basic_block bb = bbs[i];
 298
 299       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 300            gsi_next (&si))
 301         {
 302           phi = si.phi ();
 303           stmt_info = loop_vinfo->lookup_stmt (phi);
 304           if (dump_enabled_p ())
 305             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 306                              phi);
 307
 308           gcc_assert (stmt_info);
 309
 310           if (STMT_VINFO_RELEVANT_P (stmt_info)
 311               || STMT_VINFO_LIVE_P (stmt_info))
 312             {
 313               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 314               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 315
 316               if (dump_enabled_p ())
 317                 dump_printf_loc (MSG_NOTE, vect_location,
 318                                  "get vectype for scalar type:  %T\n",
 319                                  scalar_type);
 320
 321               vectype = get_vectype_for_scalar_type (scalar_type);
 322               if (!vectype)
 323                 {
 324                   if (dump_enabled_p ())
 325                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 326                                      "not vectorized: unsupported "
 327                                      "data-type %T\n",
 328                                      scalar_type);
 329                   return false;
 330                 }
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 352           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 353                                            &mask_producers))
 354             return false;
 355         }
 356     }
 357
 358   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 359   if (dump_enabled_p ())
 360     {
 361       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 362       dump_dec (MSG_NOTE, vectorization_factor);
 363       dump_printf (MSG_NOTE, "\n");
 364     }
 365
 366   if (known_le (vectorization_factor, 1U))
 367     {
 368       if (dump_enabled_p ())
 369         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                          "not vectorized: unsupported data-type\n");
 371       return false;
 372     }
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374
 375   for (i = 0; i < mask_producers.length (); i++)
 376     {
 377       stmt_info = mask_producers[i];
 378       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 379       if (!mask_type)
 380         return false;
 381       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 382     }
 383
 384   return true;
 385 }
 386
 387
 388 /* Function vect_is_simple_iv_evolution.
 389
 390    FORNOW: A simple evolution of an induction variables in the loop is
 391    considered a polynomial evolution.  */
 392
 393 static bool
 394 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 395                              tree * step)
 396 {
 397   tree init_expr;
 398   tree step_expr;
 399   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 400   basic_block bb;
 401
 402   /* When there is no evolution in this loop, the evolution function
 403      is not "simple".  */
 404   if (evolution_part == NULL_TREE)
 405     return false;
 406
 407   /* When the evolution is a polynomial of degree >= 2
 408      the evolution function is not "simple".  */
 409   if (tree_is_chrec (evolution_part))
 410     return false;
 411
 412   step_expr = evolution_part;
 413   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 414
 415   if (dump_enabled_p ())
 416     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 417                      step_expr, init_expr);
 418
 419   *init = init_expr;
 420   *step = step_expr;
 421
 422   if (TREE_CODE (step_expr) != INTEGER_CST
 423       && (TREE_CODE (step_expr) != SSA_NAME
 424           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 425               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 426           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 427               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 428                   || !flag_associative_math)))
 429       && (TREE_CODE (step_expr) != REAL_CST
 430           || !flag_associative_math))
 431     {
 432       if (dump_enabled_p ())
 433         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 434                          "step unknown.\n");
 435       return false;
 436     }
 437
 438   return true;
 439 }
 440
 441 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 442    what we are assuming is a double reduction.  For example, given
 443    a structure like this:
 444
 445       outer1:
 446         x_1 = PHI <x_4(outer2), ...>;
 447         ...
 448
 449       inner:
 450         x_2 = PHI <x_1(outer1), ...>;
 451         ...
 452         x_3 = ...;
 453         ...
 454
 455       outer2:
 456         x_4 = PHI <x_3(inner)>;
 457         ...
 458
 459    outer loop analysis would treat x_1 as a double reduction phi and
 460    this function would then return true for x_2.  */
 461
 462 static bool
 463 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 464 {
 465   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 466   use_operand_p use_p;
 467   ssa_op_iter op_iter;
 468   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 469     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 470       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 471         return true;
 472   return false;
 473 }
 474
 475 /* Function vect_analyze_scalar_cycles_1.
 476
 477    Examine the cross iteration def-use cycles of scalar variables
 478    in LOOP.  LOOP_VINFO represents the loop that is now being
 479    considered for vectorization (can be LOOP, or an outer-loop
 480    enclosing LOOP).  */
 481
 482 static void
 483 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 484 {
 485   basic_block bb = loop->header;
 486   tree init, step;
 487   auto_vec<stmt_vec_info, 64> worklist;
 488   gphi_iterator gsi;
 489   bool double_reduc;
 490
 491   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 492
 493   /* First - identify all inductions.  Reduction detection assumes that all the
 494      inductions have been identified, therefore, this order must not be
 495      changed.  */
 496   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 497     {
 498       gphi *phi = gsi.phi ();
 499       tree access_fn = NULL;
 500       tree def = PHI_RESULT (phi);
 501       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 502
 503       if (dump_enabled_p ())
 504         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 505
 506       /* Skip virtual phi's.  The data dependences that are associated with
 507          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 508       if (virtual_operand_p (def))
 509         continue;
 510
 511       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 512
 513       /* Analyze the evolution function.  */
 514       access_fn = analyze_scalar_evolution (loop, def);
 515       if (access_fn)
 516         {
 517           STRIP_NOPS (access_fn);
 518           if (dump_enabled_p ())
 519             dump_printf_loc (MSG_NOTE, vect_location,
 520                              "Access function of PHI: %T\n", access_fn);
 521           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522             = initial_condition_in_loop_num (access_fn, loop->num);
 523           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 524             = evolution_part_in_loop_num (access_fn, loop->num);
 525         }
 526
 527       if (!access_fn
 528           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 529           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 530           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 531               && TREE_CODE (step) != INTEGER_CST))
 532         {
 533           worklist.safe_push (stmt_vinfo);
 534           continue;
 535         }
 536
 537       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 538                   != NULL_TREE);
 539       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 540
 541       if (dump_enabled_p ())
 542         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 543       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 544     }
 545
 546
 547   /* Second - identify all reductions and nested cycles.  */
 548   while (worklist.length () > 0)
 549     {
 550       stmt_vec_info stmt_vinfo = worklist.pop ();
 551       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 552       tree def = PHI_RESULT (phi);
 553
 554       if (dump_enabled_p ())
 555         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 556
 557       gcc_assert (!virtual_operand_p (def)
 558                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 559
 560       stmt_vec_info reduc_stmt_info
 561         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 562                                        &double_reduc, false);
 563       if (reduc_stmt_info)
 564         {
 565           if (double_reduc)
 566             {
 567               if (dump_enabled_p ())
 568                 dump_printf_loc (MSG_NOTE, vect_location,
 569                                  "Detected double reduction.\n");
 570
 571               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 572               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 573                 = vect_double_reduction_def;
 574             }
 575           else
 576             {
 577               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 578                 {
 579                   if (dump_enabled_p ())
 580                     dump_printf_loc (MSG_NOTE, vect_location,
 581                                      "Detected vectorizable nested cycle.\n");
 582
 583                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 584                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 585                 }
 586               else
 587                 {
 588                   if (dump_enabled_p ())
 589                     dump_printf_loc (MSG_NOTE, vect_location,
 590                                      "Detected reduction.\n");
 591
 592                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 593                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 594                   /* Store the reduction cycles for possible vectorization in
 595                      loop-aware SLP if it was not detected as reduction
 596                      chain.  */
 597                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 598                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 599                       (reduc_stmt_info);
 600                 }
 601             }
 602         }
 603       else
 604         if (dump_enabled_p ())
 605           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 606                            "Unknown def-use cycle pattern.\n");
 607     }
 608 }
 609
 610
 611 /* Function vect_analyze_scalar_cycles.
 612
 613    Examine the cross iteration def-use cycles of scalar variables, by
 614    analyzing the loop-header PHIs of scalar variables.  Classify each
 615    cycle as one of the following: invariant, induction, reduction, unknown.
 616    We do that for the loop represented by LOOP_VINFO, and also to its
 617    inner-loop, if exists.
 618    Examples for scalar cycles:
 619
 620    Example1: reduction:
 621
 622               loop1:
 623               for (i=0; i<N; i++)
 624                  sum += a[i];
 625
 626    Example2: induction:
 627
 628               loop2:
 629               for (i=0; i<N; i++)
 630                  a[i] = i;  */
 631
 632 static void
 633 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 634 {
 635   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 636
 637   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 638
 639   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 640      Reductions in such inner-loop therefore have different properties than
 641      the reductions in the nest that gets vectorized:
 642      1. When vectorized, they are executed in the same order as in the original
 643         scalar loop, so we can't change the order of computation when
 644         vectorizing them.
 645      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 646         current checks are too strict.  */
 647
 648   if (loop->inner)
 649     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 650 }
 651
 652 /* Transfer group and reduction information from STMT_INFO to its
 653    pattern stmt.  */
 654
 655 static void
 656 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 657 {
 658   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 659   stmt_vec_info stmtp;
 660   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 661               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 662   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 663   do
 664     {
 665       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 666       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 667       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 668       if (stmt_info)
 669         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 670           = STMT_VINFO_RELATED_STMT (stmt_info);
 671     }
 672   while (stmt_info);
 673   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 674 }
 675
 676 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 677
 678 static void
 679 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 680 {
 681   stmt_vec_info first;
 682   unsigned i;
 683
 684   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 685     if (STMT_VINFO_IN_PATTERN_P (first))
 686       {
 687         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 688         while (next)
 689           {
 690             if (! STMT_VINFO_IN_PATTERN_P (next))
 691               break;
 692             next = REDUC_GROUP_NEXT_ELEMENT (next);
 693           }
 694         /* If not all stmt in the chain are patterns try to handle
 695            the chain without patterns.  */
 696         if (! next)
 697           {
 698             vect_fixup_reduc_chain (first);
 699             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 700               = STMT_VINFO_RELATED_STMT (first);
 701           }
 702       }
 703 }
 704
 705 /* Function vect_get_loop_niters.
 706
 707    Determine how many iterations the loop is executed and place it
 708    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 709    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 710    niter information holds in ASSUMPTIONS.
 711
 712    Return the loop exit condition.  */
 713
 714
 715 static gcond *
 716 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 717                       tree *number_of_iterations, tree *number_of_iterationsm1)
 718 {
 719   edge exit = single_exit (loop);
 720   struct tree_niter_desc niter_desc;
 721   tree niter_assumptions, niter, may_be_zero;
 722   gcond *cond = get_loop_exit_condition (loop);
 723
 724   *assumptions = boolean_true_node;
 725   *number_of_iterationsm1 = chrec_dont_know;
 726   *number_of_iterations = chrec_dont_know;
 727   DUMP_VECT_SCOPE ("get_loop_niters");
 728
 729   if (!exit)
 730     return cond;
 731
 732   niter = chrec_dont_know;
 733   may_be_zero = NULL_TREE;
 734   niter_assumptions = boolean_true_node;
 735   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 736       || chrec_contains_undetermined (niter_desc.niter))
 737     return cond;
 738
 739   niter_assumptions = niter_desc.assumptions;
 740   may_be_zero = niter_desc.may_be_zero;
 741   niter = niter_desc.niter;
 742
 743   if (may_be_zero && integer_zerop (may_be_zero))
 744     may_be_zero = NULL_TREE;
 745
 746   if (may_be_zero)
 747     {
 748       if (COMPARISON_CLASS_P (may_be_zero))
 749         {
 750           /* Try to combine may_be_zero with assumptions, this can simplify
 751              computation of niter expression.  */
 752           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 753             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 754                                              niter_assumptions,
 755                                              fold_build1 (TRUTH_NOT_EXPR,
 756                                                           boolean_type_node,
 757                                                           may_be_zero));
 758           else
 759             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 760                                  build_int_cst (TREE_TYPE (niter), 0),
 761                                  rewrite_to_non_trapping_overflow (niter));
 762
 763           may_be_zero = NULL_TREE;
 764         }
 765       else if (integer_nonzerop (may_be_zero))
 766         {
 767           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 768           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 769           return cond;
 770         }
 771       else
 772         return cond;
 773     }
 774
 775   *assumptions = niter_assumptions;
 776   *number_of_iterationsm1 = niter;
 777
 778   /* We want the number of loop header executions which is the number
 779      of latch executions plus one.
 780      ???  For UINT_MAX latch executions this number overflows to zero
 781      for loops like do { n++; } while (n != 0);  */
 782   if (niter && !chrec_contains_undetermined (niter))
 783     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 784                           build_int_cst (TREE_TYPE (niter), 1));
 785   *number_of_iterations = niter;
 786
 787   return cond;
 788 }
 789
 790 /* Function bb_in_loop_p
 791
 792    Used as predicate for dfs order traversal of the loop bbs.  */
 793
 794 static bool
 795 bb_in_loop_p (const_basic_block bb, const void *data)
 796 {
 797   const struct loop *const loop = (const struct loop *)data;
 798   if (flow_bb_inside_loop_p (loop, bb))
 799     return true;
 800   return false;
 801 }
 802
 803
 804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 805    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 806
 807 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 808   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 809     loop (loop_in),
 810     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 811     num_itersm1 (NULL_TREE),
 812     num_iters (NULL_TREE),
 813     num_iters_unchanged (NULL_TREE),
 814     num_iters_assumptions (NULL_TREE),
 815     th (0),
 816     versioning_threshold (0),
 817     vectorization_factor (0),
 818     max_vectorization_factor (0),
 819     mask_skip_niters (NULL_TREE),
 820     mask_compare_type (NULL_TREE),
 821     unaligned_dr (NULL),
 822     peeling_for_alignment (0),
 823     ptr_mask (0),
 824     ivexpr_map (NULL),
 825     slp_unrolling_factor (1),
 826     single_scalar_iteration_cost (0),
 827     vectorizable (false),
 828     can_fully_mask_p (true),
 829     fully_masked_p (false),
 830     peeling_for_gaps (false),
 831     peeling_for_niter (false),
 832     operands_swapped (false),
 833     no_data_dependencies (false),
 834     has_mask_store (false),
 835     scalar_loop (NULL),
 836     orig_loop_info (NULL)
 837 {
 838   /* CHECKME: We want to visit all BBs before their successors (except for
 839      latch blocks, for which this assertion wouldn't hold).  In the simple
 840      case of the loop forms we allow, a dfs order of the BBs would the same
 841      as reversed postorder traversal, so we are safe.  */
 842
 843   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 844                                           bbs, loop->num_nodes, loop);
 845   gcc_assert (nbbs == loop->num_nodes);
 846
 847   for (unsigned int i = 0; i < nbbs; i++)
 848     {
 849       basic_block bb = bbs[i];
 850       gimple_stmt_iterator si;
 851
 852       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 853         {
 854           gimple *phi = gsi_stmt (si);
 855           gimple_set_uid (phi, 0);
 856           add_stmt (phi);
 857         }
 858
 859       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 860         {
 861           gimple *stmt = gsi_stmt (si);
 862           gimple_set_uid (stmt, 0);
 863           add_stmt (stmt);
 864         }
 865     }
 866 }
 867
 868 /* Free all levels of MASKS.  */
 869
 870 void
 871 release_vec_loop_masks (vec_loop_masks *masks)
 872 {
 873   rgroup_masks *rgm;
 874   unsigned int i;
 875   FOR_EACH_VEC_ELT (*masks, i, rgm)
 876     rgm->masks.release ();
 877   masks->release ();
 878 }
 879
 880 /* Free all memory used by the _loop_vec_info, as well as all the
 881    stmt_vec_info structs of all the stmts in the loop.  */
 882
 883 _loop_vec_info::~_loop_vec_info ()
 884 {
 885   int nbbs;
 886   gimple_stmt_iterator si;
 887   int j;
 888
 889   nbbs = loop->num_nodes;
 890   for (j = 0; j < nbbs; j++)
 891     {
 892       basic_block bb = bbs[j];
 893       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 894         {
 895           gimple *stmt = gsi_stmt (si);
 896
 897           /* We may have broken canonical form by moving a constant
 898              into RHS1 of a commutative op.  Fix such occurrences.  */
 899           if (operands_swapped && is_gimple_assign (stmt))
 900             {
 901               enum tree_code code = gimple_assign_rhs_code (stmt);
 902
 903               if ((code == PLUS_EXPR
 904                    || code == POINTER_PLUS_EXPR
 905                    || code == MULT_EXPR)
 906                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 907                 swap_ssa_operands (stmt,
 908                                    gimple_assign_rhs1_ptr (stmt),
 909                                    gimple_assign_rhs2_ptr (stmt));
 910               else if (code == COND_EXPR
 911                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 912                 {
 913                   tree cond_expr = gimple_assign_rhs1 (stmt);
 914                   enum tree_code cond_code = TREE_CODE (cond_expr);
 915
 916                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 917                     {
 918                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 919                                                                   0));
 920                       cond_code = invert_tree_comparison (cond_code,
 921                                                           honor_nans);
 922                       if (cond_code != ERROR_MARK)
 923                         {
 924                           TREE_SET_CODE (cond_expr, cond_code);
 925                           swap_ssa_operands (stmt,
 926                                              gimple_assign_rhs2_ptr (stmt),
 927                                              gimple_assign_rhs3_ptr (stmt));
 928                         }
 929                     }
 930                 }
 931             }
 932           gsi_next (&si);
 933         }
 934     }
 935
 936   free (bbs);
 937
 938   release_vec_loop_masks (&masks);
 939   delete ivexpr_map;
 940
 941   loop->aux = NULL;
 942 }
 943
 944 /* Return an invariant or register for EXPR and emit necessary
 945    computations in the LOOP_VINFO loop preheader.  */
 946
 947 tree
 948 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 949 {
 950   if (is_gimple_reg (expr)
 951       || is_gimple_min_invariant (expr))
 952     return expr;
 953
 954   if (! loop_vinfo->ivexpr_map)
 955     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 956   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 957   if (! cached)
 958     {
 959       gimple_seq stmts = NULL;
 960       cached = force_gimple_operand (unshare_expr (expr),
 961                                      &stmts, true, NULL_TREE);
 962       if (stmts)
 963         {
 964           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 965           gsi_insert_seq_on_edge_immediate (e, stmts);
 966         }
 967     }
 968   return cached;
 969 }
 970
 971 /* Return true if we can use CMP_TYPE as the comparison type to produce
 972    all masks required to mask LOOP_VINFO.  */
 973
 974 static bool
 975 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 976 {
 977   rgroup_masks *rgm;
 978   unsigned int i;
 979   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 980     if (rgm->mask_type != NULL_TREE
 981         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 982                                             cmp_type, rgm->mask_type,
 983                                             OPTIMIZE_FOR_SPEED))
 984       return false;
 985   return true;
 986 }
 987
 988 /* Calculate the maximum number of scalars per iteration for every
 989    rgroup in LOOP_VINFO.  */
 990
 991 static unsigned int
 992 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 993 {
 994   unsigned int res = 1;
 995   unsigned int i;
 996   rgroup_masks *rgm;
 997   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 998     res = MAX (res, rgm->max_nscalars_per_iter);
 999   return res;
1000 }
1001
1002 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1003    whether we can actually generate the masks required.  Return true if so,
1004    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1005
1006 static bool
1007 vect_verify_full_masking (loop_vec_info loop_vinfo)
1008 {
1009   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1010   unsigned int min_ni_width;
1011
1012   /* Use a normal loop if there are no statements that need masking.
1013      This only happens in rare degenerate cases: it means that the loop
1014      has no loads, no stores, and no live-out values.  */
1015   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1016     return false;
1017
1018   /* Get the maximum number of iterations that is representable
1019      in the counter type.  */
1020   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1021   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1022
1023   /* Get a more refined estimate for the number of iterations.  */
1024   widest_int max_back_edges;
1025   if (max_loop_iterations (loop, &max_back_edges))
1026     max_ni = wi::smin (max_ni, max_back_edges + 1);
1027
1028   /* Account for rgroup masks, in which each bit is replicated N times.  */
1029   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1030
1031   /* Work out how many bits we need to represent the limit.  */
1032   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1033
1034   /* Find a scalar mode for which WHILE_ULT is supported.  */
1035   opt_scalar_int_mode cmp_mode_iter;
1036   tree cmp_type = NULL_TREE;
1037   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1038     {
1039       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1040       if (cmp_bits >= min_ni_width
1041           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1042         {
1043           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1044           if (this_type
1045               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1046             {
1047               /* Although we could stop as soon as we find a valid mode,
1048                  it's often better to continue until we hit Pmode, since the
1049                  operands to the WHILE are more likely to be reusable in
1050                  address calculations.  */
1051               cmp_type = this_type;
1052               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1053                 break;
1054             }
1055         }
1056     }
1057
1058   if (!cmp_type)
1059     return false;
1060
1061   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1062   return true;
1063 }
1064
1065 /* Calculate the cost of one scalar iteration of the loop.  */
1066 static void
1067 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1068 {
1069   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1070   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1071   int nbbs = loop->num_nodes, factor;
1072   int innerloop_iters, i;
1073
1074   /* Gather costs for statements in the scalar loop.  */
1075
1076   /* FORNOW.  */
1077   innerloop_iters = 1;
1078   if (loop->inner)
1079     innerloop_iters = 50; /* FIXME */
1080
1081   for (i = 0; i < nbbs; i++)
1082     {
1083       gimple_stmt_iterator si;
1084       basic_block bb = bbs[i];
1085
1086       if (bb->loop_father == loop->inner)
1087         factor = innerloop_iters;
1088       else
1089         factor = 1;
1090
1091       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1092         {
1093           gimple *stmt = gsi_stmt (si);
1094           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1095
1096           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1097             continue;
1098
1099           /* Skip stmts that are not vectorized inside the loop.  */
1100           if (stmt_info
1101               && !STMT_VINFO_RELEVANT_P (stmt_info)
1102               && (!STMT_VINFO_LIVE_P (stmt_info)
1103                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1104               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1105             continue;
1106
1107           vect_cost_for_stmt kind;
1108           if (STMT_VINFO_DATA_REF (stmt_info))
1109             {
1110               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1111                kind = scalar_load;
1112              else
1113                kind = scalar_store;
1114             }
1115           else
1116             kind = scalar_stmt;
1117
1118           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1119                             factor, kind, stmt_info, 0, vect_prologue);
1120         }
1121     }
1122
1123   /* Now accumulate cost.  */
1124   void *target_cost_data = init_cost (loop);
1125   stmt_info_for_cost *si;
1126   int j;
1127   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1128                     j, si)
1129     (void) add_stmt_cost (target_cost_data, si->count,
1130                           si->kind, si->stmt_info, si->misalign,
1131                           vect_body);
1132   unsigned dummy, body_cost = 0;
1133   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1134   destroy_cost_data (target_cost_data);
1135   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1136 }
1137
1138
1139 /* Function vect_analyze_loop_form_1.
1140
1141    Verify that certain CFG restrictions hold, including:
1142    - the loop has a pre-header
1143    - the loop has a single entry and exit
1144    - the loop exit condition is simple enough
1145    - the number of iterations can be analyzed, i.e, a countable loop.  The
1146      niter could be analyzed under some assumptions.  */
1147
1148 bool
1149 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1150                           tree *assumptions, tree *number_of_iterationsm1,
1151                           tree *number_of_iterations, gcond **inner_loop_cond)
1152 {
1153   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1154
1155   /* Different restrictions apply when we are considering an inner-most loop,
1156      vs. an outer (nested) loop.
1157      (FORNOW. May want to relax some of these restrictions in the future).  */
1158
1159   if (!loop->inner)
1160     {
1161       /* Inner-most loop.  We currently require that the number of BBs is
1162          exactly 2 (the header and latch).  Vectorizable inner-most loops
1163          look like this:
1164
1165                         (pre-header)
1166                            |
1167                           header <--------+
1168                            | |            |
1169                            | +--> latch --+
1170                            |
1171                         (exit-bb)  */
1172
1173       if (loop->num_nodes != 2)
1174         {
1175           if (dump_enabled_p ())
1176             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177                              "not vectorized: control flow in loop.\n");
1178           return false;
1179         }
1180
1181       if (empty_block_p (loop->header))
1182         {
1183           if (dump_enabled_p ())
1184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1185                              "not vectorized: empty loop.\n");
1186           return false;
1187         }
1188     }
1189   else
1190     {
1191       struct loop *innerloop = loop->inner;
1192       edge entryedge;
1193
1194       /* Nested loop. We currently require that the loop is doubly-nested,
1195          contains a single inner loop, and the number of BBs is exactly 5.
1196          Vectorizable outer-loops look like this:
1197
1198                         (pre-header)
1199                            |
1200                           header <---+
1201                            |         |
1202                           inner-loop |
1203                            |         |
1204                           tail ------+
1205                            |
1206                         (exit-bb)
1207
1208          The inner-loop has the properties expected of inner-most loops
1209          as described above.  */
1210
1211       if ((loop->inner)->inner || (loop->inner)->next)
1212         {
1213           if (dump_enabled_p ())
1214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215                              "not vectorized: multiple nested loops.\n");
1216           return false;
1217         }
1218
1219       if (loop->num_nodes != 5)
1220         {
1221           if (dump_enabled_p ())
1222             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223                              "not vectorized: control flow in loop.\n");
1224           return false;
1225         }
1226
1227       entryedge = loop_preheader_edge (innerloop);
1228       if (entryedge->src != loop->header
1229           || !single_exit (innerloop)
1230           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1231         {
1232           if (dump_enabled_p ())
1233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234                              "not vectorized: unsupported outerloop form.\n");
1235           return false;
1236         }
1237
1238       /* Analyze the inner-loop.  */
1239       tree inner_niterm1, inner_niter, inner_assumptions;
1240       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1241                                       &inner_assumptions, &inner_niterm1,
1242                                       &inner_niter, NULL)
1243           /* Don't support analyzing niter under assumptions for inner
1244              loop.  */
1245           || !integer_onep (inner_assumptions))
1246         {
1247           if (dump_enabled_p ())
1248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                              "not vectorized: Bad inner loop.\n");
1250           return false;
1251         }
1252
1253       if (!expr_invariant_in_loop_p (loop, inner_niter))
1254         {
1255           if (dump_enabled_p ())
1256             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1257                              "not vectorized: inner-loop count not"
1258                              " invariant.\n");
1259           return false;
1260         }
1261
1262       if (dump_enabled_p ())
1263         dump_printf_loc (MSG_NOTE, vect_location,
1264                          "Considering outer-loop vectorization.\n");
1265     }
1266
1267   if (!single_exit (loop)
1268       || EDGE_COUNT (loop->header->preds) != 2)
1269     {
1270       if (dump_enabled_p ())
1271         {
1272           if (!single_exit (loop))
1273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1274                              "not vectorized: multiple exits.\n");
1275           else if (EDGE_COUNT (loop->header->preds) != 2)
1276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1277                              "not vectorized: too many incoming edges.\n");
1278         }
1279       return false;
1280     }
1281
1282   /* We assume that the loop exit condition is at the end of the loop. i.e,
1283      that the loop is represented as a do-while (with a proper if-guard
1284      before the loop if needed), where the loop header contains all the
1285      executable statements, and the latch is empty.  */
1286   if (!empty_block_p (loop->latch)
1287       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1288     {
1289       if (dump_enabled_p ())
1290         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1291                          "not vectorized: latch block not empty.\n");
1292       return false;
1293     }
1294
1295   /* Make sure the exit is not abnormal.  */
1296   edge e = single_exit (loop);
1297   if (e->flags & EDGE_ABNORMAL)
1298     {
1299       if (dump_enabled_p ())
1300         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301                          "not vectorized: abnormal loop exit edge.\n");
1302       return false;
1303     }
1304
1305   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1306                                      number_of_iterationsm1);
1307   if (!*loop_cond)
1308     {
1309       if (dump_enabled_p ())
1310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1311                          "not vectorized: complicated exit condition.\n");
1312       return false;
1313     }
1314
1315   if (integer_zerop (*assumptions)
1316       || !*number_of_iterations
1317       || chrec_contains_undetermined (*number_of_iterations))
1318     {
1319       if (dump_enabled_p ())
1320         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1321                          "not vectorized: number of iterations cannot be "
1322                          "computed.\n");
1323       return false;
1324     }
1325
1326   if (integer_zerop (*number_of_iterations))
1327     {
1328       if (dump_enabled_p ())
1329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1330                          "not vectorized: number of iterations = 0.\n");
1331       return false;
1332     }
1333
1334   return true;
1335 }
1336
1337 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1338
1339 loop_vec_info
1340 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1341 {
1342   tree assumptions, number_of_iterations, number_of_iterationsm1;
1343   gcond *loop_cond, *inner_loop_cond = NULL;
1344
1345   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1346                                   &assumptions, &number_of_iterationsm1,
1347                                   &number_of_iterations, &inner_loop_cond))
1348     return NULL;
1349
1350   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1351   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1352   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1353   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1354   if (!integer_onep (assumptions))
1355     {
1356       /* We consider to vectorize this loop by versioning it under
1357          some assumptions.  In order to do this, we need to clear
1358          existing information computed by scev and niter analyzer.  */
1359       scev_reset_htab ();
1360       free_numbers_of_iterations_estimates (loop);
1361       /* Also set flag for this loop so that following scev and niter
1362          analysis are done under the assumptions.  */
1363       loop_constraint_set (loop, LOOP_C_FINITE);
1364       /* Also record the assumptions for versioning.  */
1365       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1366     }
1367
1368   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1369     {
1370       if (dump_enabled_p ())
1371         {
1372           dump_printf_loc (MSG_NOTE, vect_location,
1373                            "Symbolic number of iterations is ");
1374           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1375           dump_printf (MSG_NOTE, "\n");
1376         }
1377     }
1378
1379   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1380   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1381   if (inner_loop_cond)
1382     {
1383       stmt_vec_info inner_loop_cond_info
1384         = loop_vinfo->lookup_stmt (inner_loop_cond);
1385       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1386     }
1387
1388   gcc_assert (!loop->aux);
1389   loop->aux = loop_vinfo;
1390   return loop_vinfo;
1391 }
1392
1393
1394
1395 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1396    statements update the vectorization factor.  */
1397
1398 static void
1399 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1400 {
1401   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1402   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1403   int nbbs = loop->num_nodes;
1404   poly_uint64 vectorization_factor;
1405   int i;
1406
1407   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1408
1409   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1410   gcc_assert (known_ne (vectorization_factor, 0U));
1411
1412   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1413      vectorization factor of the loop is the unrolling factor required by
1414      the SLP instances.  If that unrolling factor is 1, we say, that we
1415      perform pure SLP on loop - cross iteration parallelism is not
1416      exploited.  */
1417   bool only_slp_in_loop = true;
1418   for (i = 0; i < nbbs; i++)
1419     {
1420       basic_block bb = bbs[i];
1421       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1422            gsi_next (&si))
1423         {
1424           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1425           stmt_info = vect_stmt_to_vectorize (stmt_info);
1426           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1427                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1428               && !PURE_SLP_STMT (stmt_info))
1429             /* STMT needs both SLP and loop-based vectorization.  */
1430             only_slp_in_loop = false;
1431         }
1432     }
1433
1434   if (only_slp_in_loop)
1435     {
1436       dump_printf_loc (MSG_NOTE, vect_location,
1437                        "Loop contains only SLP stmts\n");
1438       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1439     }
1440   else
1441     {
1442       dump_printf_loc (MSG_NOTE, vect_location,
1443                        "Loop contains SLP and non-SLP stmts\n");
1444       /* Both the vectorization factor and unroll factor have the form
1445          current_vector_size * X for some rational X, so they must have
1446          a common multiple.  */
1447       vectorization_factor
1448         = force_common_multiple (vectorization_factor,
1449                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1450     }
1451
1452   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1453   if (dump_enabled_p ())
1454     {
1455       dump_printf_loc (MSG_NOTE, vect_location,
1456                        "Updating vectorization factor to ");
1457       dump_dec (MSG_NOTE, vectorization_factor);
1458       dump_printf (MSG_NOTE, ".\n");
1459     }
1460 }
1461
1462 /* Return true if STMT_INFO describes a double reduction phi and if
1463    the other phi in the reduction is also relevant for vectorization.
1464    This rejects cases such as:
1465
1466       outer1:
1467         x_1 = PHI <x_3(outer2), ...>;
1468         ...
1469
1470       inner:
1471         x_2 = ...;
1472         ...
1473
1474       outer2:
1475         x_3 = PHI <x_2(inner)>;
1476
1477    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1478
1479 static bool
1480 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1481 {
1482   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1483     return false;
1484
1485   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1486 }
1487
1488 /* Function vect_analyze_loop_operations.
1489
1490    Scan the loop stmts and make sure they are all vectorizable.  */
1491
1492 static bool
1493 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1494 {
1495   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1496   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1497   int nbbs = loop->num_nodes;
1498   int i;
1499   stmt_vec_info stmt_info;
1500   bool need_to_vectorize = false;
1501   bool ok;
1502
1503   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1504
1505   stmt_vector_for_cost cost_vec;
1506   cost_vec.create (2);
1507
1508   for (i = 0; i < nbbs; i++)
1509     {
1510       basic_block bb = bbs[i];
1511
1512       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1513            gsi_next (&si))
1514         {
1515           gphi *phi = si.phi ();
1516           ok = true;
1517
1518           stmt_info = loop_vinfo->lookup_stmt (phi);
1519           if (dump_enabled_p ())
1520             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1521           if (virtual_operand_p (gimple_phi_result (phi)))
1522             continue;
1523
1524           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1525              (i.e., a phi in the tail of the outer-loop).  */
1526           if (! is_loop_header_bb_p (bb))
1527             {
1528               /* FORNOW: we currently don't support the case that these phis
1529                  are not used in the outerloop (unless it is double reduction,
1530                  i.e., this phi is vect_reduction_def), cause this case
1531                  requires to actually do something here.  */
1532               if (STMT_VINFO_LIVE_P (stmt_info)
1533                   && !vect_active_double_reduction_p (stmt_info))
1534                 {
1535                   if (dump_enabled_p ())
1536                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537                                      "Unsupported loop-closed phi in "
1538                                      "outer-loop.\n");
1539                   return false;
1540                 }
1541
1542               /* If PHI is used in the outer loop, we check that its operand
1543                  is defined in the inner loop.  */
1544               if (STMT_VINFO_RELEVANT_P (stmt_info))
1545                 {
1546                   tree phi_op;
1547
1548                   if (gimple_phi_num_args (phi) != 1)
1549                     return false;
1550
1551                   phi_op = PHI_ARG_DEF (phi, 0);
1552                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1553                   if (!op_def_info)
1554                     return false;
1555
1556                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1557                       && (STMT_VINFO_RELEVANT (op_def_info)
1558                           != vect_used_in_outer_by_reduction))
1559                     return false;
1560                 }
1561
1562               continue;
1563             }
1564
1565           gcc_assert (stmt_info);
1566
1567           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1568                || STMT_VINFO_LIVE_P (stmt_info))
1569               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1570             {
1571               /* A scalar-dependence cycle that we don't support.  */
1572               if (dump_enabled_p ())
1573                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574                                  "not vectorized: scalar dependence cycle.\n");
1575               return false;
1576             }
1577
1578           if (STMT_VINFO_RELEVANT_P (stmt_info))
1579             {
1580               need_to_vectorize = true;
1581               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1582                   && ! PURE_SLP_STMT (stmt_info))
1583                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1584                                              &cost_vec);
1585               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1586                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1587                        && ! PURE_SLP_STMT (stmt_info))
1588                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1589                                              &cost_vec);
1590             }
1591
1592           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1593           if (ok
1594               && STMT_VINFO_LIVE_P (stmt_info)
1595               && !PURE_SLP_STMT (stmt_info))
1596             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1597                                               &cost_vec);
1598
1599           if (!ok)
1600             {
1601               if (dump_enabled_p ())
1602                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1603                                  "not vectorized: relevant phi not "
1604                                  "supported: %G", phi);
1605               return false;
1606             }
1607         }
1608
1609       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1610            gsi_next (&si))
1611         {
1612           gimple *stmt = gsi_stmt (si);
1613           if (!gimple_clobber_p (stmt)
1614               && !vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1615                                      &need_to_vectorize,
1616                                      NULL, NULL, &cost_vec))
1617             return false;
1618         }
1619     } /* bbs */
1620
1621   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1622   cost_vec.release ();
1623
1624   /* All operations in the loop are either irrelevant (deal with loop
1625      control, or dead), or only used outside the loop and can be moved
1626      out of the loop (e.g. invariants, inductions).  The loop can be
1627      optimized away by scalar optimizations.  We're better off not
1628      touching this loop.  */
1629   if (!need_to_vectorize)
1630     {
1631       if (dump_enabled_p ())
1632         dump_printf_loc (MSG_NOTE, vect_location,
1633                          "All the computation can be taken out of the loop.\n");
1634       if (dump_enabled_p ())
1635         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1636                          "not vectorized: redundant loop. no profit to "
1637                          "vectorize.\n");
1638       return false;
1639     }
1640
1641   return true;
1642 }
1643
1644 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1645    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1646    definitely no, or -1 if it's worth retrying.  */
1647
1648 static int
1649 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1650 {
1651   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1652   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1653
1654   /* Only fully-masked loops can have iteration counts less than the
1655      vectorization factor.  */
1656   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1657     {
1658       HOST_WIDE_INT max_niter;
1659
1660       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1661         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1662       else
1663         max_niter = max_stmt_executions_int (loop);
1664
1665       if (max_niter != -1
1666           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1667         {
1668           if (dump_enabled_p ())
1669             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670                              "not vectorized: iteration count smaller than "
1671                              "vectorization factor.\n");
1672           return 0;
1673         }
1674     }
1675
1676   int min_profitable_iters, min_profitable_estimate;
1677   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1678                                       &min_profitable_estimate);
1679
1680   if (min_profitable_iters < 0)
1681     {
1682       if (dump_enabled_p ())
1683         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1684                          "not vectorized: vectorization not profitable.\n");
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687                          "not vectorized: vector version will never be "
1688                          "profitable.\n");
1689       return -1;
1690     }
1691
1692   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1693                                * assumed_vf);
1694
1695   /* Use the cost model only if it is more conservative than user specified
1696      threshold.  */
1697   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1698                                     min_profitable_iters);
1699
1700   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1701
1702   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1703       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1704     {
1705       if (dump_enabled_p ())
1706         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                          "not vectorized: vectorization not profitable.\n");
1708       if (dump_enabled_p ())
1709         dump_printf_loc (MSG_NOTE, vect_location,
1710                          "not vectorized: iteration count smaller than user "
1711                          "specified loop bound parameter or minimum profitable "
1712                          "iterations (whichever is more conservative).\n");
1713       return 0;
1714     }
1715
1716   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1717   if (estimated_niter == -1)
1718     estimated_niter = likely_max_stmt_executions_int (loop);
1719   if (estimated_niter != -1
1720       && ((unsigned HOST_WIDE_INT) estimated_niter
1721           < MAX (th, (unsigned) min_profitable_estimate)))
1722     {
1723       if (dump_enabled_p ())
1724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725                          "not vectorized: estimated iteration count too "
1726                          "small.\n");
1727       if (dump_enabled_p ())
1728         dump_printf_loc (MSG_NOTE, vect_location,
1729                          "not vectorized: estimated iteration count smaller "
1730                          "than specified loop bound parameter or minimum "
1731                          "profitable iterations (whichever is more "
1732                          "conservative).\n");
1733       return -1;
1734     }
1735
1736   return 1;
1737 }
1738
1739 static bool
1740 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1741                            vec<data_reference_p> *datarefs,
1742                            unsigned int *n_stmts)
1743 {
1744   *n_stmts = 0;
1745   for (unsigned i = 0; i < loop->num_nodes; i++)
1746     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1747          !gsi_end_p (gsi); gsi_next (&gsi))
1748       {
1749         gimple *stmt = gsi_stmt (gsi);
1750         if (is_gimple_debug (stmt))
1751           continue;
1752         ++(*n_stmts);
1753         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1754           {
1755             if (is_gimple_call (stmt) && loop->safelen)
1756               {
1757                 tree fndecl = gimple_call_fndecl (stmt), op;
1758                 if (fndecl != NULL_TREE)
1759                   {
1760                     cgraph_node *node = cgraph_node::get (fndecl);
1761                     if (node != NULL && node->simd_clones != NULL)
1762                       {
1763                         unsigned int j, n = gimple_call_num_args (stmt);
1764                         for (j = 0; j < n; j++)
1765                           {
1766                             op = gimple_call_arg (stmt, j);
1767                             if (DECL_P (op)
1768                                 || (REFERENCE_CLASS_P (op)
1769                                     && get_base_address (op)))
1770                               break;
1771                           }
1772                         op = gimple_call_lhs (stmt);
1773                         /* Ignore #pragma omp declare simd functions
1774                            if they don't have data references in the
1775                            call stmt itself.  */
1776                         if (j == n
1777                             && !(op
1778                                  && (DECL_P (op)
1779                                      || (REFERENCE_CLASS_P (op)
1780                                          && get_base_address (op)))))
1781                           continue;
1782                       }
1783                   }
1784               }
1785             return false;
1786           }
1787         /* If dependence analysis will give up due to the limit on the
1788            number of datarefs stop here and fail fatally.  */
1789         if (datarefs->length ()
1790             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1791           return false;
1792       }
1793   return true;
1794 }
1795
1796 /* Function vect_analyze_loop_2.
1797
1798    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1799    for it.  The different analyses will record information in the
1800    loop_vec_info struct.  */
1801 static bool
1802 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1803 {
1804   bool ok;
1805   int res;
1806   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1807   poly_uint64 min_vf = 2;
1808
1809   /* The first group of checks is independent of the vector size.  */
1810   fatal = true;
1811
1812   /* Find all data references in the loop (which correspond to vdefs/vuses)
1813      and analyze their evolution in the loop.  */
1814
1815   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1816
1817   /* Gather the data references and count stmts in the loop.  */
1818   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1819     {
1820       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1821                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1822                                       n_stmts))
1823         {
1824           if (dump_enabled_p ())
1825             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                              "not vectorized: loop contains function "
1827                              "calls or data references that cannot "
1828                              "be analyzed\n");
1829           return false;
1830         }
1831       loop_vinfo->shared->save_datarefs ();
1832     }
1833   else
1834     loop_vinfo->shared->check_datarefs ();
1835
1836   /* Analyze the data references and also adjust the minimal
1837      vectorization factor according to the loads and stores.  */
1838
1839   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1840   if (!ok)
1841     {
1842       if (dump_enabled_p ())
1843         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1844                          "bad data references.\n");
1845       return false;
1846     }
1847
1848   /* Classify all cross-iteration scalar data-flow cycles.
1849      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1850   vect_analyze_scalar_cycles (loop_vinfo);
1851
1852   vect_pattern_recog (loop_vinfo);
1853
1854   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1855
1856   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1857      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1858
1859   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1860   if (!ok)
1861     {
1862       if (dump_enabled_p ())
1863         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1864                          "bad data access.\n");
1865       return false;
1866     }
1867
1868   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1869
1870   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1871   if (!ok)
1872     {
1873       if (dump_enabled_p ())
1874         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875                          "unexpected pattern.\n");
1876       return false;
1877     }
1878
1879   /* While the rest of the analysis below depends on it in some way.  */
1880   fatal = false;
1881
1882   /* Analyze data dependences between the data-refs in the loop
1883      and adjust the maximum vectorization factor according to
1884      the dependences.
1885      FORNOW: fail at the first data dependence that we encounter.  */
1886
1887   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1888   if (!ok
1889       || (max_vf != MAX_VECTORIZATION_FACTOR
1890           && maybe_lt (max_vf, min_vf)))
1891     {
1892       if (dump_enabled_p ())
1893             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894                              "bad data dependence.\n");
1895       return false;
1896     }
1897   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1898
1899   ok = vect_determine_vectorization_factor (loop_vinfo);
1900   if (!ok)
1901     {
1902       if (dump_enabled_p ())
1903         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904                          "can't determine vectorization factor.\n");
1905       return false;
1906     }
1907   if (max_vf != MAX_VECTORIZATION_FACTOR
1908       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1909     {
1910       if (dump_enabled_p ())
1911         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1912                          "bad data dependence.\n");
1913       return false;
1914     }
1915
1916   /* Compute the scalar iteration cost.  */
1917   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1918
1919   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1920   unsigned th;
1921
1922   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1923   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1924   if (!ok)
1925     return false;
1926
1927   /* If there are any SLP instances mark them as pure_slp.  */
1928   bool slp = vect_make_slp_decision (loop_vinfo);
1929   if (slp)
1930     {
1931       /* Find stmts that need to be both vectorized and SLPed.  */
1932       vect_detect_hybrid_slp (loop_vinfo);
1933
1934       /* Update the vectorization factor based on the SLP decision.  */
1935       vect_update_vf_for_slp (loop_vinfo);
1936     }
1937
1938   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1939
1940   /* We don't expect to have to roll back to anything other than an empty
1941      set of rgroups.  */
1942   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1943
1944   /* This is the point where we can re-start analysis with SLP forced off.  */
1945 start_over:
1946
1947   /* Now the vectorization factor is final.  */
1948   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1949   gcc_assert (known_ne (vectorization_factor, 0U));
1950
1951   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1952     {
1953       dump_printf_loc (MSG_NOTE, vect_location,
1954                        "vectorization_factor = ");
1955       dump_dec (MSG_NOTE, vectorization_factor);
1956       dump_printf (MSG_NOTE, ", niters = %wd\n",
1957                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1958     }
1959
1960   HOST_WIDE_INT max_niter
1961     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1962
1963   /* Analyze the alignment of the data-refs in the loop.
1964      Fail if a data reference is found that cannot be vectorized.  */
1965
1966   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1967   if (!ok)
1968     {
1969       if (dump_enabled_p ())
1970         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1971                          "bad data alignment.\n");
1972       return false;
1973     }
1974
1975   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1976      It is important to call pruning after vect_analyze_data_ref_accesses,
1977      since we use grouping information gathered by interleaving analysis.  */
1978   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1979   if (!ok)
1980     return false;
1981
1982   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1983      vectorization, since we do not want to add extra peeling or
1984      add versioning for alignment.  */
1985   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1986     /* This pass will decide on using loop versioning and/or loop peeling in
1987        order to enhance the alignment of data references in the loop.  */
1988     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1989   else
1990     ok = vect_verify_datarefs_alignment (loop_vinfo);
1991   if (!ok)
1992     {
1993       if (dump_enabled_p ())
1994         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1995                          "bad data alignment.\n");
1996       return false;
1997     }
1998
1999   if (slp)
2000     {
2001       /* Analyze operations in the SLP instances.  Note this may
2002          remove unsupported SLP instances which makes the above
2003          SLP kind detection invalid.  */
2004       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2005       vect_slp_analyze_operations (loop_vinfo);
2006       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2007         goto again;
2008     }
2009
2010   /* Scan all the remaining operations in the loop that are not subject
2011      to SLP and make sure they are vectorizable.  */
2012   ok = vect_analyze_loop_operations (loop_vinfo);
2013   if (!ok)
2014     {
2015       if (dump_enabled_p ())
2016         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2017                          "bad operation or unsupported loop bound.\n");
2018       return false;
2019     }
2020
2021   /* Decide whether to use a fully-masked loop for this vectorization
2022      factor.  */
2023   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2024     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2025        && vect_verify_full_masking (loop_vinfo));
2026   if (dump_enabled_p ())
2027     {
2028       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2029         dump_printf_loc (MSG_NOTE, vect_location,
2030                          "using a fully-masked loop.\n");
2031       else
2032         dump_printf_loc (MSG_NOTE, vect_location,
2033                          "not using a fully-masked loop.\n");
2034     }
2035
2036   /* If epilog loop is required because of data accesses with gaps,
2037      one additional iteration needs to be peeled.  Check if there is
2038      enough iterations for vectorization.  */
2039   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2040       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2041       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2042     {
2043       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2044       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2045
2046       if (known_lt (wi::to_widest (scalar_niters), vf))
2047         {
2048           if (dump_enabled_p ())
2049             dump_printf_loc (MSG_NOTE, vect_location,
2050                              "loop has no enough iterations to support"
2051                              " peeling for gaps.\n");
2052           return false;
2053         }
2054     }
2055
2056   /* Check the costings of the loop make vectorizing worthwhile.  */
2057   res = vect_analyze_loop_costing (loop_vinfo);
2058   if (res < 0)
2059     goto again;
2060   if (!res)
2061     {
2062       if (dump_enabled_p ())
2063         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2064                          "Loop costings not worthwhile.\n");
2065       return false;
2066     }
2067
2068   /* Decide whether we need to create an epilogue loop to handle
2069      remaining scalar iterations.  */
2070   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2071
2072   unsigned HOST_WIDE_INT const_vf;
2073   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2074     /* The main loop handles all iterations.  */
2075     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2076   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2077            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2078     {
2079       /* Work out the (constant) number of iterations that need to be
2080          peeled for reasons other than niters.  */
2081       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2082       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2083         peel_niter += 1;
2084       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2085                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2086         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2087     }
2088   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2089            /* ??? When peeling for gaps but not alignment, we could
2090               try to check whether the (variable) niters is known to be
2091               VF * N + 1.  That's something of a niche case though.  */
2092            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2093            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2094            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2095                 < (unsigned) exact_log2 (const_vf))
2096                /* In case of versioning, check if the maximum number of
2097                   iterations is greater than th.  If they are identical,
2098                   the epilogue is unnecessary.  */
2099                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2100                    || ((unsigned HOST_WIDE_INT) max_niter
2101                        > (th / const_vf) * const_vf))))
2102     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2103
2104   /* If an epilogue loop is required make sure we can create one.  */
2105   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2106       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2107     {
2108       if (dump_enabled_p ())
2109         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2110       if (!vect_can_advance_ivs_p (loop_vinfo)
2111           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2112                                            single_exit (LOOP_VINFO_LOOP
2113                                                          (loop_vinfo))))
2114         {
2115           if (dump_enabled_p ())
2116             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117                              "not vectorized: can't create required "
2118                              "epilog loop\n");
2119           goto again;
2120         }
2121     }
2122
2123   /* During peeling, we need to check if number of loop iterations is
2124      enough for both peeled prolog loop and vector loop.  This check
2125      can be merged along with threshold check of loop versioning, so
2126      increase threshold for this case if necessary.  */
2127   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2128     {
2129       poly_uint64 niters_th = 0;
2130
2131       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2132         {
2133           /* Niters for peeled prolog loop.  */
2134           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2135             {
2136               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2137               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2138               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2139             }
2140           else
2141             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2142         }
2143
2144       /* Niters for at least one iteration of vectorized loop.  */
2145       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2146         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2147       /* One additional iteration because of peeling for gap.  */
2148       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2149         niters_th += 1;
2150       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2151     }
2152
2153   gcc_assert (known_eq (vectorization_factor,
2154                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2155
2156   /* Ok to vectorize!  */
2157   return true;
2158
2159 again:
2160   /* Try again with SLP forced off but if we didn't do any SLP there is
2161      no point in re-trying.  */
2162   if (!slp)
2163     return false;
2164
2165   /* If there are reduction chains re-trying will fail anyway.  */
2166   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2167     return false;
2168
2169   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2170      via interleaving or lane instructions.  */
2171   slp_instance instance;
2172   slp_tree node;
2173   unsigned i, j;
2174   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2175     {
2176       stmt_vec_info vinfo;
2177       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2178       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2179         continue;
2180       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2181       unsigned int size = DR_GROUP_SIZE (vinfo);
2182       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2183       if (! vect_store_lanes_supported (vectype, size, false)
2184          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2185          && ! vect_grouped_store_supported (vectype, size))
2186        return false;
2187       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2188         {
2189           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2190           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2191           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2192           size = DR_GROUP_SIZE (vinfo);
2193           vectype = STMT_VINFO_VECTYPE (vinfo);
2194           if (! vect_load_lanes_supported (vectype, size, false)
2195               && ! vect_grouped_load_supported (vectype, single_element_p,
2196                                                 size))
2197             return false;
2198         }
2199     }
2200
2201   if (dump_enabled_p ())
2202     dump_printf_loc (MSG_NOTE, vect_location,
2203                      "re-trying with SLP disabled\n");
2204
2205   /* Roll back state appropriately.  No SLP this time.  */
2206   slp = false;
2207   /* Restore vectorization factor as it were without SLP.  */
2208   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2209   /* Free the SLP instances.  */
2210   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2211     vect_free_slp_instance (instance, false);
2212   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2213   /* Reset SLP type to loop_vect on all stmts.  */
2214   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2215     {
2216       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2217       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2218            !gsi_end_p (si); gsi_next (&si))
2219         {
2220           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2221           STMT_SLP_TYPE (stmt_info) = loop_vect;
2222         }
2223       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2224            !gsi_end_p (si); gsi_next (&si))
2225         {
2226           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2227           STMT_SLP_TYPE (stmt_info) = loop_vect;
2228           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2229             {
2230               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2231               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2232               STMT_SLP_TYPE (stmt_info) = loop_vect;
2233               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2234                    !gsi_end_p (pi); gsi_next (&pi))
2235                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2236                   = loop_vect;
2237             }
2238         }
2239     }
2240   /* Free optimized alias test DDRS.  */
2241   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2242   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2243   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2244   /* Reset target cost data.  */
2245   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2246   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2247     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2248   /* Reset accumulated rgroup information.  */
2249   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2250   /* Reset assorted flags.  */
2251   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2252   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2253   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2254   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2255   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2256
2257   goto start_over;
2258 }
2259
2260 /* Function vect_analyze_loop.
2261
2262    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2263    for it.  The different analyses will record information in the
2264    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2265    be vectorized.  */
2266 loop_vec_info
2267 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2268                    vec_info_shared *shared)
2269 {
2270   loop_vec_info loop_vinfo;
2271   auto_vector_sizes vector_sizes;
2272
2273   /* Autodetect first vector size we try.  */
2274   current_vector_size = 0;
2275   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2276   unsigned int next_size = 0;
2277
2278   DUMP_VECT_SCOPE ("analyze_loop_nest");
2279
2280   if (loop_outer (loop)
2281       && loop_vec_info_for_loop (loop_outer (loop))
2282       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2283     {
2284       if (dump_enabled_p ())
2285         dump_printf_loc (MSG_NOTE, vect_location,
2286                          "outer-loop already vectorized.\n");
2287       return NULL;
2288     }
2289
2290   if (!find_loop_nest (loop, &shared->loop_nest))
2291     {
2292       if (dump_enabled_p ())
2293         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294                          "not vectorized: loop nest containing two "
2295                          "or more consecutive inner loops cannot be "
2296                          "vectorized\n");
2297       return NULL;
2298     }
2299
2300   unsigned n_stmts = 0;
2301   poly_uint64 autodetected_vector_size = 0;
2302   while (1)
2303     {
2304       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2305       loop_vinfo = vect_analyze_loop_form (loop, shared);
2306       if (!loop_vinfo)
2307         {
2308           if (dump_enabled_p ())
2309             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2310                              "bad loop form.\n");
2311           return NULL;
2312         }
2313
2314       bool fatal = false;
2315
2316       if (orig_loop_vinfo)
2317         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2318
2319       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2320         {
2321           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2322
2323           return loop_vinfo;
2324         }
2325
2326       delete loop_vinfo;
2327
2328       if (next_size == 0)
2329         autodetected_vector_size = current_vector_size;
2330
2331       if (next_size < vector_sizes.length ()
2332           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2333         next_size += 1;
2334
2335       if (fatal
2336           || next_size == vector_sizes.length ()
2337           || known_eq (current_vector_size, 0U))
2338         return NULL;
2339
2340       /* Try the next biggest vector size.  */
2341       current_vector_size = vector_sizes[next_size++];
2342       if (dump_enabled_p ())
2343         {
2344           dump_printf_loc (MSG_NOTE, vect_location,
2345                            "***** Re-trying analysis with "
2346                            "vector size ");
2347           dump_dec (MSG_NOTE, current_vector_size);
2348           dump_printf (MSG_NOTE, "\n");
2349         }
2350     }
2351 }
2352
2353 /* Return true if there is an in-order reduction function for CODE, storing
2354    it in *REDUC_FN if so.  */
2355
2356 static bool
2357 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2358 {
2359   switch (code)
2360     {
2361     case PLUS_EXPR:
2362       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2363       return true;
2364
2365     default:
2366       return false;
2367     }
2368 }
2369
2370 /* Function reduction_fn_for_scalar_code
2371
2372    Input:
2373    CODE - tree_code of a reduction operations.
2374
2375    Output:
2376    REDUC_FN - the corresponding internal function to be used to reduce the
2377       vector of partial results into a single scalar result, or IFN_LAST
2378       if the operation is a supported reduction operation, but does not have
2379       such an internal function.
2380
2381    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2382
2383 static bool
2384 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2385 {
2386   switch (code)
2387     {
2388       case MAX_EXPR:
2389         *reduc_fn = IFN_REDUC_MAX;
2390         return true;
2391
2392       case MIN_EXPR:
2393         *reduc_fn = IFN_REDUC_MIN;
2394         return true;
2395
2396       case PLUS_EXPR:
2397         *reduc_fn = IFN_REDUC_PLUS;
2398         return true;
2399
2400       case BIT_AND_EXPR:
2401         *reduc_fn = IFN_REDUC_AND;
2402         return true;
2403
2404       case BIT_IOR_EXPR:
2405         *reduc_fn = IFN_REDUC_IOR;
2406         return true;
2407
2408       case BIT_XOR_EXPR:
2409         *reduc_fn = IFN_REDUC_XOR;
2410         return true;
2411
2412       case MULT_EXPR:
2413       case MINUS_EXPR:
2414         *reduc_fn = IFN_LAST;
2415         return true;
2416
2417       default:
2418        return false;
2419     }
2420 }
2421
2422 /* If there is a neutral value X such that SLP reduction NODE would not
2423    be affected by the introduction of additional X elements, return that X,
2424    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2425    is true if the SLP statements perform a single reduction, false if each
2426    statement performs an independent reduction.  */
2427
2428 static tree
2429 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2430                               bool reduc_chain)
2431 {
2432   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2433   stmt_vec_info stmt_vinfo = stmts[0];
2434   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2435   tree scalar_type = TREE_TYPE (vector_type);
2436   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2437   gcc_assert (loop);
2438
2439   switch (code)
2440     {
2441     case WIDEN_SUM_EXPR:
2442     case DOT_PROD_EXPR:
2443     case SAD_EXPR:
2444     case PLUS_EXPR:
2445     case MINUS_EXPR:
2446     case BIT_IOR_EXPR:
2447     case BIT_XOR_EXPR:
2448       return build_zero_cst (scalar_type);
2449
2450     case MULT_EXPR:
2451       return build_one_cst (scalar_type);
2452
2453     case BIT_AND_EXPR:
2454       return build_all_ones_cst (scalar_type);
2455
2456     case MAX_EXPR:
2457     case MIN_EXPR:
2458       /* For MIN/MAX the initial values are neutral.  A reduction chain
2459          has only a single initial value, so that value is neutral for
2460          all statements.  */
2461       if (reduc_chain)
2462         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2463                                       loop_preheader_edge (loop));
2464       return NULL_TREE;
2465
2466     default:
2467       return NULL_TREE;
2468     }
2469 }
2470
2471 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2472    STMT is printed with a message MSG. */
2473
2474 static void
2475 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2476 {
2477   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2478 }
2479
2480 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2481    operation.  Return true if the results of DEF_STMT_INFO are something
2482    that can be accumulated by such a reduction.  */
2483
2484 static bool
2485 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2486 {
2487   return (is_gimple_assign (def_stmt_info->stmt)
2488           || is_gimple_call (def_stmt_info->stmt)
2489           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2490           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2491               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2492               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2493 }
2494
2495 /* Detect SLP reduction of the form:
2496
2497    #a1 = phi <a5, a0>
2498    a2 = operation (a1)
2499    a3 = operation (a2)
2500    a4 = operation (a3)
2501    a5 = operation (a4)
2502
2503    #a = phi <a5>
2504
2505    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2506    FIRST_STMT is the first reduction stmt in the chain
2507    (a2 = operation (a1)).
2508
2509    Return TRUE if a reduction chain was detected.  */
2510
2511 static bool
2512 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2513                        gimple *first_stmt)
2514 {
2515   struct loop *loop = (gimple_bb (phi))->loop_father;
2516   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2517   enum tree_code code;
2518   gimple *loop_use_stmt = NULL;
2519   stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2520   tree lhs;
2521   imm_use_iterator imm_iter;
2522   use_operand_p use_p;
2523   int nloop_uses, size = 0, n_out_of_loop_uses;
2524   bool found = false;
2525
2526   if (loop != vect_loop)
2527     return false;
2528
2529   lhs = PHI_RESULT (phi);
2530   code = gimple_assign_rhs_code (first_stmt);
2531   while (1)
2532     {
2533       nloop_uses = 0;
2534       n_out_of_loop_uses = 0;
2535       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2536         {
2537           gimple *use_stmt = USE_STMT (use_p);
2538           if (is_gimple_debug (use_stmt))
2539             continue;
2540
2541           /* Check if we got back to the reduction phi.  */
2542           if (use_stmt == phi)
2543             {
2544               loop_use_stmt = use_stmt;
2545               found = true;
2546               break;
2547             }
2548
2549           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2550             {
2551               loop_use_stmt = use_stmt;
2552               nloop_uses++;
2553             }
2554            else
2555              n_out_of_loop_uses++;
2556
2557            /* There are can be either a single use in the loop or two uses in
2558               phi nodes.  */
2559            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2560              return false;
2561         }
2562
2563       if (found)
2564         break;
2565
2566       /* We reached a statement with no loop uses.  */
2567       if (nloop_uses == 0)
2568         return false;
2569
2570       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2571       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2572         return false;
2573
2574       if (!is_gimple_assign (loop_use_stmt)
2575           || code != gimple_assign_rhs_code (loop_use_stmt)
2576           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2577         return false;
2578
2579       /* Insert USE_STMT into reduction chain.  */
2580       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2581       if (current_stmt_info)
2582         {
2583           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2584           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2585             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2586         }
2587       else
2588         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2589
2590       lhs = gimple_assign_lhs (loop_use_stmt);
2591       current_stmt_info = use_stmt_info;
2592       size++;
2593    }
2594
2595   if (!found || loop_use_stmt != phi || size < 2)
2596     return false;
2597
2598   /* Swap the operands, if needed, to make the reduction operand be the second
2599      operand.  */
2600   lhs = PHI_RESULT (phi);
2601   stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2602   while (next_stmt_info)
2603     {
2604       gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2605       if (gimple_assign_rhs2 (next_stmt) == lhs)
2606         {
2607           tree op = gimple_assign_rhs1 (next_stmt);
2608           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2609
2610           /* Check that the other def is either defined in the loop
2611              ("vect_internal_def"), or it's an induction (defined by a
2612              loop-header phi-node).  */
2613           if (def_stmt_info
2614               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2615               && vect_valid_reduction_input_p (def_stmt_info))
2616             {
2617               lhs = gimple_assign_lhs (next_stmt);
2618               next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2619               continue;
2620             }
2621
2622           return false;
2623         }
2624       else
2625         {
2626           tree op = gimple_assign_rhs2 (next_stmt);
2627           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2628
2629           /* Check that the other def is either defined in the loop
2630             ("vect_internal_def"), or it's an induction (defined by a
2631             loop-header phi-node).  */
2632           if (def_stmt_info
2633               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2634               && vect_valid_reduction_input_p (def_stmt_info))
2635             {
2636               if (dump_enabled_p ())
2637                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2638                                  next_stmt);
2639
2640               swap_ssa_operands (next_stmt,
2641                                  gimple_assign_rhs1_ptr (next_stmt),
2642                                  gimple_assign_rhs2_ptr (next_stmt));
2643               update_stmt (next_stmt);
2644
2645               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2646                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2647             }
2648           else
2649             return false;
2650         }
2651
2652       lhs = gimple_assign_lhs (next_stmt);
2653       next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2654     }
2655
2656   /* Save the chain for further analysis in SLP detection.  */
2657   stmt_vec_info first_stmt_info
2658     = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2659   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2660   REDUC_GROUP_SIZE (first_stmt_info) = size;
2661
2662   return true;
2663 }
2664
2665 /* Return true if we need an in-order reduction for operation CODE
2666    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2667    overflow must wrap.  */
2668
2669 static bool
2670 needs_fold_left_reduction_p (tree type, tree_code code,
2671                              bool need_wrapping_integral_overflow)
2672 {
2673   /* CHECKME: check for !flag_finite_math_only too?  */
2674   if (SCALAR_FLOAT_TYPE_P (type))
2675     switch (code)
2676       {
2677       case MIN_EXPR:
2678       case MAX_EXPR:
2679         return false;
2680
2681       default:
2682         return !flag_associative_math;
2683       }
2684
2685   if (INTEGRAL_TYPE_P (type))
2686     {
2687       if (!operation_no_trapping_overflow (type, code))
2688         return true;
2689       if (need_wrapping_integral_overflow
2690           && !TYPE_OVERFLOW_WRAPS (type)
2691           && operation_can_overflow (code))
2692         return true;
2693       return false;
2694     }
2695
2696   if (SAT_FIXED_POINT_TYPE_P (type))
2697     return true;
2698
2699   return false;
2700 }
2701
2702 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2703    reduction operation CODE has a handled computation expression.  */
2704
2705 bool
2706 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2707                       tree loop_arg, enum tree_code code)
2708 {
2709   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2710   auto_bitmap visited;
2711   tree lookfor = PHI_RESULT (phi);
2712   ssa_op_iter curri;
2713   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2714   while (USE_FROM_PTR (curr) != loop_arg)
2715     curr = op_iter_next_use (&curri);
2716   curri.i = curri.numops;
2717   do
2718     {
2719       path.safe_push (std::make_pair (curri, curr));
2720       tree use = USE_FROM_PTR (curr);
2721       if (use == lookfor)
2722         break;
2723       gimple *def = SSA_NAME_DEF_STMT (use);
2724       if (gimple_nop_p (def)
2725           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2726         {
2727 pop:
2728           do
2729             {
2730               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2731               curri = x.first;
2732               curr = x.second;
2733               do
2734                 curr = op_iter_next_use (&curri);
2735               /* Skip already visited or non-SSA operands (from iterating
2736                  over PHI args).  */
2737               while (curr != NULL_USE_OPERAND_P
2738                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2739                          || ! bitmap_set_bit (visited,
2740                                               SSA_NAME_VERSION
2741                                                 (USE_FROM_PTR (curr)))));
2742             }
2743           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2744           if (curr == NULL_USE_OPERAND_P)
2745             break;
2746         }
2747       else
2748         {
2749           if (gimple_code (def) == GIMPLE_PHI)
2750             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2751           else
2752             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2753           while (curr != NULL_USE_OPERAND_P
2754                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2755                      || ! bitmap_set_bit (visited,
2756                                           SSA_NAME_VERSION
2757                                             (USE_FROM_PTR (curr)))))
2758             curr = op_iter_next_use (&curri);
2759           if (curr == NULL_USE_OPERAND_P)
2760             goto pop;
2761         }
2762     }
2763   while (1);
2764   if (dump_file && (dump_flags & TDF_DETAILS))
2765     {
2766       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2767       unsigned i;
2768       std::pair<ssa_op_iter, use_operand_p> *x;
2769       FOR_EACH_VEC_ELT (path, i, x)
2770         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2771       dump_printf (MSG_NOTE, "\n");
2772     }
2773
2774   /* Check whether the reduction path detected is valid.  */
2775   bool fail = path.length () == 0;
2776   bool neg = false;
2777   for (unsigned i = 1; i < path.length (); ++i)
2778     {
2779       gimple *use_stmt = USE_STMT (path[i].second);
2780       tree op = USE_FROM_PTR (path[i].second);
2781       if (! has_single_use (op)
2782           || ! is_gimple_assign (use_stmt))
2783         {
2784           fail = true;
2785           break;
2786         }
2787       if (gimple_assign_rhs_code (use_stmt) != code)
2788         {
2789           if (code == PLUS_EXPR
2790               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2791             {
2792               /* Track whether we negate the reduction value each iteration.  */
2793               if (gimple_assign_rhs2 (use_stmt) == op)
2794                 neg = ! neg;
2795             }
2796           else
2797             {
2798               fail = true;
2799               break;
2800             }
2801         }
2802     }
2803   return ! fail && ! neg;
2804 }
2805
2806
2807 /* Function vect_is_simple_reduction
2808
2809    (1) Detect a cross-iteration def-use cycle that represents a simple
2810    reduction computation.  We look for the following pattern:
2811
2812    loop_header:
2813      a1 = phi < a0, a2 >
2814      a3 = ...
2815      a2 = operation (a3, a1)
2816
2817    or
2818
2819    a3 = ...
2820    loop_header:
2821      a1 = phi < a0, a2 >
2822      a2 = operation (a3, a1)
2823
2824    such that:
2825    1. operation is commutative and associative and it is safe to
2826       change the order of the computation
2827    2. no uses for a2 in the loop (a2 is used out of the loop)
2828    3. no uses of a1 in the loop besides the reduction operation
2829    4. no uses of a1 outside the loop.
2830
2831    Conditions 1,4 are tested here.
2832    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2833
2834    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2835    nested cycles.
2836
2837    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2838    reductions:
2839
2840      a1 = phi < a0, a2 >
2841      inner loop (def of a3)
2842      a2 = phi < a3 >
2843
2844    (4) Detect condition expressions, ie:
2845      for (int i = 0; i < N; i++)
2846        if (a[i] < val)
2847         ret_val = a[i];
2848
2849 */
2850
2851 static stmt_vec_info
2852 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2853                           bool *double_reduc,
2854                           bool need_wrapping_integral_overflow,
2855                           enum vect_reduction_type *v_reduc_type)
2856 {
2857   gphi *phi = as_a <gphi *> (phi_info->stmt);
2858   struct loop *loop = (gimple_bb (phi))->loop_father;
2859   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2860   gimple *phi_use_stmt = NULL;
2861   enum tree_code orig_code, code;
2862   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2863   tree type;
2864   int nloop_uses;
2865   tree name;
2866   imm_use_iterator imm_iter;
2867   use_operand_p use_p;
2868   bool phi_def;
2869
2870   *double_reduc = false;
2871   *v_reduc_type = TREE_CODE_REDUCTION;
2872
2873   tree phi_name = PHI_RESULT (phi);
2874   /* ???  If there are no uses of the PHI result the inner loop reduction
2875      won't be detected as possibly double-reduction by vectorizable_reduction
2876      because that tries to walk the PHI arg from the preheader edge which
2877      can be constant.  See PR60382.  */
2878   if (has_zero_uses (phi_name))
2879     return NULL;
2880   nloop_uses = 0;
2881   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2882     {
2883       gimple *use_stmt = USE_STMT (use_p);
2884       if (is_gimple_debug (use_stmt))
2885         continue;
2886
2887       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2888         {
2889           if (dump_enabled_p ())
2890             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2891                              "intermediate value used outside loop.\n");
2892
2893           return NULL;
2894         }
2895
2896       nloop_uses++;
2897       if (nloop_uses > 1)
2898         {
2899           if (dump_enabled_p ())
2900             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2901                              "reduction value used in loop.\n");
2902           return NULL;
2903         }
2904
2905       phi_use_stmt = use_stmt;
2906     }
2907
2908   edge latch_e = loop_latch_edge (loop);
2909   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2910   if (TREE_CODE (loop_arg) != SSA_NAME)
2911     {
2912       if (dump_enabled_p ())
2913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2914                          "reduction: not ssa_name: %T\n", loop_arg);
2915       return NULL;
2916     }
2917
2918   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2919   if (!def_stmt_info
2920       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2921     return NULL;
2922
2923   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2924     {
2925       name = gimple_assign_lhs (def_stmt);
2926       phi_def = false;
2927     }
2928   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2929     {
2930       name = PHI_RESULT (def_stmt);
2931       phi_def = true;
2932     }
2933   else
2934     {
2935       if (dump_enabled_p ())
2936         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2937                          "reduction: unhandled reduction operation: %G",
2938                          def_stmt_info->stmt);
2939       return NULL;
2940     }
2941
2942   nloop_uses = 0;
2943   auto_vec<gphi *, 3> lcphis;
2944   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2945     {
2946       gimple *use_stmt = USE_STMT (use_p);
2947       if (is_gimple_debug (use_stmt))
2948         continue;
2949       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2950         nloop_uses++;
2951       else
2952         /* We can have more than one loop-closed PHI.  */
2953         lcphis.safe_push (as_a <gphi *> (use_stmt));
2954       if (nloop_uses > 1)
2955         {
2956           if (dump_enabled_p ())
2957             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2958                              "reduction used in loop.\n");
2959           return NULL;
2960         }
2961     }
2962
2963   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2964      defined in the inner loop.  */
2965   if (phi_def)
2966     {
2967       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2968       op1 = PHI_ARG_DEF (def_stmt, 0);
2969
2970       if (gimple_phi_num_args (def_stmt) != 1
2971           || TREE_CODE (op1) != SSA_NAME)
2972         {
2973           if (dump_enabled_p ())
2974             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2975                              "unsupported phi node definition.\n");
2976
2977           return NULL;
2978         }
2979
2980       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2981       if (gimple_bb (def1)
2982           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2983           && loop->inner
2984           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2985           && is_gimple_assign (def1)
2986           && is_a <gphi *> (phi_use_stmt)
2987           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2988         {
2989           if (dump_enabled_p ())
2990             report_vect_op (MSG_NOTE, def_stmt,
2991                             "detected double reduction: ");
2992
2993           *double_reduc = true;
2994           return def_stmt_info;
2995         }
2996
2997       return NULL;
2998     }
2999
3000   /* If we are vectorizing an inner reduction we are executing that
3001      in the original order only in case we are not dealing with a
3002      double reduction.  */
3003   bool check_reduction = true;
3004   if (flow_loop_nested_p (vect_loop, loop))
3005     {
3006       gphi *lcphi;
3007       unsigned i;
3008       check_reduction = false;
3009       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3010         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3011           {
3012             gimple *use_stmt = USE_STMT (use_p);
3013             if (is_gimple_debug (use_stmt))
3014               continue;
3015             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3016               check_reduction = true;
3017           }
3018     }
3019
3020   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3021   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3022   code = orig_code = gimple_assign_rhs_code (def_stmt);
3023
3024   /* We can handle "res -= x[i]", which is non-associative by
3025      simply rewriting this into "res += -x[i]".  Avoid changing
3026      gimple instruction for the first simple tests and only do this
3027      if we're allowed to change code at all.  */
3028   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3029     code = PLUS_EXPR;
3030
3031   if (code == COND_EXPR)
3032     {
3033       if (! nested_in_vect_loop)
3034         *v_reduc_type = COND_REDUCTION;
3035
3036       op3 = gimple_assign_rhs1 (def_stmt);
3037       if (COMPARISON_CLASS_P (op3))
3038         {
3039           op4 = TREE_OPERAND (op3, 1);
3040           op3 = TREE_OPERAND (op3, 0);
3041         }
3042       if (op3 == phi_name || op4 == phi_name)
3043         {
3044           if (dump_enabled_p ())
3045             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3046                             "reduction: condition depends on previous"
3047                             " iteration: ");
3048           return NULL;
3049         }
3050
3051       op1 = gimple_assign_rhs2 (def_stmt);
3052       op2 = gimple_assign_rhs3 (def_stmt);
3053     }
3054   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3055     {
3056       if (dump_enabled_p ())
3057         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3058                         "reduction: not commutative/associative: ");
3059       return NULL;
3060     }
3061   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3062     {
3063       op1 = gimple_assign_rhs1 (def_stmt);
3064       op2 = gimple_assign_rhs2 (def_stmt);
3065     }
3066   else
3067     {
3068       if (dump_enabled_p ())
3069         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3070                         "reduction: not handled operation: ");
3071       return NULL;
3072     }
3073
3074   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3075     {
3076       if (dump_enabled_p ())
3077         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3078                         "reduction: both uses not ssa_names: ");
3079
3080       return NULL;
3081     }
3082
3083   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3084   if ((TREE_CODE (op1) == SSA_NAME
3085        && !types_compatible_p (type,TREE_TYPE (op1)))
3086       || (TREE_CODE (op2) == SSA_NAME
3087           && !types_compatible_p (type, TREE_TYPE (op2)))
3088       || (op3 && TREE_CODE (op3) == SSA_NAME
3089           && !types_compatible_p (type, TREE_TYPE (op3)))
3090       || (op4 && TREE_CODE (op4) == SSA_NAME
3091           && !types_compatible_p (type, TREE_TYPE (op4))))
3092     {
3093       if (dump_enabled_p ())
3094         {
3095           dump_printf_loc (MSG_NOTE, vect_location,
3096                            "reduction: multiple types: operation type: "
3097                            "%T, operands types: %T,%T",
3098                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3099           if (op3)
3100             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3101
3102           if (op4)
3103             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3104           dump_printf (MSG_NOTE, "\n");
3105         }
3106
3107       return NULL;
3108     }
3109
3110   /* Check whether it's ok to change the order of the computation.
3111      Generally, when vectorizing a reduction we change the order of the
3112      computation.  This may change the behavior of the program in some
3113      cases, so we need to check that this is ok.  One exception is when
3114      vectorizing an outer-loop: the inner-loop is executed sequentially,
3115      and therefore vectorizing reductions in the inner-loop during
3116      outer-loop vectorization is safe.  */
3117   if (check_reduction
3118       && *v_reduc_type == TREE_CODE_REDUCTION
3119       && needs_fold_left_reduction_p (type, code,
3120                                       need_wrapping_integral_overflow))
3121     *v_reduc_type = FOLD_LEFT_REDUCTION;
3122
3123   /* Reduction is safe. We're dealing with one of the following:
3124      1) integer arithmetic and no trapv
3125      2) floating point arithmetic, and special flags permit this optimization
3126      3) nested cycle (i.e., outer loop vectorization).  */
3127   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3128   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3129   if (code != COND_EXPR && !def1_info && !def2_info)
3130     {
3131       if (dump_enabled_p ())
3132         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3133       return NULL;
3134     }
3135
3136   /* Check that one def is the reduction def, defined by PHI,
3137      the other def is either defined in the loop ("vect_internal_def"),
3138      or it's an induction (defined by a loop-header phi-node).  */
3139
3140   if (def2_info
3141       && def2_info->stmt == phi
3142       && (code == COND_EXPR
3143           || !def1_info
3144           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3145           || vect_valid_reduction_input_p (def1_info)))
3146     {
3147       if (dump_enabled_p ())
3148         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3149       return def_stmt_info;
3150     }
3151
3152   if (def1_info
3153       && def1_info->stmt == phi
3154       && (code == COND_EXPR
3155           || !def2_info
3156           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3157           || vect_valid_reduction_input_p (def2_info)))
3158     {
3159       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3160         {
3161           /* Check if we can swap operands (just for simplicity - so that
3162              the rest of the code can assume that the reduction variable
3163              is always the last (second) argument).  */
3164           if (code == COND_EXPR)
3165             {
3166               /* Swap cond_expr by inverting the condition.  */
3167               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3168               enum tree_code invert_code = ERROR_MARK;
3169               enum tree_code cond_code = TREE_CODE (cond_expr);
3170
3171               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3172                 {
3173                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3174                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3175                 }
3176               if (invert_code != ERROR_MARK)
3177                 {
3178                   TREE_SET_CODE (cond_expr, invert_code);
3179                   swap_ssa_operands (def_stmt,
3180                                      gimple_assign_rhs2_ptr (def_stmt),
3181                                      gimple_assign_rhs3_ptr (def_stmt));
3182                 }
3183               else
3184                 {
3185                   if (dump_enabled_p ())
3186                     report_vect_op (MSG_NOTE, def_stmt,
3187                                     "detected reduction: cannot swap operands "
3188                                     "for cond_expr");
3189                   return NULL;
3190                 }
3191             }
3192           else
3193             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3194                                gimple_assign_rhs2_ptr (def_stmt));
3195
3196           if (dump_enabled_p ())
3197             report_vect_op (MSG_NOTE, def_stmt,
3198                             "detected reduction: need to swap operands: ");
3199
3200           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3201             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3202         }
3203       else
3204         {
3205           if (dump_enabled_p ())
3206             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3207         }
3208
3209       return def_stmt_info;
3210     }
3211
3212   /* Try to find SLP reduction chain.  */
3213   if (! nested_in_vect_loop
3214       && code != COND_EXPR
3215       && orig_code != MINUS_EXPR
3216       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3217     {
3218       if (dump_enabled_p ())
3219         report_vect_op (MSG_NOTE, def_stmt,
3220                         "reduction: detected reduction chain: ");
3221
3222       return def_stmt_info;
3223     }
3224
3225   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3226   stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3227   while (first)
3228     {
3229       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3230       REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3231       REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3232       first = next;
3233     }
3234
3235   /* Look for the expression computing loop_arg from loop PHI result.  */
3236   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3237     return def_stmt_info;
3238
3239   if (dump_enabled_p ())
3240     {
3241       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3242                       "reduction: unknown pattern: ");
3243     }
3244
3245   return NULL;
3246 }
3247
3248 /* Wrapper around vect_is_simple_reduction, which will modify code
3249    in-place if it enables detection of more reductions.  Arguments
3250    as there.  */
3251
3252 stmt_vec_info
3253 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3254                              bool *double_reduc,
3255                              bool need_wrapping_integral_overflow)
3256 {
3257   enum vect_reduction_type v_reduc_type;
3258   stmt_vec_info def_info
3259     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3260                                 need_wrapping_integral_overflow,
3261                                 &v_reduc_type);
3262   if (def_info)
3263     {
3264       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3265       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3266       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3267       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3268     }
3269   return def_info;
3270 }
3271
3272 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3273 int
3274 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3275                              int *peel_iters_epilogue,
3276                              stmt_vector_for_cost *scalar_cost_vec,
3277                              stmt_vector_for_cost *prologue_cost_vec,
3278                              stmt_vector_for_cost *epilogue_cost_vec)
3279 {
3280   int retval = 0;
3281   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3282
3283   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3284     {
3285       *peel_iters_epilogue = assumed_vf / 2;
3286       if (dump_enabled_p ())
3287         dump_printf_loc (MSG_NOTE, vect_location,
3288                          "cost model: epilogue peel iters set to vf/2 "
3289                          "because loop iterations are unknown .\n");
3290
3291       /* If peeled iterations are known but number of scalar loop
3292          iterations are unknown, count a taken branch per peeled loop.  */
3293       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3294                                  NULL, 0, vect_prologue);
3295       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3296                                  NULL, 0, vect_epilogue);
3297     }
3298   else
3299     {
3300       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3301       peel_iters_prologue = niters < peel_iters_prologue ?
3302                             niters : peel_iters_prologue;
3303       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3304       /* If we need to peel for gaps, but no peeling is required, we have to
3305          peel VF iterations.  */
3306       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3307         *peel_iters_epilogue = assumed_vf;
3308     }
3309
3310   stmt_info_for_cost *si;
3311   int j;
3312   if (peel_iters_prologue)
3313     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3314       retval += record_stmt_cost (prologue_cost_vec,
3315                                   si->count * peel_iters_prologue,
3316                                   si->kind, si->stmt_info, si->misalign,
3317                                   vect_prologue);
3318   if (*peel_iters_epilogue)
3319     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3320       retval += record_stmt_cost (epilogue_cost_vec,
3321                                   si->count * *peel_iters_epilogue,
3322                                   si->kind, si->stmt_info, si->misalign,
3323                                   vect_epilogue);
3324
3325   return retval;
3326 }
3327
3328 /* Function vect_estimate_min_profitable_iters
3329
3330    Return the number of iterations required for the vector version of the
3331    loop to be profitable relative to the cost of the scalar version of the
3332    loop.
3333
3334    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3335    of iterations for vectorization.  -1 value means loop vectorization
3336    is not profitable.  This returned value may be used for dynamic
3337    profitability check.
3338
3339    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3340    for static check against estimated number of iterations.  */
3341
3342 static void
3343 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3344                                     int *ret_min_profitable_niters,
3345                                     int *ret_min_profitable_estimate)
3346 {
3347   int min_profitable_iters;
3348   int min_profitable_estimate;
3349   int peel_iters_prologue;
3350   int peel_iters_epilogue;
3351   unsigned vec_inside_cost = 0;
3352   int vec_outside_cost = 0;
3353   unsigned vec_prologue_cost = 0;
3354   unsigned vec_epilogue_cost = 0;
3355   int scalar_single_iter_cost = 0;
3356   int scalar_outside_cost = 0;
3357   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3358   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3359   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3360
3361   /* Cost model disabled.  */
3362   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3363     {
3364       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3365       *ret_min_profitable_niters = 0;
3366       *ret_min_profitable_estimate = 0;
3367       return;
3368     }
3369
3370   /* Requires loop versioning tests to handle misalignment.  */
3371   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3372     {
3373       /*  FIXME: Make cost depend on complexity of individual check.  */
3374       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3375       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3376                             vect_prologue);
3377       dump_printf (MSG_NOTE,
3378                    "cost model: Adding cost of checks for loop "
3379                    "versioning to treat misalignment.\n");
3380     }
3381
3382   /* Requires loop versioning with alias checks.  */
3383   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3384     {
3385       /*  FIXME: Make cost depend on complexity of individual check.  */
3386       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3387       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3388                             vect_prologue);
3389       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3390       if (len)
3391         /* Count LEN - 1 ANDs and LEN comparisons.  */
3392         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3393                               NULL, 0, vect_prologue);
3394       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3395       if (len)
3396         {
3397           /* Count LEN - 1 ANDs and LEN comparisons.  */
3398           unsigned int nstmts = len * 2 - 1;
3399           /* +1 for each bias that needs adding.  */
3400           for (unsigned int i = 0; i < len; ++i)
3401             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3402               nstmts += 1;
3403           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3404                                 NULL, 0, vect_prologue);
3405         }
3406       dump_printf (MSG_NOTE,
3407                    "cost model: Adding cost of checks for loop "
3408                    "versioning aliasing.\n");
3409     }
3410
3411   /* Requires loop versioning with niter checks.  */
3412   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3413     {
3414       /*  FIXME: Make cost depend on complexity of individual check.  */
3415       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3416                             vect_prologue);
3417       dump_printf (MSG_NOTE,
3418                    "cost model: Adding cost of checks for loop "
3419                    "versioning niters.\n");
3420     }
3421
3422   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3423     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3424                           vect_prologue);
3425
3426   /* Count statements in scalar loop.  Using this as scalar cost for a single
3427      iteration for now.
3428
3429      TODO: Add outer loop support.
3430
3431      TODO: Consider assigning different costs to different scalar
3432      statements.  */
3433
3434   scalar_single_iter_cost
3435     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3436
3437   /* Add additional cost for the peeled instructions in prologue and epilogue
3438      loop.  (For fully-masked loops there will be no peeling.)
3439
3440      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3441      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3442
3443      TODO: Build an expression that represents peel_iters for prologue and
3444      epilogue to be used in a run-time test.  */
3445
3446   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3447     {
3448       peel_iters_prologue = 0;
3449       peel_iters_epilogue = 0;
3450
3451       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3452         {
3453           /* We need to peel exactly one iteration.  */
3454           peel_iters_epilogue += 1;
3455           stmt_info_for_cost *si;
3456           int j;
3457           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3458                             j, si)
3459             (void) add_stmt_cost (target_cost_data, si->count,
3460                                   si->kind, si->stmt_info, si->misalign,
3461                                   vect_epilogue);
3462         }
3463     }
3464   else if (npeel < 0)
3465     {
3466       peel_iters_prologue = assumed_vf / 2;
3467       dump_printf (MSG_NOTE, "cost model: "
3468                    "prologue peel iters set to vf/2.\n");
3469
3470       /* If peeling for alignment is unknown, loop bound of main loop becomes
3471          unknown.  */
3472       peel_iters_epilogue = assumed_vf / 2;
3473       dump_printf (MSG_NOTE, "cost model: "
3474                    "epilogue peel iters set to vf/2 because "
3475                    "peeling for alignment is unknown.\n");
3476
3477       /* If peeled iterations are unknown, count a taken branch and a not taken
3478          branch per peeled loop. Even if scalar loop iterations are known,
3479          vector iterations are not known since peeled prologue iterations are
3480          not known. Hence guards remain the same.  */
3481       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3482                             NULL, 0, vect_prologue);
3483       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3484                             NULL, 0, vect_prologue);
3485       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3486                             NULL, 0, vect_epilogue);
3487       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3488                             NULL, 0, vect_epilogue);
3489       stmt_info_for_cost *si;
3490       int j;
3491       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3492         {
3493           (void) add_stmt_cost (target_cost_data,
3494                                 si->count * peel_iters_prologue,
3495                                 si->kind, si->stmt_info, si->misalign,
3496                                 vect_prologue);
3497           (void) add_stmt_cost (target_cost_data,
3498                                 si->count * peel_iters_epilogue,
3499                                 si->kind, si->stmt_info, si->misalign,
3500                                 vect_epilogue);
3501         }
3502     }
3503   else
3504     {
3505       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3506       stmt_info_for_cost *si;
3507       int j;
3508       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3509
3510       prologue_cost_vec.create (2);
3511       epilogue_cost_vec.create (2);
3512       peel_iters_prologue = npeel;
3513
3514       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3515                                           &peel_iters_epilogue,
3516                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3517                                             (loop_vinfo),
3518                                           &prologue_cost_vec,
3519                                           &epilogue_cost_vec);
3520
3521       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3522         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3523                               si->misalign, vect_prologue);
3524
3525       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3526         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3527                               si->misalign, vect_epilogue);
3528
3529       prologue_cost_vec.release ();
3530       epilogue_cost_vec.release ();
3531     }
3532
3533   /* FORNOW: The scalar outside cost is incremented in one of the
3534      following ways:
3535
3536      1. The vectorizer checks for alignment and aliasing and generates
3537      a condition that allows dynamic vectorization.  A cost model
3538      check is ANDED with the versioning condition.  Hence scalar code
3539      path now has the added cost of the versioning check.
3540
3541        if (cost > th & versioning_check)
3542          jmp to vector code
3543
3544      Hence run-time scalar is incremented by not-taken branch cost.
3545
3546      2. The vectorizer then checks if a prologue is required.  If the
3547      cost model check was not done before during versioning, it has to
3548      be done before the prologue check.
3549
3550        if (cost <= th)
3551          prologue = scalar_iters
3552        if (prologue == 0)
3553          jmp to vector code
3554        else
3555          execute prologue
3556        if (prologue == num_iters)
3557          go to exit
3558
3559      Hence the run-time scalar cost is incremented by a taken branch,
3560      plus a not-taken branch, plus a taken branch cost.
3561
3562      3. The vectorizer then checks if an epilogue is required.  If the
3563      cost model check was not done before during prologue check, it
3564      has to be done with the epilogue check.
3565
3566        if (prologue == 0)
3567          jmp to vector code
3568        else
3569          execute prologue
3570        if (prologue == num_iters)
3571          go to exit
3572        vector code:
3573          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3574            jmp to epilogue
3575
3576      Hence the run-time scalar cost should be incremented by 2 taken
3577      branches.
3578
3579      TODO: The back end may reorder the BBS's differently and reverse
3580      conditions/branch directions.  Change the estimates below to
3581      something more reasonable.  */
3582
3583   /* If the number of iterations is known and we do not do versioning, we can
3584      decide whether to vectorize at compile time.  Hence the scalar version
3585      do not carry cost model guard costs.  */
3586   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3587       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3588     {
3589       /* Cost model check occurs at versioning.  */
3590       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3591         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3592       else
3593         {
3594           /* Cost model check occurs at prologue generation.  */
3595           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3596             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3597               + vect_get_stmt_cost (cond_branch_not_taken);
3598           /* Cost model check occurs at epilogue generation.  */
3599           else
3600             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3601         }
3602     }
3603
3604   /* Complete the target-specific cost calculations.  */
3605   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3606                &vec_inside_cost, &vec_epilogue_cost);
3607
3608   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3609
3610   if (dump_enabled_p ())
3611     {
3612       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3613       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3614                    vec_inside_cost);
3615       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3616                    vec_prologue_cost);
3617       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3618                    vec_epilogue_cost);
3619       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3620                    scalar_single_iter_cost);
3621       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3622                    scalar_outside_cost);
3623       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3624                    vec_outside_cost);
3625       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3626                    peel_iters_prologue);
3627       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3628                    peel_iters_epilogue);
3629     }
3630
3631   /* Calculate number of iterations required to make the vector version
3632      profitable, relative to the loop bodies only.  The following condition
3633      must hold true:
3634      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3635      where
3636      SIC = scalar iteration cost, VIC = vector iteration cost,
3637      VOC = vector outside cost, VF = vectorization factor,
3638      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3639      SOC = scalar outside cost for run time cost model check.  */
3640
3641   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3642     {
3643       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3644                               * assumed_vf
3645                               - vec_inside_cost * peel_iters_prologue
3646                               - vec_inside_cost * peel_iters_epilogue);
3647       if (min_profitable_iters <= 0)
3648         min_profitable_iters = 0;
3649       else
3650         {
3651           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3652                                    - vec_inside_cost);
3653
3654           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3655               <= (((int) vec_inside_cost * min_profitable_iters)
3656                   + (((int) vec_outside_cost - scalar_outside_cost)
3657                      * assumed_vf)))
3658             min_profitable_iters++;
3659         }
3660     }
3661   /* vector version will never be profitable.  */
3662   else
3663     {
3664       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3665         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3666                     "vectorization did not happen for a simd loop");
3667
3668       if (dump_enabled_p ())
3669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3670                          "cost model: the vector iteration cost = %d "
3671                          "divided by the scalar iteration cost = %d "
3672                          "is greater or equal to the vectorization factor = %d"
3673                          ".\n",
3674                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3675       *ret_min_profitable_niters = -1;
3676       *ret_min_profitable_estimate = -1;
3677       return;
3678     }
3679
3680   dump_printf (MSG_NOTE,
3681                "  Calculated minimum iters for profitability: %d\n",
3682                min_profitable_iters);
3683
3684   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3685       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3686     /* We want the vectorized loop to execute at least once.  */
3687     min_profitable_iters = assumed_vf + peel_iters_prologue;
3688
3689   if (dump_enabled_p ())
3690     dump_printf_loc (MSG_NOTE, vect_location,
3691                      "  Runtime profitability threshold = %d\n",
3692                      min_profitable_iters);
3693
3694   *ret_min_profitable_niters = min_profitable_iters;
3695
3696   /* Calculate number of iterations required to make the vector version
3697      profitable, relative to the loop bodies only.
3698
3699      Non-vectorized variant is SIC * niters and it must win over vector
3700      variant on the expected loop trip count.  The following condition must hold true:
3701      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3702
3703   if (vec_outside_cost <= 0)
3704     min_profitable_estimate = 0;
3705   else
3706     {
3707       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3708                                  * assumed_vf
3709                                  - vec_inside_cost * peel_iters_prologue
3710                                  - vec_inside_cost * peel_iters_epilogue)
3711                                  / ((scalar_single_iter_cost * assumed_vf)
3712                                    - vec_inside_cost);
3713     }
3714   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3715   if (dump_enabled_p ())
3716     dump_printf_loc (MSG_NOTE, vect_location,
3717                      "  Static estimate profitability threshold = %d\n",
3718                      min_profitable_estimate);
3719
3720   *ret_min_profitable_estimate = min_profitable_estimate;
3721 }
3722
3723 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3724    vector elements (not bits) for a vector with NELT elements.  */
3725 static void
3726 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3727                               vec_perm_builder *sel)
3728 {
3729   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3730      by vec_perm_indices.  */
3731   sel->new_vector (nelt, 1, 3);
3732   for (unsigned int i = 0; i < 3; i++)
3733     sel->quick_push (i + offset);
3734 }
3735
3736 /* Checks whether the target supports whole-vector shifts for vectors of mode
3737    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3738    it supports vec_perm_const with masks for all necessary shift amounts.  */
3739 static bool
3740 have_whole_vector_shift (machine_mode mode)
3741 {
3742   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3743     return true;
3744
3745   /* Variable-length vectors should be handled via the optab.  */
3746   unsigned int nelt;
3747   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3748     return false;
3749
3750   vec_perm_builder sel;
3751   vec_perm_indices indices;
3752   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3753     {
3754       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3755       indices.new_vector (sel, 2, nelt);
3756       if (!can_vec_perm_const_p (mode, indices, false))
3757         return false;
3758     }
3759   return true;
3760 }
3761
3762 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3763    functions. Design better to avoid maintenance issues.  */
3764
3765 /* Function vect_model_reduction_cost.
3766
3767    Models cost for a reduction operation, including the vector ops
3768    generated within the strip-mine loop, the initial definition before
3769    the loop, and the epilogue code that must be generated.  */
3770
3771 static void
3772 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3773                            int ncopies, stmt_vector_for_cost *cost_vec)
3774 {
3775   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3776   enum tree_code code;
3777   optab optab;
3778   tree vectype;
3779   machine_mode mode;
3780   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3781   struct loop *loop = NULL;
3782
3783   if (loop_vinfo)
3784     loop = LOOP_VINFO_LOOP (loop_vinfo);
3785
3786   /* Condition reductions generate two reductions in the loop.  */
3787   vect_reduction_type reduction_type
3788     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3789   if (reduction_type == COND_REDUCTION)
3790     ncopies *= 2;
3791
3792   vectype = STMT_VINFO_VECTYPE (stmt_info);
3793   mode = TYPE_MODE (vectype);
3794   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3795
3796   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3797
3798   if (reduction_type == EXTRACT_LAST_REDUCTION
3799       || reduction_type == FOLD_LEFT_REDUCTION)
3800     {
3801       /* No extra instructions needed in the prologue.  */
3802       prologue_cost = 0;
3803
3804       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3805         /* Count one reduction-like operation per vector.  */
3806         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3807                                         stmt_info, 0, vect_body);
3808       else
3809         {
3810           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3811           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3812           inside_cost = record_stmt_cost (cost_vec, nelements,
3813                                           vec_to_scalar, stmt_info, 0,
3814                                           vect_body);
3815           inside_cost += record_stmt_cost (cost_vec, nelements,
3816                                            scalar_stmt, stmt_info, 0,
3817                                            vect_body);
3818         }
3819     }
3820   else
3821     {
3822       /* Add in cost for initial definition.
3823          For cond reduction we have four vectors: initial index, step,
3824          initial result of the data reduction, initial value of the index
3825          reduction.  */
3826       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3827       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3828                                          scalar_to_vec, stmt_info, 0,
3829                                          vect_prologue);
3830
3831       /* Cost of reduction op inside loop.  */
3832       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3833                                       stmt_info, 0, vect_body);
3834     }
3835
3836   /* Determine cost of epilogue code.
3837
3838      We have a reduction operator that will reduce the vector in one statement.
3839      Also requires scalar extract.  */
3840
3841   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3842     {
3843       if (reduc_fn != IFN_LAST)
3844         {
3845           if (reduction_type == COND_REDUCTION)
3846             {
3847               /* An EQ stmt and an COND_EXPR stmt.  */
3848               epilogue_cost += record_stmt_cost (cost_vec, 2,
3849                                                  vector_stmt, stmt_info, 0,
3850                                                  vect_epilogue);
3851               /* Reduction of the max index and a reduction of the found
3852                  values.  */
3853               epilogue_cost += record_stmt_cost (cost_vec, 2,
3854                                                  vec_to_scalar, stmt_info, 0,
3855                                                  vect_epilogue);
3856               /* A broadcast of the max value.  */
3857               epilogue_cost += record_stmt_cost (cost_vec, 1,
3858                                                  scalar_to_vec, stmt_info, 0,
3859                                                  vect_epilogue);
3860             }
3861           else
3862             {
3863               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3864                                                  stmt_info, 0, vect_epilogue);
3865               epilogue_cost += record_stmt_cost (cost_vec, 1,
3866                                                  vec_to_scalar, stmt_info, 0,
3867                                                  vect_epilogue);
3868             }
3869         }
3870       else if (reduction_type == COND_REDUCTION)
3871         {
3872           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3873           /* Extraction of scalar elements.  */
3874           epilogue_cost += record_stmt_cost (cost_vec,
3875                                              2 * estimated_nunits,
3876                                              vec_to_scalar, stmt_info, 0,
3877                                              vect_epilogue);
3878           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3879           epilogue_cost += record_stmt_cost (cost_vec,
3880                                              2 * estimated_nunits - 3,
3881                                              scalar_stmt, stmt_info, 0,
3882                                              vect_epilogue);
3883         }
3884       else if (reduction_type == EXTRACT_LAST_REDUCTION
3885                || reduction_type == FOLD_LEFT_REDUCTION)
3886         /* No extra instructions need in the epilogue.  */
3887         ;
3888       else
3889         {
3890           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3891           tree bitsize =
3892             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3893           int element_bitsize = tree_to_uhwi (bitsize);
3894           int nelements = vec_size_in_bits / element_bitsize;
3895
3896           if (code == COND_EXPR)
3897             code = MAX_EXPR;
3898
3899           optab = optab_for_tree_code (code, vectype, optab_default);
3900
3901           /* We have a whole vector shift available.  */
3902           if (optab != unknown_optab
3903               && VECTOR_MODE_P (mode)
3904               && optab_handler (optab, mode) != CODE_FOR_nothing
3905               && have_whole_vector_shift (mode))
3906             {
3907               /* Final reduction via vector shifts and the reduction operator.
3908                  Also requires scalar extract.  */
3909               epilogue_cost += record_stmt_cost (cost_vec,
3910                                                  exact_log2 (nelements) * 2,
3911                                                  vector_stmt, stmt_info, 0,
3912                                                  vect_epilogue);
3913               epilogue_cost += record_stmt_cost (cost_vec, 1,
3914                                                  vec_to_scalar, stmt_info, 0,
3915                                                  vect_epilogue);
3916             }
3917           else
3918             /* Use extracts and reduction op for final reduction.  For N
3919                elements, we have N extracts and N-1 reduction ops.  */
3920             epilogue_cost += record_stmt_cost (cost_vec,
3921                                                nelements + nelements - 1,
3922                                                vector_stmt, stmt_info, 0,
3923                                                vect_epilogue);
3924         }
3925     }
3926
3927   if (dump_enabled_p ())
3928     dump_printf (MSG_NOTE,
3929                  "vect_model_reduction_cost: inside_cost = %d, "
3930                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3931                  prologue_cost, epilogue_cost);
3932 }
3933
3934
3935 /* Function vect_model_induction_cost.
3936
3937    Models cost for induction operations.  */
3938
3939 static void
3940 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3941                            stmt_vector_for_cost *cost_vec)
3942 {
3943   unsigned inside_cost, prologue_cost;
3944
3945   if (PURE_SLP_STMT (stmt_info))
3946     return;
3947
3948   /* loop cost for vec_loop.  */
3949   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3950                                   stmt_info, 0, vect_body);
3951
3952   /* prologue cost for vec_init and vec_step.  */
3953   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3954                                     stmt_info, 0, vect_prologue);
3955
3956   if (dump_enabled_p ())
3957     dump_printf_loc (MSG_NOTE, vect_location,
3958                      "vect_model_induction_cost: inside_cost = %d, "
3959                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3960 }
3961
3962
3963
3964 /* Function get_initial_def_for_reduction
3965
3966    Input:
3967    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3968    INIT_VAL - the initial value of the reduction variable
3969
3970    Output:
3971    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3972         of the reduction (used for adjusting the epilog - see below).
3973    Return a vector variable, initialized according to the operation that
3974         STMT_VINFO performs. This vector will be used as the initial value
3975         of the vector of partial results.
3976
3977    Option1 (adjust in epilog): Initialize the vector as follows:
3978      add/bit or/xor:    [0,0,...,0,0]
3979      mult/bit and:      [1,1,...,1,1]
3980      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3981    and when necessary (e.g. add/mult case) let the caller know
3982    that it needs to adjust the result by init_val.
3983
3984    Option2: Initialize the vector as follows:
3985      add/bit or/xor:    [init_val,0,0,...,0]
3986      mult/bit and:      [init_val,1,1,...,1]
3987      min/max/cond_expr: [init_val,init_val,...,init_val]
3988    and no adjustments are needed.
3989
3990    For example, for the following code:
3991
3992    s = init_val;
3993    for (i=0;i<n;i++)
3994      s = s + a[i];
3995
3996    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3997    For a vector of 4 units, we want to return either [0,0,0,init_val],
3998    or [0,0,0,0] and let the caller know that it needs to adjust
3999    the result at the end by 'init_val'.
4000
4001    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4002    initialization vector is simpler (same element in all entries), if
4003    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4004
4005    A cost model should help decide between these two schemes.  */
4006
4007 tree
4008 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4009                                tree *adjustment_def)
4010 {
4011   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4012   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4013   tree scalar_type = TREE_TYPE (init_val);
4014   tree vectype = get_vectype_for_scalar_type (scalar_type);
4015   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4016   tree def_for_init;
4017   tree init_def;
4018   REAL_VALUE_TYPE real_init_val = dconst0;
4019   int int_init_val = 0;
4020   gimple_seq stmts = NULL;
4021
4022   gcc_assert (vectype);
4023
4024   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4025               || SCALAR_FLOAT_TYPE_P (scalar_type));
4026
4027   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4028               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4029
4030   vect_reduction_type reduction_type
4031     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4032
4033   switch (code)
4034     {
4035     case WIDEN_SUM_EXPR:
4036     case DOT_PROD_EXPR:
4037     case SAD_EXPR:
4038     case PLUS_EXPR:
4039     case MINUS_EXPR:
4040     case BIT_IOR_EXPR:
4041     case BIT_XOR_EXPR:
4042     case MULT_EXPR:
4043     case BIT_AND_EXPR:
4044       {
4045         /* ADJUSTMENT_DEF is NULL when called from
4046            vect_create_epilog_for_reduction to vectorize double reduction.  */
4047         if (adjustment_def)
4048           *adjustment_def = init_val;
4049
4050         if (code == MULT_EXPR)
4051           {
4052             real_init_val = dconst1;
4053             int_init_val = 1;
4054           }
4055
4056         if (code == BIT_AND_EXPR)
4057           int_init_val = -1;
4058
4059         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4060           def_for_init = build_real (scalar_type, real_init_val);
4061         else
4062           def_for_init = build_int_cst (scalar_type, int_init_val);
4063
4064         if (adjustment_def)
4065           /* Option1: the first element is '0' or '1' as well.  */
4066           init_def = gimple_build_vector_from_val (&stmts, vectype,
4067                                                    def_for_init);
4068         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4069           {
4070             /* Option2 (variable length): the first element is INIT_VAL.  */
4071             init_def = gimple_build_vector_from_val (&stmts, vectype,
4072                                                      def_for_init);
4073             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4074                                      vectype, init_def, init_val);
4075           }
4076         else
4077           {
4078             /* Option2: the first element is INIT_VAL.  */
4079             tree_vector_builder elts (vectype, 1, 2);
4080             elts.quick_push (init_val);
4081             elts.quick_push (def_for_init);
4082             init_def = gimple_build_vector (&stmts, &elts);
4083           }
4084       }
4085       break;
4086
4087     case MIN_EXPR:
4088     case MAX_EXPR:
4089     case COND_EXPR:
4090       {
4091         if (adjustment_def)
4092           {
4093             *adjustment_def = NULL_TREE;
4094             if (reduction_type != COND_REDUCTION
4095                 && reduction_type != EXTRACT_LAST_REDUCTION)
4096               {
4097                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4098                 break;
4099               }
4100           }
4101         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4102         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4103       }
4104       break;
4105
4106     default:
4107       gcc_unreachable ();
4108     }
4109
4110   if (stmts)
4111     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4112   return init_def;
4113 }
4114
4115 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4116    NUMBER_OF_VECTORS is the number of vector defs to create.
4117    If NEUTRAL_OP is nonnull, introducing extra elements of that
4118    value will not change the result.  */
4119
4120 static void
4121 get_initial_defs_for_reduction (slp_tree slp_node,
4122                                 vec<tree> *vec_oprnds,
4123                                 unsigned int number_of_vectors,
4124                                 bool reduc_chain, tree neutral_op)
4125 {
4126   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4127   stmt_vec_info stmt_vinfo = stmts[0];
4128   unsigned HOST_WIDE_INT nunits;
4129   unsigned j, number_of_places_left_in_vector;
4130   tree vector_type;
4131   tree vop;
4132   int group_size = stmts.length ();
4133   unsigned int vec_num, i;
4134   unsigned number_of_copies = 1;
4135   vec<tree> voprnds;
4136   voprnds.create (number_of_vectors);
4137   struct loop *loop;
4138   auto_vec<tree, 16> permute_results;
4139
4140   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4141
4142   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4143
4144   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4145   gcc_assert (loop);
4146   edge pe = loop_preheader_edge (loop);
4147
4148   gcc_assert (!reduc_chain || neutral_op);
4149
4150   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4151      created vectors. It is greater than 1 if unrolling is performed.
4152
4153      For example, we have two scalar operands, s1 and s2 (e.g., group of
4154      strided accesses of size two), while NUNITS is four (i.e., four scalars
4155      of this type can be packed in a vector).  The output vector will contain
4156      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4157      will be 2).
4158
4159      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4160      vectors containing the operands.
4161
4162      For example, NUNITS is four as before, and the group size is 8
4163      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4164      {s5, s6, s7, s8}.  */
4165
4166   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4167     nunits = group_size;
4168
4169   number_of_copies = nunits * number_of_vectors / group_size;
4170
4171   number_of_places_left_in_vector = nunits;
4172   bool constant_p = true;
4173   tree_vector_builder elts (vector_type, nunits, 1);
4174   elts.quick_grow (nunits);
4175   for (j = 0; j < number_of_copies; j++)
4176     {
4177       for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4178         {
4179           tree op;
4180           /* Get the def before the loop.  In reduction chain we have only
4181              one initial value.  */
4182           if ((j != (number_of_copies - 1)
4183                || (reduc_chain && i != 0))
4184               && neutral_op)
4185             op = neutral_op;
4186           else
4187             op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4188
4189           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4190           number_of_places_left_in_vector--;
4191           elts[number_of_places_left_in_vector] = op;
4192           if (!CONSTANT_CLASS_P (op))
4193             constant_p = false;
4194
4195           if (number_of_places_left_in_vector == 0)
4196             {
4197               gimple_seq ctor_seq = NULL;
4198               tree init;
4199               if (constant_p && !neutral_op
4200                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4201                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4202                 /* Build the vector directly from ELTS.  */
4203                 init = gimple_build_vector (&ctor_seq, &elts);
4204               else if (neutral_op)
4205                 {
4206                   /* Build a vector of the neutral value and shift the
4207                      other elements into place.  */
4208                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4209                                                        neutral_op);
4210                   int k = nunits;
4211                   while (k > 0 && elts[k - 1] == neutral_op)
4212                     k -= 1;
4213                   while (k > 0)
4214                     {
4215                       k -= 1;
4216                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4217                                            vector_type, init, elts[k]);
4218                     }
4219                 }
4220               else
4221                 {
4222                   /* First time round, duplicate ELTS to fill the
4223                      required number of vectors, then cherry pick the
4224                      appropriate result for each iteration.  */
4225                   if (vec_oprnds->is_empty ())
4226                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4227                                               number_of_vectors,
4228                                               permute_results);
4229                   init = permute_results[number_of_vectors - j - 1];
4230                 }
4231               if (ctor_seq != NULL)
4232                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4233               voprnds.quick_push (init);
4234
4235               number_of_places_left_in_vector = nunits;
4236               elts.new_vector (vector_type, nunits, 1);
4237               elts.quick_grow (nunits);
4238               constant_p = true;
4239             }
4240         }
4241     }
4242
4243   /* Since the vectors are created in the reverse order, we should invert
4244      them.  */
4245   vec_num = voprnds.length ();
4246   for (j = vec_num; j != 0; j--)
4247     {
4248       vop = voprnds[j - 1];
4249       vec_oprnds->quick_push (vop);
4250     }
4251
4252   voprnds.release ();
4253
4254   /* In case that VF is greater than the unrolling factor needed for the SLP
4255      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4256      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4257      to replicate the vectors.  */
4258   tree neutral_vec = NULL;
4259   while (number_of_vectors > vec_oprnds->length ())
4260     {
4261       if (neutral_op)
4262         {
4263           if (!neutral_vec)
4264             {
4265               gimple_seq ctor_seq = NULL;
4266               neutral_vec = gimple_build_vector_from_val
4267                 (&ctor_seq, vector_type, neutral_op);
4268               if (ctor_seq != NULL)
4269                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4270             }
4271           vec_oprnds->quick_push (neutral_vec);
4272         }
4273       else
4274         {
4275           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4276             vec_oprnds->quick_push (vop);
4277         }
4278     }
4279 }
4280
4281
4282 /* Function vect_create_epilog_for_reduction
4283
4284    Create code at the loop-epilog to finalize the result of a reduction
4285    computation.
4286
4287    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4288      reduction statements.
4289    STMT_INFO is the scalar reduction stmt that is being vectorized.
4290    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4291      number of elements that we can fit in a vectype (nunits).  In this case
4292      we have to generate more than one vector stmt - i.e - we need to "unroll"
4293      the vector stmt by a factor VF/nunits.  For more details see documentation
4294      in vectorizable_operation.
4295    REDUC_FN is the internal function for the epilog reduction.
4296    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4297      computation.
4298    REDUC_INDEX is the index of the operand in the right hand side of the
4299      statement that is defined by REDUCTION_PHI.
4300    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4301    SLP_NODE is an SLP node containing a group of reduction statements. The
4302      first one in this group is STMT_INFO.
4303    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4304      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4305      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4306      any value of the IV in the loop.
4307    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4308    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4309      null if this is not an SLP reduction
4310
4311    This function:
4312    1. Creates the reduction def-use cycles: sets the arguments for
4313       REDUCTION_PHIS:
4314       The loop-entry argument is the vectorized initial-value of the reduction.
4315       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4316       sums.
4317    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4318       by calling the function specified by REDUC_FN if available, or by
4319       other means (whole-vector shifts or a scalar loop).
4320       The function also creates a new phi node at the loop exit to preserve
4321       loop-closed form, as illustrated below.
4322
4323      The flow at the entry to this function:
4324
4325         loop:
4326           vec_def = phi <null, null>            # REDUCTION_PHI
4327           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4328           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4329         loop_exit:
4330           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4331           use <s_out0>
4332           use <s_out0>
4333
4334      The above is transformed by this function into:
4335
4336         loop:
4337           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4338           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4339           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4340         loop_exit:
4341           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4342           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4343           v_out2 = reduce <v_out1>
4344           s_out3 = extract_field <v_out2, 0>
4345           s_out4 = adjust_result <s_out3>
4346           use <s_out4>
4347           use <s_out4>
4348 */
4349
4350 static void
4351 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4352                                   stmt_vec_info stmt_info,
4353                                   gimple *reduc_def_stmt,
4354                                   int ncopies, internal_fn reduc_fn,
4355                                   vec<stmt_vec_info> reduction_phis,
4356                                   bool double_reduc,
4357                                   slp_tree slp_node,
4358                                   slp_instance slp_node_instance,
4359                                   tree induc_val, enum tree_code induc_code,
4360                                   tree neutral_op)
4361 {
4362   stmt_vec_info prev_phi_info;
4363   tree vectype;
4364   machine_mode mode;
4365   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4366   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4367   basic_block exit_bb;
4368   tree scalar_dest;
4369   tree scalar_type;
4370   gimple *new_phi = NULL, *phi;
4371   stmt_vec_info phi_info;
4372   gimple_stmt_iterator exit_gsi;
4373   tree vec_dest;
4374   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4375   gimple *epilog_stmt = NULL;
4376   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4377   gimple *exit_phi;
4378   tree bitsize;
4379   tree adjustment_def = NULL;
4380   tree vec_initial_def = NULL;
4381   tree expr, def, initial_def = NULL;
4382   tree orig_name, scalar_result;
4383   imm_use_iterator imm_iter, phi_imm_iter;
4384   use_operand_p use_p, phi_use_p;
4385   gimple *use_stmt;
4386   stmt_vec_info reduction_phi_info = NULL;
4387   bool nested_in_vect_loop = false;
4388   auto_vec<gimple *> new_phis;
4389   auto_vec<stmt_vec_info> inner_phis;
4390   int j, i;
4391   auto_vec<tree> scalar_results;
4392   unsigned int group_size = 1, k, ratio;
4393   auto_vec<tree> vec_initial_defs;
4394   auto_vec<gimple *> phis;
4395   bool slp_reduc = false;
4396   bool direct_slp_reduc;
4397   tree new_phi_result;
4398   stmt_vec_info inner_phi = NULL;
4399   tree induction_index = NULL_TREE;
4400
4401   if (slp_node)
4402     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4403
4404   if (nested_in_vect_loop_p (loop, stmt_info))
4405     {
4406       outer_loop = loop;
4407       loop = loop->inner;
4408       nested_in_vect_loop = true;
4409       gcc_assert (!slp_node);
4410     }
4411
4412   vectype = STMT_VINFO_VECTYPE (stmt_info);
4413   gcc_assert (vectype);
4414   mode = TYPE_MODE (vectype);
4415
4416   /* 1. Create the reduction def-use cycle:
4417      Set the arguments of REDUCTION_PHIS, i.e., transform
4418
4419         loop:
4420           vec_def = phi <null, null>            # REDUCTION_PHI
4421           VECT_DEF = vector_stmt                # vectorized form of STMT
4422           ...
4423
4424      into:
4425
4426         loop:
4427           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4428           VECT_DEF = vector_stmt                # vectorized form of STMT
4429           ...
4430
4431      (in case of SLP, do it for all the phis). */
4432
4433   /* Get the loop-entry arguments.  */
4434   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4435   if (slp_node)
4436     {
4437       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4438       vec_initial_defs.reserve (vec_num);
4439       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4440                                       &vec_initial_defs, vec_num,
4441                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4442                                       neutral_op);
4443     }
4444   else
4445     {
4446       /* Get at the scalar def before the loop, that defines the initial value
4447          of the reduction variable.  */
4448       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4449                                            loop_preheader_edge (loop));
4450       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4451          and we can't use zero for induc_val, use initial_def.  Similarly
4452          for REDUC_MIN and initial_def larger than the base.  */
4453       if (TREE_CODE (initial_def) == INTEGER_CST
4454           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4455               == INTEGER_INDUC_COND_REDUCTION)
4456           && !integer_zerop (induc_val)
4457           && ((induc_code == MAX_EXPR
4458                && tree_int_cst_lt (initial_def, induc_val))
4459               || (induc_code == MIN_EXPR
4460                   && tree_int_cst_lt (induc_val, initial_def))))
4461         induc_val = initial_def;
4462
4463       if (double_reduc)
4464         /* In case of double reduction we only create a vector variable
4465            to be put in the reduction phi node.  The actual statement
4466            creation is done later in this function.  */
4467         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4468       else if (nested_in_vect_loop)
4469         {
4470           /* Do not use an adjustment def as that case is not supported
4471              correctly if ncopies is not one.  */
4472           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4473           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4474                                                           stmt_info);
4475         }
4476       else
4477         vec_initial_def
4478           = get_initial_def_for_reduction (stmt_info, initial_def,
4479                                            &adjustment_def);
4480       vec_initial_defs.create (1);
4481       vec_initial_defs.quick_push (vec_initial_def);
4482     }
4483
4484   /* Set phi nodes arguments.  */
4485   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4486     {
4487       tree vec_init_def = vec_initial_defs[i];
4488       tree def = vect_defs[i];
4489       for (j = 0; j < ncopies; j++)
4490         {
4491           if (j != 0)
4492             {
4493               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4494               if (nested_in_vect_loop)
4495                 vec_init_def
4496                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4497             }
4498
4499           /* Set the loop-entry arg of the reduction-phi.  */
4500
4501           gphi *phi = as_a <gphi *> (phi_info->stmt);
4502           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4503               == INTEGER_INDUC_COND_REDUCTION)
4504             {
4505               /* Initialise the reduction phi to zero.  This prevents initial
4506                  values of non-zero interferring with the reduction op.  */
4507               gcc_assert (ncopies == 1);
4508               gcc_assert (i == 0);
4509
4510               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4511               tree induc_val_vec
4512                 = build_vector_from_val (vec_init_def_type, induc_val);
4513
4514               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4515                            UNKNOWN_LOCATION);
4516             }
4517           else
4518             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4519                          UNKNOWN_LOCATION);
4520
4521           /* Set the loop-latch arg for the reduction-phi.  */
4522           if (j > 0)
4523             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4524
4525           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4526
4527           if (dump_enabled_p ())
4528             dump_printf_loc (MSG_NOTE, vect_location,
4529                              "transform reduction: created def-use cycle: %G%G",
4530                              phi, SSA_NAME_DEF_STMT (def));
4531         }
4532     }
4533
4534   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4535      which is updated with the current index of the loop for every match of
4536      the original loop's cond_expr (VEC_STMT).  This results in a vector
4537      containing the last time the condition passed for that vector lane.
4538      The first match will be a 1 to allow 0 to be used for non-matching
4539      indexes.  If there are no matches at all then the vector will be all
4540      zeroes.  */
4541   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4542     {
4543       tree indx_before_incr, indx_after_incr;
4544       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4545
4546       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4547       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4548
4549       int scalar_precision
4550         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4551       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4552       tree cr_index_vector_type = build_vector_type
4553         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4554
4555       /* First we create a simple vector induction variable which starts
4556          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4557          vector size (STEP).  */
4558
4559       /* Create a {1,2,3,...} vector.  */
4560       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4561
4562       /* Create a vector of the step value.  */
4563       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4564       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4565
4566       /* Create an induction variable.  */
4567       gimple_stmt_iterator incr_gsi;
4568       bool insert_after;
4569       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4570       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4571                  insert_after, &indx_before_incr, &indx_after_incr);
4572
4573       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4574          filled with zeros (VEC_ZERO).  */
4575
4576       /* Create a vector of 0s.  */
4577       tree zero = build_zero_cst (cr_index_scalar_type);
4578       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4579
4580       /* Create a vector phi node.  */
4581       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4582       new_phi = create_phi_node (new_phi_tree, loop->header);
4583       loop_vinfo->add_stmt (new_phi);
4584       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4585                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4586
4587       /* Now take the condition from the loops original cond_expr
4588          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4589          every match uses values from the induction variable
4590          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4591          (NEW_PHI_TREE).
4592          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4593          the new cond_expr (INDEX_COND_EXPR).  */
4594
4595       /* Duplicate the condition from vec_stmt.  */
4596       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4597
4598       /* Create a conditional, where the condition is taken from vec_stmt
4599          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4600          else is the phi (NEW_PHI_TREE).  */
4601       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4602                                      ccompare, indx_before_incr,
4603                                      new_phi_tree);
4604       induction_index = make_ssa_name (cr_index_vector_type);
4605       gimple *index_condition = gimple_build_assign (induction_index,
4606                                                      index_cond_expr);
4607       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4608       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4609       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4610
4611       /* Update the phi with the vec cond.  */
4612       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4613                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4614     }
4615
4616   /* 2. Create epilog code.
4617         The reduction epilog code operates across the elements of the vector
4618         of partial results computed by the vectorized loop.
4619         The reduction epilog code consists of:
4620
4621         step 1: compute the scalar result in a vector (v_out2)
4622         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4623         step 3: adjust the scalar result (s_out3) if needed.
4624
4625         Step 1 can be accomplished using one the following three schemes:
4626           (scheme 1) using reduc_fn, if available.
4627           (scheme 2) using whole-vector shifts, if available.
4628           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4629                      combined.
4630
4631           The overall epilog code looks like this:
4632
4633           s_out0 = phi <s_loop>         # original EXIT_PHI
4634           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4635           v_out2 = reduce <v_out1>              # step 1
4636           s_out3 = extract_field <v_out2, 0>    # step 2
4637           s_out4 = adjust_result <s_out3>       # step 3
4638
4639           (step 3 is optional, and steps 1 and 2 may be combined).
4640           Lastly, the uses of s_out0 are replaced by s_out4.  */
4641
4642
4643   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4644          v_out1 = phi <VECT_DEF>
4645          Store them in NEW_PHIS.  */
4646
4647   exit_bb = single_exit (loop)->dest;
4648   prev_phi_info = NULL;
4649   new_phis.create (vect_defs.length ());
4650   FOR_EACH_VEC_ELT (vect_defs, i, def)
4651     {
4652       for (j = 0; j < ncopies; j++)
4653         {
4654           tree new_def = copy_ssa_name (def);
4655           phi = create_phi_node (new_def, exit_bb);
4656           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4657           if (j == 0)
4658             new_phis.quick_push (phi);
4659           else
4660             {
4661               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4662               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4663             }
4664
4665           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4666           prev_phi_info = phi_info;
4667         }
4668     }
4669
4670   /* The epilogue is created for the outer-loop, i.e., for the loop being
4671      vectorized.  Create exit phis for the outer loop.  */
4672   if (double_reduc)
4673     {
4674       loop = outer_loop;
4675       exit_bb = single_exit (loop)->dest;
4676       inner_phis.create (vect_defs.length ());
4677       FOR_EACH_VEC_ELT (new_phis, i, phi)
4678         {
4679           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4680           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4681           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4682           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4683                            PHI_RESULT (phi));
4684           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4685           inner_phis.quick_push (phi_info);
4686           new_phis[i] = outer_phi;
4687           while (STMT_VINFO_RELATED_STMT (phi_info))
4688             {
4689               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4690               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4691               outer_phi = create_phi_node (new_result, exit_bb);
4692               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4693                                PHI_RESULT (phi_info->stmt));
4694               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4695               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4696               prev_phi_info = outer_phi_info;
4697             }
4698         }
4699     }
4700
4701   exit_gsi = gsi_after_labels (exit_bb);
4702
4703   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4704          (i.e. when reduc_fn is not available) and in the final adjustment
4705          code (if needed).  Also get the original scalar reduction variable as
4706          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4707          represents a reduction pattern), the tree-code and scalar-def are
4708          taken from the original stmt that the pattern-stmt (STMT) replaces.
4709          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4710          are taken from STMT.  */
4711
4712   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4713   if (orig_stmt_info != stmt_info)
4714     {
4715       /* Reduction pattern  */
4716       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4717       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4718     }
4719
4720   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4721   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4722      partial results are added and not subtracted.  */
4723   if (code == MINUS_EXPR)
4724     code = PLUS_EXPR;
4725
4726   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4727   scalar_type = TREE_TYPE (scalar_dest);
4728   scalar_results.create (group_size);
4729   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4730   bitsize = TYPE_SIZE (scalar_type);
4731
4732   /* In case this is a reduction in an inner-loop while vectorizing an outer
4733      loop - we don't need to extract a single scalar result at the end of the
4734      inner-loop (unless it is double reduction, i.e., the use of reduction is
4735      outside the outer-loop).  The final vector of partial results will be used
4736      in the vectorized outer-loop, or reduced to a scalar result at the end of
4737      the outer-loop.  */
4738   if (nested_in_vect_loop && !double_reduc)
4739     goto vect_finalize_reduction;
4740
4741   /* SLP reduction without reduction chain, e.g.,
4742      # a1 = phi <a2, a0>
4743      # b1 = phi <b2, b0>
4744      a2 = operation (a1)
4745      b2 = operation (b1)  */
4746   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4747
4748   /* True if we should implement SLP_REDUC using native reduction operations
4749      instead of scalar operations.  */
4750   direct_slp_reduc = (reduc_fn != IFN_LAST
4751                       && slp_reduc
4752                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4753
4754   /* In case of reduction chain, e.g.,
4755      # a1 = phi <a3, a0>
4756      a2 = operation (a1)
4757      a3 = operation (a2),
4758
4759      we may end up with more than one vector result.  Here we reduce them to
4760      one vector.  */
4761   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4762     {
4763       tree first_vect = PHI_RESULT (new_phis[0]);
4764       gassign *new_vec_stmt = NULL;
4765       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4766       for (k = 1; k < new_phis.length (); k++)
4767         {
4768           gimple *next_phi = new_phis[k];
4769           tree second_vect = PHI_RESULT (next_phi);
4770           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4771           new_vec_stmt = gimple_build_assign (tem, code,
4772                                               first_vect, second_vect);
4773           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4774           first_vect = tem;
4775         }
4776
4777       new_phi_result = first_vect;
4778       if (new_vec_stmt)
4779         {
4780           new_phis.truncate (0);
4781           new_phis.safe_push (new_vec_stmt);
4782         }
4783     }
4784   /* Likewise if we couldn't use a single defuse cycle.  */
4785   else if (ncopies > 1)
4786     {
4787       gcc_assert (new_phis.length () == 1);
4788       tree first_vect = PHI_RESULT (new_phis[0]);
4789       gassign *new_vec_stmt = NULL;
4790       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4791       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4792       for (int k = 1; k < ncopies; ++k)
4793         {
4794           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4795           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4796           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4797           new_vec_stmt = gimple_build_assign (tem, code,
4798                                               first_vect, second_vect);
4799           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4800           first_vect = tem;
4801         }
4802       new_phi_result = first_vect;
4803       new_phis.truncate (0);
4804       new_phis.safe_push (new_vec_stmt);
4805     }
4806   else
4807     new_phi_result = PHI_RESULT (new_phis[0]);
4808
4809   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4810       && reduc_fn != IFN_LAST)
4811     {
4812       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4813          various data values where the condition matched and another vector
4814          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4815          need to extract the last matching index (which will be the index with
4816          highest value) and use this to index into the data vector.
4817          For the case where there were no matches, the data vector will contain
4818          all default values and the index vector will be all zeros.  */
4819
4820       /* Get various versions of the type of the vector of indexes.  */
4821       tree index_vec_type = TREE_TYPE (induction_index);
4822       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4823       tree index_scalar_type = TREE_TYPE (index_vec_type);
4824       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4825         (index_vec_type);
4826
4827       /* Get an unsigned integer version of the type of the data vector.  */
4828       int scalar_precision
4829         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4830       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4831       tree vectype_unsigned = build_vector_type
4832         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4833
4834       /* First we need to create a vector (ZERO_VEC) of zeros and another
4835          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4836          can create using a MAX reduction and then expanding.
4837          In the case where the loop never made any matches, the max index will
4838          be zero.  */
4839
4840       /* Vector of {0, 0, 0,...}.  */
4841       tree zero_vec = make_ssa_name (vectype);
4842       tree zero_vec_rhs = build_zero_cst (vectype);
4843       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4844       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4845
4846       /* Find maximum value from the vector of found indexes.  */
4847       tree max_index = make_ssa_name (index_scalar_type);
4848       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4849                                                           1, induction_index);
4850       gimple_call_set_lhs (max_index_stmt, max_index);
4851       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4852
4853       /* Vector of {max_index, max_index, max_index,...}.  */
4854       tree max_index_vec = make_ssa_name (index_vec_type);
4855       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4856                                                       max_index);
4857       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4858                                                         max_index_vec_rhs);
4859       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4860
4861       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4862          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4863          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4864          otherwise.  Only one value should match, resulting in a vector
4865          (VEC_COND) with one data value and the rest zeros.
4866          In the case where the loop never made any matches, every index will
4867          match, resulting in a vector with all data values (which will all be
4868          the default value).  */
4869
4870       /* Compare the max index vector to the vector of found indexes to find
4871          the position of the max value.  */
4872       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4873       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4874                                                       induction_index,
4875                                                       max_index_vec);
4876       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4877
4878       /* Use the compare to choose either values from the data vector or
4879          zero.  */
4880       tree vec_cond = make_ssa_name (vectype);
4881       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4882                                                    vec_compare, new_phi_result,
4883                                                    zero_vec);
4884       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4885
4886       /* Finally we need to extract the data value from the vector (VEC_COND)
4887          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4888          reduction, but because this doesn't exist, we can use a MAX reduction
4889          instead.  The data value might be signed or a float so we need to cast
4890          it first.
4891          In the case where the loop never made any matches, the data values are
4892          all identical, and so will reduce down correctly.  */
4893
4894       /* Make the matched data values unsigned.  */
4895       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4896       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4897                                        vec_cond);
4898       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4899                                                         VIEW_CONVERT_EXPR,
4900                                                         vec_cond_cast_rhs);
4901       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4902
4903       /* Reduce down to a scalar value.  */
4904       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4905       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4906                                                            1, vec_cond_cast);
4907       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4908       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4909
4910       /* Convert the reduced value back to the result type and set as the
4911          result.  */
4912       gimple_seq stmts = NULL;
4913       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4914                                data_reduc);
4915       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4916       scalar_results.safe_push (new_temp);
4917     }
4918   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4919            && reduc_fn == IFN_LAST)
4920     {
4921       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4922          idx = 0;
4923          idx_val = induction_index[0];
4924          val = data_reduc[0];
4925          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4926            if (induction_index[i] > idx_val)
4927              val = data_reduc[i], idx_val = induction_index[i];
4928          return val;  */
4929
4930       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4931       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4932       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4933       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4934       /* Enforced by vectorizable_reduction, which ensures we have target
4935          support before allowing a conditional reduction on variable-length
4936          vectors.  */
4937       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4938       tree idx_val = NULL_TREE, val = NULL_TREE;
4939       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4940         {
4941           tree old_idx_val = idx_val;
4942           tree old_val = val;
4943           idx_val = make_ssa_name (idx_eltype);
4944           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4945                                              build3 (BIT_FIELD_REF, idx_eltype,
4946                                                      induction_index,
4947                                                      bitsize_int (el_size),
4948                                                      bitsize_int (off)));
4949           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4950           val = make_ssa_name (data_eltype);
4951           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4952                                              build3 (BIT_FIELD_REF,
4953                                                      data_eltype,
4954                                                      new_phi_result,
4955                                                      bitsize_int (el_size),
4956                                                      bitsize_int (off)));
4957           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4958           if (off != 0)
4959             {
4960               tree new_idx_val = idx_val;
4961               tree new_val = val;
4962               if (off != v_size - el_size)
4963                 {
4964                   new_idx_val = make_ssa_name (idx_eltype);
4965                   epilog_stmt = gimple_build_assign (new_idx_val,
4966                                                      MAX_EXPR, idx_val,
4967                                                      old_idx_val);
4968                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4969                 }
4970               new_val = make_ssa_name (data_eltype);
4971               epilog_stmt = gimple_build_assign (new_val,
4972                                                  COND_EXPR,
4973                                                  build2 (GT_EXPR,
4974                                                          boolean_type_node,
4975                                                          idx_val,
4976                                                          old_idx_val),
4977                                                  val, old_val);
4978               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4979               idx_val = new_idx_val;
4980               val = new_val;
4981             }
4982         }
4983       /* Convert the reduced value back to the result type and set as the
4984          result.  */
4985       gimple_seq stmts = NULL;
4986       val = gimple_convert (&stmts, scalar_type, val);
4987       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4988       scalar_results.safe_push (val);
4989     }
4990
4991   /* 2.3 Create the reduction code, using one of the three schemes described
4992          above. In SLP we simply need to extract all the elements from the
4993          vector (without reducing them), so we use scalar shifts.  */
4994   else if (reduc_fn != IFN_LAST && !slp_reduc)
4995     {
4996       tree tmp;
4997       tree vec_elem_type;
4998
4999       /* Case 1:  Create:
5000          v_out2 = reduc_expr <v_out1>  */
5001
5002       if (dump_enabled_p ())
5003         dump_printf_loc (MSG_NOTE, vect_location,
5004                          "Reduce using direct vector reduction.\n");
5005
5006       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5007       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5008         {
5009           tree tmp_dest
5010             = vect_create_destination_var (scalar_dest, vec_elem_type);
5011           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5012                                                     new_phi_result);
5013           gimple_set_lhs (epilog_stmt, tmp_dest);
5014           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5015           gimple_set_lhs (epilog_stmt, new_temp);
5016           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5017
5018           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5019                                              new_temp);
5020         }
5021       else
5022         {
5023           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5024                                                     new_phi_result);
5025           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5026         }
5027
5028       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5029       gimple_set_lhs (epilog_stmt, new_temp);
5030       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5031
5032       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5033            == INTEGER_INDUC_COND_REDUCTION)
5034           && !operand_equal_p (initial_def, induc_val, 0))
5035         {
5036           /* Earlier we set the initial value to be a vector if induc_val
5037              values.  Check the result and if it is induc_val then replace
5038              with the original initial value, unless induc_val is
5039              the same as initial_def already.  */
5040           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5041                                   induc_val);
5042
5043           tmp = make_ssa_name (new_scalar_dest);
5044           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5045                                              initial_def, new_temp);
5046           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5047           new_temp = tmp;
5048         }
5049
5050       scalar_results.safe_push (new_temp);
5051     }
5052   else if (direct_slp_reduc)
5053     {
5054       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5055          with the elements for other SLP statements replaced with the
5056          neutral value.  We can then do a normal reduction on each vector.  */
5057
5058       /* Enforced by vectorizable_reduction.  */
5059       gcc_assert (new_phis.length () == 1);
5060       gcc_assert (pow2p_hwi (group_size));
5061
5062       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5063       vec<stmt_vec_info> orig_phis
5064         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5065       gimple_seq seq = NULL;
5066
5067       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5068          and the same element size as VECTYPE.  */
5069       tree index = build_index_vector (vectype, 0, 1);
5070       tree index_type = TREE_TYPE (index);
5071       tree index_elt_type = TREE_TYPE (index_type);
5072       tree mask_type = build_same_sized_truth_vector_type (index_type);
5073
5074       /* Create a vector that, for each element, identifies which of
5075          the REDUC_GROUP_SIZE results should use it.  */
5076       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5077       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5078                             build_vector_from_val (index_type, index_mask));
5079
5080       /* Get a neutral vector value.  This is simply a splat of the neutral
5081          scalar value if we have one, otherwise the initial scalar value
5082          is itself a neutral value.  */
5083       tree vector_identity = NULL_TREE;
5084       if (neutral_op)
5085         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5086                                                         neutral_op);
5087       for (unsigned int i = 0; i < group_size; ++i)
5088         {
5089           /* If there's no univeral neutral value, we can use the
5090              initial scalar value from the original PHI.  This is used
5091              for MIN and MAX reduction, for example.  */
5092           if (!neutral_op)
5093             {
5094               tree scalar_value
5095                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5096                                          loop_preheader_edge (loop));
5097               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5098                                                               scalar_value);
5099             }
5100
5101           /* Calculate the equivalent of:
5102
5103              sel[j] = (index[j] == i);
5104
5105              which selects the elements of NEW_PHI_RESULT that should
5106              be included in the result.  */
5107           tree compare_val = build_int_cst (index_elt_type, i);
5108           compare_val = build_vector_from_val (index_type, compare_val);
5109           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5110                                    index, compare_val);
5111
5112           /* Calculate the equivalent of:
5113
5114              vec = seq ? new_phi_result : vector_identity;
5115
5116              VEC is now suitable for a full vector reduction.  */
5117           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5118                                    sel, new_phi_result, vector_identity);
5119
5120           /* Do the reduction and convert it to the appropriate type.  */
5121           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5122                                       TREE_TYPE (vectype), vec);
5123           scalar = gimple_convert (&seq, scalar_type, scalar);
5124           scalar_results.safe_push (scalar);
5125         }
5126       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5127     }
5128   else
5129     {
5130       bool reduce_with_shift;
5131       tree vec_temp;
5132
5133       /* COND reductions all do the final reduction with MAX_EXPR
5134          or MIN_EXPR.  */
5135       if (code == COND_EXPR)
5136         {
5137           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5138               == INTEGER_INDUC_COND_REDUCTION)
5139             code = induc_code;
5140           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5141                    == CONST_COND_REDUCTION)
5142             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5143           else
5144             code = MAX_EXPR;
5145         }
5146
5147       /* See if the target wants to do the final (shift) reduction
5148          in a vector mode of smaller size and first reduce upper/lower
5149          halves against each other.  */
5150       enum machine_mode mode1 = mode;
5151       tree vectype1 = vectype;
5152       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5153       unsigned sz1 = sz;
5154       if (!slp_reduc
5155           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5156         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5157
5158       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5159       reduce_with_shift = have_whole_vector_shift (mode1);
5160       if (!VECTOR_MODE_P (mode1))
5161         reduce_with_shift = false;
5162       else
5163         {
5164           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5165           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5166             reduce_with_shift = false;
5167         }
5168
5169       /* First reduce the vector to the desired vector size we should
5170          do shift reduction on by combining upper and lower halves.  */
5171       new_temp = new_phi_result;
5172       while (sz > sz1)
5173         {
5174           gcc_assert (!slp_reduc);
5175           sz /= 2;
5176           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5177
5178           /* The target has to make sure we support lowpart/highpart
5179              extraction, either via direct vector extract or through
5180              an integer mode punning.  */
5181           tree dst1, dst2;
5182           if (convert_optab_handler (vec_extract_optab,
5183                                      TYPE_MODE (TREE_TYPE (new_temp)),
5184                                      TYPE_MODE (vectype1))
5185               != CODE_FOR_nothing)
5186             {
5187               /* Extract sub-vectors directly once vec_extract becomes
5188                  a conversion optab.  */
5189               dst1 = make_ssa_name (vectype1);
5190               epilog_stmt
5191                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5192                                          build3 (BIT_FIELD_REF, vectype1,
5193                                                  new_temp, TYPE_SIZE (vectype1),
5194                                                  bitsize_int (0)));
5195               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5196               dst2 =  make_ssa_name (vectype1);
5197               epilog_stmt
5198                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5199                                          build3 (BIT_FIELD_REF, vectype1,
5200                                                  new_temp, TYPE_SIZE (vectype1),
5201                                                  bitsize_int (sz * BITS_PER_UNIT)));
5202               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5203             }
5204           else
5205             {
5206               /* Extract via punning to appropriately sized integer mode
5207                  vector.  */
5208               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5209                                                             1);
5210               tree etype = build_vector_type (eltype, 2);
5211               gcc_assert (convert_optab_handler (vec_extract_optab,
5212                                                  TYPE_MODE (etype),
5213                                                  TYPE_MODE (eltype))
5214                           != CODE_FOR_nothing);
5215               tree tem = make_ssa_name (etype);
5216               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5217                                                  build1 (VIEW_CONVERT_EXPR,
5218                                                          etype, new_temp));
5219               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5220               new_temp = tem;
5221               tem = make_ssa_name (eltype);
5222               epilog_stmt
5223                   = gimple_build_assign (tem, BIT_FIELD_REF,
5224                                          build3 (BIT_FIELD_REF, eltype,
5225                                                  new_temp, TYPE_SIZE (eltype),
5226                                                  bitsize_int (0)));
5227               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5228               dst1 = make_ssa_name (vectype1);
5229               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5230                                                  build1 (VIEW_CONVERT_EXPR,
5231                                                          vectype1, tem));
5232               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5233               tem = make_ssa_name (eltype);
5234               epilog_stmt
5235                   = gimple_build_assign (tem, BIT_FIELD_REF,
5236                                          build3 (BIT_FIELD_REF, eltype,
5237                                                  new_temp, TYPE_SIZE (eltype),
5238                                                  bitsize_int (sz * BITS_PER_UNIT)));
5239               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5240               dst2 =  make_ssa_name (vectype1);
5241               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5242                                                  build1 (VIEW_CONVERT_EXPR,
5243                                                          vectype1, tem));
5244               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5245             }
5246
5247           new_temp = make_ssa_name (vectype1);
5248           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5249           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5250         }
5251
5252       if (reduce_with_shift && !slp_reduc)
5253         {
5254           int element_bitsize = tree_to_uhwi (bitsize);
5255           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5256              for variable-length vectors and also requires direct target support
5257              for loop reductions.  */
5258           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5259           int nelements = vec_size_in_bits / element_bitsize;
5260           vec_perm_builder sel;
5261           vec_perm_indices indices;
5262
5263           int elt_offset;
5264
5265           tree zero_vec = build_zero_cst (vectype1);
5266           /* Case 2: Create:
5267              for (offset = nelements/2; offset >= 1; offset/=2)
5268                 {
5269                   Create:  va' = vec_shift <va, offset>
5270                   Create:  va = vop <va, va'>
5271                 }  */
5272
5273           tree rhs;
5274
5275           if (dump_enabled_p ())
5276             dump_printf_loc (MSG_NOTE, vect_location,
5277                              "Reduce using vector shifts\n");
5278
5279           mode1 = TYPE_MODE (vectype1);
5280           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5281           for (elt_offset = nelements / 2;
5282                elt_offset >= 1;
5283                elt_offset /= 2)
5284             {
5285               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5286               indices.new_vector (sel, 2, nelements);
5287               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5288               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5289                                                  new_temp, zero_vec, mask);
5290               new_name = make_ssa_name (vec_dest, epilog_stmt);
5291               gimple_assign_set_lhs (epilog_stmt, new_name);
5292               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5293
5294               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5295                                                  new_temp);
5296               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5297               gimple_assign_set_lhs (epilog_stmt, new_temp);
5298               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5299             }
5300
5301           /* 2.4  Extract the final scalar result.  Create:
5302              s_out3 = extract_field <v_out2, bitpos>  */
5303
5304           if (dump_enabled_p ())
5305             dump_printf_loc (MSG_NOTE, vect_location,
5306                              "extract scalar result\n");
5307
5308           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5309                         bitsize, bitsize_zero_node);
5310           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5311           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5312           gimple_assign_set_lhs (epilog_stmt, new_temp);
5313           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5314           scalar_results.safe_push (new_temp);
5315         }
5316       else
5317         {
5318           /* Case 3: Create:
5319              s = extract_field <v_out2, 0>
5320              for (offset = element_size;
5321                   offset < vector_size;
5322                   offset += element_size;)
5323                {
5324                  Create:  s' = extract_field <v_out2, offset>
5325                  Create:  s = op <s, s'>  // For non SLP cases
5326                }  */
5327
5328           if (dump_enabled_p ())
5329             dump_printf_loc (MSG_NOTE, vect_location,
5330                              "Reduce using scalar code.\n");
5331
5332           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5333           int element_bitsize = tree_to_uhwi (bitsize);
5334           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5335             {
5336               int bit_offset;
5337               if (gimple_code (new_phi) == GIMPLE_PHI)
5338                 vec_temp = PHI_RESULT (new_phi);
5339               else
5340                 vec_temp = gimple_assign_lhs (new_phi);
5341               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5342                                  bitsize_zero_node);
5343               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5344               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5345               gimple_assign_set_lhs (epilog_stmt, new_temp);
5346               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5347
5348               /* In SLP we don't need to apply reduction operation, so we just
5349                  collect s' values in SCALAR_RESULTS.  */
5350               if (slp_reduc)
5351                 scalar_results.safe_push (new_temp);
5352
5353               for (bit_offset = element_bitsize;
5354                    bit_offset < vec_size_in_bits;
5355                    bit_offset += element_bitsize)
5356                 {
5357                   tree bitpos = bitsize_int (bit_offset);
5358                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5359                                      bitsize, bitpos);
5360
5361                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5362                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5363                   gimple_assign_set_lhs (epilog_stmt, new_name);
5364                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5365
5366                   if (slp_reduc)
5367                     {
5368                       /* In SLP we don't need to apply reduction operation, so
5369                          we just collect s' values in SCALAR_RESULTS.  */
5370                       new_temp = new_name;
5371                       scalar_results.safe_push (new_name);
5372                     }
5373                   else
5374                     {
5375                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5376                                                          new_name, new_temp);
5377                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5378                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5379                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5380                     }
5381                 }
5382             }
5383
5384           /* The only case where we need to reduce scalar results in SLP, is
5385              unrolling.  If the size of SCALAR_RESULTS is greater than
5386              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5387              REDUC_GROUP_SIZE.  */
5388           if (slp_reduc)
5389             {
5390               tree res, first_res, new_res;
5391               gimple *new_stmt;
5392
5393               /* Reduce multiple scalar results in case of SLP unrolling.  */
5394               for (j = group_size; scalar_results.iterate (j, &res);
5395                    j++)
5396                 {
5397                   first_res = scalar_results[j % group_size];
5398                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5399                                                   first_res, res);
5400                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5401                   gimple_assign_set_lhs (new_stmt, new_res);
5402                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5403                   scalar_results[j % group_size] = new_res;
5404                 }
5405             }
5406           else
5407             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5408             scalar_results.safe_push (new_temp);
5409         }
5410
5411       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5412            == INTEGER_INDUC_COND_REDUCTION)
5413           && !operand_equal_p (initial_def, induc_val, 0))
5414         {
5415           /* Earlier we set the initial value to be a vector if induc_val
5416              values.  Check the result and if it is induc_val then replace
5417              with the original initial value, unless induc_val is
5418              the same as initial_def already.  */
5419           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5420                                   induc_val);
5421
5422           tree tmp = make_ssa_name (new_scalar_dest);
5423           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5424                                              initial_def, new_temp);
5425           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5426           scalar_results[0] = tmp;
5427         }
5428     }
5429
5430 vect_finalize_reduction:
5431
5432   if (double_reduc)
5433     loop = loop->inner;
5434
5435   /* 2.5 Adjust the final result by the initial value of the reduction
5436          variable. (When such adjustment is not needed, then
5437          'adjustment_def' is zero).  For example, if code is PLUS we create:
5438          new_temp = loop_exit_def + adjustment_def  */
5439
5440   if (adjustment_def)
5441     {
5442       gcc_assert (!slp_reduc);
5443       if (nested_in_vect_loop)
5444         {
5445           new_phi = new_phis[0];
5446           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5447           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5448           new_dest = vect_create_destination_var (scalar_dest, vectype);
5449         }
5450       else
5451         {
5452           new_temp = scalar_results[0];
5453           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5454           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5455           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5456         }
5457
5458       epilog_stmt = gimple_build_assign (new_dest, expr);
5459       new_temp = make_ssa_name (new_dest, epilog_stmt);
5460       gimple_assign_set_lhs (epilog_stmt, new_temp);
5461       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5462       if (nested_in_vect_loop)
5463         {
5464           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5465           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5466             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5467
5468           if (!double_reduc)
5469             scalar_results.quick_push (new_temp);
5470           else
5471             scalar_results[0] = new_temp;
5472         }
5473       else
5474         scalar_results[0] = new_temp;
5475
5476       new_phis[0] = epilog_stmt;
5477     }
5478
5479   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5480           phis with new adjusted scalar results, i.e., replace use <s_out0>
5481           with use <s_out4>.
5482
5483      Transform:
5484         loop_exit:
5485           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5486           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5487           v_out2 = reduce <v_out1>
5488           s_out3 = extract_field <v_out2, 0>
5489           s_out4 = adjust_result <s_out3>
5490           use <s_out0>
5491           use <s_out0>
5492
5493      into:
5494
5495         loop_exit:
5496           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5497           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5498           v_out2 = reduce <v_out1>
5499           s_out3 = extract_field <v_out2, 0>
5500           s_out4 = adjust_result <s_out3>
5501           use <s_out4>
5502           use <s_out4> */
5503
5504
5505   /* In SLP reduction chain we reduce vector results into one vector if
5506      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5507      LHS of the last stmt in the reduction chain, since we are looking for
5508      the loop exit phi node.  */
5509   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5510     {
5511       stmt_vec_info dest_stmt_info
5512         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5513       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5514       group_size = 1;
5515     }
5516
5517   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5518      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5519      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5520      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5521      correspond to the first vector stmt, etc.
5522      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5523   if (group_size > new_phis.length ())
5524     {
5525       ratio = group_size / new_phis.length ();
5526       gcc_assert (!(group_size % new_phis.length ()));
5527     }
5528   else
5529     ratio = 1;
5530
5531   stmt_vec_info epilog_stmt_info = NULL;
5532   for (k = 0; k < group_size; k++)
5533     {
5534       if (k % ratio == 0)
5535         {
5536           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5537           reduction_phi_info = reduction_phis[k / ratio];
5538           if (double_reduc)
5539             inner_phi = inner_phis[k / ratio];
5540         }
5541
5542       if (slp_reduc)
5543         {
5544           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5545
5546           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5547           /* SLP statements can't participate in patterns.  */
5548           gcc_assert (!orig_stmt_info);
5549           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5550         }
5551
5552       phis.create (3);
5553       /* Find the loop-closed-use at the loop exit of the original scalar
5554          result.  (The reduction result is expected to have two immediate uses -
5555          one at the latch block, and one at the loop exit).  */
5556       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5557         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5558             && !is_gimple_debug (USE_STMT (use_p)))
5559           phis.safe_push (USE_STMT (use_p));
5560
5561       /* While we expect to have found an exit_phi because of loop-closed-ssa
5562          form we can end up without one if the scalar cycle is dead.  */
5563
5564       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5565         {
5566           if (outer_loop)
5567             {
5568               stmt_vec_info exit_phi_vinfo
5569                 = loop_vinfo->lookup_stmt (exit_phi);
5570               gphi *vect_phi;
5571
5572               /* FORNOW. Currently not supporting the case that an inner-loop
5573                  reduction is not used in the outer-loop (but only outside the
5574                  outer-loop), unless it is double reduction.  */
5575               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5576                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5577                           || double_reduc);
5578
5579               if (double_reduc)
5580                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5581               else
5582                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5583               if (!double_reduc
5584                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5585                       != vect_double_reduction_def)
5586                 continue;
5587
5588               /* Handle double reduction:
5589
5590                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5591                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5592                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5593                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5594
5595                  At that point the regular reduction (stmt2 and stmt3) is
5596                  already vectorized, as well as the exit phi node, stmt4.
5597                  Here we vectorize the phi node of double reduction, stmt1, and
5598                  update all relevant statements.  */
5599
5600               /* Go through all the uses of s2 to find double reduction phi
5601                  node, i.e., stmt1 above.  */
5602               orig_name = PHI_RESULT (exit_phi);
5603               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5604                 {
5605                   stmt_vec_info use_stmt_vinfo;
5606                   tree vect_phi_init, preheader_arg, vect_phi_res;
5607                   basic_block bb = gimple_bb (use_stmt);
5608
5609                   /* Check that USE_STMT is really double reduction phi
5610                      node.  */
5611                   if (gimple_code (use_stmt) != GIMPLE_PHI
5612                       || gimple_phi_num_args (use_stmt) != 2
5613                       || bb->loop_father != outer_loop)
5614                     continue;
5615                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5616                   if (!use_stmt_vinfo
5617                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5618                           != vect_double_reduction_def)
5619                     continue;
5620
5621                   /* Create vector phi node for double reduction:
5622                      vs1 = phi <vs0, vs2>
5623                      vs1 was created previously in this function by a call to
5624                        vect_get_vec_def_for_operand and is stored in
5625                        vec_initial_def;
5626                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5627                      vs0 is created here.  */
5628
5629                   /* Create vector phi node.  */
5630                   vect_phi = create_phi_node (vec_initial_def, bb);
5631                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5632
5633                   /* Create vs0 - initial def of the double reduction phi.  */
5634                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5635                                              loop_preheader_edge (outer_loop));
5636                   vect_phi_init = get_initial_def_for_reduction
5637                     (stmt_info, preheader_arg, NULL);
5638
5639                   /* Update phi node arguments with vs0 and vs2.  */
5640                   add_phi_arg (vect_phi, vect_phi_init,
5641                                loop_preheader_edge (outer_loop),
5642                                UNKNOWN_LOCATION);
5643                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5644                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5645                   if (dump_enabled_p ())
5646                     dump_printf_loc (MSG_NOTE, vect_location,
5647                                      "created double reduction phi node: %G",
5648                                      vect_phi);
5649
5650                   vect_phi_res = PHI_RESULT (vect_phi);
5651
5652                   /* Replace the use, i.e., set the correct vs1 in the regular
5653                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5654                      loop is redundant.  */
5655                   stmt_vec_info use_info = reduction_phi_info;
5656                   for (j = 0; j < ncopies; j++)
5657                     {
5658                       edge pr_edge = loop_preheader_edge (loop);
5659                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5660                                        pr_edge->dest_idx, vect_phi_res);
5661                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5662                     }
5663                 }
5664             }
5665         }
5666
5667       phis.release ();
5668       if (nested_in_vect_loop)
5669         {
5670           if (double_reduc)
5671             loop = outer_loop;
5672           else
5673             continue;
5674         }
5675
5676       phis.create (3);
5677       /* Find the loop-closed-use at the loop exit of the original scalar
5678          result.  (The reduction result is expected to have two immediate uses,
5679          one at the latch block, and one at the loop exit).  For double
5680          reductions we are looking for exit phis of the outer loop.  */
5681       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5682         {
5683           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5684             {
5685               if (!is_gimple_debug (USE_STMT (use_p)))
5686                 phis.safe_push (USE_STMT (use_p));
5687             }
5688           else
5689             {
5690               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5691                 {
5692                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5693
5694                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5695                     {
5696                       if (!flow_bb_inside_loop_p (loop,
5697                                              gimple_bb (USE_STMT (phi_use_p)))
5698                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5699                         phis.safe_push (USE_STMT (phi_use_p));
5700                     }
5701                 }
5702             }
5703         }
5704
5705       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5706         {
5707           /* Replace the uses:  */
5708           orig_name = PHI_RESULT (exit_phi);
5709           scalar_result = scalar_results[k];
5710           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5711             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5712               SET_USE (use_p, scalar_result);
5713         }
5714
5715       phis.release ();
5716     }
5717 }
5718
5719 /* Return a vector of type VECTYPE that is equal to the vector select
5720    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5721    before GSI.  */
5722
5723 static tree
5724 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5725                      tree vec, tree identity)
5726 {
5727   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5728   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5729                                           mask, vec, identity);
5730   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5731   return cond;
5732 }
5733
5734 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5735    order, starting with LHS.  Insert the extraction statements before GSI and
5736    associate the new scalar SSA names with variable SCALAR_DEST.
5737    Return the SSA name for the result.  */
5738
5739 static tree
5740 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5741                        tree_code code, tree lhs, tree vector_rhs)
5742 {
5743   tree vectype = TREE_TYPE (vector_rhs);
5744   tree scalar_type = TREE_TYPE (vectype);
5745   tree bitsize = TYPE_SIZE (scalar_type);
5746   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5747   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5748
5749   for (unsigned HOST_WIDE_INT bit_offset = 0;
5750        bit_offset < vec_size_in_bits;
5751        bit_offset += element_bitsize)
5752     {
5753       tree bitpos = bitsize_int (bit_offset);
5754       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5755                          bitsize, bitpos);
5756
5757       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5758       rhs = make_ssa_name (scalar_dest, stmt);
5759       gimple_assign_set_lhs (stmt, rhs);
5760       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5761
5762       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5763       tree new_name = make_ssa_name (scalar_dest, stmt);
5764       gimple_assign_set_lhs (stmt, new_name);
5765       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5766       lhs = new_name;
5767     }
5768   return lhs;
5769 }
5770
5771 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5772    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5773    statement.  CODE is the operation performed by STMT_INFO and OPS are
5774    its scalar operands.  REDUC_INDEX is the index of the operand in
5775    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5776    implements in-order reduction, or IFN_LAST if we should open-code it.
5777    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5778    that should be used to control the operation in a fully-masked loop.  */
5779
5780 static bool
5781 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5782                                gimple_stmt_iterator *gsi,
5783                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5784                                gimple *reduc_def_stmt,
5785                                tree_code code, internal_fn reduc_fn,
5786                                tree ops[3], tree vectype_in,
5787                                int reduc_index, vec_loop_masks *masks)
5788 {
5789   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5790   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5791   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5792   stmt_vec_info new_stmt_info = NULL;
5793
5794   int ncopies;
5795   if (slp_node)
5796     ncopies = 1;
5797   else
5798     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5799
5800   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5801   gcc_assert (ncopies == 1);
5802   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5803   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5804   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5805               == FOLD_LEFT_REDUCTION);
5806
5807   if (slp_node)
5808     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5809                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5810
5811   tree op0 = ops[1 - reduc_index];
5812
5813   int group_size = 1;
5814   stmt_vec_info scalar_dest_def_info;
5815   auto_vec<tree> vec_oprnds0;
5816   if (slp_node)
5817     {
5818       vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5819                          slp_node);
5820       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5821       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5822     }
5823   else
5824     {
5825       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5826       vec_oprnds0.create (1);
5827       vec_oprnds0.quick_push (loop_vec_def0);
5828       scalar_dest_def_info = stmt_info;
5829     }
5830
5831   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5832   tree scalar_type = TREE_TYPE (scalar_dest);
5833   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5834
5835   int vec_num = vec_oprnds0.length ();
5836   gcc_assert (vec_num == 1 || slp_node);
5837   tree vec_elem_type = TREE_TYPE (vectype_out);
5838   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5839
5840   tree vector_identity = NULL_TREE;
5841   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5842     vector_identity = build_zero_cst (vectype_out);
5843
5844   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5845   int i;
5846   tree def0;
5847   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5848     {
5849       gimple *new_stmt;
5850       tree mask = NULL_TREE;
5851       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5852         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5853
5854       /* Handle MINUS by adding the negative.  */
5855       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5856         {
5857           tree negated = make_ssa_name (vectype_out);
5858           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5859           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5860           def0 = negated;
5861         }
5862
5863       if (mask)
5864         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5865                                     vector_identity);
5866
5867       /* On the first iteration the input is simply the scalar phi
5868          result, and for subsequent iterations it is the output of
5869          the preceding operation.  */
5870       if (reduc_fn != IFN_LAST)
5871         {
5872           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5873           /* For chained SLP reductions the output of the previous reduction
5874              operation serves as the input of the next. For the final statement
5875              the output cannot be a temporary - we reuse the original
5876              scalar destination of the last statement.  */
5877           if (i != vec_num - 1)
5878             {
5879               gimple_set_lhs (new_stmt, scalar_dest_var);
5880               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5881               gimple_set_lhs (new_stmt, reduc_var);
5882             }
5883         }
5884       else
5885         {
5886           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5887                                              reduc_var, def0);
5888           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5889           /* Remove the statement, so that we can use the same code paths
5890              as for statements that we've just created.  */
5891           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5892           gsi_remove (&tmp_gsi, false);
5893         }
5894
5895       if (i == vec_num - 1)
5896         {
5897           gimple_set_lhs (new_stmt, scalar_dest);
5898           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5899                                                     new_stmt);
5900         }
5901       else
5902         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5903                                                      new_stmt, gsi);
5904
5905       if (slp_node)
5906         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5907     }
5908
5909   if (!slp_node)
5910     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5911
5912   return true;
5913 }
5914
5915 /* Function is_nonwrapping_integer_induction.
5916
5917    Check if STMT_VINO (which is part of loop LOOP) both increments and
5918    does not cause overflow.  */
5919
5920 static bool
5921 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5922 {
5923   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5924   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5925   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5926   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5927   widest_int ni, max_loop_value, lhs_max;
5928   wi::overflow_type overflow = wi::OVF_NONE;
5929
5930   /* Make sure the loop is integer based.  */
5931   if (TREE_CODE (base) != INTEGER_CST
5932       || TREE_CODE (step) != INTEGER_CST)
5933     return false;
5934
5935   /* Check that the max size of the loop will not wrap.  */
5936
5937   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5938     return true;
5939
5940   if (! max_stmt_executions (loop, &ni))
5941     return false;
5942
5943   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5944                             &overflow);
5945   if (overflow)
5946     return false;
5947
5948   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5949                             TYPE_SIGN (lhs_type), &overflow);
5950   if (overflow)
5951     return false;
5952
5953   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5954           <= TYPE_PRECISION (lhs_type));
5955 }
5956
5957 /* Function vectorizable_reduction.
5958
5959    Check if STMT_INFO performs a reduction operation that can be vectorized.
5960    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5961    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5962    Return true if STMT_INFO is vectorizable in this way.
5963
5964    This function also handles reduction idioms (patterns) that have been
5965    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5966    may be of this form:
5967      X = pattern_expr (arg0, arg1, ..., X)
5968    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5969    sequence that had been detected and replaced by the pattern-stmt
5970    (STMT_INFO).
5971
5972    This function also handles reduction of condition expressions, for example:
5973      for (int i = 0; i < N; i++)
5974        if (a[i] < value)
5975          last = a[i];
5976    This is handled by vectorising the loop and creating an additional vector
5977    containing the loop indexes for which "a[i] < value" was true.  In the
5978    function epilogue this is reduced to a single max value and then used to
5979    index into the vector of results.
5980
5981    In some cases of reduction patterns, the type of the reduction variable X is
5982    different than the type of the other arguments of STMT_INFO.
5983    In such cases, the vectype that is used when transforming STMT_INFO into
5984    a vector stmt is different than the vectype that is used to determine the
5985    vectorization factor, because it consists of a different number of elements
5986    than the actual number of elements that are being operated upon in parallel.
5987
5988    For example, consider an accumulation of shorts into an int accumulator.
5989    On some targets it's possible to vectorize this pattern operating on 8
5990    shorts at a time (hence, the vectype for purposes of determining the
5991    vectorization factor should be V8HI); on the other hand, the vectype that
5992    is used to create the vector form is actually V4SI (the type of the result).
5993
5994    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5995    indicates what is the actual level of parallelism (V8HI in the example), so
5996    that the right vectorization factor would be derived.  This vectype
5997    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5998    be used to create the vectorized stmt.  The right vectype for the vectorized
5999    stmt is obtained from the type of the result X:
6000         get_vectype_for_scalar_type (TREE_TYPE (X))
6001
6002    This means that, contrary to "regular" reductions (or "regular" stmts in
6003    general), the following equation:
6004       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6005    does *NOT* necessarily hold for reduction patterns.  */
6006
6007 bool
6008 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6009                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6010                         slp_instance slp_node_instance,
6011                         stmt_vector_for_cost *cost_vec)
6012 {
6013   tree vec_dest;
6014   tree scalar_dest;
6015   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6016   tree vectype_in = NULL_TREE;
6017   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6018   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6019   enum tree_code code, orig_code;
6020   internal_fn reduc_fn;
6021   machine_mode vec_mode;
6022   int op_type;
6023   optab optab;
6024   tree new_temp = NULL_TREE;
6025   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6026   stmt_vec_info cond_stmt_vinfo = NULL;
6027   enum tree_code cond_reduc_op_code = ERROR_MARK;
6028   tree scalar_type;
6029   bool is_simple_use;
6030   int i;
6031   int ncopies;
6032   int epilog_copies;
6033   stmt_vec_info prev_stmt_info, prev_phi_info;
6034   bool single_defuse_cycle = false;
6035   stmt_vec_info new_stmt_info = NULL;
6036   int j;
6037   tree ops[3];
6038   enum vect_def_type dts[3];
6039   bool nested_cycle = false, found_nested_cycle_def = false;
6040   bool double_reduc = false;
6041   basic_block def_bb;
6042   struct loop * def_stmt_loop;
6043   tree def_arg;
6044   auto_vec<tree> vec_oprnds0;
6045   auto_vec<tree> vec_oprnds1;
6046   auto_vec<tree> vec_oprnds2;
6047   auto_vec<tree> vect_defs;
6048   auto_vec<stmt_vec_info> phis;
6049   int vec_num;
6050   tree def0, tem;
6051   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6052   tree cond_reduc_val = NULL_TREE;
6053
6054   /* Make sure it was already recognized as a reduction computation.  */
6055   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6056       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6057     return false;
6058
6059   if (nested_in_vect_loop_p (loop, stmt_info))
6060     {
6061       loop = loop->inner;
6062       nested_cycle = true;
6063     }
6064
6065   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6066     gcc_assert (slp_node
6067                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6068
6069   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6070     {
6071       tree phi_result = gimple_phi_result (phi);
6072       /* Analysis is fully done on the reduction stmt invocation.  */
6073       if (! vec_stmt)
6074         {
6075           if (slp_node)
6076             slp_node_instance->reduc_phis = slp_node;
6077
6078           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6079           return true;
6080         }
6081
6082       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6083         /* Leave the scalar phi in place.  Note that checking
6084            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6085            for reductions involving a single statement.  */
6086         return true;
6087
6088       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6089       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6090
6091       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6092           == EXTRACT_LAST_REDUCTION)
6093         /* Leave the scalar phi in place.  */
6094         return true;
6095
6096       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6097       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6098         {
6099           tree op = gimple_op (reduc_stmt, k);
6100           if (op == phi_result)
6101             continue;
6102           if (k == 1
6103               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6104             continue;
6105           if (!vectype_in
6106               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6107                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6108             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6109           break;
6110         }
6111       gcc_assert (vectype_in);
6112
6113       if (slp_node)
6114         ncopies = 1;
6115       else
6116         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6117
6118       stmt_vec_info use_stmt_info;
6119       if (ncopies > 1
6120           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6121           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6122           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6123         single_defuse_cycle = true;
6124
6125       /* Create the destination vector  */
6126       scalar_dest = gimple_assign_lhs (reduc_stmt);
6127       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6128
6129       if (slp_node)
6130         /* The size vect_schedule_slp_instance computes is off for us.  */
6131         vec_num = vect_get_num_vectors
6132           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6133            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6134            vectype_in);
6135       else
6136         vec_num = 1;
6137
6138       /* Generate the reduction PHIs upfront.  */
6139       prev_phi_info = NULL;
6140       for (j = 0; j < ncopies; j++)
6141         {
6142           if (j == 0 || !single_defuse_cycle)
6143             {
6144               for (i = 0; i < vec_num; i++)
6145                 {
6146                   /* Create the reduction-phi that defines the reduction
6147                      operand.  */
6148                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6149                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6150
6151                   if (slp_node)
6152                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6153                   else
6154                     {
6155                       if (j == 0)
6156                         STMT_VINFO_VEC_STMT (stmt_info)
6157                           = *vec_stmt = new_phi_info;
6158                       else
6159                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6160                       prev_phi_info = new_phi_info;
6161                     }
6162                 }
6163             }
6164         }
6165
6166       return true;
6167     }
6168
6169   /* 1. Is vectorizable reduction?  */
6170   /* Not supportable if the reduction variable is used in the loop, unless
6171      it's a reduction chain.  */
6172   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6173       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6174     return false;
6175
6176   /* Reductions that are not used even in an enclosing outer-loop,
6177      are expected to be "live" (used out of the loop).  */
6178   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6179       && !STMT_VINFO_LIVE_P (stmt_info))
6180     return false;
6181
6182   /* 2. Has this been recognized as a reduction pattern?
6183
6184      Check if STMT represents a pattern that has been recognized
6185      in earlier analysis stages.  For stmts that represent a pattern,
6186      the STMT_VINFO_RELATED_STMT field records the last stmt in
6187      the original sequence that constitutes the pattern.  */
6188
6189   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6190   if (orig_stmt_info)
6191     {
6192       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6193       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6194     }
6195
6196   /* 3. Check the operands of the operation.  The first operands are defined
6197         inside the loop body. The last operand is the reduction variable,
6198         which is defined by the loop-header-phi.  */
6199
6200   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6201
6202   /* Flatten RHS.  */
6203   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6204     {
6205     case GIMPLE_BINARY_RHS:
6206       code = gimple_assign_rhs_code (stmt);
6207       op_type = TREE_CODE_LENGTH (code);
6208       gcc_assert (op_type == binary_op);
6209       ops[0] = gimple_assign_rhs1 (stmt);
6210       ops[1] = gimple_assign_rhs2 (stmt);
6211       break;
6212
6213     case GIMPLE_TERNARY_RHS:
6214       code = gimple_assign_rhs_code (stmt);
6215       op_type = TREE_CODE_LENGTH (code);
6216       gcc_assert (op_type == ternary_op);
6217       ops[0] = gimple_assign_rhs1 (stmt);
6218       ops[1] = gimple_assign_rhs2 (stmt);
6219       ops[2] = gimple_assign_rhs3 (stmt);
6220       break;
6221
6222     case GIMPLE_UNARY_RHS:
6223       return false;
6224
6225     default:
6226       gcc_unreachable ();
6227     }
6228
6229   if (code == COND_EXPR && slp_node)
6230     return false;
6231
6232   scalar_dest = gimple_assign_lhs (stmt);
6233   scalar_type = TREE_TYPE (scalar_dest);
6234   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6235       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6236     return false;
6237
6238   /* Do not try to vectorize bit-precision reductions.  */
6239   if (!type_has_mode_precision_p (scalar_type))
6240     return false;
6241
6242   /* All uses but the last are expected to be defined in the loop.
6243      The last use is the reduction variable.  In case of nested cycle this
6244      assumption is not true: we use reduc_index to record the index of the
6245      reduction variable.  */
6246   stmt_vec_info reduc_def_info = NULL;
6247   int reduc_index = -1;
6248   for (i = 0; i < op_type; i++)
6249     {
6250       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6251       if (i == 0 && code == COND_EXPR)
6252         continue;
6253
6254       stmt_vec_info def_stmt_info;
6255       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6256                                           &def_stmt_info);
6257       dt = dts[i];
6258       gcc_assert (is_simple_use);
6259       if (dt == vect_reduction_def)
6260         {
6261           reduc_def_info = def_stmt_info;
6262           reduc_index = i;
6263           continue;
6264         }
6265       else if (tem)
6266         {
6267           /* To properly compute ncopies we are interested in the widest
6268              input type in case we're looking at a widening accumulation.  */
6269           if (!vectype_in
6270               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6271                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6272             vectype_in = tem;
6273         }
6274
6275       if (dt != vect_internal_def
6276           && dt != vect_external_def
6277           && dt != vect_constant_def
6278           && dt != vect_induction_def
6279           && !(dt == vect_nested_cycle && nested_cycle))
6280         return false;
6281
6282       if (dt == vect_nested_cycle)
6283         {
6284           found_nested_cycle_def = true;
6285           reduc_def_info = def_stmt_info;
6286           reduc_index = i;
6287         }
6288
6289       if (i == 1 && code == COND_EXPR)
6290         {
6291           /* Record how value of COND_EXPR is defined.  */
6292           if (dt == vect_constant_def)
6293             {
6294               cond_reduc_dt = dt;
6295               cond_reduc_val = ops[i];
6296             }
6297           if (dt == vect_induction_def
6298               && def_stmt_info
6299               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6300             {
6301               cond_reduc_dt = dt;
6302               cond_stmt_vinfo = def_stmt_info;
6303             }
6304         }
6305     }
6306
6307   if (!vectype_in)
6308     vectype_in = vectype_out;
6309
6310   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6311      directy used in stmt.  */
6312   if (reduc_index == -1)
6313     {
6314       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6315         {
6316           if (dump_enabled_p ())
6317             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6318                              "in-order reduction chain without SLP.\n");
6319           return false;
6320         }
6321
6322       if (orig_stmt_info)
6323         reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6324       else
6325         reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6326     }
6327
6328   if (! reduc_def_info)
6329     return false;
6330
6331   gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6332   if (!reduc_def_phi)
6333     return false;
6334
6335   if (!(reduc_index == -1
6336         || dts[reduc_index] == vect_reduction_def
6337         || dts[reduc_index] == vect_nested_cycle
6338         || ((dts[reduc_index] == vect_internal_def
6339              || dts[reduc_index] == vect_external_def
6340              || dts[reduc_index] == vect_constant_def
6341              || dts[reduc_index] == vect_induction_def)
6342             && nested_cycle && found_nested_cycle_def)))
6343     {
6344       /* For pattern recognized stmts, orig_stmt might be a reduction,
6345          but some helper statements for the pattern might not, or
6346          might be COND_EXPRs with reduction uses in the condition.  */
6347       gcc_assert (orig_stmt_info);
6348       return false;
6349     }
6350
6351   /* PHIs should not participate in patterns.  */
6352   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6353   enum vect_reduction_type v_reduc_type
6354     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6355   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6356
6357   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6358   /* If we have a condition reduction, see if we can simplify it further.  */
6359   if (v_reduc_type == COND_REDUCTION)
6360     {
6361       /* TODO: We can't yet handle reduction chains, since we need to treat
6362          each COND_EXPR in the chain specially, not just the last one.
6363          E.g. for:
6364
6365             x_1 = PHI <x_3, ...>
6366             x_2 = a_2 ? ... : x_1;
6367             x_3 = a_3 ? ... : x_2;
6368
6369          we're interested in the last element in x_3 for which a_2 || a_3
6370          is true, whereas the current reduction chain handling would
6371          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6372          as a reduction operation.  */
6373       if (reduc_index == -1)
6374         {
6375           if (dump_enabled_p ())
6376             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6377                              "conditional reduction chains not supported\n");
6378           return false;
6379         }
6380
6381       /* vect_is_simple_reduction ensured that operand 2 is the
6382          loop-carried operand.  */
6383       gcc_assert (reduc_index == 2);
6384
6385       /* Loop peeling modifies initial value of reduction PHI, which
6386          makes the reduction stmt to be transformed different to the
6387          original stmt analyzed.  We need to record reduction code for
6388          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6389          it can be used directly at transform stage.  */
6390       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6391           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6392         {
6393           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6394           gcc_assert (cond_reduc_dt == vect_constant_def);
6395           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6396         }
6397       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6398                                                vectype_in, OPTIMIZE_FOR_SPEED))
6399         {
6400           if (dump_enabled_p ())
6401             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6402                              "optimizing condition reduction with"
6403                              " FOLD_EXTRACT_LAST.\n");
6404           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6405         }
6406       else if (cond_reduc_dt == vect_induction_def)
6407         {
6408           tree base
6409             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6410           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6411
6412           gcc_assert (TREE_CODE (base) == INTEGER_CST
6413                       && TREE_CODE (step) == INTEGER_CST);
6414           cond_reduc_val = NULL_TREE;
6415           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6416              above base; punt if base is the minimum value of the type for
6417              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6418           if (tree_int_cst_sgn (step) == -1)
6419             {
6420               cond_reduc_op_code = MIN_EXPR;
6421               if (tree_int_cst_sgn (base) == -1)
6422                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6423               else if (tree_int_cst_lt (base,
6424                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6425                 cond_reduc_val
6426                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6427             }
6428           else
6429             {
6430               cond_reduc_op_code = MAX_EXPR;
6431               if (tree_int_cst_sgn (base) == 1)
6432                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6433               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6434                                         base))
6435                 cond_reduc_val
6436                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6437             }
6438           if (cond_reduc_val)
6439             {
6440               if (dump_enabled_p ())
6441                 dump_printf_loc (MSG_NOTE, vect_location,
6442                                  "condition expression based on "
6443                                  "integer induction.\n");
6444               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6445                 = INTEGER_INDUC_COND_REDUCTION;
6446             }
6447         }
6448       else if (cond_reduc_dt == vect_constant_def)
6449         {
6450           enum vect_def_type cond_initial_dt;
6451           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6452           tree cond_initial_val
6453             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6454
6455           gcc_assert (cond_reduc_val != NULL_TREE);
6456           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6457           if (cond_initial_dt == vect_constant_def
6458               && types_compatible_p (TREE_TYPE (cond_initial_val),
6459                                      TREE_TYPE (cond_reduc_val)))
6460             {
6461               tree e = fold_binary (LE_EXPR, boolean_type_node,
6462                                     cond_initial_val, cond_reduc_val);
6463               if (e && (integer_onep (e) || integer_zerop (e)))
6464                 {
6465                   if (dump_enabled_p ())
6466                     dump_printf_loc (MSG_NOTE, vect_location,
6467                                      "condition expression based on "
6468                                      "compile time constant.\n");
6469                   /* Record reduction code at analysis stage.  */
6470                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6471                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6472                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6473                     = CONST_COND_REDUCTION;
6474                 }
6475             }
6476         }
6477     }
6478
6479   if (orig_stmt_info)
6480     gcc_assert (tmp == orig_stmt_info
6481                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6482   else
6483     /* We changed STMT to be the first stmt in reduction chain, hence we
6484        check that in this case the first element in the chain is STMT.  */
6485     gcc_assert (tmp == stmt_info
6486                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6487
6488   if (STMT_VINFO_LIVE_P (reduc_def_info))
6489     return false;
6490
6491   if (slp_node)
6492     ncopies = 1;
6493   else
6494     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6495
6496   gcc_assert (ncopies >= 1);
6497
6498   vec_mode = TYPE_MODE (vectype_in);
6499   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6500
6501   if (code == COND_EXPR)
6502     {
6503       /* Only call during the analysis stage, otherwise we'll lose
6504          STMT_VINFO_TYPE.  */
6505       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6506                                                 ops[reduc_index], 0, NULL,
6507                                                 cost_vec))
6508         {
6509           if (dump_enabled_p ())
6510             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6511                              "unsupported condition in reduction\n");
6512           return false;
6513         }
6514     }
6515   else
6516     {
6517       /* 4. Supportable by target?  */
6518
6519       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6520           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6521         {
6522           /* Shifts and rotates are only supported by vectorizable_shifts,
6523              not vectorizable_reduction.  */
6524           if (dump_enabled_p ())
6525             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6526                              "unsupported shift or rotation.\n");
6527           return false;
6528         }
6529
6530       /* 4.1. check support for the operation in the loop  */
6531       optab = optab_for_tree_code (code, vectype_in, optab_default);
6532       if (!optab)
6533         {
6534           if (dump_enabled_p ())
6535             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6536                              "no optab.\n");
6537
6538           return false;
6539         }
6540
6541       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6542         {
6543           if (dump_enabled_p ())
6544             dump_printf (MSG_NOTE, "op not supported by target.\n");
6545
6546           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6547               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6548             return false;
6549
6550           if (dump_enabled_p ())
6551             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6552         }
6553
6554       /* Worthwhile without SIMD support?  */
6555       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6556           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6557         {
6558           if (dump_enabled_p ())
6559             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6560                              "not worthwhile without SIMD support.\n");
6561
6562           return false;
6563         }
6564     }
6565
6566   /* 4.2. Check support for the epilog operation.
6567
6568           If STMT represents a reduction pattern, then the type of the
6569           reduction variable may be different than the type of the rest
6570           of the arguments.  For example, consider the case of accumulation
6571           of shorts into an int accumulator; The original code:
6572                         S1: int_a = (int) short_a;
6573           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6574
6575           was replaced with:
6576                         STMT: int_acc = widen_sum <short_a, int_acc>
6577
6578           This means that:
6579           1. The tree-code that is used to create the vector operation in the
6580              epilog code (that reduces the partial results) is not the
6581              tree-code of STMT, but is rather the tree-code of the original
6582              stmt from the pattern that STMT is replacing.  I.e, in the example
6583              above we want to use 'widen_sum' in the loop, but 'plus' in the
6584              epilog.
6585           2. The type (mode) we use to check available target support
6586              for the vector operation to be created in the *epilog*, is
6587              determined by the type of the reduction variable (in the example
6588              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6589              However the type (mode) we use to check available target support
6590              for the vector operation to be created *inside the loop*, is
6591              determined by the type of the other arguments to STMT (in the
6592              example we'd check this: optab_handler (widen_sum_optab,
6593              vect_short_mode)).
6594
6595           This is contrary to "regular" reductions, in which the types of all
6596           the arguments are the same as the type of the reduction variable.
6597           For "regular" reductions we can therefore use the same vector type
6598           (and also the same tree-code) when generating the epilog code and
6599           when generating the code inside the loop.  */
6600
6601   vect_reduction_type reduction_type
6602     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6603   if (orig_stmt_info
6604       && (reduction_type == TREE_CODE_REDUCTION
6605           || reduction_type == FOLD_LEFT_REDUCTION))
6606     {
6607       /* This is a reduction pattern: get the vectype from the type of the
6608          reduction variable, and get the tree-code from orig_stmt.  */
6609       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6610       gcc_assert (vectype_out);
6611       vec_mode = TYPE_MODE (vectype_out);
6612     }
6613   else
6614     {
6615       /* Regular reduction: use the same vectype and tree-code as used for
6616          the vector code inside the loop can be used for the epilog code. */
6617       orig_code = code;
6618
6619       if (code == MINUS_EXPR)
6620         orig_code = PLUS_EXPR;
6621
6622       /* For simple condition reductions, replace with the actual expression
6623          we want to base our reduction around.  */
6624       if (reduction_type == CONST_COND_REDUCTION)
6625         {
6626           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6627           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6628         }
6629       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6630         orig_code = cond_reduc_op_code;
6631     }
6632
6633   if (nested_cycle)
6634     {
6635       def_bb = gimple_bb (reduc_def_phi);
6636       def_stmt_loop = def_bb->loop_father;
6637       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6638                                        loop_preheader_edge (def_stmt_loop));
6639       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6640       if (def_arg_stmt_info
6641           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6642               == vect_double_reduction_def))
6643         double_reduc = true;
6644     }
6645
6646   reduc_fn = IFN_LAST;
6647
6648   if (reduction_type == TREE_CODE_REDUCTION
6649       || reduction_type == FOLD_LEFT_REDUCTION
6650       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6651       || reduction_type == CONST_COND_REDUCTION)
6652     {
6653       if (reduction_type == FOLD_LEFT_REDUCTION
6654           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6655           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6656         {
6657           if (reduc_fn != IFN_LAST
6658               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6659                                                   OPTIMIZE_FOR_SPEED))
6660             {
6661               if (dump_enabled_p ())
6662                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6663                                  "reduc op not supported by target.\n");
6664
6665               reduc_fn = IFN_LAST;
6666             }
6667         }
6668       else
6669         {
6670           if (!nested_cycle || double_reduc)
6671             {
6672               if (dump_enabled_p ())
6673                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6674                                  "no reduc code for scalar code.\n");
6675
6676               return false;
6677             }
6678         }
6679     }
6680   else if (reduction_type == COND_REDUCTION)
6681     {
6682       int scalar_precision
6683         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6684       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6685       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6686                                                 nunits_out);
6687
6688       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6689                                           OPTIMIZE_FOR_SPEED))
6690         reduc_fn = IFN_REDUC_MAX;
6691     }
6692
6693   if (reduction_type != EXTRACT_LAST_REDUCTION
6694       && (!nested_cycle || double_reduc)
6695       && reduc_fn == IFN_LAST
6696       && !nunits_out.is_constant ())
6697     {
6698       if (dump_enabled_p ())
6699         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700                          "missing target support for reduction on"
6701                          " variable-length vectors.\n");
6702       return false;
6703     }
6704
6705   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6706       && ncopies > 1)
6707     {
6708       if (dump_enabled_p ())
6709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6710                          "multiple types in double reduction or condition "
6711                          "reduction.\n");
6712       return false;
6713     }
6714
6715   /* For SLP reductions, see if there is a neutral value we can use.  */
6716   tree neutral_op = NULL_TREE;
6717   if (slp_node)
6718     neutral_op = neutral_op_for_slp_reduction
6719       (slp_node_instance->reduc_phis, code,
6720        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6721
6722   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6723     {
6724       /* We can't support in-order reductions of code such as this:
6725
6726            for (int i = 0; i < n1; ++i)
6727              for (int j = 0; j < n2; ++j)
6728                l += a[j];
6729
6730          since GCC effectively transforms the loop when vectorizing:
6731
6732            for (int i = 0; i < n1 / VF; ++i)
6733              for (int j = 0; j < n2; ++j)
6734                for (int k = 0; k < VF; ++k)
6735                  l += a[j];
6736
6737          which is a reassociation of the original operation.  */
6738       if (dump_enabled_p ())
6739         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6740                          "in-order double reduction not supported.\n");
6741
6742       return false;
6743     }
6744
6745   if (reduction_type == FOLD_LEFT_REDUCTION
6746       && slp_node
6747       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6748     {
6749       /* We cannot use in-order reductions in this case because there is
6750          an implicit reassociation of the operations involved.  */
6751       if (dump_enabled_p ())
6752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6753                          "in-order unchained SLP reductions not supported.\n");
6754       return false;
6755     }
6756
6757   /* For double reductions, and for SLP reductions with a neutral value,
6758      we construct a variable-length initial vector by loading a vector
6759      full of the neutral value and then shift-and-inserting the start
6760      values into the low-numbered elements.  */
6761   if ((double_reduc || neutral_op)
6762       && !nunits_out.is_constant ()
6763       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6764                                           vectype_out, OPTIMIZE_FOR_SPEED))
6765     {
6766       if (dump_enabled_p ())
6767         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6768                          "reduction on variable-length vectors requires"
6769                          " target support for a vector-shift-and-insert"
6770                          " operation.\n");
6771       return false;
6772     }
6773
6774   /* Check extra constraints for variable-length unchained SLP reductions.  */
6775   if (STMT_SLP_TYPE (stmt_info)
6776       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6777       && !nunits_out.is_constant ())
6778     {
6779       /* We checked above that we could build the initial vector when
6780          there's a neutral element value.  Check here for the case in
6781          which each SLP statement has its own initial value and in which
6782          that value needs to be repeated for every instance of the
6783          statement within the initial vector.  */
6784       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6785       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6786       if (!neutral_op
6787           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6788         {
6789           if (dump_enabled_p ())
6790             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6791                              "unsupported form of SLP reduction for"
6792                              " variable-length vectors: cannot build"
6793                              " initial vector.\n");
6794           return false;
6795         }
6796       /* The epilogue code relies on the number of elements being a multiple
6797          of the group size.  The duplicate-and-interleave approach to setting
6798          up the the initial vector does too.  */
6799       if (!multiple_p (nunits_out, group_size))
6800         {
6801           if (dump_enabled_p ())
6802             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6803                              "unsupported form of SLP reduction for"
6804                              " variable-length vectors: the vector size"
6805                              " is not a multiple of the number of results.\n");
6806           return false;
6807         }
6808     }
6809
6810   /* In case of widenning multiplication by a constant, we update the type
6811      of the constant to be the type of the other operand.  We check that the
6812      constant fits the type in the pattern recognition pass.  */
6813   if (code == DOT_PROD_EXPR
6814       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6815     {
6816       if (TREE_CODE (ops[0]) == INTEGER_CST)
6817         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6818       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6819         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6820       else
6821         {
6822           if (dump_enabled_p ())
6823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6824                              "invalid types in dot-prod\n");
6825
6826           return false;
6827         }
6828     }
6829
6830   if (reduction_type == COND_REDUCTION)
6831     {
6832       widest_int ni;
6833
6834       if (! max_loop_iterations (loop, &ni))
6835         {
6836           if (dump_enabled_p ())
6837             dump_printf_loc (MSG_NOTE, vect_location,
6838                              "loop count not known, cannot create cond "
6839                              "reduction.\n");
6840           return false;
6841         }
6842       /* Convert backedges to iterations.  */
6843       ni += 1;
6844
6845       /* The additional index will be the same type as the condition.  Check
6846          that the loop can fit into this less one (because we'll use up the
6847          zero slot for when there are no matches).  */
6848       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6849       if (wi::geu_p (ni, wi::to_widest (max_index)))
6850         {
6851           if (dump_enabled_p ())
6852             dump_printf_loc (MSG_NOTE, vect_location,
6853                              "loop size is greater than data size.\n");
6854           return false;
6855         }
6856     }
6857
6858   /* In case the vectorization factor (VF) is bigger than the number
6859      of elements that we can fit in a vectype (nunits), we have to generate
6860      more than one vector stmt - i.e - we need to "unroll" the
6861      vector stmt by a factor VF/nunits.  For more details see documentation
6862      in vectorizable_operation.  */
6863
6864   /* If the reduction is used in an outer loop we need to generate
6865      VF intermediate results, like so (e.g. for ncopies=2):
6866         r0 = phi (init, r0)
6867         r1 = phi (init, r1)
6868         r0 = x0 + r0;
6869         r1 = x1 + r1;
6870     (i.e. we generate VF results in 2 registers).
6871     In this case we have a separate def-use cycle for each copy, and therefore
6872     for each copy we get the vector def for the reduction variable from the
6873     respective phi node created for this copy.
6874
6875     Otherwise (the reduction is unused in the loop nest), we can combine
6876     together intermediate results, like so (e.g. for ncopies=2):
6877         r = phi (init, r)
6878         r = x0 + r;
6879         r = x1 + r;
6880    (i.e. we generate VF/2 results in a single register).
6881    In this case for each copy we get the vector def for the reduction variable
6882    from the vectorized reduction operation generated in the previous iteration.
6883
6884    This only works when we see both the reduction PHI and its only consumer
6885    in vectorizable_reduction and there are no intermediate stmts
6886    participating.  */
6887   stmt_vec_info use_stmt_info;
6888   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6889   if (ncopies > 1
6890       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6891       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6892       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6893     {
6894       single_defuse_cycle = true;
6895       epilog_copies = 1;
6896     }
6897   else
6898     epilog_copies = ncopies;
6899
6900   /* If the reduction stmt is one of the patterns that have lane
6901      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6902   if ((ncopies > 1
6903        && ! single_defuse_cycle)
6904       && (code == DOT_PROD_EXPR
6905           || code == WIDEN_SUM_EXPR
6906           || code == SAD_EXPR))
6907     {
6908       if (dump_enabled_p ())
6909         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6910                          "multi def-use cycle not possible for lane-reducing "
6911                          "reduction operation\n");
6912       return false;
6913     }
6914
6915   if (slp_node)
6916     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6917   else
6918     vec_num = 1;
6919
6920   internal_fn cond_fn = get_conditional_internal_fn (code);
6921   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6922
6923   if (!vec_stmt) /* transformation not required.  */
6924     {
6925       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6926       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6927         {
6928           if (reduction_type != FOLD_LEFT_REDUCTION
6929               && (cond_fn == IFN_LAST
6930                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6931                                                       OPTIMIZE_FOR_SPEED)))
6932             {
6933               if (dump_enabled_p ())
6934                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6935                                  "can't use a fully-masked loop because no"
6936                                  " conditional operation is available.\n");
6937               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6938             }
6939           else if (reduc_index == -1)
6940             {
6941               if (dump_enabled_p ())
6942                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6943                                  "can't use a fully-masked loop for chained"
6944                                  " reductions.\n");
6945               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6946             }
6947           else
6948             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6949                                    vectype_in);
6950         }
6951       if (dump_enabled_p ()
6952           && reduction_type == FOLD_LEFT_REDUCTION)
6953         dump_printf_loc (MSG_NOTE, vect_location,
6954                          "using an in-order (fold-left) reduction.\n");
6955       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6956       return true;
6957     }
6958
6959   /* Transform.  */
6960
6961   if (dump_enabled_p ())
6962     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6963
6964   /* FORNOW: Multiple types are not supported for condition.  */
6965   if (code == COND_EXPR)
6966     gcc_assert (ncopies == 1);
6967
6968   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6969
6970   if (reduction_type == FOLD_LEFT_REDUCTION)
6971     return vectorize_fold_left_reduction
6972       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6973        reduc_fn, ops, vectype_in, reduc_index, masks);
6974
6975   if (reduction_type == EXTRACT_LAST_REDUCTION)
6976     {
6977       gcc_assert (!slp_node);
6978       return vectorizable_condition (stmt_info, gsi, vec_stmt,
6979                                      NULL, reduc_index, NULL, NULL);
6980     }
6981
6982   /* Create the destination vector  */
6983   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6984
6985   prev_stmt_info = NULL;
6986   prev_phi_info = NULL;
6987   if (!slp_node)
6988     {
6989       vec_oprnds0.create (1);
6990       vec_oprnds1.create (1);
6991       if (op_type == ternary_op)
6992         vec_oprnds2.create (1);
6993     }
6994
6995   phis.create (vec_num);
6996   vect_defs.create (vec_num);
6997   if (!slp_node)
6998     vect_defs.quick_push (NULL_TREE);
6999
7000   if (slp_node)
7001     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7002   else
7003     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7004
7005   for (j = 0; j < ncopies; j++)
7006     {
7007       if (code == COND_EXPR)
7008         {
7009           gcc_assert (!slp_node);
7010           vectorizable_condition (stmt_info, gsi, vec_stmt,
7011                                   PHI_RESULT (phis[0]->stmt),
7012                                   reduc_index, NULL, NULL);
7013           /* Multiple types are not supported for condition.  */
7014           break;
7015         }
7016
7017       /* Handle uses.  */
7018       if (j == 0)
7019         {
7020           if (slp_node)
7021             {
7022               /* Get vec defs for all the operands except the reduction index,
7023                  ensuring the ordering of the ops in the vector is kept.  */
7024               auto_vec<tree, 3> slp_ops;
7025               auto_vec<vec<tree>, 3> vec_defs;
7026
7027               slp_ops.quick_push (ops[0]);
7028               slp_ops.quick_push (ops[1]);
7029               if (op_type == ternary_op)
7030                 slp_ops.quick_push (ops[2]);
7031
7032               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7033
7034               vec_oprnds0.safe_splice (vec_defs[0]);
7035               vec_defs[0].release ();
7036               vec_oprnds1.safe_splice (vec_defs[1]);
7037               vec_defs[1].release ();
7038               if (op_type == ternary_op)
7039                 {
7040                   vec_oprnds2.safe_splice (vec_defs[2]);
7041                   vec_defs[2].release ();
7042                 }
7043             }
7044           else
7045             {
7046               vec_oprnds0.quick_push
7047                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7048               vec_oprnds1.quick_push
7049                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7050               if (op_type == ternary_op)
7051                 vec_oprnds2.quick_push
7052                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7053             }
7054         }
7055       else
7056         {
7057           if (!slp_node)
7058             {
7059               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7060
7061               if (single_defuse_cycle && reduc_index == 0)
7062                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7063               else
7064                 vec_oprnds0[0]
7065                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7066                                                     vec_oprnds0[0]);
7067               if (single_defuse_cycle && reduc_index == 1)
7068                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7069               else
7070                 vec_oprnds1[0]
7071                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7072                                                     vec_oprnds1[0]);
7073               if (op_type == ternary_op)
7074                 {
7075                   if (single_defuse_cycle && reduc_index == 2)
7076                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7077                   else
7078                     vec_oprnds2[0]
7079                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7080                                                         vec_oprnds2[0]);
7081                 }
7082             }
7083         }
7084
7085       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7086         {
7087           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7088           if (masked_loop_p)
7089             {
7090               /* Make sure that the reduction accumulator is vop[0].  */
7091               if (reduc_index == 1)
7092                 {
7093                   gcc_assert (commutative_tree_code (code));
7094                   std::swap (vop[0], vop[1]);
7095                 }
7096               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7097                                               vectype_in, i * ncopies + j);
7098               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7099                                                         vop[0], vop[1],
7100                                                         vop[0]);
7101               new_temp = make_ssa_name (vec_dest, call);
7102               gimple_call_set_lhs (call, new_temp);
7103               gimple_call_set_nothrow (call, true);
7104               new_stmt_info
7105                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7106             }
7107           else
7108             {
7109               if (op_type == ternary_op)
7110                 vop[2] = vec_oprnds2[i];
7111
7112               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7113                                                        vop[0], vop[1], vop[2]);
7114               new_temp = make_ssa_name (vec_dest, new_stmt);
7115               gimple_assign_set_lhs (new_stmt, new_temp);
7116               new_stmt_info
7117                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7118             }
7119
7120           if (slp_node)
7121             {
7122               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7123               vect_defs.quick_push (new_temp);
7124             }
7125           else
7126             vect_defs[0] = new_temp;
7127         }
7128
7129       if (slp_node)
7130         continue;
7131
7132       if (j == 0)
7133         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7134       else
7135         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7136
7137       prev_stmt_info = new_stmt_info;
7138     }
7139
7140   /* Finalize the reduction-phi (set its arguments) and create the
7141      epilog reduction code.  */
7142   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7143     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7144
7145   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7146                                     epilog_copies, reduc_fn, phis,
7147                                     double_reduc, slp_node, slp_node_instance,
7148                                     cond_reduc_val, cond_reduc_op_code,
7149                                     neutral_op);
7150
7151   return true;
7152 }
7153
7154 /* Function vect_min_worthwhile_factor.
7155
7156    For a loop where we could vectorize the operation indicated by CODE,
7157    return the minimum vectorization factor that makes it worthwhile
7158    to use generic vectors.  */
7159 static unsigned int
7160 vect_min_worthwhile_factor (enum tree_code code)
7161 {
7162   switch (code)
7163     {
7164     case PLUS_EXPR:
7165     case MINUS_EXPR:
7166     case NEGATE_EXPR:
7167       return 4;
7168
7169     case BIT_AND_EXPR:
7170     case BIT_IOR_EXPR:
7171     case BIT_XOR_EXPR:
7172     case BIT_NOT_EXPR:
7173       return 2;
7174
7175     default:
7176       return INT_MAX;
7177     }
7178 }
7179
7180 /* Return true if VINFO indicates we are doing loop vectorization and if
7181    it is worth decomposing CODE operations into scalar operations for
7182    that loop's vectorization factor.  */
7183
7184 bool
7185 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7186 {
7187   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7188   unsigned HOST_WIDE_INT value;
7189   return (loop_vinfo
7190           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7191           && value >= vect_min_worthwhile_factor (code));
7192 }
7193
7194 /* Function vectorizable_induction
7195
7196    Check if STMT_INFO performs an induction computation that can be vectorized.
7197    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7198    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7199    Return true if STMT_INFO is vectorizable in this way.  */
7200
7201 bool
7202 vectorizable_induction (stmt_vec_info stmt_info,
7203                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7204                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7205                         stmt_vector_for_cost *cost_vec)
7206 {
7207   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7208   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7209   unsigned ncopies;
7210   bool nested_in_vect_loop = false;
7211   struct loop *iv_loop;
7212   tree vec_def;
7213   edge pe = loop_preheader_edge (loop);
7214   basic_block new_bb;
7215   tree new_vec, vec_init, vec_step, t;
7216   tree new_name;
7217   gimple *new_stmt;
7218   gphi *induction_phi;
7219   tree induc_def, vec_dest;
7220   tree init_expr, step_expr;
7221   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7222   unsigned i;
7223   tree expr;
7224   gimple_seq stmts;
7225   imm_use_iterator imm_iter;
7226   use_operand_p use_p;
7227   gimple *exit_phi;
7228   edge latch_e;
7229   tree loop_arg;
7230   gimple_stmt_iterator si;
7231
7232   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7233   if (!phi)
7234     return false;
7235
7236   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7237     return false;
7238
7239   /* Make sure it was recognized as induction computation.  */
7240   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7241     return false;
7242
7243   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7244   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7245
7246   if (slp_node)
7247     ncopies = 1;
7248   else
7249     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7250   gcc_assert (ncopies >= 1);
7251
7252   /* FORNOW. These restrictions should be relaxed.  */
7253   if (nested_in_vect_loop_p (loop, stmt_info))
7254     {
7255       imm_use_iterator imm_iter;
7256       use_operand_p use_p;
7257       gimple *exit_phi;
7258       edge latch_e;
7259       tree loop_arg;
7260
7261       if (ncopies > 1)
7262         {
7263           if (dump_enabled_p ())
7264             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7265                              "multiple types in nested loop.\n");
7266           return false;
7267         }
7268
7269       /* FORNOW: outer loop induction with SLP not supported.  */
7270       if (STMT_SLP_TYPE (stmt_info))
7271         return false;
7272
7273       exit_phi = NULL;
7274       latch_e = loop_latch_edge (loop->inner);
7275       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7276       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7277         {
7278           gimple *use_stmt = USE_STMT (use_p);
7279           if (is_gimple_debug (use_stmt))
7280             continue;
7281
7282           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7283             {
7284               exit_phi = use_stmt;
7285               break;
7286             }
7287         }
7288       if (exit_phi)
7289         {
7290           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7291           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7292                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7293             {
7294               if (dump_enabled_p ())
7295                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7296                                  "inner-loop induction only used outside "
7297                                  "of the outer vectorized loop.\n");
7298               return false;
7299             }
7300         }
7301
7302       nested_in_vect_loop = true;
7303       iv_loop = loop->inner;
7304     }
7305   else
7306     iv_loop = loop;
7307   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7308
7309   if (slp_node && !nunits.is_constant ())
7310     {
7311       /* The current SLP code creates the initial value element-by-element.  */
7312       if (dump_enabled_p ())
7313         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7314                          "SLP induction not supported for variable-length"
7315                          " vectors.\n");
7316       return false;
7317     }
7318
7319   if (!vec_stmt) /* transformation not required.  */
7320     {
7321       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7322       DUMP_VECT_SCOPE ("vectorizable_induction");
7323       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7324       return true;
7325     }
7326
7327   /* Transform.  */
7328
7329   /* Compute a vector variable, initialized with the first VF values of
7330      the induction variable.  E.g., for an iv with IV_PHI='X' and
7331      evolution S, for a vector of 4 units, we want to compute:
7332      [X, X + S, X + 2*S, X + 3*S].  */
7333
7334   if (dump_enabled_p ())
7335     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7336
7337   latch_e = loop_latch_edge (iv_loop);
7338   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7339
7340   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7341   gcc_assert (step_expr != NULL_TREE);
7342
7343   pe = loop_preheader_edge (iv_loop);
7344   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7345                                      loop_preheader_edge (iv_loop));
7346
7347   stmts = NULL;
7348   if (!nested_in_vect_loop)
7349     {
7350       /* Convert the initial value to the desired type.  */
7351       tree new_type = TREE_TYPE (vectype);
7352       init_expr = gimple_convert (&stmts, new_type, init_expr);
7353
7354       /* If we are using the loop mask to "peel" for alignment then we need
7355          to adjust the start value here.  */
7356       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7357       if (skip_niters != NULL_TREE)
7358         {
7359           if (FLOAT_TYPE_P (vectype))
7360             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7361                                         skip_niters);
7362           else
7363             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7364           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7365                                          skip_niters, step_expr);
7366           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7367                                     init_expr, skip_step);
7368         }
7369     }
7370
7371   /* Convert the step to the desired type.  */
7372   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7373
7374   if (stmts)
7375     {
7376       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7377       gcc_assert (!new_bb);
7378     }
7379
7380   /* Find the first insertion point in the BB.  */
7381   basic_block bb = gimple_bb (phi);
7382   si = gsi_after_labels (bb);
7383
7384   /* For SLP induction we have to generate several IVs as for example
7385      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7386      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7387      [VF*S, VF*S, VF*S, VF*S] for all.  */
7388   if (slp_node)
7389     {
7390       /* Enforced above.  */
7391       unsigned int const_nunits = nunits.to_constant ();
7392
7393       /* Generate [VF*S, VF*S, ... ].  */
7394       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7395         {
7396           expr = build_int_cst (integer_type_node, vf);
7397           expr = fold_convert (TREE_TYPE (step_expr), expr);
7398         }
7399       else
7400         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7401       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7402                               expr, step_expr);
7403       if (! CONSTANT_CLASS_P (new_name))
7404         new_name = vect_init_vector (stmt_info, new_name,
7405                                      TREE_TYPE (step_expr), NULL);
7406       new_vec = build_vector_from_val (vectype, new_name);
7407       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7408
7409       /* Now generate the IVs.  */
7410       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7411       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7412       unsigned elts = const_nunits * nvects;
7413       unsigned nivs = least_common_multiple (group_size,
7414                                              const_nunits) / const_nunits;
7415       gcc_assert (elts % group_size == 0);
7416       tree elt = init_expr;
7417       unsigned ivn;
7418       for (ivn = 0; ivn < nivs; ++ivn)
7419         {
7420           tree_vector_builder elts (vectype, const_nunits, 1);
7421           stmts = NULL;
7422           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7423             {
7424               if (ivn*const_nunits + eltn >= group_size
7425                   && (ivn * const_nunits + eltn) % group_size == 0)
7426                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7427                                     elt, step_expr);
7428               elts.quick_push (elt);
7429             }
7430           vec_init = gimple_build_vector (&stmts, &elts);
7431           if (stmts)
7432             {
7433               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7434               gcc_assert (!new_bb);
7435             }
7436
7437           /* Create the induction-phi that defines the induction-operand.  */
7438           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7439           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7440           stmt_vec_info induction_phi_info
7441             = loop_vinfo->add_stmt (induction_phi);
7442           induc_def = PHI_RESULT (induction_phi);
7443
7444           /* Create the iv update inside the loop  */
7445           vec_def = make_ssa_name (vec_dest);
7446           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7447           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7448           loop_vinfo->add_stmt (new_stmt);
7449
7450           /* Set the arguments of the phi node:  */
7451           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7452           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7453                        UNKNOWN_LOCATION);
7454
7455           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7456         }
7457
7458       /* Re-use IVs when we can.  */
7459       if (ivn < nvects)
7460         {
7461           unsigned vfp
7462             = least_common_multiple (group_size, const_nunits) / group_size;
7463           /* Generate [VF'*S, VF'*S, ... ].  */
7464           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7465             {
7466               expr = build_int_cst (integer_type_node, vfp);
7467               expr = fold_convert (TREE_TYPE (step_expr), expr);
7468             }
7469           else
7470             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7471           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7472                                   expr, step_expr);
7473           if (! CONSTANT_CLASS_P (new_name))
7474             new_name = vect_init_vector (stmt_info, new_name,
7475                                          TREE_TYPE (step_expr), NULL);
7476           new_vec = build_vector_from_val (vectype, new_name);
7477           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7478           for (; ivn < nvects; ++ivn)
7479             {
7480               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7481               tree def;
7482               if (gimple_code (iv) == GIMPLE_PHI)
7483                 def = gimple_phi_result (iv);
7484               else
7485                 def = gimple_assign_lhs (iv);
7486               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7487                                               PLUS_EXPR,
7488                                               def, vec_step);
7489               if (gimple_code (iv) == GIMPLE_PHI)
7490                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7491               else
7492                 {
7493                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7494                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7495                 }
7496               SLP_TREE_VEC_STMTS (slp_node).quick_push
7497                 (loop_vinfo->add_stmt (new_stmt));
7498             }
7499         }
7500
7501       return true;
7502     }
7503
7504   /* Create the vector that holds the initial_value of the induction.  */
7505   if (nested_in_vect_loop)
7506     {
7507       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7508          been created during vectorization of previous stmts.  We obtain it
7509          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7510       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7511       /* If the initial value is not of proper type, convert it.  */
7512       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7513         {
7514           new_stmt
7515             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7516                                                           vect_simple_var,
7517                                                           "vec_iv_"),
7518                                    VIEW_CONVERT_EXPR,
7519                                    build1 (VIEW_CONVERT_EXPR, vectype,
7520                                            vec_init));
7521           vec_init = gimple_assign_lhs (new_stmt);
7522           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7523                                                  new_stmt);
7524           gcc_assert (!new_bb);
7525           loop_vinfo->add_stmt (new_stmt);
7526         }
7527     }
7528   else
7529     {
7530       /* iv_loop is the loop to be vectorized. Create:
7531          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7532       stmts = NULL;
7533       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7534
7535       unsigned HOST_WIDE_INT const_nunits;
7536       if (nunits.is_constant (&const_nunits))
7537         {
7538           tree_vector_builder elts (vectype, const_nunits, 1);
7539           elts.quick_push (new_name);
7540           for (i = 1; i < const_nunits; i++)
7541             {
7542               /* Create: new_name_i = new_name + step_expr  */
7543               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7544                                        new_name, step_expr);
7545               elts.quick_push (new_name);
7546             }
7547           /* Create a vector from [new_name_0, new_name_1, ...,
7548              new_name_nunits-1]  */
7549           vec_init = gimple_build_vector (&stmts, &elts);
7550         }
7551       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7552         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7553         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7554                                  new_name, step_expr);
7555       else
7556         {
7557           /* Build:
7558                 [base, base, base, ...]
7559                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7560           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7561           gcc_assert (flag_associative_math);
7562           tree index = build_index_vector (vectype, 0, 1);
7563           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7564                                                         new_name);
7565           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7566                                                         step_expr);
7567           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7568           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7569                                    vec_init, step_vec);
7570           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7571                                    vec_init, base_vec);
7572         }
7573
7574       if (stmts)
7575         {
7576           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7577           gcc_assert (!new_bb);
7578         }
7579     }
7580
7581
7582   /* Create the vector that holds the step of the induction.  */
7583   if (nested_in_vect_loop)
7584     /* iv_loop is nested in the loop to be vectorized. Generate:
7585        vec_step = [S, S, S, S]  */
7586     new_name = step_expr;
7587   else
7588     {
7589       /* iv_loop is the loop to be vectorized. Generate:
7590           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7591       gimple_seq seq = NULL;
7592       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7593         {
7594           expr = build_int_cst (integer_type_node, vf);
7595           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7596         }
7597       else
7598         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7599       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7600                                expr, step_expr);
7601       if (seq)
7602         {
7603           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7604           gcc_assert (!new_bb);
7605         }
7606     }
7607
7608   t = unshare_expr (new_name);
7609   gcc_assert (CONSTANT_CLASS_P (new_name)
7610               || TREE_CODE (new_name) == SSA_NAME);
7611   new_vec = build_vector_from_val (vectype, t);
7612   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7613
7614
7615   /* Create the following def-use cycle:
7616      loop prolog:
7617          vec_init = ...
7618          vec_step = ...
7619      loop:
7620          vec_iv = PHI <vec_init, vec_loop>
7621          ...
7622          STMT
7623          ...
7624          vec_loop = vec_iv + vec_step;  */
7625
7626   /* Create the induction-phi that defines the induction-operand.  */
7627   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7628   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7629   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7630   induc_def = PHI_RESULT (induction_phi);
7631
7632   /* Create the iv update inside the loop  */
7633   vec_def = make_ssa_name (vec_dest);
7634   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7635   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7636   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7637
7638   /* Set the arguments of the phi node:  */
7639   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7640   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7641                UNKNOWN_LOCATION);
7642
7643   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7644
7645   /* In case that vectorization factor (VF) is bigger than the number
7646      of elements that we can fit in a vectype (nunits), we have to generate
7647      more than one vector stmt - i.e - we need to "unroll" the
7648      vector stmt by a factor VF/nunits.  For more details see documentation
7649      in vectorizable_operation.  */
7650
7651   if (ncopies > 1)
7652     {
7653       gimple_seq seq = NULL;
7654       stmt_vec_info prev_stmt_vinfo;
7655       /* FORNOW. This restriction should be relaxed.  */
7656       gcc_assert (!nested_in_vect_loop);
7657
7658       /* Create the vector that holds the step of the induction.  */
7659       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7660         {
7661           expr = build_int_cst (integer_type_node, nunits);
7662           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7663         }
7664       else
7665         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7666       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7667                                expr, step_expr);
7668       if (seq)
7669         {
7670           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7671           gcc_assert (!new_bb);
7672         }
7673
7674       t = unshare_expr (new_name);
7675       gcc_assert (CONSTANT_CLASS_P (new_name)
7676                   || TREE_CODE (new_name) == SSA_NAME);
7677       new_vec = build_vector_from_val (vectype, t);
7678       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7679
7680       vec_def = induc_def;
7681       prev_stmt_vinfo = induction_phi_info;
7682       for (i = 1; i < ncopies; i++)
7683         {
7684           /* vec_i = vec_prev + vec_step  */
7685           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7686                                           vec_def, vec_step);
7687           vec_def = make_ssa_name (vec_dest, new_stmt);
7688           gimple_assign_set_lhs (new_stmt, vec_def);
7689
7690           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7691           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7692           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7693           prev_stmt_vinfo = new_stmt_info;
7694         }
7695     }
7696
7697   if (nested_in_vect_loop)
7698     {
7699       /* Find the loop-closed exit-phi of the induction, and record
7700          the final vector of induction results:  */
7701       exit_phi = NULL;
7702       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7703         {
7704           gimple *use_stmt = USE_STMT (use_p);
7705           if (is_gimple_debug (use_stmt))
7706             continue;
7707
7708           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7709             {
7710               exit_phi = use_stmt;
7711               break;
7712             }
7713         }
7714       if (exit_phi)
7715         {
7716           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7717           /* FORNOW. Currently not supporting the case that an inner-loop induction
7718              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7719           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7720                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7721
7722           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7723           if (dump_enabled_p ())
7724             dump_printf_loc (MSG_NOTE, vect_location,
7725                              "vector of inductions after inner-loop:%G",
7726                              new_stmt);
7727         }
7728     }
7729
7730
7731   if (dump_enabled_p ())
7732     dump_printf_loc (MSG_NOTE, vect_location,
7733                      "transform induction: created def-use cycle: %G%G",
7734                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7735
7736   return true;
7737 }
7738
7739 /* Function vectorizable_live_operation.
7740
7741    STMT_INFO computes a value that is used outside the loop.  Check if
7742    it can be supported.  */
7743
7744 bool
7745 vectorizable_live_operation (stmt_vec_info stmt_info,
7746                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7747                              slp_tree slp_node, int slp_index,
7748                              stmt_vec_info *vec_stmt,
7749                              stmt_vector_for_cost *)
7750 {
7751   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7752   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7753   imm_use_iterator imm_iter;
7754   tree lhs, lhs_type, bitsize, vec_bitsize;
7755   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7756   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7757   int ncopies;
7758   gimple *use_stmt;
7759   auto_vec<tree> vec_oprnds;
7760   int vec_entry = 0;
7761   poly_uint64 vec_index = 0;
7762
7763   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7764
7765   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7766     return false;
7767
7768   /* FORNOW.  CHECKME.  */
7769   if (nested_in_vect_loop_p (loop, stmt_info))
7770     return false;
7771
7772   /* If STMT is not relevant and it is a simple assignment and its inputs are
7773      invariant then it can remain in place, unvectorized.  The original last
7774      scalar value that it computes will be used.  */
7775   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7776     {
7777       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7778       if (dump_enabled_p ())
7779         dump_printf_loc (MSG_NOTE, vect_location,
7780                          "statement is simple and uses invariant.  Leaving in "
7781                          "place.\n");
7782       return true;
7783     }
7784
7785   if (slp_node)
7786     ncopies = 1;
7787   else
7788     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7789
7790   if (slp_node)
7791     {
7792       gcc_assert (slp_index >= 0);
7793
7794       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7795       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7796
7797       /* Get the last occurrence of the scalar index from the concatenation of
7798          all the slp vectors. Calculate which slp vector it is and the index
7799          within.  */
7800       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7801
7802       /* Calculate which vector contains the result, and which lane of
7803          that vector we need.  */
7804       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7805         {
7806           if (dump_enabled_p ())
7807             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7808                              "Cannot determine which vector holds the"
7809                              " final result.\n");
7810           return false;
7811         }
7812     }
7813
7814   if (!vec_stmt)
7815     {
7816       /* No transformation required.  */
7817       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7818         {
7819           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7820                                                OPTIMIZE_FOR_SPEED))
7821             {
7822               if (dump_enabled_p ())
7823                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7824                                  "can't use a fully-masked loop because "
7825                                  "the target doesn't support extract last "
7826                                  "reduction.\n");
7827               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7828             }
7829           else if (slp_node)
7830             {
7831               if (dump_enabled_p ())
7832                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7833                                  "can't use a fully-masked loop because an "
7834                                  "SLP statement is live after the loop.\n");
7835               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7836             }
7837           else if (ncopies > 1)
7838             {
7839               if (dump_enabled_p ())
7840                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7841                                  "can't use a fully-masked loop because"
7842                                  " ncopies is greater than 1.\n");
7843               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7844             }
7845           else
7846             {
7847               gcc_assert (ncopies == 1 && !slp_node);
7848               vect_record_loop_mask (loop_vinfo,
7849                                      &LOOP_VINFO_MASKS (loop_vinfo),
7850                                      1, vectype);
7851             }
7852         }
7853       return true;
7854     }
7855
7856   /* Use the lhs of the original scalar statement.  */
7857   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7858
7859   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7860         : gimple_get_lhs (stmt);
7861   lhs_type = TREE_TYPE (lhs);
7862
7863   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7864              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7865              : TYPE_SIZE (TREE_TYPE (vectype)));
7866   vec_bitsize = TYPE_SIZE (vectype);
7867
7868   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7869   tree vec_lhs, bitstart;
7870   if (slp_node)
7871     {
7872       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7873
7874       /* Get the correct slp vectorized stmt.  */
7875       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7876       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7877         vec_lhs = gimple_phi_result (phi);
7878       else
7879         vec_lhs = gimple_get_lhs (vec_stmt);
7880
7881       /* Get entry to use.  */
7882       bitstart = bitsize_int (vec_index);
7883       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7884     }
7885   else
7886     {
7887       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7888       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7889       gcc_checking_assert (ncopies == 1
7890                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7891
7892       /* For multiple copies, get the last copy.  */
7893       for (int i = 1; i < ncopies; ++i)
7894         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7895
7896       /* Get the last lane in the vector.  */
7897       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7898     }
7899
7900   gimple_seq stmts = NULL;
7901   tree new_tree;
7902   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7903     {
7904       /* Emit:
7905
7906            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7907
7908          where VEC_LHS is the vectorized live-out result and MASK is
7909          the loop mask for the final iteration.  */
7910       gcc_assert (ncopies == 1 && !slp_node);
7911       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7912       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7913                                       1, vectype, 0);
7914       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7915                                       scalar_type, mask, vec_lhs);
7916
7917       /* Convert the extracted vector element to the required scalar type.  */
7918       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7919     }
7920   else
7921     {
7922       tree bftype = TREE_TYPE (vectype);
7923       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7924         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7925       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7926       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7927                                        &stmts, true, NULL_TREE);
7928     }
7929
7930   if (stmts)
7931     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7932
7933   /* Replace use of lhs with newly computed result.  If the use stmt is a
7934      single arg PHI, just replace all uses of PHI result.  It's necessary
7935      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7936   use_operand_p use_p;
7937   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7938     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7939         && !is_gimple_debug (use_stmt))
7940     {
7941       if (gimple_code (use_stmt) == GIMPLE_PHI
7942           && gimple_phi_num_args (use_stmt) == 1)
7943         {
7944           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7945         }
7946       else
7947         {
7948           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7949             SET_USE (use_p, new_tree);
7950         }
7951       update_stmt (use_stmt);
7952     }
7953
7954   return true;
7955 }
7956
7957 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7958
7959 static void
7960 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7961 {
7962   ssa_op_iter op_iter;
7963   imm_use_iterator imm_iter;
7964   def_operand_p def_p;
7965   gimple *ustmt;
7966
7967   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7968     {
7969       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7970         {
7971           basic_block bb;
7972
7973           if (!is_gimple_debug (ustmt))
7974             continue;
7975
7976           bb = gimple_bb (ustmt);
7977
7978           if (!flow_bb_inside_loop_p (loop, bb))
7979             {
7980               if (gimple_debug_bind_p (ustmt))
7981                 {
7982                   if (dump_enabled_p ())
7983                     dump_printf_loc (MSG_NOTE, vect_location,
7984                                      "killing debug use\n");
7985
7986                   gimple_debug_bind_reset_value (ustmt);
7987                   update_stmt (ustmt);
7988                 }
7989               else
7990                 gcc_unreachable ();
7991             }
7992         }
7993     }
7994 }
7995
7996 /* Given loop represented by LOOP_VINFO, return true if computation of
7997    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7998    otherwise.  */
7999
8000 static bool
8001 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8002 {
8003   /* Constant case.  */
8004   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8005     {
8006       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8007       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8008
8009       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8010       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8011       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8012         return true;
8013     }
8014
8015   widest_int max;
8016   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8017   /* Check the upper bound of loop niters.  */
8018   if (get_max_loop_iterations (loop, &max))
8019     {
8020       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8021       signop sgn = TYPE_SIGN (type);
8022       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8023       if (max < type_max)
8024         return true;
8025     }
8026   return false;
8027 }
8028
8029 /* Return a mask type with half the number of elements as TYPE.  */
8030
8031 tree
8032 vect_halve_mask_nunits (tree type)
8033 {
8034   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8035   return build_truth_vector_type (nunits, current_vector_size);
8036 }
8037
8038 /* Return a mask type with twice as many elements as TYPE.  */
8039
8040 tree
8041 vect_double_mask_nunits (tree type)
8042 {
8043   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8044   return build_truth_vector_type (nunits, current_vector_size);
8045 }
8046
8047 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8048    contain a sequence of NVECTORS masks that each control a vector of type
8049    VECTYPE.  */
8050
8051 void
8052 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8053                        unsigned int nvectors, tree vectype)
8054 {
8055   gcc_assert (nvectors != 0);
8056   if (masks->length () < nvectors)
8057     masks->safe_grow_cleared (nvectors);
8058   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8059   /* The number of scalars per iteration and the number of vectors are
8060      both compile-time constants.  */
8061   unsigned int nscalars_per_iter
8062     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8063                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8064   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8065     {
8066       rgm->max_nscalars_per_iter = nscalars_per_iter;
8067       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8068     }
8069 }
8070
8071 /* Given a complete set of masks MASKS, extract mask number INDEX
8072    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8073    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8074
8075    See the comment above vec_loop_masks for more details about the mask
8076    arrangement.  */
8077
8078 tree
8079 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8080                     unsigned int nvectors, tree vectype, unsigned int index)
8081 {
8082   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8083   tree mask_type = rgm->mask_type;
8084
8085   /* Populate the rgroup's mask array, if this is the first time we've
8086      used it.  */
8087   if (rgm->masks.is_empty ())
8088     {
8089       rgm->masks.safe_grow_cleared (nvectors);
8090       for (unsigned int i = 0; i < nvectors; ++i)
8091         {
8092           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8093           /* Provide a dummy definition until the real one is available.  */
8094           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8095           rgm->masks[i] = mask;
8096         }
8097     }
8098
8099   tree mask = rgm->masks[index];
8100   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8101                 TYPE_VECTOR_SUBPARTS (vectype)))
8102     {
8103       /* A loop mask for data type X can be reused for data type Y
8104          if X has N times more elements than Y and if Y's elements
8105          are N times bigger than X's.  In this case each sequence
8106          of N elements in the loop mask will be all-zero or all-one.
8107          We can then view-convert the mask so that each sequence of
8108          N elements is replaced by a single element.  */
8109       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8110                               TYPE_VECTOR_SUBPARTS (vectype)));
8111       gimple_seq seq = NULL;
8112       mask_type = build_same_sized_truth_vector_type (vectype);
8113       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8114       if (seq)
8115         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8116     }
8117   return mask;
8118 }
8119
8120 /* Scale profiling counters by estimation for LOOP which is vectorized
8121    by factor VF.  */
8122
8123 static void
8124 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8125 {
8126   edge preheader = loop_preheader_edge (loop);
8127   /* Reduce loop iterations by the vectorization factor.  */
8128   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8129   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8130
8131   if (freq_h.nonzero_p ())
8132     {
8133       profile_probability p;
8134
8135       /* Avoid dropping loop body profile counter to 0 because of zero count
8136          in loop's preheader.  */
8137       if (!(freq_e == profile_count::zero ()))
8138         freq_e = freq_e.force_nonzero ();
8139       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8140       scale_loop_frequencies (loop, p);
8141     }
8142
8143   edge exit_e = single_exit (loop);
8144   exit_e->probability = profile_probability::always ()
8145                                  .apply_scale (1, new_est_niter + 1);
8146
8147   edge exit_l = single_pred_edge (loop->latch);
8148   profile_probability prob = exit_l->probability;
8149   exit_l->probability = exit_e->probability.invert ();
8150   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8151     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8152 }
8153
8154 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8155    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8156    stmt_vec_info.  */
8157
8158 static void
8159 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8160                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8161 {
8162   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8163   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8164
8165   if (dump_enabled_p ())
8166     dump_printf_loc (MSG_NOTE, vect_location,
8167                      "------>vectorizing statement: %G", stmt_info->stmt);
8168
8169   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8170     vect_loop_kill_debug_uses (loop, stmt_info);
8171
8172   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8173       && !STMT_VINFO_LIVE_P (stmt_info))
8174     return;
8175
8176   if (STMT_VINFO_VECTYPE (stmt_info))
8177     {
8178       poly_uint64 nunits
8179         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8180       if (!STMT_SLP_TYPE (stmt_info)
8181           && maybe_ne (nunits, vf)
8182           && dump_enabled_p ())
8183         /* For SLP VF is set according to unrolling factor, and not
8184            to vector size, hence for SLP this print is not valid.  */
8185         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8186     }
8187
8188   /* Pure SLP statements have already been vectorized.  We still need
8189      to apply loop vectorization to hybrid SLP statements.  */
8190   if (PURE_SLP_STMT (stmt_info))
8191     return;
8192
8193   if (dump_enabled_p ())
8194     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8195
8196   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8197     *seen_store = stmt_info;
8198 }
8199
8200 /* Function vect_transform_loop.
8201
8202    The analysis phase has determined that the loop is vectorizable.
8203    Vectorize the loop - created vectorized stmts to replace the scalar
8204    stmts in the loop, and update the loop exit condition.
8205    Returns scalar epilogue loop if any.  */
8206
8207 struct loop *
8208 vect_transform_loop (loop_vec_info loop_vinfo)
8209 {
8210   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8211   struct loop *epilogue = NULL;
8212   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8213   int nbbs = loop->num_nodes;
8214   int i;
8215   tree niters_vector = NULL_TREE;
8216   tree step_vector = NULL_TREE;
8217   tree niters_vector_mult_vf = NULL_TREE;
8218   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8219   unsigned int lowest_vf = constant_lower_bound (vf);
8220   gimple *stmt;
8221   bool check_profitability = false;
8222   unsigned int th;
8223
8224   DUMP_VECT_SCOPE ("vec_transform_loop");
8225
8226   loop_vinfo->shared->check_datarefs ();
8227
8228   /* Use the more conservative vectorization threshold.  If the number
8229      of iterations is constant assume the cost check has been performed
8230      by our caller.  If the threshold makes all loops profitable that
8231      run at least the (estimated) vectorization factor number of times
8232      checking is pointless, too.  */
8233   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8234   if (th >= vect_vf_for_cost (loop_vinfo)
8235       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8236     {
8237       if (dump_enabled_p ())
8238         dump_printf_loc (MSG_NOTE, vect_location,
8239                          "Profitability threshold is %d loop iterations.\n",
8240                          th);
8241       check_profitability = true;
8242     }
8243
8244   /* Make sure there exists a single-predecessor exit bb.  Do this before
8245      versioning.   */
8246   edge e = single_exit (loop);
8247   if (! single_pred_p (e->dest))
8248     {
8249       split_loop_exit_edge (e);
8250       if (dump_enabled_p ())
8251         dump_printf (MSG_NOTE, "split exit edge\n");
8252     }
8253
8254   /* Version the loop first, if required, so the profitability check
8255      comes first.  */
8256
8257   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8258     {
8259       poly_uint64 versioning_threshold
8260         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8261       if (check_profitability
8262           && ordered_p (poly_uint64 (th), versioning_threshold))
8263         {
8264           versioning_threshold = ordered_max (poly_uint64 (th),
8265                                               versioning_threshold);
8266           check_profitability = false;
8267         }
8268       vect_loop_versioning (loop_vinfo, th, check_profitability,
8269                             versioning_threshold);
8270       check_profitability = false;
8271     }
8272
8273   /* Make sure there exists a single-predecessor exit bb also on the
8274      scalar loop copy.  Do this after versioning but before peeling
8275      so CFG structure is fine for both scalar and if-converted loop
8276      to make slpeel_duplicate_current_defs_from_edges face matched
8277      loop closed PHI nodes on the exit.  */
8278   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8279     {
8280       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8281       if (! single_pred_p (e->dest))
8282         {
8283           split_loop_exit_edge (e);
8284           if (dump_enabled_p ())
8285             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8286         }
8287     }
8288
8289   tree niters = vect_build_loop_niters (loop_vinfo);
8290   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8291   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8292   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8293   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8294                               &step_vector, &niters_vector_mult_vf, th,
8295                               check_profitability, niters_no_overflow);
8296
8297   if (niters_vector == NULL_TREE)
8298     {
8299       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8300           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8301           && known_eq (lowest_vf, vf))
8302         {
8303           niters_vector
8304             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8305                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8306           step_vector = build_one_cst (TREE_TYPE (niters));
8307         }
8308       else
8309         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8310                                      &step_vector, niters_no_overflow);
8311     }
8312
8313   /* 1) Make sure the loop header has exactly two entries
8314      2) Make sure we have a preheader basic block.  */
8315
8316   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8317
8318   split_edge (loop_preheader_edge (loop));
8319
8320   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8321       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8322     /* This will deal with any possible peeling.  */
8323     vect_prepare_for_masked_peels (loop_vinfo);
8324
8325   /* Schedule the SLP instances first, then handle loop vectorization
8326      below.  */
8327   if (!loop_vinfo->slp_instances.is_empty ())
8328     {
8329       DUMP_VECT_SCOPE ("scheduling SLP instances");
8330       vect_schedule_slp (loop_vinfo);
8331     }
8332
8333   /* FORNOW: the vectorizer supports only loops which body consist
8334      of one basic block (header + empty latch). When the vectorizer will
8335      support more involved loop forms, the order by which the BBs are
8336      traversed need to be reconsidered.  */
8337
8338   for (i = 0; i < nbbs; i++)
8339     {
8340       basic_block bb = bbs[i];
8341       stmt_vec_info stmt_info;
8342
8343       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8344            gsi_next (&si))
8345         {
8346           gphi *phi = si.phi ();
8347           if (dump_enabled_p ())
8348             dump_printf_loc (MSG_NOTE, vect_location,
8349                              "------>vectorizing phi: %G", phi);
8350           stmt_info = loop_vinfo->lookup_stmt (phi);
8351           if (!stmt_info)
8352             continue;
8353
8354           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8355             vect_loop_kill_debug_uses (loop, stmt_info);
8356
8357           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8358               && !STMT_VINFO_LIVE_P (stmt_info))
8359             continue;
8360
8361           if (STMT_VINFO_VECTYPE (stmt_info)
8362               && (maybe_ne
8363                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8364               && dump_enabled_p ())
8365             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8366
8367           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8368                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8369                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8370               && ! PURE_SLP_STMT (stmt_info))
8371             {
8372               if (dump_enabled_p ())
8373                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8374               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8375             }
8376         }
8377
8378       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8379            !gsi_end_p (si);)
8380         {
8381           stmt = gsi_stmt (si);
8382           /* During vectorization remove existing clobber stmts.  */
8383           if (gimple_clobber_p (stmt))
8384             {
8385               unlink_stmt_vdef (stmt);
8386               gsi_remove (&si, true);
8387               release_defs (stmt);
8388             }
8389           else
8390             {
8391               stmt_info = loop_vinfo->lookup_stmt (stmt);
8392
8393               /* vector stmts created in the outer-loop during vectorization of
8394                  stmts in an inner-loop may not have a stmt_info, and do not
8395                  need to be vectorized.  */
8396               stmt_vec_info seen_store = NULL;
8397               if (stmt_info)
8398                 {
8399                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8400                     {
8401                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8402                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8403                            !gsi_end_p (subsi); gsi_next (&subsi))
8404                         {
8405                           stmt_vec_info pat_stmt_info
8406                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8407                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8408                                                     &si, &seen_store);
8409                         }
8410                       stmt_vec_info pat_stmt_info
8411                         = STMT_VINFO_RELATED_STMT (stmt_info);
8412                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8413                                                 &seen_store);
8414                     }
8415                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8416                                             &seen_store);
8417                 }
8418               gsi_next (&si);
8419               if (seen_store)
8420                 {
8421                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8422                     /* Interleaving.  If IS_STORE is TRUE, the
8423                        vectorization of the interleaving chain was
8424                        completed - free all the stores in the chain.  */
8425                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8426                   else
8427                     /* Free the attached stmt_vec_info and remove the stmt.  */
8428                     loop_vinfo->remove_stmt (stmt_info);
8429                 }
8430             }
8431         }
8432
8433       /* Stub out scalar statements that must not survive vectorization.
8434          Doing this here helps with grouped statements, or statements that
8435          are involved in patterns.  */
8436       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8437            !gsi_end_p (gsi); gsi_next (&gsi))
8438         {
8439           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8440           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8441             {
8442               tree lhs = gimple_get_lhs (call);
8443               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8444                 {
8445                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8446                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8447                   gsi_replace (&gsi, new_stmt, true);
8448                 }
8449             }
8450         }
8451     }                           /* BBs in loop */
8452
8453   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8454      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8455   if (integer_onep (step_vector))
8456     niters_no_overflow = true;
8457   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8458                            niters_vector_mult_vf, !niters_no_overflow);
8459
8460   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8461   scale_profile_for_vect_loop (loop, assumed_vf);
8462
8463   /* True if the final iteration might not handle a full vector's
8464      worth of scalar iterations.  */
8465   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8466   /* The minimum number of iterations performed by the epilogue.  This
8467      is 1 when peeling for gaps because we always need a final scalar
8468      iteration.  */
8469   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8470   /* +1 to convert latch counts to loop iteration counts,
8471      -min_epilogue_iters to remove iterations that cannot be performed
8472        by the vector code.  */
8473   int bias_for_lowest = 1 - min_epilogue_iters;
8474   int bias_for_assumed = bias_for_lowest;
8475   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8476   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8477     {
8478       /* When the amount of peeling is known at compile time, the first
8479          iteration will have exactly alignment_npeels active elements.
8480          In the worst case it will have at least one.  */
8481       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8482       bias_for_lowest += lowest_vf - min_first_active;
8483       bias_for_assumed += assumed_vf - min_first_active;
8484     }
8485   /* In these calculations the "- 1" converts loop iteration counts
8486      back to latch counts.  */
8487   if (loop->any_upper_bound)
8488     loop->nb_iterations_upper_bound
8489       = (final_iter_may_be_partial
8490          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8491                           lowest_vf) - 1
8492          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8493                            lowest_vf) - 1);
8494   if (loop->any_likely_upper_bound)
8495     loop->nb_iterations_likely_upper_bound
8496       = (final_iter_may_be_partial
8497          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8498                           + bias_for_lowest, lowest_vf) - 1
8499          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8500                            + bias_for_lowest, lowest_vf) - 1);
8501   if (loop->any_estimate)
8502     loop->nb_iterations_estimate
8503       = (final_iter_may_be_partial
8504          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8505                           assumed_vf) - 1
8506          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8507                            assumed_vf) - 1);
8508
8509   if (dump_enabled_p ())
8510     {
8511       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8512         {
8513           dump_printf_loc (MSG_NOTE, vect_location,
8514                            "LOOP VECTORIZED\n");
8515           if (loop->inner)
8516             dump_printf_loc (MSG_NOTE, vect_location,
8517                              "OUTER LOOP VECTORIZED\n");
8518           dump_printf (MSG_NOTE, "\n");
8519         }
8520       else
8521         {
8522           dump_printf_loc (MSG_NOTE, vect_location,
8523                            "LOOP EPILOGUE VECTORIZED (VS=");
8524           dump_dec (MSG_NOTE, current_vector_size);
8525           dump_printf (MSG_NOTE, ")\n");
8526         }
8527     }
8528
8529   /* Free SLP instances here because otherwise stmt reference counting
8530      won't work.  */
8531   slp_instance instance;
8532   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8533     vect_free_slp_instance (instance, true);
8534   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8535   /* Clear-up safelen field since its value is invalid after vectorization
8536      since vectorized loop can have loop-carried dependencies.  */
8537   loop->safelen = 0;
8538
8539   /* Don't vectorize epilogue for epilogue.  */
8540   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8541     epilogue = NULL;
8542
8543   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8544     epilogue = NULL;
8545
8546   if (epilogue)
8547     {
8548       auto_vector_sizes vector_sizes;
8549       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8550       unsigned int next_size = 0;
8551
8552       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8553           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8554           && known_eq (vf, lowest_vf))
8555         {
8556           unsigned int eiters
8557             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8558                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8559           eiters = eiters % lowest_vf;
8560           epilogue->nb_iterations_upper_bound = eiters - 1;
8561
8562           unsigned int ratio;
8563           while (next_size < vector_sizes.length ()
8564                  && !(constant_multiple_p (current_vector_size,
8565                                            vector_sizes[next_size], &ratio)
8566                       && eiters >= lowest_vf / ratio))
8567             next_size += 1;
8568         }
8569       else
8570         while (next_size < vector_sizes.length ()
8571                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8572           next_size += 1;
8573
8574       if (next_size == vector_sizes.length ())
8575         epilogue = NULL;
8576     }
8577
8578   if (epilogue)
8579     {
8580       epilogue->force_vectorize = loop->force_vectorize;
8581       epilogue->safelen = loop->safelen;
8582       epilogue->dont_vectorize = false;
8583
8584       /* We may need to if-convert epilogue to vectorize it.  */
8585       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8586         tree_if_conversion (epilogue);
8587     }
8588
8589   return epilogue;
8590 }
8591
8592 /* The code below is trying to perform simple optimization - revert
8593    if-conversion for masked stores, i.e. if the mask of a store is zero
8594    do not perform it and all stored value producers also if possible.
8595    For example,
8596      for (i=0; i<n; i++)
8597        if (c[i])
8598         {
8599           p1[i] += 1;
8600           p2[i] = p3[i] +2;
8601         }
8602    this transformation will produce the following semi-hammock:
8603
8604    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8605      {
8606        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8607        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8608        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8609        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8610        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8611        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8612      }
8613 */
8614
8615 void
8616 optimize_mask_stores (struct loop *loop)
8617 {
8618   basic_block *bbs = get_loop_body (loop);
8619   unsigned nbbs = loop->num_nodes;
8620   unsigned i;
8621   basic_block bb;
8622   struct loop *bb_loop;
8623   gimple_stmt_iterator gsi;
8624   gimple *stmt;
8625   auto_vec<gimple *> worklist;
8626
8627   vect_location = find_loop_location (loop);
8628   /* Pick up all masked stores in loop if any.  */
8629   for (i = 0; i < nbbs; i++)
8630     {
8631       bb = bbs[i];
8632       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8633            gsi_next (&gsi))
8634         {
8635           stmt = gsi_stmt (gsi);
8636           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8637             worklist.safe_push (stmt);
8638         }
8639     }
8640
8641   free (bbs);
8642   if (worklist.is_empty ())
8643     return;
8644
8645   /* Loop has masked stores.  */
8646   while (!worklist.is_empty ())
8647     {
8648       gimple *last, *last_store;
8649       edge e, efalse;
8650       tree mask;
8651       basic_block store_bb, join_bb;
8652       gimple_stmt_iterator gsi_to;
8653       tree vdef, new_vdef;
8654       gphi *phi;
8655       tree vectype;
8656       tree zero;
8657
8658       last = worklist.pop ();
8659       mask = gimple_call_arg (last, 2);
8660       bb = gimple_bb (last);
8661       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8662          the same loop as if_bb.  It could be different to LOOP when two
8663          level loop-nest is vectorized and mask_store belongs to the inner
8664          one.  */
8665       e = split_block (bb, last);
8666       bb_loop = bb->loop_father;
8667       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8668       join_bb = e->dest;
8669       store_bb = create_empty_bb (bb);
8670       add_bb_to_loop (store_bb, bb_loop);
8671       e->flags = EDGE_TRUE_VALUE;
8672       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8673       /* Put STORE_BB to likely part.  */
8674       efalse->probability = profile_probability::unlikely ();
8675       store_bb->count = efalse->count ();
8676       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8677       if (dom_info_available_p (CDI_DOMINATORS))
8678         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8679       if (dump_enabled_p ())
8680         dump_printf_loc (MSG_NOTE, vect_location,
8681                          "Create new block %d to sink mask stores.",
8682                          store_bb->index);
8683       /* Create vector comparison with boolean result.  */
8684       vectype = TREE_TYPE (mask);
8685       zero = build_zero_cst (vectype);
8686       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8687       gsi = gsi_last_bb (bb);
8688       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8689       /* Create new PHI node for vdef of the last masked store:
8690          .MEM_2 = VDEF <.MEM_1>
8691          will be converted to
8692          .MEM.3 = VDEF <.MEM_1>
8693          and new PHI node will be created in join bb
8694          .MEM_2 = PHI <.MEM_1, .MEM_3>
8695       */
8696       vdef = gimple_vdef (last);
8697       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8698       gimple_set_vdef (last, new_vdef);
8699       phi = create_phi_node (vdef, join_bb);
8700       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8701
8702       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8703       while (true)
8704         {
8705           gimple_stmt_iterator gsi_from;
8706           gimple *stmt1 = NULL;
8707
8708           /* Move masked store to STORE_BB.  */
8709           last_store = last;
8710           gsi = gsi_for_stmt (last);
8711           gsi_from = gsi;
8712           /* Shift GSI to the previous stmt for further traversal.  */
8713           gsi_prev (&gsi);
8714           gsi_to = gsi_start_bb (store_bb);
8715           gsi_move_before (&gsi_from, &gsi_to);
8716           /* Setup GSI_TO to the non-empty block start.  */
8717           gsi_to = gsi_start_bb (store_bb);
8718           if (dump_enabled_p ())
8719             dump_printf_loc (MSG_NOTE, vect_location,
8720                              "Move stmt to created bb\n%G", last);
8721           /* Move all stored value producers if possible.  */
8722           while (!gsi_end_p (gsi))
8723             {
8724               tree lhs;
8725               imm_use_iterator imm_iter;
8726               use_operand_p use_p;
8727               bool res;
8728
8729               /* Skip debug statements.  */
8730               if (is_gimple_debug (gsi_stmt (gsi)))
8731                 {
8732                   gsi_prev (&gsi);
8733                   continue;
8734                 }
8735               stmt1 = gsi_stmt (gsi);
8736               /* Do not consider statements writing to memory or having
8737                  volatile operand.  */
8738               if (gimple_vdef (stmt1)
8739                   || gimple_has_volatile_ops (stmt1))
8740                 break;
8741               gsi_from = gsi;
8742               gsi_prev (&gsi);
8743               lhs = gimple_get_lhs (stmt1);
8744               if (!lhs)
8745                 break;
8746
8747               /* LHS of vectorized stmt must be SSA_NAME.  */
8748               if (TREE_CODE (lhs) != SSA_NAME)
8749                 break;
8750
8751               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8752                 {
8753                   /* Remove dead scalar statement.  */
8754                   if (has_zero_uses (lhs))
8755                     {
8756                       gsi_remove (&gsi_from, true);
8757                       continue;
8758                     }
8759                 }
8760
8761               /* Check that LHS does not have uses outside of STORE_BB.  */
8762               res = true;
8763               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8764                 {
8765                   gimple *use_stmt;
8766                   use_stmt = USE_STMT (use_p);
8767                   if (is_gimple_debug (use_stmt))
8768                     continue;
8769                   if (gimple_bb (use_stmt) != store_bb)
8770                     {
8771                       res = false;
8772                       break;
8773                     }
8774                 }
8775               if (!res)
8776                 break;
8777
8778               if (gimple_vuse (stmt1)
8779                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8780                 break;
8781
8782               /* Can move STMT1 to STORE_BB.  */
8783               if (dump_enabled_p ())
8784                 dump_printf_loc (MSG_NOTE, vect_location,
8785                                  "Move stmt to created bb\n%G", stmt1);
8786               gsi_move_before (&gsi_from, &gsi_to);
8787               /* Shift GSI_TO for further insertion.  */
8788               gsi_prev (&gsi_to);
8789             }
8790           /* Put other masked stores with the same mask to STORE_BB.  */
8791           if (worklist.is_empty ()
8792               || gimple_call_arg (worklist.last (), 2) != mask
8793               || worklist.last () != stmt1)
8794             break;
8795           last = worklist.pop ();
8796         }
8797       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8798     }
8799 }