gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     {
 219       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 220       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 221     }
 222   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 223     return false;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             {
 238               dump_printf_loc (MSG_NOTE, vect_location,
 239                                "==> examining pattern def stmt: ");
 240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 241                                 def_stmt_info->stmt, 0);
 242             }
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245             return false;
 246         }
 247
 248       if (dump_enabled_p ())
 249         {
 250           dump_printf_loc (MSG_NOTE, vect_location,
 251                            "==> examining pattern statement: ");
 252           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 253         }
 254       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 255         return false;
 256     }
 257
 258   return true;
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static bool
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 314               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 315             }
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 {
 327                   dump_printf_loc (MSG_NOTE, vect_location,
 328                                    "get vectype for scalar type:  ");
 329                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 330                   dump_printf (MSG_NOTE, "\n");
 331                 }
 332
 333               vectype = get_vectype_for_scalar_type (scalar_type);
 334               if (!vectype)
 335                 {
 336                   if (dump_enabled_p ())
 337                     {
 338                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                        "not vectorized: unsupported "
 340                                        "data-type ");
 341                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                          scalar_type);
 343                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 344                     }
 345                   return false;
 346                 }
 347               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 348
 349               if (dump_enabled_p ())
 350                 {
 351                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 352                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 353                   dump_printf (MSG_NOTE, "\n");
 354                 }
 355
 356               if (dump_enabled_p ())
 357                 {
 358                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 359                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 360                   dump_printf (MSG_NOTE, "\n");
 361                 }
 362
 363               vect_update_max_nunits (&vectorization_factor, vectype);
 364             }
 365         }
 366
 367       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 368            gsi_next (&si))
 369         {
 370           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 371           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 372                                            &mask_producers))
 373             return false;
 374         }
 375     }
 376
 377   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 378   if (dump_enabled_p ())
 379     {
 380       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 381       dump_dec (MSG_NOTE, vectorization_factor);
 382       dump_printf (MSG_NOTE, "\n");
 383     }
 384
 385   if (known_le (vectorization_factor, 1U))
 386     {
 387       if (dump_enabled_p ())
 388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                          "not vectorized: unsupported data-type\n");
 390       return false;
 391     }
 392   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 393
 394   for (i = 0; i < mask_producers.length (); i++)
 395     {
 396       stmt_info = mask_producers[i];
 397       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 398       if (!mask_type)
 399         return false;
 400       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 401     }
 402
 403   return true;
 404 }
 405
 406
 407 /* Function vect_is_simple_iv_evolution.
 408
 409    FORNOW: A simple evolution of an induction variables in the loop is
 410    considered a polynomial evolution.  */
 411
 412 static bool
 413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 414                              tree * step)
 415 {
 416   tree init_expr;
 417   tree step_expr;
 418   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 419   basic_block bb;
 420
 421   /* When there is no evolution in this loop, the evolution function
 422      is not "simple".  */
 423   if (evolution_part == NULL_TREE)
 424     return false;
 425
 426   /* When the evolution is a polynomial of degree >= 2
 427      the evolution function is not "simple".  */
 428   if (tree_is_chrec (evolution_part))
 429     return false;
 430
 431   step_expr = evolution_part;
 432   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 433
 434   if (dump_enabled_p ())
 435     {
 436       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 437       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 438       dump_printf (MSG_NOTE, ",  init: ");
 439       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 440       dump_printf (MSG_NOTE, "\n");
 441     }
 442
 443   *init = init_expr;
 444   *step = step_expr;
 445
 446   if (TREE_CODE (step_expr) != INTEGER_CST
 447       && (TREE_CODE (step_expr) != SSA_NAME
 448           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 449               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 450           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 451               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 452                   || !flag_associative_math)))
 453       && (TREE_CODE (step_expr) != REAL_CST
 454           || !flag_associative_math))
 455     {
 456       if (dump_enabled_p ())
 457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 458                          "step unknown.\n");
 459       return false;
 460     }
 461
 462   return true;
 463 }
 464
 465 /* Function vect_analyze_scalar_cycles_1.
 466
 467    Examine the cross iteration def-use cycles of scalar variables
 468    in LOOP.  LOOP_VINFO represents the loop that is now being
 469    considered for vectorization (can be LOOP, or an outer-loop
 470    enclosing LOOP).  */
 471
 472 static void
 473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 474 {
 475   basic_block bb = loop->header;
 476   tree init, step;
 477   auto_vec<gimple *, 64> worklist;
 478   gphi_iterator gsi;
 479   bool double_reduc;
 480
 481   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 482
 483   /* First - identify all inductions.  Reduction detection assumes that all the
 484      inductions have been identified, therefore, this order must not be
 485      changed.  */
 486   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 487     {
 488       gphi *phi = gsi.phi ();
 489       tree access_fn = NULL;
 490       tree def = PHI_RESULT (phi);
 491       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 492
 493       if (dump_enabled_p ())
 494         {
 495           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 496           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 497         }
 498
 499       /* Skip virtual phi's.  The data dependences that are associated with
 500          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 501       if (virtual_operand_p (def))
 502         continue;
 503
 504       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 505
 506       /* Analyze the evolution function.  */
 507       access_fn = analyze_scalar_evolution (loop, def);
 508       if (access_fn)
 509         {
 510           STRIP_NOPS (access_fn);
 511           if (dump_enabled_p ())
 512             {
 513               dump_printf_loc (MSG_NOTE, vect_location,
 514                                "Access function of PHI: ");
 515               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 516               dump_printf (MSG_NOTE, "\n");
 517             }
 518           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519             = initial_condition_in_loop_num (access_fn, loop->num);
 520           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 521             = evolution_part_in_loop_num (access_fn, loop->num);
 522         }
 523
 524       if (!access_fn
 525           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 526           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 527               && TREE_CODE (step) != INTEGER_CST))
 528         {
 529           worklist.safe_push (phi);
 530           continue;
 531         }
 532
 533       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 534                   != NULL_TREE);
 535       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 539       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 540     }
 541
 542
 543   /* Second - identify all reductions and nested cycles.  */
 544   while (worklist.length () > 0)
 545     {
 546       gimple *phi = worklist.pop ();
 547       tree def = PHI_RESULT (phi);
 548       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 549       gimple *reduc_stmt;
 550
 551       if (dump_enabled_p ())
 552         {
 553           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 554           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 555         }
 556
 557       gcc_assert (!virtual_operand_p (def)
 558                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 559
 560       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 561                                                 &double_reduc, false);
 562       if (reduc_stmt)
 563         {
 564           if (double_reduc)
 565             {
 566               if (dump_enabled_p ())
 567                 dump_printf_loc (MSG_NOTE, vect_location,
 568                                  "Detected double reduction.\n");
 569
 570               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 571               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 572                                                     vect_double_reduction_def;
 573             }
 574           else
 575             {
 576               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 577                 {
 578                   if (dump_enabled_p ())
 579                     dump_printf_loc (MSG_NOTE, vect_location,
 580                                      "Detected vectorizable nested cycle.\n");
 581
 582                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 583                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 584                                                              vect_nested_cycle;
 585                 }
 586               else
 587                 {
 588                   if (dump_enabled_p ())
 589                     dump_printf_loc (MSG_NOTE, vect_location,
 590                                      "Detected reduction.\n");
 591
 592                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 593                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 594                                                            vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 600                 }
 601             }
 602         }
 603       else
 604         if (dump_enabled_p ())
 605           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 606                            "Unknown def-use cycle pattern.\n");
 607     }
 608 }
 609
 610
 611 /* Function vect_analyze_scalar_cycles.
 612
 613    Examine the cross iteration def-use cycles of scalar variables, by
 614    analyzing the loop-header PHIs of scalar variables.  Classify each
 615    cycle as one of the following: invariant, induction, reduction, unknown.
 616    We do that for the loop represented by LOOP_VINFO, and also to its
 617    inner-loop, if exists.
 618    Examples for scalar cycles:
 619
 620    Example1: reduction:
 621
 622               loop1:
 623               for (i=0; i<N; i++)
 624                  sum += a[i];
 625
 626    Example2: induction:
 627
 628               loop2:
 629               for (i=0; i<N; i++)
 630                  a[i] = i;  */
 631
 632 static void
 633 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 634 {
 635   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 636
 637   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 638
 639   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 640      Reductions in such inner-loop therefore have different properties than
 641      the reductions in the nest that gets vectorized:
 642      1. When vectorized, they are executed in the same order as in the original
 643         scalar loop, so we can't change the order of computation when
 644         vectorizing them.
 645      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 646         current checks are too strict.  */
 647
 648   if (loop->inner)
 649     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 650 }
 651
 652 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 653
 654 static void
 655 vect_fixup_reduc_chain (gimple *stmt)
 656 {
 657   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 658   gimple *stmtp;
 659   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 660               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 661   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 662     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 663   do
 664     {
 665       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 666       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 667       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 668       if (stmt)
 669         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 670           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 671     }
 672   while (stmt);
 673   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 674 }
 675
 676 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 677
 678 static void
 679 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 680 {
 681   gimple *first;
 682   unsigned i;
 683
 684   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 685     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 686       {
 687         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 688         while (next)
 689           {
 690             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 691               break;
 692             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 693           }
 694         /* If not all stmt in the chain are patterns try to handle
 695            the chain without patterns.  */
 696         if (! next)
 697           {
 698             vect_fixup_reduc_chain (first);
 699             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 700               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 701           }
 702       }
 703 }
 704
 705 /* Function vect_get_loop_niters.
 706
 707    Determine how many iterations the loop is executed and place it
 708    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 709    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 710    niter information holds in ASSUMPTIONS.
 711
 712    Return the loop exit condition.  */
 713
 714
 715 static gcond *
 716 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 717                       tree *number_of_iterations, tree *number_of_iterationsm1)
 718 {
 719   edge exit = single_exit (loop);
 720   struct tree_niter_desc niter_desc;
 721   tree niter_assumptions, niter, may_be_zero;
 722   gcond *cond = get_loop_exit_condition (loop);
 723
 724   *assumptions = boolean_true_node;
 725   *number_of_iterationsm1 = chrec_dont_know;
 726   *number_of_iterations = chrec_dont_know;
 727   DUMP_VECT_SCOPE ("get_loop_niters");
 728
 729   if (!exit)
 730     return cond;
 731
 732   niter = chrec_dont_know;
 733   may_be_zero = NULL_TREE;
 734   niter_assumptions = boolean_true_node;
 735   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 736       || chrec_contains_undetermined (niter_desc.niter))
 737     return cond;
 738
 739   niter_assumptions = niter_desc.assumptions;
 740   may_be_zero = niter_desc.may_be_zero;
 741   niter = niter_desc.niter;
 742
 743   if (may_be_zero && integer_zerop (may_be_zero))
 744     may_be_zero = NULL_TREE;
 745
 746   if (may_be_zero)
 747     {
 748       if (COMPARISON_CLASS_P (may_be_zero))
 749         {
 750           /* Try to combine may_be_zero with assumptions, this can simplify
 751              computation of niter expression.  */
 752           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 753             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 754                                              niter_assumptions,
 755                                              fold_build1 (TRUTH_NOT_EXPR,
 756                                                           boolean_type_node,
 757                                                           may_be_zero));
 758           else
 759             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 760                                  build_int_cst (TREE_TYPE (niter), 0),
 761                                  rewrite_to_non_trapping_overflow (niter));
 762
 763           may_be_zero = NULL_TREE;
 764         }
 765       else if (integer_nonzerop (may_be_zero))
 766         {
 767           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 768           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 769           return cond;
 770         }
 771       else
 772         return cond;
 773     }
 774
 775   *assumptions = niter_assumptions;
 776   *number_of_iterationsm1 = niter;
 777
 778   /* We want the number of loop header executions which is the number
 779      of latch executions plus one.
 780      ???  For UINT_MAX latch executions this number overflows to zero
 781      for loops like do { n++; } while (n != 0);  */
 782   if (niter && !chrec_contains_undetermined (niter))
 783     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 784                           build_int_cst (TREE_TYPE (niter), 1));
 785   *number_of_iterations = niter;
 786
 787   return cond;
 788 }
 789
 790 /* Function bb_in_loop_p
 791
 792    Used as predicate for dfs order traversal of the loop bbs.  */
 793
 794 static bool
 795 bb_in_loop_p (const_basic_block bb, const void *data)
 796 {
 797   const struct loop *const loop = (const struct loop *)data;
 798   if (flow_bb_inside_loop_p (loop, bb))
 799     return true;
 800   return false;
 801 }
 802
 803
 804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 805    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 806
 807 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 808   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 809     loop (loop_in),
 810     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 811     num_itersm1 (NULL_TREE),
 812     num_iters (NULL_TREE),
 813     num_iters_unchanged (NULL_TREE),
 814     num_iters_assumptions (NULL_TREE),
 815     th (0),
 816     versioning_threshold (0),
 817     vectorization_factor (0),
 818     max_vectorization_factor (0),
 819     mask_skip_niters (NULL_TREE),
 820     mask_compare_type (NULL_TREE),
 821     unaligned_dr (NULL),
 822     peeling_for_alignment (0),
 823     ptr_mask (0),
 824     ivexpr_map (NULL),
 825     slp_unrolling_factor (1),
 826     single_scalar_iteration_cost (0),
 827     vectorizable (false),
 828     can_fully_mask_p (true),
 829     fully_masked_p (false),
 830     peeling_for_gaps (false),
 831     peeling_for_niter (false),
 832     operands_swapped (false),
 833     no_data_dependencies (false),
 834     has_mask_store (false),
 835     scalar_loop (NULL),
 836     orig_loop_info (NULL)
 837 {
 838   /* Create/Update stmt_info for all stmts in the loop.  */
 839   basic_block *body = get_loop_body (loop);
 840   for (unsigned int i = 0; i < loop->num_nodes; i++)
 841     {
 842       basic_block bb = body[i];
 843       gimple_stmt_iterator si;
 844
 845       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 846         {
 847           gimple *phi = gsi_stmt (si);
 848           gimple_set_uid (phi, 0);
 849           add_stmt (phi);
 850         }
 851
 852       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 853         {
 854           gimple *stmt = gsi_stmt (si);
 855           gimple_set_uid (stmt, 0);
 856           add_stmt (stmt);
 857         }
 858     }
 859   free (body);
 860
 861   /* CHECKME: We want to visit all BBs before their successors (except for
 862      latch blocks, for which this assertion wouldn't hold).  In the simple
 863      case of the loop forms we allow, a dfs order of the BBs would the same
 864      as reversed postorder traversal, so we are safe.  */
 865
 866   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 867                                           bbs, loop->num_nodes, loop);
 868   gcc_assert (nbbs == loop->num_nodes);
 869 }
 870
 871 /* Free all levels of MASKS.  */
 872
 873 void
 874 release_vec_loop_masks (vec_loop_masks *masks)
 875 {
 876   rgroup_masks *rgm;
 877   unsigned int i;
 878   FOR_EACH_VEC_ELT (*masks, i, rgm)
 879     rgm->masks.release ();
 880   masks->release ();
 881 }
 882
 883 /* Free all memory used by the _loop_vec_info, as well as all the
 884    stmt_vec_info structs of all the stmts in the loop.  */
 885
 886 _loop_vec_info::~_loop_vec_info ()
 887 {
 888   int nbbs;
 889   gimple_stmt_iterator si;
 890   int j;
 891
 892   /* ???  We're releasing loop_vinfos en-block.  */
 893   set_stmt_vec_info_vec (&stmt_vec_infos);
 894   nbbs = loop->num_nodes;
 895   for (j = 0; j < nbbs; j++)
 896     {
 897       basic_block bb = bbs[j];
 898       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 899         free_stmt_vec_info (gsi_stmt (si));
 900
 901       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 902         {
 903           gimple *stmt = gsi_stmt (si);
 904
 905           /* We may have broken canonical form by moving a constant
 906              into RHS1 of a commutative op.  Fix such occurrences.  */
 907           if (operands_swapped && is_gimple_assign (stmt))
 908             {
 909               enum tree_code code = gimple_assign_rhs_code (stmt);
 910
 911               if ((code == PLUS_EXPR
 912                    || code == POINTER_PLUS_EXPR
 913                    || code == MULT_EXPR)
 914                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 915                 swap_ssa_operands (stmt,
 916                                    gimple_assign_rhs1_ptr (stmt),
 917                                    gimple_assign_rhs2_ptr (stmt));
 918               else if (code == COND_EXPR
 919                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 920                 {
 921                   tree cond_expr = gimple_assign_rhs1 (stmt);
 922                   enum tree_code cond_code = TREE_CODE (cond_expr);
 923
 924                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 925                     {
 926                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 927                                                                   0));
 928                       cond_code = invert_tree_comparison (cond_code,
 929                                                           honor_nans);
 930                       if (cond_code != ERROR_MARK)
 931                         {
 932                           TREE_SET_CODE (cond_expr, cond_code);
 933                           swap_ssa_operands (stmt,
 934                                              gimple_assign_rhs2_ptr (stmt),
 935                                              gimple_assign_rhs3_ptr (stmt));
 936                         }
 937                     }
 938                 }
 939             }
 940
 941           /* Free stmt_vec_info.  */
 942           free_stmt_vec_info (stmt);
 943           gsi_next (&si);
 944         }
 945     }
 946
 947   free (bbs);
 948
 949   release_vec_loop_masks (&masks);
 950   delete ivexpr_map;
 951
 952   loop->aux = NULL;
 953 }
 954
 955 /* Return an invariant or register for EXPR and emit necessary
 956    computations in the LOOP_VINFO loop preheader.  */
 957
 958 tree
 959 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 960 {
 961   if (is_gimple_reg (expr)
 962       || is_gimple_min_invariant (expr))
 963     return expr;
 964
 965   if (! loop_vinfo->ivexpr_map)
 966     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 967   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 968   if (! cached)
 969     {
 970       gimple_seq stmts = NULL;
 971       cached = force_gimple_operand (unshare_expr (expr),
 972                                      &stmts, true, NULL_TREE);
 973       if (stmts)
 974         {
 975           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 976           gsi_insert_seq_on_edge_immediate (e, stmts);
 977         }
 978     }
 979   return cached;
 980 }
 981
 982 /* Return true if we can use CMP_TYPE as the comparison type to produce
 983    all masks required to mask LOOP_VINFO.  */
 984
 985 static bool
 986 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 987 {
 988   rgroup_masks *rgm;
 989   unsigned int i;
 990   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 991     if (rgm->mask_type != NULL_TREE
 992         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 993                                             cmp_type, rgm->mask_type,
 994                                             OPTIMIZE_FOR_SPEED))
 995       return false;
 996   return true;
 997 }
 998
 999 /* Calculate the maximum number of scalars per iteration for every
1000    rgroup in LOOP_VINFO.  */
1001
1002 static unsigned int
1003 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1004 {
1005   unsigned int res = 1;
1006   unsigned int i;
1007   rgroup_masks *rgm;
1008   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1009     res = MAX (res, rgm->max_nscalars_per_iter);
1010   return res;
1011 }
1012
1013 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1014    whether we can actually generate the masks required.  Return true if so,
1015    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1016
1017 static bool
1018 vect_verify_full_masking (loop_vec_info loop_vinfo)
1019 {
1020   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1021   unsigned int min_ni_width;
1022
1023   /* Use a normal loop if there are no statements that need masking.
1024      This only happens in rare degenerate cases: it means that the loop
1025      has no loads, no stores, and no live-out values.  */
1026   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1027     return false;
1028
1029   /* Get the maximum number of iterations that is representable
1030      in the counter type.  */
1031   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1032   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1033
1034   /* Get a more refined estimate for the number of iterations.  */
1035   widest_int max_back_edges;
1036   if (max_loop_iterations (loop, &max_back_edges))
1037     max_ni = wi::smin (max_ni, max_back_edges + 1);
1038
1039   /* Account for rgroup masks, in which each bit is replicated N times.  */
1040   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1041
1042   /* Work out how many bits we need to represent the limit.  */
1043   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1044
1045   /* Find a scalar mode for which WHILE_ULT is supported.  */
1046   opt_scalar_int_mode cmp_mode_iter;
1047   tree cmp_type = NULL_TREE;
1048   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1049     {
1050       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1051       if (cmp_bits >= min_ni_width
1052           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1053         {
1054           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1055           if (this_type
1056               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1057             {
1058               /* Although we could stop as soon as we find a valid mode,
1059                  it's often better to continue until we hit Pmode, since the
1060                  operands to the WHILE are more likely to be reusable in
1061                  address calculations.  */
1062               cmp_type = this_type;
1063               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064                 break;
1065             }
1066         }
1067     }
1068
1069   if (!cmp_type)
1070     return false;
1071
1072   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073   return true;
1074 }
1075
1076 /* Calculate the cost of one scalar iteration of the loop.  */
1077 static void
1078 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1079 {
1080   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1081   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1082   int nbbs = loop->num_nodes, factor;
1083   int innerloop_iters, i;
1084
1085   /* Gather costs for statements in the scalar loop.  */
1086
1087   /* FORNOW.  */
1088   innerloop_iters = 1;
1089   if (loop->inner)
1090     innerloop_iters = 50; /* FIXME */
1091
1092   for (i = 0; i < nbbs; i++)
1093     {
1094       gimple_stmt_iterator si;
1095       basic_block bb = bbs[i];
1096
1097       if (bb->loop_father == loop->inner)
1098         factor = innerloop_iters;
1099       else
1100         factor = 1;
1101
1102       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1103         {
1104           gimple *stmt = gsi_stmt (si);
1105           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1106
1107           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1108             continue;
1109
1110           /* Skip stmts that are not vectorized inside the loop.  */
1111           if (stmt_info
1112               && !STMT_VINFO_RELEVANT_P (stmt_info)
1113               && (!STMT_VINFO_LIVE_P (stmt_info)
1114                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1115               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1116             continue;
1117
1118           vect_cost_for_stmt kind;
1119           if (STMT_VINFO_DATA_REF (stmt_info))
1120             {
1121               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1122                kind = scalar_load;
1123              else
1124                kind = scalar_store;
1125             }
1126           else
1127             kind = scalar_stmt;
1128
1129           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1130                             factor, kind, stmt_info, 0, vect_prologue);
1131         }
1132     }
1133
1134   /* Now accumulate cost.  */
1135   void *target_cost_data = init_cost (loop);
1136   stmt_info_for_cost *si;
1137   int j;
1138   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1139                     j, si)
1140     {
1141       struct _stmt_vec_info *stmt_info
1142         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
1143       (void) add_stmt_cost (target_cost_data, si->count,
1144                             si->kind, stmt_info, si->misalign,
1145                             vect_body);
1146     }
1147   unsigned dummy, body_cost = 0;
1148   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1149   destroy_cost_data (target_cost_data);
1150   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1151 }
1152
1153
1154 /* Function vect_analyze_loop_form_1.
1155
1156    Verify that certain CFG restrictions hold, including:
1157    - the loop has a pre-header
1158    - the loop has a single entry and exit
1159    - the loop exit condition is simple enough
1160    - the number of iterations can be analyzed, i.e, a countable loop.  The
1161      niter could be analyzed under some assumptions.  */
1162
1163 bool
1164 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1165                           tree *assumptions, tree *number_of_iterationsm1,
1166                           tree *number_of_iterations, gcond **inner_loop_cond)
1167 {
1168   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1169
1170   /* Different restrictions apply when we are considering an inner-most loop,
1171      vs. an outer (nested) loop.
1172      (FORNOW. May want to relax some of these restrictions in the future).  */
1173
1174   if (!loop->inner)
1175     {
1176       /* Inner-most loop.  We currently require that the number of BBs is
1177          exactly 2 (the header and latch).  Vectorizable inner-most loops
1178          look like this:
1179
1180                         (pre-header)
1181                            |
1182                           header <--------+
1183                            | |            |
1184                            | +--> latch --+
1185                            |
1186                         (exit-bb)  */
1187
1188       if (loop->num_nodes != 2)
1189         {
1190           if (dump_enabled_p ())
1191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1192                              "not vectorized: control flow in loop.\n");
1193           return false;
1194         }
1195
1196       if (empty_block_p (loop->header))
1197         {
1198           if (dump_enabled_p ())
1199             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1200                              "not vectorized: empty loop.\n");
1201           return false;
1202         }
1203     }
1204   else
1205     {
1206       struct loop *innerloop = loop->inner;
1207       edge entryedge;
1208
1209       /* Nested loop. We currently require that the loop is doubly-nested,
1210          contains a single inner loop, and the number of BBs is exactly 5.
1211          Vectorizable outer-loops look like this:
1212
1213                         (pre-header)
1214                            |
1215                           header <---+
1216                            |         |
1217                           inner-loop |
1218                            |         |
1219                           tail ------+
1220                            |
1221                         (exit-bb)
1222
1223          The inner-loop has the properties expected of inner-most loops
1224          as described above.  */
1225
1226       if ((loop->inner)->inner || (loop->inner)->next)
1227         {
1228           if (dump_enabled_p ())
1229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230                              "not vectorized: multiple nested loops.\n");
1231           return false;
1232         }
1233
1234       if (loop->num_nodes != 5)
1235         {
1236           if (dump_enabled_p ())
1237             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238                              "not vectorized: control flow in loop.\n");
1239           return false;
1240         }
1241
1242       entryedge = loop_preheader_edge (innerloop);
1243       if (entryedge->src != loop->header
1244           || !single_exit (innerloop)
1245           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1246         {
1247           if (dump_enabled_p ())
1248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                              "not vectorized: unsupported outerloop form.\n");
1250           return false;
1251         }
1252
1253       /* Analyze the inner-loop.  */
1254       tree inner_niterm1, inner_niter, inner_assumptions;
1255       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1256                                       &inner_assumptions, &inner_niterm1,
1257                                       &inner_niter, NULL)
1258           /* Don't support analyzing niter under assumptions for inner
1259              loop.  */
1260           || !integer_onep (inner_assumptions))
1261         {
1262           if (dump_enabled_p ())
1263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1264                              "not vectorized: Bad inner loop.\n");
1265           return false;
1266         }
1267
1268       if (!expr_invariant_in_loop_p (loop, inner_niter))
1269         {
1270           if (dump_enabled_p ())
1271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272                              "not vectorized: inner-loop count not"
1273                              " invariant.\n");
1274           return false;
1275         }
1276
1277       if (dump_enabled_p ())
1278         dump_printf_loc (MSG_NOTE, vect_location,
1279                          "Considering outer-loop vectorization.\n");
1280     }
1281
1282   if (!single_exit (loop)
1283       || EDGE_COUNT (loop->header->preds) != 2)
1284     {
1285       if (dump_enabled_p ())
1286         {
1287           if (!single_exit (loop))
1288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289                              "not vectorized: multiple exits.\n");
1290           else if (EDGE_COUNT (loop->header->preds) != 2)
1291             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1292                              "not vectorized: too many incoming edges.\n");
1293         }
1294       return false;
1295     }
1296
1297   /* We assume that the loop exit condition is at the end of the loop. i.e,
1298      that the loop is represented as a do-while (with a proper if-guard
1299      before the loop if needed), where the loop header contains all the
1300      executable statements, and the latch is empty.  */
1301   if (!empty_block_p (loop->latch)
1302       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1303     {
1304       if (dump_enabled_p ())
1305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306                          "not vectorized: latch block not empty.\n");
1307       return false;
1308     }
1309
1310   /* Make sure the exit is not abnormal.  */
1311   edge e = single_exit (loop);
1312   if (e->flags & EDGE_ABNORMAL)
1313     {
1314       if (dump_enabled_p ())
1315         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1316                          "not vectorized: abnormal loop exit edge.\n");
1317       return false;
1318     }
1319
1320   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1321                                      number_of_iterationsm1);
1322   if (!*loop_cond)
1323     {
1324       if (dump_enabled_p ())
1325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1326                          "not vectorized: complicated exit condition.\n");
1327       return false;
1328     }
1329
1330   if (integer_zerop (*assumptions)
1331       || !*number_of_iterations
1332       || chrec_contains_undetermined (*number_of_iterations))
1333     {
1334       if (dump_enabled_p ())
1335         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1336                          "not vectorized: number of iterations cannot be "
1337                          "computed.\n");
1338       return false;
1339     }
1340
1341   if (integer_zerop (*number_of_iterations))
1342     {
1343       if (dump_enabled_p ())
1344         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                          "not vectorized: number of iterations = 0.\n");
1346       return false;
1347     }
1348
1349   return true;
1350 }
1351
1352 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1353
1354 loop_vec_info
1355 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1356 {
1357   tree assumptions, number_of_iterations, number_of_iterationsm1;
1358   gcond *loop_cond, *inner_loop_cond = NULL;
1359
1360   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1361                                   &assumptions, &number_of_iterationsm1,
1362                                   &number_of_iterations, &inner_loop_cond))
1363     return NULL;
1364
1365   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1366   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1367   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1368   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1369   if (!integer_onep (assumptions))
1370     {
1371       /* We consider to vectorize this loop by versioning it under
1372          some assumptions.  In order to do this, we need to clear
1373          existing information computed by scev and niter analyzer.  */
1374       scev_reset_htab ();
1375       free_numbers_of_iterations_estimates (loop);
1376       /* Also set flag for this loop so that following scev and niter
1377          analysis are done under the assumptions.  */
1378       loop_constraint_set (loop, LOOP_C_FINITE);
1379       /* Also record the assumptions for versioning.  */
1380       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1381     }
1382
1383   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1384     {
1385       if (dump_enabled_p ())
1386         {
1387           dump_printf_loc (MSG_NOTE, vect_location,
1388                            "Symbolic number of iterations is ");
1389           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1390           dump_printf (MSG_NOTE, "\n");
1391         }
1392     }
1393
1394   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1395   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1396   if (inner_loop_cond)
1397     {
1398       stmt_vec_info inner_loop_cond_info
1399         = loop_vinfo->lookup_stmt (inner_loop_cond);
1400       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1401     }
1402
1403   gcc_assert (!loop->aux);
1404   loop->aux = loop_vinfo;
1405   return loop_vinfo;
1406 }
1407
1408
1409
1410 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1411    statements update the vectorization factor.  */
1412
1413 static void
1414 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1415 {
1416   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1417   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1418   int nbbs = loop->num_nodes;
1419   poly_uint64 vectorization_factor;
1420   int i;
1421
1422   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1423
1424   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1425   gcc_assert (known_ne (vectorization_factor, 0U));
1426
1427   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1428      vectorization factor of the loop is the unrolling factor required by
1429      the SLP instances.  If that unrolling factor is 1, we say, that we
1430      perform pure SLP on loop - cross iteration parallelism is not
1431      exploited.  */
1432   bool only_slp_in_loop = true;
1433   for (i = 0; i < nbbs; i++)
1434     {
1435       basic_block bb = bbs[i];
1436       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1437            gsi_next (&si))
1438         {
1439           gimple *stmt = gsi_stmt (si);
1440           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1441           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1442               && STMT_VINFO_RELATED_STMT (stmt_info))
1443             {
1444               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1445               stmt_info = vinfo_for_stmt (stmt);
1446             }
1447           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1448                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1449               && !PURE_SLP_STMT (stmt_info))
1450             /* STMT needs both SLP and loop-based vectorization.  */
1451             only_slp_in_loop = false;
1452         }
1453     }
1454
1455   if (only_slp_in_loop)
1456     {
1457       dump_printf_loc (MSG_NOTE, vect_location,
1458                        "Loop contains only SLP stmts\n");
1459       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1460     }
1461   else
1462     {
1463       dump_printf_loc (MSG_NOTE, vect_location,
1464                        "Loop contains SLP and non-SLP stmts\n");
1465       /* Both the vectorization factor and unroll factor have the form
1466          current_vector_size * X for some rational X, so they must have
1467          a common multiple.  */
1468       vectorization_factor
1469         = force_common_multiple (vectorization_factor,
1470                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1471     }
1472
1473   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1474   if (dump_enabled_p ())
1475     {
1476       dump_printf_loc (MSG_NOTE, vect_location,
1477                        "Updating vectorization factor to ");
1478       dump_dec (MSG_NOTE, vectorization_factor);
1479       dump_printf (MSG_NOTE, ".\n");
1480     }
1481 }
1482
1483 /* Return true if STMT_INFO describes a double reduction phi and if
1484    the other phi in the reduction is also relevant for vectorization.
1485    This rejects cases such as:
1486
1487       outer1:
1488         x_1 = PHI <x_3(outer2), ...>;
1489         ...
1490
1491       inner:
1492         x_2 = ...;
1493         ...
1494
1495       outer2:
1496         x_3 = PHI <x_2(inner)>;
1497
1498    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1499
1500 static bool
1501 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1502 {
1503   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1504     return false;
1505
1506   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1507   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1508 }
1509
1510 /* Function vect_analyze_loop_operations.
1511
1512    Scan the loop stmts and make sure they are all vectorizable.  */
1513
1514 static bool
1515 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1516 {
1517   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1518   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1519   int nbbs = loop->num_nodes;
1520   int i;
1521   stmt_vec_info stmt_info;
1522   bool need_to_vectorize = false;
1523   bool ok;
1524
1525   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1526
1527   stmt_vector_for_cost cost_vec;
1528   cost_vec.create (2);
1529
1530   for (i = 0; i < nbbs; i++)
1531     {
1532       basic_block bb = bbs[i];
1533
1534       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1535            gsi_next (&si))
1536         {
1537           gphi *phi = si.phi ();
1538           ok = true;
1539
1540           stmt_info = loop_vinfo->lookup_stmt (phi);
1541           if (dump_enabled_p ())
1542             {
1543               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1544               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1545             }
1546           if (virtual_operand_p (gimple_phi_result (phi)))
1547             continue;
1548
1549           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1550              (i.e., a phi in the tail of the outer-loop).  */
1551           if (! is_loop_header_bb_p (bb))
1552             {
1553               /* FORNOW: we currently don't support the case that these phis
1554                  are not used in the outerloop (unless it is double reduction,
1555                  i.e., this phi is vect_reduction_def), cause this case
1556                  requires to actually do something here.  */
1557               if (STMT_VINFO_LIVE_P (stmt_info)
1558                   && !vect_active_double_reduction_p (stmt_info))
1559                 {
1560                   if (dump_enabled_p ())
1561                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1562                                      "Unsupported loop-closed phi in "
1563                                      "outer-loop.\n");
1564                   return false;
1565                 }
1566
1567               /* If PHI is used in the outer loop, we check that its operand
1568                  is defined in the inner loop.  */
1569               if (STMT_VINFO_RELEVANT_P (stmt_info))
1570                 {
1571                   tree phi_op;
1572
1573                   if (gimple_phi_num_args (phi) != 1)
1574                     return false;
1575
1576                   phi_op = PHI_ARG_DEF (phi, 0);
1577                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1578                   if (!op_def_info)
1579                     return false;
1580
1581                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1582                       && (STMT_VINFO_RELEVANT (op_def_info)
1583                           != vect_used_in_outer_by_reduction))
1584                     return false;
1585                 }
1586
1587               continue;
1588             }
1589
1590           gcc_assert (stmt_info);
1591
1592           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1593                || STMT_VINFO_LIVE_P (stmt_info))
1594               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1595             {
1596               /* A scalar-dependence cycle that we don't support.  */
1597               if (dump_enabled_p ())
1598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599                                  "not vectorized: scalar dependence cycle.\n");
1600               return false;
1601             }
1602
1603           if (STMT_VINFO_RELEVANT_P (stmt_info))
1604             {
1605               need_to_vectorize = true;
1606               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1607                   && ! PURE_SLP_STMT (stmt_info))
1608                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1609               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1610                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1611                        && ! PURE_SLP_STMT (stmt_info))
1612                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1613                                              &cost_vec);
1614             }
1615
1616           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1617           if (ok
1618               && STMT_VINFO_LIVE_P (stmt_info)
1619               && !PURE_SLP_STMT (stmt_info))
1620             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1621                                               &cost_vec);
1622
1623           if (!ok)
1624             {
1625               if (dump_enabled_p ())
1626                 {
1627                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1628                                    "not vectorized: relevant phi not "
1629                                    "supported: ");
1630                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1631                 }
1632               return false;
1633             }
1634         }
1635
1636       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1637            gsi_next (&si))
1638         {
1639           gimple *stmt = gsi_stmt (si);
1640           if (!gimple_clobber_p (stmt)
1641               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1642                                      &cost_vec))
1643             return false;
1644         }
1645     } /* bbs */
1646
1647   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1648   cost_vec.release ();
1649
1650   /* All operations in the loop are either irrelevant (deal with loop
1651      control, or dead), or only used outside the loop and can be moved
1652      out of the loop (e.g. invariants, inductions).  The loop can be
1653      optimized away by scalar optimizations.  We're better off not
1654      touching this loop.  */
1655   if (!need_to_vectorize)
1656     {
1657       if (dump_enabled_p ())
1658         dump_printf_loc (MSG_NOTE, vect_location,
1659                          "All the computation can be taken out of the loop.\n");
1660       if (dump_enabled_p ())
1661         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1662                          "not vectorized: redundant loop. no profit to "
1663                          "vectorize.\n");
1664       return false;
1665     }
1666
1667   return true;
1668 }
1669
1670 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1671    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1672    definitely no, or -1 if it's worth retrying.  */
1673
1674 static int
1675 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1676 {
1677   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1679
1680   /* Only fully-masked loops can have iteration counts less than the
1681      vectorization factor.  */
1682   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1683     {
1684       HOST_WIDE_INT max_niter;
1685
1686       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1687         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1688       else
1689         max_niter = max_stmt_executions_int (loop);
1690
1691       if (max_niter != -1
1692           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1693         {
1694           if (dump_enabled_p ())
1695             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1696                              "not vectorized: iteration count smaller than "
1697                              "vectorization factor.\n");
1698           return 0;
1699         }
1700     }
1701
1702   int min_profitable_iters, min_profitable_estimate;
1703   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1704                                       &min_profitable_estimate);
1705
1706   if (min_profitable_iters < 0)
1707     {
1708       if (dump_enabled_p ())
1709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                          "not vectorized: vectorization not profitable.\n");
1711       if (dump_enabled_p ())
1712         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1713                          "not vectorized: vector version will never be "
1714                          "profitable.\n");
1715       return -1;
1716     }
1717
1718   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1719                                * assumed_vf);
1720
1721   /* Use the cost model only if it is more conservative than user specified
1722      threshold.  */
1723   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1724                                     min_profitable_iters);
1725
1726   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1727
1728   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1729       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1730     {
1731       if (dump_enabled_p ())
1732         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1733                          "not vectorized: vectorization not profitable.\n");
1734       if (dump_enabled_p ())
1735         dump_printf_loc (MSG_NOTE, vect_location,
1736                          "not vectorized: iteration count smaller than user "
1737                          "specified loop bound parameter or minimum profitable "
1738                          "iterations (whichever is more conservative).\n");
1739       return 0;
1740     }
1741
1742   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1743   if (estimated_niter == -1)
1744     estimated_niter = likely_max_stmt_executions_int (loop);
1745   if (estimated_niter != -1
1746       && ((unsigned HOST_WIDE_INT) estimated_niter
1747           < MAX (th, (unsigned) min_profitable_estimate)))
1748     {
1749       if (dump_enabled_p ())
1750         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1751                          "not vectorized: estimated iteration count too "
1752                          "small.\n");
1753       if (dump_enabled_p ())
1754         dump_printf_loc (MSG_NOTE, vect_location,
1755                          "not vectorized: estimated iteration count smaller "
1756                          "than specified loop bound parameter or minimum "
1757                          "profitable iterations (whichever is more "
1758                          "conservative).\n");
1759       return -1;
1760     }
1761
1762   return 1;
1763 }
1764
1765 static bool
1766 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1767                            vec<data_reference_p> *datarefs,
1768                            unsigned int *n_stmts)
1769 {
1770   *n_stmts = 0;
1771   for (unsigned i = 0; i < loop->num_nodes; i++)
1772     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1773          !gsi_end_p (gsi); gsi_next (&gsi))
1774       {
1775         gimple *stmt = gsi_stmt (gsi);
1776         if (is_gimple_debug (stmt))
1777           continue;
1778         ++(*n_stmts);
1779         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1780           {
1781             if (is_gimple_call (stmt) && loop->safelen)
1782               {
1783                 tree fndecl = gimple_call_fndecl (stmt), op;
1784                 if (fndecl != NULL_TREE)
1785                   {
1786                     cgraph_node *node = cgraph_node::get (fndecl);
1787                     if (node != NULL && node->simd_clones != NULL)
1788                       {
1789                         unsigned int j, n = gimple_call_num_args (stmt);
1790                         for (j = 0; j < n; j++)
1791                           {
1792                             op = gimple_call_arg (stmt, j);
1793                             if (DECL_P (op)
1794                                 || (REFERENCE_CLASS_P (op)
1795                                     && get_base_address (op)))
1796                               break;
1797                           }
1798                         op = gimple_call_lhs (stmt);
1799                         /* Ignore #pragma omp declare simd functions
1800                            if they don't have data references in the
1801                            call stmt itself.  */
1802                         if (j == n
1803                             && !(op
1804                                  && (DECL_P (op)
1805                                      || (REFERENCE_CLASS_P (op)
1806                                          && get_base_address (op)))))
1807                           continue;
1808                       }
1809                   }
1810               }
1811             return false;
1812           }
1813         /* If dependence analysis will give up due to the limit on the
1814            number of datarefs stop here and fail fatally.  */
1815         if (datarefs->length ()
1816             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1817           return false;
1818       }
1819   return true;
1820 }
1821
1822 /* Function vect_analyze_loop_2.
1823
1824    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1825    for it.  The different analyses will record information in the
1826    loop_vec_info struct.  */
1827 static bool
1828 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1829 {
1830   bool ok;
1831   int res;
1832   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1833   poly_uint64 min_vf = 2;
1834
1835   /* The first group of checks is independent of the vector size.  */
1836   fatal = true;
1837
1838   /* Find all data references in the loop (which correspond to vdefs/vuses)
1839      and analyze their evolution in the loop.  */
1840
1841   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1842
1843   /* Gather the data references and count stmts in the loop.  */
1844   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1845     {
1846       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1847                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1848                                       n_stmts))
1849         {
1850           if (dump_enabled_p ())
1851             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852                              "not vectorized: loop contains function "
1853                              "calls or data references that cannot "
1854                              "be analyzed\n");
1855           return false;
1856         }
1857       loop_vinfo->shared->save_datarefs ();
1858     }
1859   else
1860     loop_vinfo->shared->check_datarefs ();
1861
1862   /* Analyze the data references and also adjust the minimal
1863      vectorization factor according to the loads and stores.  */
1864
1865   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1866   if (!ok)
1867     {
1868       if (dump_enabled_p ())
1869         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1870                          "bad data references.\n");
1871       return false;
1872     }
1873
1874   /* Classify all cross-iteration scalar data-flow cycles.
1875      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1876   vect_analyze_scalar_cycles (loop_vinfo);
1877
1878   vect_pattern_recog (loop_vinfo);
1879
1880   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1881
1882   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1883      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1884
1885   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1886   if (!ok)
1887     {
1888       if (dump_enabled_p ())
1889         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1890                          "bad data access.\n");
1891       return false;
1892     }
1893
1894   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1895
1896   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1897   if (!ok)
1898     {
1899       if (dump_enabled_p ())
1900         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1901                          "unexpected pattern.\n");
1902       return false;
1903     }
1904
1905   /* While the rest of the analysis below depends on it in some way.  */
1906   fatal = false;
1907
1908   /* Analyze data dependences between the data-refs in the loop
1909      and adjust the maximum vectorization factor according to
1910      the dependences.
1911      FORNOW: fail at the first data dependence that we encounter.  */
1912
1913   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1914   if (!ok
1915       || (max_vf != MAX_VECTORIZATION_FACTOR
1916           && maybe_lt (max_vf, min_vf)))
1917     {
1918       if (dump_enabled_p ())
1919             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1920                              "bad data dependence.\n");
1921       return false;
1922     }
1923   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1924
1925   ok = vect_determine_vectorization_factor (loop_vinfo);
1926   if (!ok)
1927     {
1928       if (dump_enabled_p ())
1929         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930                          "can't determine vectorization factor.\n");
1931       return false;
1932     }
1933   if (max_vf != MAX_VECTORIZATION_FACTOR
1934       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1935     {
1936       if (dump_enabled_p ())
1937         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1938                          "bad data dependence.\n");
1939       return false;
1940     }
1941
1942   /* Compute the scalar iteration cost.  */
1943   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1944
1945   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1946   unsigned th;
1947
1948   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1949   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1950   if (!ok)
1951     return false;
1952
1953   /* If there are any SLP instances mark them as pure_slp.  */
1954   bool slp = vect_make_slp_decision (loop_vinfo);
1955   if (slp)
1956     {
1957       /* Find stmts that need to be both vectorized and SLPed.  */
1958       vect_detect_hybrid_slp (loop_vinfo);
1959
1960       /* Update the vectorization factor based on the SLP decision.  */
1961       vect_update_vf_for_slp (loop_vinfo);
1962     }
1963
1964   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1965
1966   /* We don't expect to have to roll back to anything other than an empty
1967      set of rgroups.  */
1968   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1969
1970   /* This is the point where we can re-start analysis with SLP forced off.  */
1971 start_over:
1972
1973   /* Now the vectorization factor is final.  */
1974   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1975   gcc_assert (known_ne (vectorization_factor, 0U));
1976
1977   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1978     {
1979       dump_printf_loc (MSG_NOTE, vect_location,
1980                        "vectorization_factor = ");
1981       dump_dec (MSG_NOTE, vectorization_factor);
1982       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1983                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1984     }
1985
1986   HOST_WIDE_INT max_niter
1987     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1988
1989   /* Analyze the alignment of the data-refs in the loop.
1990      Fail if a data reference is found that cannot be vectorized.  */
1991
1992   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1993   if (!ok)
1994     {
1995       if (dump_enabled_p ())
1996         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1997                          "bad data alignment.\n");
1998       return false;
1999     }
2000
2001   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2002      It is important to call pruning after vect_analyze_data_ref_accesses,
2003      since we use grouping information gathered by interleaving analysis.  */
2004   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2005   if (!ok)
2006     return false;
2007
2008   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2009      vectorization.  */
2010   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2011     {
2012     /* This pass will decide on using loop versioning and/or loop peeling in
2013        order to enhance the alignment of data references in the loop.  */
2014     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2015     if (!ok)
2016       {
2017         if (dump_enabled_p ())
2018           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2019                            "bad data alignment.\n");
2020         return false;
2021       }
2022     }
2023
2024   if (slp)
2025     {
2026       /* Analyze operations in the SLP instances.  Note this may
2027          remove unsupported SLP instances which makes the above
2028          SLP kind detection invalid.  */
2029       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2030       vect_slp_analyze_operations (loop_vinfo);
2031       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2032         goto again;
2033     }
2034
2035   /* Scan all the remaining operations in the loop that are not subject
2036      to SLP and make sure they are vectorizable.  */
2037   ok = vect_analyze_loop_operations (loop_vinfo);
2038   if (!ok)
2039     {
2040       if (dump_enabled_p ())
2041         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2042                          "bad operation or unsupported loop bound.\n");
2043       return false;
2044     }
2045
2046   /* Decide whether to use a fully-masked loop for this vectorization
2047      factor.  */
2048   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2049     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2050        && vect_verify_full_masking (loop_vinfo));
2051   if (dump_enabled_p ())
2052     {
2053       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2054         dump_printf_loc (MSG_NOTE, vect_location,
2055                          "using a fully-masked loop.\n");
2056       else
2057         dump_printf_loc (MSG_NOTE, vect_location,
2058                          "not using a fully-masked loop.\n");
2059     }
2060
2061   /* If epilog loop is required because of data accesses with gaps,
2062      one additional iteration needs to be peeled.  Check if there is
2063      enough iterations for vectorization.  */
2064   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2065       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2066       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2067     {
2068       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2069       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2070
2071       if (known_lt (wi::to_widest (scalar_niters), vf))
2072         {
2073           if (dump_enabled_p ())
2074             dump_printf_loc (MSG_NOTE, vect_location,
2075                              "loop has no enough iterations to support"
2076                              " peeling for gaps.\n");
2077           return false;
2078         }
2079     }
2080
2081   /* Check the costings of the loop make vectorizing worthwhile.  */
2082   res = vect_analyze_loop_costing (loop_vinfo);
2083   if (res < 0)
2084     goto again;
2085   if (!res)
2086     {
2087       if (dump_enabled_p ())
2088         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089                          "Loop costings not worthwhile.\n");
2090       return false;
2091     }
2092
2093   /* Decide whether we need to create an epilogue loop to handle
2094      remaining scalar iterations.  */
2095   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2096
2097   unsigned HOST_WIDE_INT const_vf;
2098   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2099     /* The main loop handles all iterations.  */
2100     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2101   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2102            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2103     {
2104       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2105                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2106                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2107         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2108     }
2109   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2110            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2111            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2112                 < (unsigned) exact_log2 (const_vf))
2113                /* In case of versioning, check if the maximum number of
2114                   iterations is greater than th.  If they are identical,
2115                   the epilogue is unnecessary.  */
2116                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2117                    || ((unsigned HOST_WIDE_INT) max_niter
2118                        > (th / const_vf) * const_vf))))
2119     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2120
2121   /* If an epilogue loop is required make sure we can create one.  */
2122   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2123       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2124     {
2125       if (dump_enabled_p ())
2126         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2127       if (!vect_can_advance_ivs_p (loop_vinfo)
2128           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2129                                            single_exit (LOOP_VINFO_LOOP
2130                                                          (loop_vinfo))))
2131         {
2132           if (dump_enabled_p ())
2133             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2134                              "not vectorized: can't create required "
2135                              "epilog loop\n");
2136           goto again;
2137         }
2138     }
2139
2140   /* During peeling, we need to check if number of loop iterations is
2141      enough for both peeled prolog loop and vector loop.  This check
2142      can be merged along with threshold check of loop versioning, so
2143      increase threshold for this case if necessary.  */
2144   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2145     {
2146       poly_uint64 niters_th = 0;
2147
2148       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2149         {
2150           /* Niters for peeled prolog loop.  */
2151           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2152             {
2153               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2154               tree vectype
2155                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2156               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2157             }
2158           else
2159             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2160         }
2161
2162       /* Niters for at least one iteration of vectorized loop.  */
2163       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2164         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2165       /* One additional iteration because of peeling for gap.  */
2166       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2167         niters_th += 1;
2168       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2169     }
2170
2171   gcc_assert (known_eq (vectorization_factor,
2172                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2173
2174   /* Ok to vectorize!  */
2175   return true;
2176
2177 again:
2178   /* Try again with SLP forced off but if we didn't do any SLP there is
2179      no point in re-trying.  */
2180   if (!slp)
2181     return false;
2182
2183   /* If there are reduction chains re-trying will fail anyway.  */
2184   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2185     return false;
2186
2187   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2188      via interleaving or lane instructions.  */
2189   slp_instance instance;
2190   slp_tree node;
2191   unsigned i, j;
2192   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2193     {
2194       stmt_vec_info vinfo;
2195       vinfo = vinfo_for_stmt
2196           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2197       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2198         continue;
2199       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2200       unsigned int size = DR_GROUP_SIZE (vinfo);
2201       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2202       if (! vect_store_lanes_supported (vectype, size, false)
2203          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2204          && ! vect_grouped_store_supported (vectype, size))
2205        return false;
2206       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2207         {
2208           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2209           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2210           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2211           size = DR_GROUP_SIZE (vinfo);
2212           vectype = STMT_VINFO_VECTYPE (vinfo);
2213           if (! vect_load_lanes_supported (vectype, size, false)
2214               && ! vect_grouped_load_supported (vectype, single_element_p,
2215                                                 size))
2216             return false;
2217         }
2218     }
2219
2220   if (dump_enabled_p ())
2221     dump_printf_loc (MSG_NOTE, vect_location,
2222                      "re-trying with SLP disabled\n");
2223
2224   /* Roll back state appropriately.  No SLP this time.  */
2225   slp = false;
2226   /* Restore vectorization factor as it were without SLP.  */
2227   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2228   /* Free the SLP instances.  */
2229   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2230     vect_free_slp_instance (instance, false);
2231   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2232   /* Reset SLP type to loop_vect on all stmts.  */
2233   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2234     {
2235       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2236       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2237            !gsi_end_p (si); gsi_next (&si))
2238         {
2239           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2240           STMT_SLP_TYPE (stmt_info) = loop_vect;
2241         }
2242       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2243            !gsi_end_p (si); gsi_next (&si))
2244         {
2245           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2246           STMT_SLP_TYPE (stmt_info) = loop_vect;
2247           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2248             {
2249               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2250               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2251               STMT_SLP_TYPE (stmt_info) = loop_vect;
2252               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2253                    !gsi_end_p (pi); gsi_next (&pi))
2254                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2255                   = loop_vect;
2256             }
2257         }
2258     }
2259   /* Free optimized alias test DDRS.  */
2260   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2261   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2262   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2263   /* Reset target cost data.  */
2264   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2265   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2266     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2267   /* Reset accumulated rgroup information.  */
2268   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2269   /* Reset assorted flags.  */
2270   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2271   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2272   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2273   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2274   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2275
2276   goto start_over;
2277 }
2278
2279 /* Function vect_analyze_loop.
2280
2281    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2282    for it.  The different analyses will record information in the
2283    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2284    be vectorized.  */
2285 loop_vec_info
2286 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2287                    vec_info_shared *shared)
2288 {
2289   loop_vec_info loop_vinfo;
2290   auto_vector_sizes vector_sizes;
2291
2292   /* Autodetect first vector size we try.  */
2293   current_vector_size = 0;
2294   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2295   unsigned int next_size = 0;
2296
2297   DUMP_VECT_SCOPE ("analyze_loop_nest");
2298
2299   if (loop_outer (loop)
2300       && loop_vec_info_for_loop (loop_outer (loop))
2301       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2302     {
2303       if (dump_enabled_p ())
2304         dump_printf_loc (MSG_NOTE, vect_location,
2305                          "outer-loop already vectorized.\n");
2306       return NULL;
2307     }
2308
2309   if (!find_loop_nest (loop, &shared->loop_nest))
2310     {
2311       if (dump_enabled_p ())
2312         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313                          "not vectorized: loop nest containing two "
2314                          "or more consecutive inner loops cannot be "
2315                          "vectorized\n");
2316       return NULL;
2317     }
2318
2319   unsigned n_stmts = 0;
2320   poly_uint64 autodetected_vector_size = 0;
2321   while (1)
2322     {
2323       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2324       loop_vinfo = vect_analyze_loop_form (loop, shared);
2325       if (!loop_vinfo)
2326         {
2327           if (dump_enabled_p ())
2328             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329                              "bad loop form.\n");
2330           return NULL;
2331         }
2332
2333       bool fatal = false;
2334
2335       if (orig_loop_vinfo)
2336         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2337
2338       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2339         {
2340           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2341
2342           return loop_vinfo;
2343         }
2344
2345       delete loop_vinfo;
2346
2347       if (next_size == 0)
2348         autodetected_vector_size = current_vector_size;
2349
2350       if (next_size < vector_sizes.length ()
2351           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2352         next_size += 1;
2353
2354       if (fatal
2355           || next_size == vector_sizes.length ()
2356           || known_eq (current_vector_size, 0U))
2357         return NULL;
2358
2359       /* Try the next biggest vector size.  */
2360       current_vector_size = vector_sizes[next_size++];
2361       if (dump_enabled_p ())
2362         {
2363           dump_printf_loc (MSG_NOTE, vect_location,
2364                            "***** Re-trying analysis with "
2365                            "vector size ");
2366           dump_dec (MSG_NOTE, current_vector_size);
2367           dump_printf (MSG_NOTE, "\n");
2368         }
2369     }
2370 }
2371
2372 /* Return true if there is an in-order reduction function for CODE, storing
2373    it in *REDUC_FN if so.  */
2374
2375 static bool
2376 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2377 {
2378   switch (code)
2379     {
2380     case PLUS_EXPR:
2381       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2382       return true;
2383
2384     default:
2385       return false;
2386     }
2387 }
2388
2389 /* Function reduction_fn_for_scalar_code
2390
2391    Input:
2392    CODE - tree_code of a reduction operations.
2393
2394    Output:
2395    REDUC_FN - the corresponding internal function to be used to reduce the
2396       vector of partial results into a single scalar result, or IFN_LAST
2397       if the operation is a supported reduction operation, but does not have
2398       such an internal function.
2399
2400    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2401
2402 static bool
2403 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2404 {
2405   switch (code)
2406     {
2407       case MAX_EXPR:
2408         *reduc_fn = IFN_REDUC_MAX;
2409         return true;
2410
2411       case MIN_EXPR:
2412         *reduc_fn = IFN_REDUC_MIN;
2413         return true;
2414
2415       case PLUS_EXPR:
2416         *reduc_fn = IFN_REDUC_PLUS;
2417         return true;
2418
2419       case BIT_AND_EXPR:
2420         *reduc_fn = IFN_REDUC_AND;
2421         return true;
2422
2423       case BIT_IOR_EXPR:
2424         *reduc_fn = IFN_REDUC_IOR;
2425         return true;
2426
2427       case BIT_XOR_EXPR:
2428         *reduc_fn = IFN_REDUC_XOR;
2429         return true;
2430
2431       case MULT_EXPR:
2432       case MINUS_EXPR:
2433         *reduc_fn = IFN_LAST;
2434         return true;
2435
2436       default:
2437        return false;
2438     }
2439 }
2440
2441 /* If there is a neutral value X such that SLP reduction NODE would not
2442    be affected by the introduction of additional X elements, return that X,
2443    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2444    is true if the SLP statements perform a single reduction, false if each
2445    statement performs an independent reduction.  */
2446
2447 static tree
2448 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2449                               bool reduc_chain)
2450 {
2451   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2452   gimple *stmt = stmts[0];
2453   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2454   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2455   tree scalar_type = TREE_TYPE (vector_type);
2456   struct loop *loop = gimple_bb (stmt)->loop_father;
2457   gcc_assert (loop);
2458
2459   switch (code)
2460     {
2461     case WIDEN_SUM_EXPR:
2462     case DOT_PROD_EXPR:
2463     case SAD_EXPR:
2464     case PLUS_EXPR:
2465     case MINUS_EXPR:
2466     case BIT_IOR_EXPR:
2467     case BIT_XOR_EXPR:
2468       return build_zero_cst (scalar_type);
2469
2470     case MULT_EXPR:
2471       return build_one_cst (scalar_type);
2472
2473     case BIT_AND_EXPR:
2474       return build_all_ones_cst (scalar_type);
2475
2476     case MAX_EXPR:
2477     case MIN_EXPR:
2478       /* For MIN/MAX the initial values are neutral.  A reduction chain
2479          has only a single initial value, so that value is neutral for
2480          all statements.  */
2481       if (reduc_chain)
2482         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2483       return NULL_TREE;
2484
2485     default:
2486       return NULL_TREE;
2487     }
2488 }
2489
2490 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2491    STMT is printed with a message MSG. */
2492
2493 static void
2494 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2495 {
2496   dump_printf_loc (msg_type, vect_location, "%s", msg);
2497   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2498 }
2499
2500 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2501    operation.  Return true if the results of DEF_STMT_INFO are something
2502    that can be accumulated by such a reduction.  */
2503
2504 static bool
2505 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2506 {
2507   return (is_gimple_assign (def_stmt_info->stmt)
2508           || is_gimple_call (def_stmt_info->stmt)
2509           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2510           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2511               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2512               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2513 }
2514
2515 /* Detect SLP reduction of the form:
2516
2517    #a1 = phi <a5, a0>
2518    a2 = operation (a1)
2519    a3 = operation (a2)
2520    a4 = operation (a3)
2521    a5 = operation (a4)
2522
2523    #a = phi <a5>
2524
2525    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2526    FIRST_STMT is the first reduction stmt in the chain
2527    (a2 = operation (a1)).
2528
2529    Return TRUE if a reduction chain was detected.  */
2530
2531 static bool
2532 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2533                        gimple *first_stmt)
2534 {
2535   struct loop *loop = (gimple_bb (phi))->loop_father;
2536   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2537   enum tree_code code;
2538   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2539   stmt_vec_info use_stmt_info, current_stmt_info;
2540   tree lhs;
2541   imm_use_iterator imm_iter;
2542   use_operand_p use_p;
2543   int nloop_uses, size = 0, n_out_of_loop_uses;
2544   bool found = false;
2545
2546   if (loop != vect_loop)
2547     return false;
2548
2549   lhs = PHI_RESULT (phi);
2550   code = gimple_assign_rhs_code (first_stmt);
2551   while (1)
2552     {
2553       nloop_uses = 0;
2554       n_out_of_loop_uses = 0;
2555       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2556         {
2557           gimple *use_stmt = USE_STMT (use_p);
2558           if (is_gimple_debug (use_stmt))
2559             continue;
2560
2561           /* Check if we got back to the reduction phi.  */
2562           if (use_stmt == phi)
2563             {
2564               loop_use_stmt = use_stmt;
2565               found = true;
2566               break;
2567             }
2568
2569           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2570             {
2571               loop_use_stmt = use_stmt;
2572               nloop_uses++;
2573             }
2574            else
2575              n_out_of_loop_uses++;
2576
2577            /* There are can be either a single use in the loop or two uses in
2578               phi nodes.  */
2579            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2580              return false;
2581         }
2582
2583       if (found)
2584         break;
2585
2586       /* We reached a statement with no loop uses.  */
2587       if (nloop_uses == 0)
2588         return false;
2589
2590       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2591       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2592         return false;
2593
2594       if (!is_gimple_assign (loop_use_stmt)
2595           || code != gimple_assign_rhs_code (loop_use_stmt)
2596           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2597         return false;
2598
2599       /* Insert USE_STMT into reduction chain.  */
2600       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2601       if (current_stmt)
2602         {
2603           current_stmt_info = vinfo_for_stmt (current_stmt);
2604           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2605           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2606             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2607         }
2608       else
2609         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2610
2611       lhs = gimple_assign_lhs (loop_use_stmt);
2612       current_stmt = loop_use_stmt;
2613       size++;
2614    }
2615
2616   if (!found || loop_use_stmt != phi || size < 2)
2617     return false;
2618
2619   /* Swap the operands, if needed, to make the reduction operand be the second
2620      operand.  */
2621   lhs = PHI_RESULT (phi);
2622   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2623   while (next_stmt)
2624     {
2625       if (gimple_assign_rhs2 (next_stmt) == lhs)
2626         {
2627           tree op = gimple_assign_rhs1 (next_stmt);
2628           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2629
2630           /* Check that the other def is either defined in the loop
2631              ("vect_internal_def"), or it's an induction (defined by a
2632              loop-header phi-node).  */
2633           if (def_stmt_info
2634               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2635               && vect_valid_reduction_input_p (def_stmt_info))
2636             {
2637               lhs = gimple_assign_lhs (next_stmt);
2638               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2639               continue;
2640             }
2641
2642           return false;
2643         }
2644       else
2645         {
2646           tree op = gimple_assign_rhs2 (next_stmt);
2647           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2648
2649           /* Check that the other def is either defined in the loop
2650             ("vect_internal_def"), or it's an induction (defined by a
2651             loop-header phi-node).  */
2652           if (def_stmt_info
2653               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2654               && vect_valid_reduction_input_p (def_stmt_info))
2655             {
2656               if (dump_enabled_p ())
2657                 {
2658                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2659                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2660                 }
2661
2662               swap_ssa_operands (next_stmt,
2663                                  gimple_assign_rhs1_ptr (next_stmt),
2664                                  gimple_assign_rhs2_ptr (next_stmt));
2665               update_stmt (next_stmt);
2666
2667               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2668                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2669             }
2670           else
2671             return false;
2672         }
2673
2674       lhs = gimple_assign_lhs (next_stmt);
2675       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2676     }
2677
2678   /* Save the chain for further analysis in SLP detection.  */
2679   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2680   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2681   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2682
2683   return true;
2684 }
2685
2686 /* Return true if we need an in-order reduction for operation CODE
2687    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2688    overflow must wrap.  */
2689
2690 static bool
2691 needs_fold_left_reduction_p (tree type, tree_code code,
2692                              bool need_wrapping_integral_overflow)
2693 {
2694   /* CHECKME: check for !flag_finite_math_only too?  */
2695   if (SCALAR_FLOAT_TYPE_P (type))
2696     switch (code)
2697       {
2698       case MIN_EXPR:
2699       case MAX_EXPR:
2700         return false;
2701
2702       default:
2703         return !flag_associative_math;
2704       }
2705
2706   if (INTEGRAL_TYPE_P (type))
2707     {
2708       if (!operation_no_trapping_overflow (type, code))
2709         return true;
2710       if (need_wrapping_integral_overflow
2711           && !TYPE_OVERFLOW_WRAPS (type)
2712           && operation_can_overflow (code))
2713         return true;
2714       return false;
2715     }
2716
2717   if (SAT_FIXED_POINT_TYPE_P (type))
2718     return true;
2719
2720   return false;
2721 }
2722
2723 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2724    reduction operation CODE has a handled computation expression.  */
2725
2726 bool
2727 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2728                       tree loop_arg, enum tree_code code)
2729 {
2730   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2731   auto_bitmap visited;
2732   tree lookfor = PHI_RESULT (phi);
2733   ssa_op_iter curri;
2734   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2735   while (USE_FROM_PTR (curr) != loop_arg)
2736     curr = op_iter_next_use (&curri);
2737   curri.i = curri.numops;
2738   do
2739     {
2740       path.safe_push (std::make_pair (curri, curr));
2741       tree use = USE_FROM_PTR (curr);
2742       if (use == lookfor)
2743         break;
2744       gimple *def = SSA_NAME_DEF_STMT (use);
2745       if (gimple_nop_p (def)
2746           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2747         {
2748 pop:
2749           do
2750             {
2751               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2752               curri = x.first;
2753               curr = x.second;
2754               do
2755                 curr = op_iter_next_use (&curri);
2756               /* Skip already visited or non-SSA operands (from iterating
2757                  over PHI args).  */
2758               while (curr != NULL_USE_OPERAND_P
2759                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2760                          || ! bitmap_set_bit (visited,
2761                                               SSA_NAME_VERSION
2762                                                 (USE_FROM_PTR (curr)))));
2763             }
2764           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2765           if (curr == NULL_USE_OPERAND_P)
2766             break;
2767         }
2768       else
2769         {
2770           if (gimple_code (def) == GIMPLE_PHI)
2771             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2772           else
2773             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2774           while (curr != NULL_USE_OPERAND_P
2775                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2776                      || ! bitmap_set_bit (visited,
2777                                           SSA_NAME_VERSION
2778                                             (USE_FROM_PTR (curr)))))
2779             curr = op_iter_next_use (&curri);
2780           if (curr == NULL_USE_OPERAND_P)
2781             goto pop;
2782         }
2783     }
2784   while (1);
2785   if (dump_file && (dump_flags & TDF_DETAILS))
2786     {
2787       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2788       unsigned i;
2789       std::pair<ssa_op_iter, use_operand_p> *x;
2790       FOR_EACH_VEC_ELT (path, i, x)
2791         {
2792           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2793           dump_printf (MSG_NOTE, " ");
2794         }
2795       dump_printf (MSG_NOTE, "\n");
2796     }
2797
2798   /* Check whether the reduction path detected is valid.  */
2799   bool fail = path.length () == 0;
2800   bool neg = false;
2801   for (unsigned i = 1; i < path.length (); ++i)
2802     {
2803       gimple *use_stmt = USE_STMT (path[i].second);
2804       tree op = USE_FROM_PTR (path[i].second);
2805       if (! has_single_use (op)
2806           || ! is_gimple_assign (use_stmt))
2807         {
2808           fail = true;
2809           break;
2810         }
2811       if (gimple_assign_rhs_code (use_stmt) != code)
2812         {
2813           if (code == PLUS_EXPR
2814               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2815             {
2816               /* Track whether we negate the reduction value each iteration.  */
2817               if (gimple_assign_rhs2 (use_stmt) == op)
2818                 neg = ! neg;
2819             }
2820           else
2821             {
2822               fail = true;
2823               break;
2824             }
2825         }
2826     }
2827   return ! fail && ! neg;
2828 }
2829
2830
2831 /* Function vect_is_simple_reduction
2832
2833    (1) Detect a cross-iteration def-use cycle that represents a simple
2834    reduction computation.  We look for the following pattern:
2835
2836    loop_header:
2837      a1 = phi < a0, a2 >
2838      a3 = ...
2839      a2 = operation (a3, a1)
2840
2841    or
2842
2843    a3 = ...
2844    loop_header:
2845      a1 = phi < a0, a2 >
2846      a2 = operation (a3, a1)
2847
2848    such that:
2849    1. operation is commutative and associative and it is safe to
2850       change the order of the computation
2851    2. no uses for a2 in the loop (a2 is used out of the loop)
2852    3. no uses of a1 in the loop besides the reduction operation
2853    4. no uses of a1 outside the loop.
2854
2855    Conditions 1,4 are tested here.
2856    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2857
2858    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2859    nested cycles.
2860
2861    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2862    reductions:
2863
2864      a1 = phi < a0, a2 >
2865      inner loop (def of a3)
2866      a2 = phi < a3 >
2867
2868    (4) Detect condition expressions, ie:
2869      for (int i = 0; i < N; i++)
2870        if (a[i] < val)
2871         ret_val = a[i];
2872
2873 */
2874
2875 static gimple *
2876 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2877                           bool *double_reduc,
2878                           bool need_wrapping_integral_overflow,
2879                           enum vect_reduction_type *v_reduc_type)
2880 {
2881   struct loop *loop = (gimple_bb (phi))->loop_father;
2882   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2883   gimple *def_stmt, *phi_use_stmt = NULL;
2884   enum tree_code orig_code, code;
2885   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2886   tree type;
2887   int nloop_uses;
2888   tree name;
2889   imm_use_iterator imm_iter;
2890   use_operand_p use_p;
2891   bool phi_def;
2892
2893   *double_reduc = false;
2894   *v_reduc_type = TREE_CODE_REDUCTION;
2895
2896   tree phi_name = PHI_RESULT (phi);
2897   /* ???  If there are no uses of the PHI result the inner loop reduction
2898      won't be detected as possibly double-reduction by vectorizable_reduction
2899      because that tries to walk the PHI arg from the preheader edge which
2900      can be constant.  See PR60382.  */
2901   if (has_zero_uses (phi_name))
2902     return NULL;
2903   nloop_uses = 0;
2904   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2905     {
2906       gimple *use_stmt = USE_STMT (use_p);
2907       if (is_gimple_debug (use_stmt))
2908         continue;
2909
2910       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2911         {
2912           if (dump_enabled_p ())
2913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2914                              "intermediate value used outside loop.\n");
2915
2916           return NULL;
2917         }
2918
2919       nloop_uses++;
2920       if (nloop_uses > 1)
2921         {
2922           if (dump_enabled_p ())
2923             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2924                              "reduction value used in loop.\n");
2925           return NULL;
2926         }
2927
2928       phi_use_stmt = use_stmt;
2929     }
2930
2931   edge latch_e = loop_latch_edge (loop);
2932   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2933   if (TREE_CODE (loop_arg) != SSA_NAME)
2934     {
2935       if (dump_enabled_p ())
2936         {
2937           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2938                            "reduction: not ssa_name: ");
2939           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2940           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2941         }
2942       return NULL;
2943     }
2944
2945   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2946   if (is_gimple_assign (def_stmt))
2947     {
2948       name = gimple_assign_lhs (def_stmt);
2949       phi_def = false;
2950     }
2951   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2952     {
2953       name = PHI_RESULT (def_stmt);
2954       phi_def = true;
2955     }
2956   else
2957     {
2958       if (dump_enabled_p ())
2959         {
2960           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2961                            "reduction: unhandled reduction operation: ");
2962           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2963         }
2964       return NULL;
2965     }
2966
2967   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2968     return NULL;
2969
2970   nloop_uses = 0;
2971   auto_vec<gphi *, 3> lcphis;
2972   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2973     {
2974       gimple *use_stmt = USE_STMT (use_p);
2975       if (is_gimple_debug (use_stmt))
2976         continue;
2977       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2978         nloop_uses++;
2979       else
2980         /* We can have more than one loop-closed PHI.  */
2981         lcphis.safe_push (as_a <gphi *> (use_stmt));
2982       if (nloop_uses > 1)
2983         {
2984           if (dump_enabled_p ())
2985             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2986                              "reduction used in loop.\n");
2987           return NULL;
2988         }
2989     }
2990
2991   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2992      defined in the inner loop.  */
2993   if (phi_def)
2994     {
2995       op1 = PHI_ARG_DEF (def_stmt, 0);
2996
2997       if (gimple_phi_num_args (def_stmt) != 1
2998           || TREE_CODE (op1) != SSA_NAME)
2999         {
3000           if (dump_enabled_p ())
3001             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002                              "unsupported phi node definition.\n");
3003
3004           return NULL;
3005         }
3006
3007       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3008       if (gimple_bb (def1)
3009           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3010           && loop->inner
3011           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3012           && is_gimple_assign (def1)
3013           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3014         {
3015           if (dump_enabled_p ())
3016             report_vect_op (MSG_NOTE, def_stmt,
3017                             "detected double reduction: ");
3018
3019           *double_reduc = true;
3020           return def_stmt;
3021         }
3022
3023       return NULL;
3024     }
3025
3026   /* If we are vectorizing an inner reduction we are executing that
3027      in the original order only in case we are not dealing with a
3028      double reduction.  */
3029   bool check_reduction = true;
3030   if (flow_loop_nested_p (vect_loop, loop))
3031     {
3032       gphi *lcphi;
3033       unsigned i;
3034       check_reduction = false;
3035       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3036         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3037           {
3038             gimple *use_stmt = USE_STMT (use_p);
3039             if (is_gimple_debug (use_stmt))
3040               continue;
3041             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3042               check_reduction = true;
3043           }
3044     }
3045
3046   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3047   code = orig_code = gimple_assign_rhs_code (def_stmt);
3048
3049   /* We can handle "res -= x[i]", which is non-associative by
3050      simply rewriting this into "res += -x[i]".  Avoid changing
3051      gimple instruction for the first simple tests and only do this
3052      if we're allowed to change code at all.  */
3053   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3054     code = PLUS_EXPR;
3055
3056   if (code == COND_EXPR)
3057     {
3058       if (! nested_in_vect_loop)
3059         *v_reduc_type = COND_REDUCTION;
3060
3061       op3 = gimple_assign_rhs1 (def_stmt);
3062       if (COMPARISON_CLASS_P (op3))
3063         {
3064           op4 = TREE_OPERAND (op3, 1);
3065           op3 = TREE_OPERAND (op3, 0);
3066         }
3067       if (op3 == phi_name || op4 == phi_name)
3068         {
3069           if (dump_enabled_p ())
3070             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3071                             "reduction: condition depends on previous"
3072                             " iteration: ");
3073           return NULL;
3074         }
3075
3076       op1 = gimple_assign_rhs2 (def_stmt);
3077       op2 = gimple_assign_rhs3 (def_stmt);
3078     }
3079   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3080     {
3081       if (dump_enabled_p ())
3082         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3083                         "reduction: not commutative/associative: ");
3084       return NULL;
3085     }
3086   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3087     {
3088       op1 = gimple_assign_rhs1 (def_stmt);
3089       op2 = gimple_assign_rhs2 (def_stmt);
3090     }
3091   else
3092     {
3093       if (dump_enabled_p ())
3094         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3095                         "reduction: not handled operation: ");
3096       return NULL;
3097     }
3098
3099   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3100     {
3101       if (dump_enabled_p ())
3102         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3103                         "reduction: both uses not ssa_names: ");
3104
3105       return NULL;
3106     }
3107
3108   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3109   if ((TREE_CODE (op1) == SSA_NAME
3110        && !types_compatible_p (type,TREE_TYPE (op1)))
3111       || (TREE_CODE (op2) == SSA_NAME
3112           && !types_compatible_p (type, TREE_TYPE (op2)))
3113       || (op3 && TREE_CODE (op3) == SSA_NAME
3114           && !types_compatible_p (type, TREE_TYPE (op3)))
3115       || (op4 && TREE_CODE (op4) == SSA_NAME
3116           && !types_compatible_p (type, TREE_TYPE (op4))))
3117     {
3118       if (dump_enabled_p ())
3119         {
3120           dump_printf_loc (MSG_NOTE, vect_location,
3121                            "reduction: multiple types: operation type: ");
3122           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3123           dump_printf (MSG_NOTE, ", operands types: ");
3124           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3125                              TREE_TYPE (op1));
3126           dump_printf (MSG_NOTE, ",");
3127           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3128                              TREE_TYPE (op2));
3129           if (op3)
3130             {
3131               dump_printf (MSG_NOTE, ",");
3132               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3133                                  TREE_TYPE (op3));
3134             }
3135
3136           if (op4)
3137             {
3138               dump_printf (MSG_NOTE, ",");
3139               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3140                                  TREE_TYPE (op4));
3141             }
3142           dump_printf (MSG_NOTE, "\n");
3143         }
3144
3145       return NULL;
3146     }
3147
3148   /* Check whether it's ok to change the order of the computation.
3149      Generally, when vectorizing a reduction we change the order of the
3150      computation.  This may change the behavior of the program in some
3151      cases, so we need to check that this is ok.  One exception is when
3152      vectorizing an outer-loop: the inner-loop is executed sequentially,
3153      and therefore vectorizing reductions in the inner-loop during
3154      outer-loop vectorization is safe.  */
3155   if (check_reduction
3156       && *v_reduc_type == TREE_CODE_REDUCTION
3157       && needs_fold_left_reduction_p (type, code,
3158                                       need_wrapping_integral_overflow))
3159     *v_reduc_type = FOLD_LEFT_REDUCTION;
3160
3161   /* Reduction is safe. We're dealing with one of the following:
3162      1) integer arithmetic and no trapv
3163      2) floating point arithmetic, and special flags permit this optimization
3164      3) nested cycle (i.e., outer loop vectorization).  */
3165   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3166   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3167   if (code != COND_EXPR && !def1_info && !def2_info)
3168     {
3169       if (dump_enabled_p ())
3170         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3171       return NULL;
3172     }
3173
3174   /* Check that one def is the reduction def, defined by PHI,
3175      the other def is either defined in the loop ("vect_internal_def"),
3176      or it's an induction (defined by a loop-header phi-node).  */
3177
3178   if (def2_info
3179       && def2_info->stmt == phi
3180       && (code == COND_EXPR
3181           || !def1_info
3182           || vect_valid_reduction_input_p (def1_info)))
3183     {
3184       if (dump_enabled_p ())
3185         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3186       return def_stmt;
3187     }
3188
3189   if (def1_info
3190       && def1_info->stmt == phi
3191       && (code == COND_EXPR
3192           || !def2_info
3193           || vect_valid_reduction_input_p (def2_info)))
3194     {
3195       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3196         {
3197           /* Check if we can swap operands (just for simplicity - so that
3198              the rest of the code can assume that the reduction variable
3199              is always the last (second) argument).  */
3200           if (code == COND_EXPR)
3201             {
3202               /* Swap cond_expr by inverting the condition.  */
3203               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3204               enum tree_code invert_code = ERROR_MARK;
3205               enum tree_code cond_code = TREE_CODE (cond_expr);
3206
3207               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3208                 {
3209                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3210                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3211                 }
3212               if (invert_code != ERROR_MARK)
3213                 {
3214                   TREE_SET_CODE (cond_expr, invert_code);
3215                   swap_ssa_operands (def_stmt,
3216                                      gimple_assign_rhs2_ptr (def_stmt),
3217                                      gimple_assign_rhs3_ptr (def_stmt));
3218                 }
3219               else
3220                 {
3221                   if (dump_enabled_p ())
3222                     report_vect_op (MSG_NOTE, def_stmt,
3223                                     "detected reduction: cannot swap operands "
3224                                     "for cond_expr");
3225                   return NULL;
3226                 }
3227             }
3228           else
3229             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3230                                gimple_assign_rhs2_ptr (def_stmt));
3231
3232           if (dump_enabled_p ())
3233             report_vect_op (MSG_NOTE, def_stmt,
3234                             "detected reduction: need to swap operands: ");
3235
3236           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3237             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3238         }
3239       else
3240         {
3241           if (dump_enabled_p ())
3242             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3243         }
3244
3245       return def_stmt;
3246     }
3247
3248   /* Try to find SLP reduction chain.  */
3249   if (! nested_in_vect_loop
3250       && code != COND_EXPR
3251       && orig_code != MINUS_EXPR
3252       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3253     {
3254       if (dump_enabled_p ())
3255         report_vect_op (MSG_NOTE, def_stmt,
3256                         "reduction: detected reduction chain: ");
3257
3258       return def_stmt;
3259     }
3260
3261   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3262   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3263   while (first)
3264     {
3265       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3266       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3267       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3268       first = next;
3269     }
3270
3271   /* Look for the expression computing loop_arg from loop PHI result.  */
3272   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3273                             code))
3274     return def_stmt;
3275
3276   if (dump_enabled_p ())
3277     {
3278       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3279                       "reduction: unknown pattern: ");
3280     }
3281
3282   return NULL;
3283 }
3284
3285 /* Wrapper around vect_is_simple_reduction, which will modify code
3286    in-place if it enables detection of more reductions.  Arguments
3287    as there.  */
3288
3289 gimple *
3290 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3291                              bool *double_reduc,
3292                              bool need_wrapping_integral_overflow)
3293 {
3294   enum vect_reduction_type v_reduc_type;
3295   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3296                                           need_wrapping_integral_overflow,
3297                                           &v_reduc_type);
3298   if (def)
3299     {
3300       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3301       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3302       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3303       reduc_def_info = vinfo_for_stmt (def);
3304       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3305       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3306     }
3307   return def;
3308 }
3309
3310 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3311 int
3312 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3313                              int *peel_iters_epilogue,
3314                              stmt_vector_for_cost *scalar_cost_vec,
3315                              stmt_vector_for_cost *prologue_cost_vec,
3316                              stmt_vector_for_cost *epilogue_cost_vec)
3317 {
3318   int retval = 0;
3319   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3320
3321   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3322     {
3323       *peel_iters_epilogue = assumed_vf / 2;
3324       if (dump_enabled_p ())
3325         dump_printf_loc (MSG_NOTE, vect_location,
3326                          "cost model: epilogue peel iters set to vf/2 "
3327                          "because loop iterations are unknown .\n");
3328
3329       /* If peeled iterations are known but number of scalar loop
3330          iterations are unknown, count a taken branch per peeled loop.  */
3331       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3332                                  NULL, 0, vect_prologue);
3333       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3334                                  NULL, 0, vect_epilogue);
3335     }
3336   else
3337     {
3338       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3339       peel_iters_prologue = niters < peel_iters_prologue ?
3340                             niters : peel_iters_prologue;
3341       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3342       /* If we need to peel for gaps, but no peeling is required, we have to
3343          peel VF iterations.  */
3344       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3345         *peel_iters_epilogue = assumed_vf;
3346     }
3347
3348   stmt_info_for_cost *si;
3349   int j;
3350   if (peel_iters_prologue)
3351     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3352         {
3353           stmt_vec_info stmt_info
3354             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3355           retval += record_stmt_cost (prologue_cost_vec,
3356                                       si->count * peel_iters_prologue,
3357                                       si->kind, stmt_info, si->misalign,
3358                                       vect_prologue);
3359         }
3360   if (*peel_iters_epilogue)
3361     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3362         {
3363           stmt_vec_info stmt_info
3364             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3365           retval += record_stmt_cost (epilogue_cost_vec,
3366                                       si->count * *peel_iters_epilogue,
3367                                       si->kind, stmt_info, si->misalign,
3368                                       vect_epilogue);
3369         }
3370
3371   return retval;
3372 }
3373
3374 /* Function vect_estimate_min_profitable_iters
3375
3376    Return the number of iterations required for the vector version of the
3377    loop to be profitable relative to the cost of the scalar version of the
3378    loop.
3379
3380    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3381    of iterations for vectorization.  -1 value means loop vectorization
3382    is not profitable.  This returned value may be used for dynamic
3383    profitability check.
3384
3385    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3386    for static check against estimated number of iterations.  */
3387
3388 static void
3389 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3390                                     int *ret_min_profitable_niters,
3391                                     int *ret_min_profitable_estimate)
3392 {
3393   int min_profitable_iters;
3394   int min_profitable_estimate;
3395   int peel_iters_prologue;
3396   int peel_iters_epilogue;
3397   unsigned vec_inside_cost = 0;
3398   int vec_outside_cost = 0;
3399   unsigned vec_prologue_cost = 0;
3400   unsigned vec_epilogue_cost = 0;
3401   int scalar_single_iter_cost = 0;
3402   int scalar_outside_cost = 0;
3403   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3404   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3405   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3406
3407   /* Cost model disabled.  */
3408   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3409     {
3410       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3411       *ret_min_profitable_niters = 0;
3412       *ret_min_profitable_estimate = 0;
3413       return;
3414     }
3415
3416   /* Requires loop versioning tests to handle misalignment.  */
3417   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3418     {
3419       /*  FIXME: Make cost depend on complexity of individual check.  */
3420       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3421       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3422                             vect_prologue);
3423       dump_printf (MSG_NOTE,
3424                    "cost model: Adding cost of checks for loop "
3425                    "versioning to treat misalignment.\n");
3426     }
3427
3428   /* Requires loop versioning with alias checks.  */
3429   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3430     {
3431       /*  FIXME: Make cost depend on complexity of individual check.  */
3432       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3433       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3434                             vect_prologue);
3435       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3436       if (len)
3437         /* Count LEN - 1 ANDs and LEN comparisons.  */
3438         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3439                               NULL, 0, vect_prologue);
3440       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3441       if (len)
3442         {
3443           /* Count LEN - 1 ANDs and LEN comparisons.  */
3444           unsigned int nstmts = len * 2 - 1;
3445           /* +1 for each bias that needs adding.  */
3446           for (unsigned int i = 0; i < len; ++i)
3447             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3448               nstmts += 1;
3449           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3450                                 NULL, 0, vect_prologue);
3451         }
3452       dump_printf (MSG_NOTE,
3453                    "cost model: Adding cost of checks for loop "
3454                    "versioning aliasing.\n");
3455     }
3456
3457   /* Requires loop versioning with niter checks.  */
3458   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3459     {
3460       /*  FIXME: Make cost depend on complexity of individual check.  */
3461       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3462                             vect_prologue);
3463       dump_printf (MSG_NOTE,
3464                    "cost model: Adding cost of checks for loop "
3465                    "versioning niters.\n");
3466     }
3467
3468   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3469     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3470                           vect_prologue);
3471
3472   /* Count statements in scalar loop.  Using this as scalar cost for a single
3473      iteration for now.
3474
3475      TODO: Add outer loop support.
3476
3477      TODO: Consider assigning different costs to different scalar
3478      statements.  */
3479
3480   scalar_single_iter_cost
3481     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3482
3483   /* Add additional cost for the peeled instructions in prologue and epilogue
3484      loop.  (For fully-masked loops there will be no peeling.)
3485
3486      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3487      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3488
3489      TODO: Build an expression that represents peel_iters for prologue and
3490      epilogue to be used in a run-time test.  */
3491
3492   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3493     {
3494       peel_iters_prologue = 0;
3495       peel_iters_epilogue = 0;
3496
3497       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3498         {
3499           /* We need to peel exactly one iteration.  */
3500           peel_iters_epilogue += 1;
3501           stmt_info_for_cost *si;
3502           int j;
3503           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3504                             j, si)
3505             {
3506               struct _stmt_vec_info *stmt_info
3507                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3508               (void) add_stmt_cost (target_cost_data, si->count,
3509                                     si->kind, stmt_info, si->misalign,
3510                                     vect_epilogue);
3511             }
3512         }
3513     }
3514   else if (npeel < 0)
3515     {
3516       peel_iters_prologue = assumed_vf / 2;
3517       dump_printf (MSG_NOTE, "cost model: "
3518                    "prologue peel iters set to vf/2.\n");
3519
3520       /* If peeling for alignment is unknown, loop bound of main loop becomes
3521          unknown.  */
3522       peel_iters_epilogue = assumed_vf / 2;
3523       dump_printf (MSG_NOTE, "cost model: "
3524                    "epilogue peel iters set to vf/2 because "
3525                    "peeling for alignment is unknown.\n");
3526
3527       /* If peeled iterations are unknown, count a taken branch and a not taken
3528          branch per peeled loop. Even if scalar loop iterations are known,
3529          vector iterations are not known since peeled prologue iterations are
3530          not known. Hence guards remain the same.  */
3531       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3532                             NULL, 0, vect_prologue);
3533       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3534                             NULL, 0, vect_prologue);
3535       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3536                             NULL, 0, vect_epilogue);
3537       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3538                             NULL, 0, vect_epilogue);
3539       stmt_info_for_cost *si;
3540       int j;
3541       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3542         {
3543           struct _stmt_vec_info *stmt_info
3544             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3545           (void) add_stmt_cost (target_cost_data,
3546                                 si->count * peel_iters_prologue,
3547                                 si->kind, stmt_info, si->misalign,
3548                                 vect_prologue);
3549           (void) add_stmt_cost (target_cost_data,
3550                                 si->count * peel_iters_epilogue,
3551                                 si->kind, stmt_info, si->misalign,
3552                                 vect_epilogue);
3553         }
3554     }
3555   else
3556     {
3557       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3558       stmt_info_for_cost *si;
3559       int j;
3560       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3561
3562       prologue_cost_vec.create (2);
3563       epilogue_cost_vec.create (2);
3564       peel_iters_prologue = npeel;
3565
3566       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3567                                           &peel_iters_epilogue,
3568                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3569                                             (loop_vinfo),
3570                                           &prologue_cost_vec,
3571                                           &epilogue_cost_vec);
3572
3573       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3574         {
3575           struct _stmt_vec_info *stmt_info
3576             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3577           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3578                                 si->misalign, vect_prologue);
3579         }
3580
3581       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3582         {
3583           struct _stmt_vec_info *stmt_info
3584             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3585           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3586                                 si->misalign, vect_epilogue);
3587         }
3588
3589       prologue_cost_vec.release ();
3590       epilogue_cost_vec.release ();
3591     }
3592
3593   /* FORNOW: The scalar outside cost is incremented in one of the
3594      following ways:
3595
3596      1. The vectorizer checks for alignment and aliasing and generates
3597      a condition that allows dynamic vectorization.  A cost model
3598      check is ANDED with the versioning condition.  Hence scalar code
3599      path now has the added cost of the versioning check.
3600
3601        if (cost > th & versioning_check)
3602          jmp to vector code
3603
3604      Hence run-time scalar is incremented by not-taken branch cost.
3605
3606      2. The vectorizer then checks if a prologue is required.  If the
3607      cost model check was not done before during versioning, it has to
3608      be done before the prologue check.
3609
3610        if (cost <= th)
3611          prologue = scalar_iters
3612        if (prologue == 0)
3613          jmp to vector code
3614        else
3615          execute prologue
3616        if (prologue == num_iters)
3617          go to exit
3618
3619      Hence the run-time scalar cost is incremented by a taken branch,
3620      plus a not-taken branch, plus a taken branch cost.
3621
3622      3. The vectorizer then checks if an epilogue is required.  If the
3623      cost model check was not done before during prologue check, it
3624      has to be done with the epilogue check.
3625
3626        if (prologue == 0)
3627          jmp to vector code
3628        else
3629          execute prologue
3630        if (prologue == num_iters)
3631          go to exit
3632        vector code:
3633          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3634            jmp to epilogue
3635
3636      Hence the run-time scalar cost should be incremented by 2 taken
3637      branches.
3638
3639      TODO: The back end may reorder the BBS's differently and reverse
3640      conditions/branch directions.  Change the estimates below to
3641      something more reasonable.  */
3642
3643   /* If the number of iterations is known and we do not do versioning, we can
3644      decide whether to vectorize at compile time.  Hence the scalar version
3645      do not carry cost model guard costs.  */
3646   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3647       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3648     {
3649       /* Cost model check occurs at versioning.  */
3650       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3651         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3652       else
3653         {
3654           /* Cost model check occurs at prologue generation.  */
3655           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3656             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3657               + vect_get_stmt_cost (cond_branch_not_taken);
3658           /* Cost model check occurs at epilogue generation.  */
3659           else
3660             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3661         }
3662     }
3663
3664   /* Complete the target-specific cost calculations.  */
3665   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3666                &vec_inside_cost, &vec_epilogue_cost);
3667
3668   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3669
3670   if (dump_enabled_p ())
3671     {
3672       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3673       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3674                    vec_inside_cost);
3675       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3676                    vec_prologue_cost);
3677       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3678                    vec_epilogue_cost);
3679       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3680                    scalar_single_iter_cost);
3681       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3682                    scalar_outside_cost);
3683       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3684                    vec_outside_cost);
3685       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3686                    peel_iters_prologue);
3687       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3688                    peel_iters_epilogue);
3689     }
3690
3691   /* Calculate number of iterations required to make the vector version
3692      profitable, relative to the loop bodies only.  The following condition
3693      must hold true:
3694      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3695      where
3696      SIC = scalar iteration cost, VIC = vector iteration cost,
3697      VOC = vector outside cost, VF = vectorization factor,
3698      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3699      SOC = scalar outside cost for run time cost model check.  */
3700
3701   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3702     {
3703       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3704                               * assumed_vf
3705                               - vec_inside_cost * peel_iters_prologue
3706                               - vec_inside_cost * peel_iters_epilogue);
3707       if (min_profitable_iters <= 0)
3708         min_profitable_iters = 0;
3709       else
3710         {
3711           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3712                                    - vec_inside_cost);
3713
3714           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3715               <= (((int) vec_inside_cost * min_profitable_iters)
3716                   + (((int) vec_outside_cost - scalar_outside_cost)
3717                      * assumed_vf)))
3718             min_profitable_iters++;
3719         }
3720     }
3721   /* vector version will never be profitable.  */
3722   else
3723     {
3724       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3725         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3726                     "vectorization did not happen for a simd loop");
3727
3728       if (dump_enabled_p ())
3729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3730                          "cost model: the vector iteration cost = %d "
3731                          "divided by the scalar iteration cost = %d "
3732                          "is greater or equal to the vectorization factor = %d"
3733                          ".\n",
3734                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3735       *ret_min_profitable_niters = -1;
3736       *ret_min_profitable_estimate = -1;
3737       return;
3738     }
3739
3740   dump_printf (MSG_NOTE,
3741                "  Calculated minimum iters for profitability: %d\n",
3742                min_profitable_iters);
3743
3744   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3745       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3746     /* We want the vectorized loop to execute at least once.  */
3747     min_profitable_iters = assumed_vf + peel_iters_prologue;
3748
3749   if (dump_enabled_p ())
3750     dump_printf_loc (MSG_NOTE, vect_location,
3751                      "  Runtime profitability threshold = %d\n",
3752                      min_profitable_iters);
3753
3754   *ret_min_profitable_niters = min_profitable_iters;
3755
3756   /* Calculate number of iterations required to make the vector version
3757      profitable, relative to the loop bodies only.
3758
3759      Non-vectorized variant is SIC * niters and it must win over vector
3760      variant on the expected loop trip count.  The following condition must hold true:
3761      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3762
3763   if (vec_outside_cost <= 0)
3764     min_profitable_estimate = 0;
3765   else
3766     {
3767       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3768                                  * assumed_vf
3769                                  - vec_inside_cost * peel_iters_prologue
3770                                  - vec_inside_cost * peel_iters_epilogue)
3771                                  / ((scalar_single_iter_cost * assumed_vf)
3772                                    - vec_inside_cost);
3773     }
3774   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3775   if (dump_enabled_p ())
3776     dump_printf_loc (MSG_NOTE, vect_location,
3777                      "  Static estimate profitability threshold = %d\n",
3778                      min_profitable_estimate);
3779
3780   *ret_min_profitable_estimate = min_profitable_estimate;
3781 }
3782
3783 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3784    vector elements (not bits) for a vector with NELT elements.  */
3785 static void
3786 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3787                               vec_perm_builder *sel)
3788 {
3789   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3790      by vec_perm_indices.  */
3791   sel->new_vector (nelt, 1, 3);
3792   for (unsigned int i = 0; i < 3; i++)
3793     sel->quick_push (i + offset);
3794 }
3795
3796 /* Checks whether the target supports whole-vector shifts for vectors of mode
3797    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3798    it supports vec_perm_const with masks for all necessary shift amounts.  */
3799 static bool
3800 have_whole_vector_shift (machine_mode mode)
3801 {
3802   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3803     return true;
3804
3805   /* Variable-length vectors should be handled via the optab.  */
3806   unsigned int nelt;
3807   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3808     return false;
3809
3810   vec_perm_builder sel;
3811   vec_perm_indices indices;
3812   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3813     {
3814       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3815       indices.new_vector (sel, 2, nelt);
3816       if (!can_vec_perm_const_p (mode, indices, false))
3817         return false;
3818     }
3819   return true;
3820 }
3821
3822 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3823    functions. Design better to avoid maintenance issues.  */
3824
3825 /* Function vect_model_reduction_cost.
3826
3827    Models cost for a reduction operation, including the vector ops
3828    generated within the strip-mine loop, the initial definition before
3829    the loop, and the epilogue code that must be generated.  */
3830
3831 static void
3832 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3833                            int ncopies, stmt_vector_for_cost *cost_vec)
3834 {
3835   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3836   enum tree_code code;
3837   optab optab;
3838   tree vectype;
3839   gimple *orig_stmt;
3840   machine_mode mode;
3841   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3842   struct loop *loop = NULL;
3843
3844   if (loop_vinfo)
3845     loop = LOOP_VINFO_LOOP (loop_vinfo);
3846
3847   /* Condition reductions generate two reductions in the loop.  */
3848   vect_reduction_type reduction_type
3849     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3850   if (reduction_type == COND_REDUCTION)
3851     ncopies *= 2;
3852
3853   vectype = STMT_VINFO_VECTYPE (stmt_info);
3854   mode = TYPE_MODE (vectype);
3855   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3856
3857   if (!orig_stmt)
3858     orig_stmt = STMT_VINFO_STMT (stmt_info);
3859
3860   code = gimple_assign_rhs_code (orig_stmt);
3861
3862   if (reduction_type == EXTRACT_LAST_REDUCTION
3863       || reduction_type == FOLD_LEFT_REDUCTION)
3864     {
3865       /* No extra instructions needed in the prologue.  */
3866       prologue_cost = 0;
3867
3868       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3869         /* Count one reduction-like operation per vector.  */
3870         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3871                                         stmt_info, 0, vect_body);
3872       else
3873         {
3874           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3875           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3876           inside_cost = record_stmt_cost (cost_vec, nelements,
3877                                           vec_to_scalar, stmt_info, 0,
3878                                           vect_body);
3879           inside_cost += record_stmt_cost (cost_vec, nelements,
3880                                            scalar_stmt, stmt_info, 0,
3881                                            vect_body);
3882         }
3883     }
3884   else
3885     {
3886       /* Add in cost for initial definition.
3887          For cond reduction we have four vectors: initial index, step,
3888          initial result of the data reduction, initial value of the index
3889          reduction.  */
3890       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3891       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3892                                          scalar_to_vec, stmt_info, 0,
3893                                          vect_prologue);
3894
3895       /* Cost of reduction op inside loop.  */
3896       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3897                                       stmt_info, 0, vect_body);
3898     }
3899
3900   /* Determine cost of epilogue code.
3901
3902      We have a reduction operator that will reduce the vector in one statement.
3903      Also requires scalar extract.  */
3904
3905   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3906     {
3907       if (reduc_fn != IFN_LAST)
3908         {
3909           if (reduction_type == COND_REDUCTION)
3910             {
3911               /* An EQ stmt and an COND_EXPR stmt.  */
3912               epilogue_cost += record_stmt_cost (cost_vec, 2,
3913                                                  vector_stmt, stmt_info, 0,
3914                                                  vect_epilogue);
3915               /* Reduction of the max index and a reduction of the found
3916                  values.  */
3917               epilogue_cost += record_stmt_cost (cost_vec, 2,
3918                                                  vec_to_scalar, stmt_info, 0,
3919                                                  vect_epilogue);
3920               /* A broadcast of the max value.  */
3921               epilogue_cost += record_stmt_cost (cost_vec, 1,
3922                                                  scalar_to_vec, stmt_info, 0,
3923                                                  vect_epilogue);
3924             }
3925           else
3926             {
3927               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3928                                                  stmt_info, 0, vect_epilogue);
3929               epilogue_cost += record_stmt_cost (cost_vec, 1,
3930                                                  vec_to_scalar, stmt_info, 0,
3931                                                  vect_epilogue);
3932             }
3933         }
3934       else if (reduction_type == COND_REDUCTION)
3935         {
3936           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3937           /* Extraction of scalar elements.  */
3938           epilogue_cost += record_stmt_cost (cost_vec,
3939                                              2 * estimated_nunits,
3940                                              vec_to_scalar, stmt_info, 0,
3941                                              vect_epilogue);
3942           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3943           epilogue_cost += record_stmt_cost (cost_vec,
3944                                              2 * estimated_nunits - 3,
3945                                              scalar_stmt, stmt_info, 0,
3946                                              vect_epilogue);
3947         }
3948       else if (reduction_type == EXTRACT_LAST_REDUCTION
3949                || reduction_type == FOLD_LEFT_REDUCTION)
3950         /* No extra instructions need in the epilogue.  */
3951         ;
3952       else
3953         {
3954           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3955           tree bitsize =
3956             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3957           int element_bitsize = tree_to_uhwi (bitsize);
3958           int nelements = vec_size_in_bits / element_bitsize;
3959
3960           if (code == COND_EXPR)
3961             code = MAX_EXPR;
3962
3963           optab = optab_for_tree_code (code, vectype, optab_default);
3964
3965           /* We have a whole vector shift available.  */
3966           if (optab != unknown_optab
3967               && VECTOR_MODE_P (mode)
3968               && optab_handler (optab, mode) != CODE_FOR_nothing
3969               && have_whole_vector_shift (mode))
3970             {
3971               /* Final reduction via vector shifts and the reduction operator.
3972                  Also requires scalar extract.  */
3973               epilogue_cost += record_stmt_cost (cost_vec,
3974                                                  exact_log2 (nelements) * 2,
3975                                                  vector_stmt, stmt_info, 0,
3976                                                  vect_epilogue);
3977               epilogue_cost += record_stmt_cost (cost_vec, 1,
3978                                                  vec_to_scalar, stmt_info, 0,
3979                                                  vect_epilogue);
3980             }
3981           else
3982             /* Use extracts and reduction op for final reduction.  For N
3983                elements, we have N extracts and N-1 reduction ops.  */
3984             epilogue_cost += record_stmt_cost (cost_vec,
3985                                                nelements + nelements - 1,
3986                                                vector_stmt, stmt_info, 0,
3987                                                vect_epilogue);
3988         }
3989     }
3990
3991   if (dump_enabled_p ())
3992     dump_printf (MSG_NOTE,
3993                  "vect_model_reduction_cost: inside_cost = %d, "
3994                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3995                  prologue_cost, epilogue_cost);
3996 }
3997
3998
3999 /* Function vect_model_induction_cost.
4000
4001    Models cost for induction operations.  */
4002
4003 static void
4004 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4005                            stmt_vector_for_cost *cost_vec)
4006 {
4007   unsigned inside_cost, prologue_cost;
4008
4009   if (PURE_SLP_STMT (stmt_info))
4010     return;
4011
4012   /* loop cost for vec_loop.  */
4013   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4014                                   stmt_info, 0, vect_body);
4015
4016   /* prologue cost for vec_init and vec_step.  */
4017   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4018                                     stmt_info, 0, vect_prologue);
4019
4020   if (dump_enabled_p ())
4021     dump_printf_loc (MSG_NOTE, vect_location,
4022                      "vect_model_induction_cost: inside_cost = %d, "
4023                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4024 }
4025
4026
4027
4028 /* Function get_initial_def_for_reduction
4029
4030    Input:
4031    STMT - a stmt that performs a reduction operation in the loop.
4032    INIT_VAL - the initial value of the reduction variable
4033
4034    Output:
4035    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4036         of the reduction (used for adjusting the epilog - see below).
4037    Return a vector variable, initialized according to the operation that STMT
4038         performs. This vector will be used as the initial value of the
4039         vector of partial results.
4040
4041    Option1 (adjust in epilog): Initialize the vector as follows:
4042      add/bit or/xor:    [0,0,...,0,0]
4043      mult/bit and:      [1,1,...,1,1]
4044      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4045    and when necessary (e.g. add/mult case) let the caller know
4046    that it needs to adjust the result by init_val.
4047
4048    Option2: Initialize the vector as follows:
4049      add/bit or/xor:    [init_val,0,0,...,0]
4050      mult/bit and:      [init_val,1,1,...,1]
4051      min/max/cond_expr: [init_val,init_val,...,init_val]
4052    and no adjustments are needed.
4053
4054    For example, for the following code:
4055
4056    s = init_val;
4057    for (i=0;i<n;i++)
4058      s = s + a[i];
4059
4060    STMT is 's = s + a[i]', and the reduction variable is 's'.
4061    For a vector of 4 units, we want to return either [0,0,0,init_val],
4062    or [0,0,0,0] and let the caller know that it needs to adjust
4063    the result at the end by 'init_val'.
4064
4065    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4066    initialization vector is simpler (same element in all entries), if
4067    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4068
4069    A cost model should help decide between these two schemes.  */
4070
4071 tree
4072 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4073                                tree *adjustment_def)
4074 {
4075   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4076   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4077   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4078   tree scalar_type = TREE_TYPE (init_val);
4079   tree vectype = get_vectype_for_scalar_type (scalar_type);
4080   enum tree_code code = gimple_assign_rhs_code (stmt);
4081   tree def_for_init;
4082   tree init_def;
4083   REAL_VALUE_TYPE real_init_val = dconst0;
4084   int int_init_val = 0;
4085   gimple_seq stmts = NULL;
4086
4087   gcc_assert (vectype);
4088
4089   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4090               || SCALAR_FLOAT_TYPE_P (scalar_type));
4091
4092   gcc_assert (nested_in_vect_loop_p (loop, stmt)
4093               || loop == (gimple_bb (stmt))->loop_father);
4094
4095   vect_reduction_type reduction_type
4096     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4097
4098   switch (code)
4099     {
4100     case WIDEN_SUM_EXPR:
4101     case DOT_PROD_EXPR:
4102     case SAD_EXPR:
4103     case PLUS_EXPR:
4104     case MINUS_EXPR:
4105     case BIT_IOR_EXPR:
4106     case BIT_XOR_EXPR:
4107     case MULT_EXPR:
4108     case BIT_AND_EXPR:
4109       {
4110         /* ADJUSTMENT_DEF is NULL when called from
4111            vect_create_epilog_for_reduction to vectorize double reduction.  */
4112         if (adjustment_def)
4113           *adjustment_def = init_val;
4114
4115         if (code == MULT_EXPR)
4116           {
4117             real_init_val = dconst1;
4118             int_init_val = 1;
4119           }
4120
4121         if (code == BIT_AND_EXPR)
4122           int_init_val = -1;
4123
4124         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4125           def_for_init = build_real (scalar_type, real_init_val);
4126         else
4127           def_for_init = build_int_cst (scalar_type, int_init_val);
4128
4129         if (adjustment_def)
4130           /* Option1: the first element is '0' or '1' as well.  */
4131           init_def = gimple_build_vector_from_val (&stmts, vectype,
4132                                                    def_for_init);
4133         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4134           {
4135             /* Option2 (variable length): the first element is INIT_VAL.  */
4136             init_def = gimple_build_vector_from_val (&stmts, vectype,
4137                                                      def_for_init);
4138             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4139                                      vectype, init_def, init_val);
4140           }
4141         else
4142           {
4143             /* Option2: the first element is INIT_VAL.  */
4144             tree_vector_builder elts (vectype, 1, 2);
4145             elts.quick_push (init_val);
4146             elts.quick_push (def_for_init);
4147             init_def = gimple_build_vector (&stmts, &elts);
4148           }
4149       }
4150       break;
4151
4152     case MIN_EXPR:
4153     case MAX_EXPR:
4154     case COND_EXPR:
4155       {
4156         if (adjustment_def)
4157           {
4158             *adjustment_def = NULL_TREE;
4159             if (reduction_type != COND_REDUCTION
4160                 && reduction_type != EXTRACT_LAST_REDUCTION)
4161               {
4162                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4163                 break;
4164               }
4165           }
4166         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4167         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4168       }
4169       break;
4170
4171     default:
4172       gcc_unreachable ();
4173     }
4174
4175   if (stmts)
4176     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4177   return init_def;
4178 }
4179
4180 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4181    NUMBER_OF_VECTORS is the number of vector defs to create.
4182    If NEUTRAL_OP is nonnull, introducing extra elements of that
4183    value will not change the result.  */
4184
4185 static void
4186 get_initial_defs_for_reduction (slp_tree slp_node,
4187                                 vec<tree> *vec_oprnds,
4188                                 unsigned int number_of_vectors,
4189                                 bool reduc_chain, tree neutral_op)
4190 {
4191   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4192   gimple *stmt = stmts[0];
4193   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4194   unsigned HOST_WIDE_INT nunits;
4195   unsigned j, number_of_places_left_in_vector;
4196   tree vector_type;
4197   tree vop;
4198   int group_size = stmts.length ();
4199   unsigned int vec_num, i;
4200   unsigned number_of_copies = 1;
4201   vec<tree> voprnds;
4202   voprnds.create (number_of_vectors);
4203   struct loop *loop;
4204   auto_vec<tree, 16> permute_results;
4205
4206   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4207
4208   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4209
4210   loop = (gimple_bb (stmt))->loop_father;
4211   gcc_assert (loop);
4212   edge pe = loop_preheader_edge (loop);
4213
4214   gcc_assert (!reduc_chain || neutral_op);
4215
4216   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4217      created vectors. It is greater than 1 if unrolling is performed.
4218
4219      For example, we have two scalar operands, s1 and s2 (e.g., group of
4220      strided accesses of size two), while NUNITS is four (i.e., four scalars
4221      of this type can be packed in a vector).  The output vector will contain
4222      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4223      will be 2).
4224
4225      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4226      vectors containing the operands.
4227
4228      For example, NUNITS is four as before, and the group size is 8
4229      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4230      {s5, s6, s7, s8}.  */
4231
4232   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4233     nunits = group_size;
4234
4235   number_of_copies = nunits * number_of_vectors / group_size;
4236
4237   number_of_places_left_in_vector = nunits;
4238   bool constant_p = true;
4239   tree_vector_builder elts (vector_type, nunits, 1);
4240   elts.quick_grow (nunits);
4241   for (j = 0; j < number_of_copies; j++)
4242     {
4243       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4244         {
4245           tree op;
4246           /* Get the def before the loop.  In reduction chain we have only
4247              one initial value.  */
4248           if ((j != (number_of_copies - 1)
4249                || (reduc_chain && i != 0))
4250               && neutral_op)
4251             op = neutral_op;
4252           else
4253             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4254
4255           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4256           number_of_places_left_in_vector--;
4257           elts[number_of_places_left_in_vector] = op;
4258           if (!CONSTANT_CLASS_P (op))
4259             constant_p = false;
4260
4261           if (number_of_places_left_in_vector == 0)
4262             {
4263               gimple_seq ctor_seq = NULL;
4264               tree init;
4265               if (constant_p && !neutral_op
4266                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4267                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4268                 /* Build the vector directly from ELTS.  */
4269                 init = gimple_build_vector (&ctor_seq, &elts);
4270               else if (neutral_op)
4271                 {
4272                   /* Build a vector of the neutral value and shift the
4273                      other elements into place.  */
4274                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4275                                                        neutral_op);
4276                   int k = nunits;
4277                   while (k > 0 && elts[k - 1] == neutral_op)
4278                     k -= 1;
4279                   while (k > 0)
4280                     {
4281                       k -= 1;
4282                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4283                                            vector_type, init, elts[k]);
4284                     }
4285                 }
4286               else
4287                 {
4288                   /* First time round, duplicate ELTS to fill the
4289                      required number of vectors, then cherry pick the
4290                      appropriate result for each iteration.  */
4291                   if (vec_oprnds->is_empty ())
4292                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4293                                               number_of_vectors,
4294                                               permute_results);
4295                   init = permute_results[number_of_vectors - j - 1];
4296                 }
4297               if (ctor_seq != NULL)
4298                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4299               voprnds.quick_push (init);
4300
4301               number_of_places_left_in_vector = nunits;
4302               elts.new_vector (vector_type, nunits, 1);
4303               elts.quick_grow (nunits);
4304               constant_p = true;
4305             }
4306         }
4307     }
4308
4309   /* Since the vectors are created in the reverse order, we should invert
4310      them.  */
4311   vec_num = voprnds.length ();
4312   for (j = vec_num; j != 0; j--)
4313     {
4314       vop = voprnds[j - 1];
4315       vec_oprnds->quick_push (vop);
4316     }
4317
4318   voprnds.release ();
4319
4320   /* In case that VF is greater than the unrolling factor needed for the SLP
4321      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4322      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4323      to replicate the vectors.  */
4324   tree neutral_vec = NULL;
4325   while (number_of_vectors > vec_oprnds->length ())
4326     {
4327       if (neutral_op)
4328         {
4329           if (!neutral_vec)
4330             {
4331               gimple_seq ctor_seq = NULL;
4332               neutral_vec = gimple_build_vector_from_val
4333                 (&ctor_seq, vector_type, neutral_op);
4334               if (ctor_seq != NULL)
4335                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4336             }
4337           vec_oprnds->quick_push (neutral_vec);
4338         }
4339       else
4340         {
4341           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4342             vec_oprnds->quick_push (vop);
4343         }
4344     }
4345 }
4346
4347
4348 /* Function vect_create_epilog_for_reduction
4349
4350    Create code at the loop-epilog to finalize the result of a reduction
4351    computation.
4352
4353    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4354      reduction statements.
4355    STMT is the scalar reduction stmt that is being vectorized.
4356    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4357      number of elements that we can fit in a vectype (nunits).  In this case
4358      we have to generate more than one vector stmt - i.e - we need to "unroll"
4359      the vector stmt by a factor VF/nunits.  For more details see documentation
4360      in vectorizable_operation.
4361    REDUC_FN is the internal function for the epilog reduction.
4362    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4363      computation.
4364    REDUC_INDEX is the index of the operand in the right hand side of the
4365      statement that is defined by REDUCTION_PHI.
4366    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4367    SLP_NODE is an SLP node containing a group of reduction statements. The
4368      first one in this group is STMT.
4369    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4370      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4371      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4372      any value of the IV in the loop.
4373    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4374    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4375      null if this is not an SLP reduction
4376
4377    This function:
4378    1. Creates the reduction def-use cycles: sets the arguments for
4379       REDUCTION_PHIS:
4380       The loop-entry argument is the vectorized initial-value of the reduction.
4381       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4382       sums.
4383    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4384       by calling the function specified by REDUC_FN if available, or by
4385       other means (whole-vector shifts or a scalar loop).
4386       The function also creates a new phi node at the loop exit to preserve
4387       loop-closed form, as illustrated below.
4388
4389      The flow at the entry to this function:
4390
4391         loop:
4392           vec_def = phi <null, null>            # REDUCTION_PHI
4393           VECT_DEF = vector_stmt                # vectorized form of STMT
4394           s_loop = scalar_stmt                  # (scalar) STMT
4395         loop_exit:
4396           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4397           use <s_out0>
4398           use <s_out0>
4399
4400      The above is transformed by this function into:
4401
4402         loop:
4403           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4404           VECT_DEF = vector_stmt                # vectorized form of STMT
4405           s_loop = scalar_stmt                  # (scalar) STMT
4406         loop_exit:
4407           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4408           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4409           v_out2 = reduce <v_out1>
4410           s_out3 = extract_field <v_out2, 0>
4411           s_out4 = adjust_result <s_out3>
4412           use <s_out4>
4413           use <s_out4>
4414 */
4415
4416 static void
4417 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4418                                   gimple *reduc_def_stmt,
4419                                   int ncopies, internal_fn reduc_fn,
4420                                   vec<gimple *> reduction_phis,
4421                                   bool double_reduc,
4422                                   slp_tree slp_node,
4423                                   slp_instance slp_node_instance,
4424                                   tree induc_val, enum tree_code induc_code,
4425                                   tree neutral_op)
4426 {
4427   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4428   stmt_vec_info prev_phi_info;
4429   tree vectype;
4430   machine_mode mode;
4431   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4432   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4433   basic_block exit_bb;
4434   tree scalar_dest;
4435   tree scalar_type;
4436   gimple *new_phi = NULL, *phi;
4437   gimple_stmt_iterator exit_gsi;
4438   tree vec_dest;
4439   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4440   gimple *epilog_stmt = NULL;
4441   enum tree_code code = gimple_assign_rhs_code (stmt);
4442   gimple *exit_phi;
4443   tree bitsize;
4444   tree adjustment_def = NULL;
4445   tree vec_initial_def = NULL;
4446   tree expr, def, initial_def = NULL;
4447   tree orig_name, scalar_result;
4448   imm_use_iterator imm_iter, phi_imm_iter;
4449   use_operand_p use_p, phi_use_p;
4450   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4451   bool nested_in_vect_loop = false;
4452   auto_vec<gimple *> new_phis;
4453   auto_vec<gimple *> inner_phis;
4454   enum vect_def_type dt = vect_unknown_def_type;
4455   int j, i;
4456   auto_vec<tree> scalar_results;
4457   unsigned int group_size = 1, k, ratio;
4458   auto_vec<tree> vec_initial_defs;
4459   auto_vec<gimple *> phis;
4460   bool slp_reduc = false;
4461   bool direct_slp_reduc;
4462   tree new_phi_result;
4463   gimple *inner_phi = NULL;
4464   tree induction_index = NULL_TREE;
4465
4466   if (slp_node)
4467     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4468
4469   if (nested_in_vect_loop_p (loop, stmt))
4470     {
4471       outer_loop = loop;
4472       loop = loop->inner;
4473       nested_in_vect_loop = true;
4474       gcc_assert (!slp_node);
4475     }
4476
4477   vectype = STMT_VINFO_VECTYPE (stmt_info);
4478   gcc_assert (vectype);
4479   mode = TYPE_MODE (vectype);
4480
4481   /* 1. Create the reduction def-use cycle:
4482      Set the arguments of REDUCTION_PHIS, i.e., transform
4483
4484         loop:
4485           vec_def = phi <null, null>            # REDUCTION_PHI
4486           VECT_DEF = vector_stmt                # vectorized form of STMT
4487           ...
4488
4489      into:
4490
4491         loop:
4492           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4493           VECT_DEF = vector_stmt                # vectorized form of STMT
4494           ...
4495
4496      (in case of SLP, do it for all the phis). */
4497
4498   /* Get the loop-entry arguments.  */
4499   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4500   if (slp_node)
4501     {
4502       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4503       vec_initial_defs.reserve (vec_num);
4504       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4505                                       &vec_initial_defs, vec_num,
4506                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4507                                       neutral_op);
4508     }
4509   else
4510     {
4511       /* Get at the scalar def before the loop, that defines the initial value
4512          of the reduction variable.  */
4513       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4514                                            loop_preheader_edge (loop));
4515       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4516          and we can't use zero for induc_val, use initial_def.  Similarly
4517          for REDUC_MIN and initial_def larger than the base.  */
4518       if (TREE_CODE (initial_def) == INTEGER_CST
4519           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4520               == INTEGER_INDUC_COND_REDUCTION)
4521           && !integer_zerop (induc_val)
4522           && ((induc_code == MAX_EXPR
4523                && tree_int_cst_lt (initial_def, induc_val))
4524               || (induc_code == MIN_EXPR
4525                   && tree_int_cst_lt (induc_val, initial_def))))
4526         induc_val = initial_def;
4527
4528       if (double_reduc)
4529         /* In case of double reduction we only create a vector variable
4530            to be put in the reduction phi node.  The actual statement
4531            creation is done later in this function.  */
4532         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4533       else if (nested_in_vect_loop)
4534         {
4535           /* Do not use an adjustment def as that case is not supported
4536              correctly if ncopies is not one.  */
4537           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4538           vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4539         }
4540       else
4541         vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4542                                                          &adjustment_def);
4543       vec_initial_defs.create (1);
4544       vec_initial_defs.quick_push (vec_initial_def);
4545     }
4546
4547   /* Set phi nodes arguments.  */
4548   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4549     {
4550       tree vec_init_def = vec_initial_defs[i];
4551       tree def = vect_defs[i];
4552       for (j = 0; j < ncopies; j++)
4553         {
4554           if (j != 0)
4555             {
4556               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4557               if (nested_in_vect_loop)
4558                 vec_init_def
4559                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4560                                                     vec_init_def);
4561             }
4562
4563           /* Set the loop-entry arg of the reduction-phi.  */
4564
4565           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4566               == INTEGER_INDUC_COND_REDUCTION)
4567             {
4568               /* Initialise the reduction phi to zero.  This prevents initial
4569                  values of non-zero interferring with the reduction op.  */
4570               gcc_assert (ncopies == 1);
4571               gcc_assert (i == 0);
4572
4573               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4574               tree induc_val_vec
4575                 = build_vector_from_val (vec_init_def_type, induc_val);
4576
4577               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4578                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4579             }
4580           else
4581             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4582                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4583
4584           /* Set the loop-latch arg for the reduction-phi.  */
4585           if (j > 0)
4586             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4587
4588           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4589                        UNKNOWN_LOCATION);
4590
4591           if (dump_enabled_p ())
4592             {
4593               dump_printf_loc (MSG_NOTE, vect_location,
4594                                "transform reduction: created def-use cycle: ");
4595               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4596               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4597             }
4598         }
4599     }
4600
4601   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4602      which is updated with the current index of the loop for every match of
4603      the original loop's cond_expr (VEC_STMT).  This results in a vector
4604      containing the last time the condition passed for that vector lane.
4605      The first match will be a 1 to allow 0 to be used for non-matching
4606      indexes.  If there are no matches at all then the vector will be all
4607      zeroes.  */
4608   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4609     {
4610       tree indx_before_incr, indx_after_incr;
4611       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4612
4613       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4614       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4615
4616       int scalar_precision
4617         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4618       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4619       tree cr_index_vector_type = build_vector_type
4620         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4621
4622       /* First we create a simple vector induction variable which starts
4623          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4624          vector size (STEP).  */
4625
4626       /* Create a {1,2,3,...} vector.  */
4627       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4628
4629       /* Create a vector of the step value.  */
4630       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4631       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4632
4633       /* Create an induction variable.  */
4634       gimple_stmt_iterator incr_gsi;
4635       bool insert_after;
4636       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4637       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4638                  insert_after, &indx_before_incr, &indx_after_incr);
4639
4640       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4641          filled with zeros (VEC_ZERO).  */
4642
4643       /* Create a vector of 0s.  */
4644       tree zero = build_zero_cst (cr_index_scalar_type);
4645       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4646
4647       /* Create a vector phi node.  */
4648       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4649       new_phi = create_phi_node (new_phi_tree, loop->header);
4650       loop_vinfo->add_stmt (new_phi);
4651       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4652                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4653
4654       /* Now take the condition from the loops original cond_expr
4655          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4656          every match uses values from the induction variable
4657          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4658          (NEW_PHI_TREE).
4659          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4660          the new cond_expr (INDEX_COND_EXPR).  */
4661
4662       /* Duplicate the condition from vec_stmt.  */
4663       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4664
4665       /* Create a conditional, where the condition is taken from vec_stmt
4666          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4667          else is the phi (NEW_PHI_TREE).  */
4668       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4669                                      ccompare, indx_before_incr,
4670                                      new_phi_tree);
4671       induction_index = make_ssa_name (cr_index_vector_type);
4672       gimple *index_condition = gimple_build_assign (induction_index,
4673                                                      index_cond_expr);
4674       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4675       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4676       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4677
4678       /* Update the phi with the vec cond.  */
4679       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4680                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4681     }
4682
4683   /* 2. Create epilog code.
4684         The reduction epilog code operates across the elements of the vector
4685         of partial results computed by the vectorized loop.
4686         The reduction epilog code consists of:
4687
4688         step 1: compute the scalar result in a vector (v_out2)
4689         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4690         step 3: adjust the scalar result (s_out3) if needed.
4691
4692         Step 1 can be accomplished using one the following three schemes:
4693           (scheme 1) using reduc_fn, if available.
4694           (scheme 2) using whole-vector shifts, if available.
4695           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4696                      combined.
4697
4698           The overall epilog code looks like this:
4699
4700           s_out0 = phi <s_loop>         # original EXIT_PHI
4701           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4702           v_out2 = reduce <v_out1>              # step 1
4703           s_out3 = extract_field <v_out2, 0>    # step 2
4704           s_out4 = adjust_result <s_out3>       # step 3
4705
4706           (step 3 is optional, and steps 1 and 2 may be combined).
4707           Lastly, the uses of s_out0 are replaced by s_out4.  */
4708
4709
4710   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4711          v_out1 = phi <VECT_DEF>
4712          Store them in NEW_PHIS.  */
4713
4714   exit_bb = single_exit (loop)->dest;
4715   prev_phi_info = NULL;
4716   new_phis.create (vect_defs.length ());
4717   FOR_EACH_VEC_ELT (vect_defs, i, def)
4718     {
4719       for (j = 0; j < ncopies; j++)
4720         {
4721           tree new_def = copy_ssa_name (def);
4722           phi = create_phi_node (new_def, exit_bb);
4723           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4724           if (j == 0)
4725             new_phis.quick_push (phi);
4726           else
4727             {
4728               def = vect_get_vec_def_for_stmt_copy (dt, def);
4729               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4730             }
4731
4732           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4733           prev_phi_info = phi_info;
4734         }
4735     }
4736
4737   /* The epilogue is created for the outer-loop, i.e., for the loop being
4738      vectorized.  Create exit phis for the outer loop.  */
4739   if (double_reduc)
4740     {
4741       loop = outer_loop;
4742       exit_bb = single_exit (loop)->dest;
4743       inner_phis.create (vect_defs.length ());
4744       FOR_EACH_VEC_ELT (new_phis, i, phi)
4745         {
4746           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4747           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4748           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4749                            PHI_RESULT (phi));
4750           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4751           inner_phis.quick_push (phi);
4752           new_phis[i] = outer_phi;
4753           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4754             {
4755               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4756               new_result = copy_ssa_name (PHI_RESULT (phi));
4757               outer_phi = create_phi_node (new_result, exit_bb);
4758               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4759                                PHI_RESULT (phi));
4760               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4761               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4762               prev_phi_info = outer_phi_info;
4763             }
4764         }
4765     }
4766
4767   exit_gsi = gsi_after_labels (exit_bb);
4768
4769   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4770          (i.e. when reduc_fn is not available) and in the final adjustment
4771          code (if needed).  Also get the original scalar reduction variable as
4772          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4773          represents a reduction pattern), the tree-code and scalar-def are
4774          taken from the original stmt that the pattern-stmt (STMT) replaces.
4775          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4776          are taken from STMT.  */
4777
4778   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4779   if (!orig_stmt)
4780     {
4781       /* Regular reduction  */
4782       orig_stmt = stmt;
4783     }
4784   else
4785     {
4786       /* Reduction pattern  */
4787       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4788       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4789       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4790     }
4791
4792   code = gimple_assign_rhs_code (orig_stmt);
4793   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4794      partial results are added and not subtracted.  */
4795   if (code == MINUS_EXPR)
4796     code = PLUS_EXPR;
4797
4798   scalar_dest = gimple_assign_lhs (orig_stmt);
4799   scalar_type = TREE_TYPE (scalar_dest);
4800   scalar_results.create (group_size);
4801   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4802   bitsize = TYPE_SIZE (scalar_type);
4803
4804   /* In case this is a reduction in an inner-loop while vectorizing an outer
4805      loop - we don't need to extract a single scalar result at the end of the
4806      inner-loop (unless it is double reduction, i.e., the use of reduction is
4807      outside the outer-loop).  The final vector of partial results will be used
4808      in the vectorized outer-loop, or reduced to a scalar result at the end of
4809      the outer-loop.  */
4810   if (nested_in_vect_loop && !double_reduc)
4811     goto vect_finalize_reduction;
4812
4813   /* SLP reduction without reduction chain, e.g.,
4814      # a1 = phi <a2, a0>
4815      # b1 = phi <b2, b0>
4816      a2 = operation (a1)
4817      b2 = operation (b1)  */
4818   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4819
4820   /* True if we should implement SLP_REDUC using native reduction operations
4821      instead of scalar operations.  */
4822   direct_slp_reduc = (reduc_fn != IFN_LAST
4823                       && slp_reduc
4824                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4825
4826   /* In case of reduction chain, e.g.,
4827      # a1 = phi <a3, a0>
4828      a2 = operation (a1)
4829      a3 = operation (a2),
4830
4831      we may end up with more than one vector result.  Here we reduce them to
4832      one vector.  */
4833   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4834     {
4835       tree first_vect = PHI_RESULT (new_phis[0]);
4836       gassign *new_vec_stmt = NULL;
4837       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4838       for (k = 1; k < new_phis.length (); k++)
4839         {
4840           gimple *next_phi = new_phis[k];
4841           tree second_vect = PHI_RESULT (next_phi);
4842           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4843           new_vec_stmt = gimple_build_assign (tem, code,
4844                                               first_vect, second_vect);
4845           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4846           first_vect = tem;
4847         }
4848
4849       new_phi_result = first_vect;
4850       if (new_vec_stmt)
4851         {
4852           new_phis.truncate (0);
4853           new_phis.safe_push (new_vec_stmt);
4854         }
4855     }
4856   /* Likewise if we couldn't use a single defuse cycle.  */
4857   else if (ncopies > 1)
4858     {
4859       gcc_assert (new_phis.length () == 1);
4860       tree first_vect = PHI_RESULT (new_phis[0]);
4861       gassign *new_vec_stmt = NULL;
4862       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4863       gimple *next_phi = new_phis[0];
4864       for (int k = 1; k < ncopies; ++k)
4865         {
4866           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4867           tree second_vect = PHI_RESULT (next_phi);
4868           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4869           new_vec_stmt = gimple_build_assign (tem, code,
4870                                               first_vect, second_vect);
4871           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4872           first_vect = tem;
4873         }
4874       new_phi_result = first_vect;
4875       new_phis.truncate (0);
4876       new_phis.safe_push (new_vec_stmt);
4877     }
4878   else
4879     new_phi_result = PHI_RESULT (new_phis[0]);
4880
4881   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4882       && reduc_fn != IFN_LAST)
4883     {
4884       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4885          various data values where the condition matched and another vector
4886          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4887          need to extract the last matching index (which will be the index with
4888          highest value) and use this to index into the data vector.
4889          For the case where there were no matches, the data vector will contain
4890          all default values and the index vector will be all zeros.  */
4891
4892       /* Get various versions of the type of the vector of indexes.  */
4893       tree index_vec_type = TREE_TYPE (induction_index);
4894       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4895       tree index_scalar_type = TREE_TYPE (index_vec_type);
4896       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4897         (index_vec_type);
4898
4899       /* Get an unsigned integer version of the type of the data vector.  */
4900       int scalar_precision
4901         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4902       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4903       tree vectype_unsigned = build_vector_type
4904         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4905
4906       /* First we need to create a vector (ZERO_VEC) of zeros and another
4907          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4908          can create using a MAX reduction and then expanding.
4909          In the case where the loop never made any matches, the max index will
4910          be zero.  */
4911
4912       /* Vector of {0, 0, 0,...}.  */
4913       tree zero_vec = make_ssa_name (vectype);
4914       tree zero_vec_rhs = build_zero_cst (vectype);
4915       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4916       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4917
4918       /* Find maximum value from the vector of found indexes.  */
4919       tree max_index = make_ssa_name (index_scalar_type);
4920       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4921                                                           1, induction_index);
4922       gimple_call_set_lhs (max_index_stmt, max_index);
4923       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4924
4925       /* Vector of {max_index, max_index, max_index,...}.  */
4926       tree max_index_vec = make_ssa_name (index_vec_type);
4927       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4928                                                       max_index);
4929       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4930                                                         max_index_vec_rhs);
4931       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4932
4933       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4934          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4935          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4936          otherwise.  Only one value should match, resulting in a vector
4937          (VEC_COND) with one data value and the rest zeros.
4938          In the case where the loop never made any matches, every index will
4939          match, resulting in a vector with all data values (which will all be
4940          the default value).  */
4941
4942       /* Compare the max index vector to the vector of found indexes to find
4943          the position of the max value.  */
4944       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4945       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4946                                                       induction_index,
4947                                                       max_index_vec);
4948       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4949
4950       /* Use the compare to choose either values from the data vector or
4951          zero.  */
4952       tree vec_cond = make_ssa_name (vectype);
4953       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4954                                                    vec_compare, new_phi_result,
4955                                                    zero_vec);
4956       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4957
4958       /* Finally we need to extract the data value from the vector (VEC_COND)
4959          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4960          reduction, but because this doesn't exist, we can use a MAX reduction
4961          instead.  The data value might be signed or a float so we need to cast
4962          it first.
4963          In the case where the loop never made any matches, the data values are
4964          all identical, and so will reduce down correctly.  */
4965
4966       /* Make the matched data values unsigned.  */
4967       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4968       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4969                                        vec_cond);
4970       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4971                                                         VIEW_CONVERT_EXPR,
4972                                                         vec_cond_cast_rhs);
4973       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4974
4975       /* Reduce down to a scalar value.  */
4976       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4977       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4978                                                            1, vec_cond_cast);
4979       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4980       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4981
4982       /* Convert the reduced value back to the result type and set as the
4983          result.  */
4984       gimple_seq stmts = NULL;
4985       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4986                                data_reduc);
4987       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4988       scalar_results.safe_push (new_temp);
4989     }
4990   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4991            && reduc_fn == IFN_LAST)
4992     {
4993       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4994          idx = 0;
4995          idx_val = induction_index[0];
4996          val = data_reduc[0];
4997          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4998            if (induction_index[i] > idx_val)
4999              val = data_reduc[i], idx_val = induction_index[i];
5000          return val;  */
5001
5002       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5003       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5004       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5005       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5006       /* Enforced by vectorizable_reduction, which ensures we have target
5007          support before allowing a conditional reduction on variable-length
5008          vectors.  */
5009       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5010       tree idx_val = NULL_TREE, val = NULL_TREE;
5011       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5012         {
5013           tree old_idx_val = idx_val;
5014           tree old_val = val;
5015           idx_val = make_ssa_name (idx_eltype);
5016           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5017                                              build3 (BIT_FIELD_REF, idx_eltype,
5018                                                      induction_index,
5019                                                      bitsize_int (el_size),
5020                                                      bitsize_int (off)));
5021           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5022           val = make_ssa_name (data_eltype);
5023           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5024                                              build3 (BIT_FIELD_REF,
5025                                                      data_eltype,
5026                                                      new_phi_result,
5027                                                      bitsize_int (el_size),
5028                                                      bitsize_int (off)));
5029           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5030           if (off != 0)
5031             {
5032               tree new_idx_val = idx_val;
5033               tree new_val = val;
5034               if (off != v_size - el_size)
5035                 {
5036                   new_idx_val = make_ssa_name (idx_eltype);
5037                   epilog_stmt = gimple_build_assign (new_idx_val,
5038                                                      MAX_EXPR, idx_val,
5039                                                      old_idx_val);
5040                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5041                 }
5042               new_val = make_ssa_name (data_eltype);
5043               epilog_stmt = gimple_build_assign (new_val,
5044                                                  COND_EXPR,
5045                                                  build2 (GT_EXPR,
5046                                                          boolean_type_node,
5047                                                          idx_val,
5048                                                          old_idx_val),
5049                                                  val, old_val);
5050               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5051               idx_val = new_idx_val;
5052               val = new_val;
5053             }
5054         }
5055       /* Convert the reduced value back to the result type and set as the
5056          result.  */
5057       gimple_seq stmts = NULL;
5058       val = gimple_convert (&stmts, scalar_type, val);
5059       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5060       scalar_results.safe_push (val);
5061     }
5062
5063   /* 2.3 Create the reduction code, using one of the three schemes described
5064          above. In SLP we simply need to extract all the elements from the
5065          vector (without reducing them), so we use scalar shifts.  */
5066   else if (reduc_fn != IFN_LAST && !slp_reduc)
5067     {
5068       tree tmp;
5069       tree vec_elem_type;
5070
5071       /* Case 1:  Create:
5072          v_out2 = reduc_expr <v_out1>  */
5073
5074       if (dump_enabled_p ())
5075         dump_printf_loc (MSG_NOTE, vect_location,
5076                          "Reduce using direct vector reduction.\n");
5077
5078       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5079       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5080         {
5081           tree tmp_dest
5082             = vect_create_destination_var (scalar_dest, vec_elem_type);
5083           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5084                                                     new_phi_result);
5085           gimple_set_lhs (epilog_stmt, tmp_dest);
5086           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5087           gimple_set_lhs (epilog_stmt, new_temp);
5088           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5089
5090           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5091                                              new_temp);
5092         }
5093       else
5094         {
5095           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5096                                                     new_phi_result);
5097           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5098         }
5099
5100       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5101       gimple_set_lhs (epilog_stmt, new_temp);
5102       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5103
5104       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5105            == INTEGER_INDUC_COND_REDUCTION)
5106           && !operand_equal_p (initial_def, induc_val, 0))
5107         {
5108           /* Earlier we set the initial value to be a vector if induc_val
5109              values.  Check the result and if it is induc_val then replace
5110              with the original initial value, unless induc_val is
5111              the same as initial_def already.  */
5112           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5113                                   induc_val);
5114
5115           tmp = make_ssa_name (new_scalar_dest);
5116           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5117                                              initial_def, new_temp);
5118           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5119           new_temp = tmp;
5120         }
5121
5122       scalar_results.safe_push (new_temp);
5123     }
5124   else if (direct_slp_reduc)
5125     {
5126       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5127          with the elements for other SLP statements replaced with the
5128          neutral value.  We can then do a normal reduction on each vector.  */
5129
5130       /* Enforced by vectorizable_reduction.  */
5131       gcc_assert (new_phis.length () == 1);
5132       gcc_assert (pow2p_hwi (group_size));
5133
5134       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5135       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5136       gimple_seq seq = NULL;
5137
5138       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5139          and the same element size as VECTYPE.  */
5140       tree index = build_index_vector (vectype, 0, 1);
5141       tree index_type = TREE_TYPE (index);
5142       tree index_elt_type = TREE_TYPE (index_type);
5143       tree mask_type = build_same_sized_truth_vector_type (index_type);
5144
5145       /* Create a vector that, for each element, identifies which of
5146          the REDUC_GROUP_SIZE results should use it.  */
5147       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5148       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5149                             build_vector_from_val (index_type, index_mask));
5150
5151       /* Get a neutral vector value.  This is simply a splat of the neutral
5152          scalar value if we have one, otherwise the initial scalar value
5153          is itself a neutral value.  */
5154       tree vector_identity = NULL_TREE;
5155       if (neutral_op)
5156         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5157                                                         neutral_op);
5158       for (unsigned int i = 0; i < group_size; ++i)
5159         {
5160           /* If there's no univeral neutral value, we can use the
5161              initial scalar value from the original PHI.  This is used
5162              for MIN and MAX reduction, for example.  */
5163           if (!neutral_op)
5164             {
5165               tree scalar_value
5166                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5167                                          loop_preheader_edge (loop));
5168               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5169                                                               scalar_value);
5170             }
5171
5172           /* Calculate the equivalent of:
5173
5174              sel[j] = (index[j] == i);
5175
5176              which selects the elements of NEW_PHI_RESULT that should
5177              be included in the result.  */
5178           tree compare_val = build_int_cst (index_elt_type, i);
5179           compare_val = build_vector_from_val (index_type, compare_val);
5180           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5181                                    index, compare_val);
5182
5183           /* Calculate the equivalent of:
5184
5185              vec = seq ? new_phi_result : vector_identity;
5186
5187              VEC is now suitable for a full vector reduction.  */
5188           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5189                                    sel, new_phi_result, vector_identity);
5190
5191           /* Do the reduction and convert it to the appropriate type.  */
5192           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5193                                       TREE_TYPE (vectype), vec);
5194           scalar = gimple_convert (&seq, scalar_type, scalar);
5195           scalar_results.safe_push (scalar);
5196         }
5197       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5198     }
5199   else
5200     {
5201       bool reduce_with_shift;
5202       tree vec_temp;
5203
5204       /* COND reductions all do the final reduction with MAX_EXPR
5205          or MIN_EXPR.  */
5206       if (code == COND_EXPR)
5207         {
5208           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5209               == INTEGER_INDUC_COND_REDUCTION)
5210             code = induc_code;
5211           else
5212             code = MAX_EXPR;
5213         }
5214
5215       /* See if the target wants to do the final (shift) reduction
5216          in a vector mode of smaller size and first reduce upper/lower
5217          halves against each other.  */
5218       enum machine_mode mode1 = mode;
5219       tree vectype1 = vectype;
5220       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5221       unsigned sz1 = sz;
5222       if (!slp_reduc
5223           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5224         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5225
5226       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5227       reduce_with_shift = have_whole_vector_shift (mode1);
5228       if (!VECTOR_MODE_P (mode1))
5229         reduce_with_shift = false;
5230       else
5231         {
5232           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5233           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5234             reduce_with_shift = false;
5235         }
5236
5237       /* First reduce the vector to the desired vector size we should
5238          do shift reduction on by combining upper and lower halves.  */
5239       new_temp = new_phi_result;
5240       while (sz > sz1)
5241         {
5242           gcc_assert (!slp_reduc);
5243           sz /= 2;
5244           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5245
5246           /* The target has to make sure we support lowpart/highpart
5247              extraction, either via direct vector extract or through
5248              an integer mode punning.  */
5249           tree dst1, dst2;
5250           if (convert_optab_handler (vec_extract_optab,
5251                                      TYPE_MODE (TREE_TYPE (new_temp)),
5252                                      TYPE_MODE (vectype1))
5253               != CODE_FOR_nothing)
5254             {
5255               /* Extract sub-vectors directly once vec_extract becomes
5256                  a conversion optab.  */
5257               dst1 = make_ssa_name (vectype1);
5258               epilog_stmt
5259                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5260                                          build3 (BIT_FIELD_REF, vectype1,
5261                                                  new_temp, TYPE_SIZE (vectype1),
5262                                                  bitsize_int (0)));
5263               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5264               dst2 =  make_ssa_name (vectype1);
5265               epilog_stmt
5266                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5267                                          build3 (BIT_FIELD_REF, vectype1,
5268                                                  new_temp, TYPE_SIZE (vectype1),
5269                                                  bitsize_int (sz * BITS_PER_UNIT)));
5270               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271             }
5272           else
5273             {
5274               /* Extract via punning to appropriately sized integer mode
5275                  vector.  */
5276               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5277                                                             1);
5278               tree etype = build_vector_type (eltype, 2);
5279               gcc_assert (convert_optab_handler (vec_extract_optab,
5280                                                  TYPE_MODE (etype),
5281                                                  TYPE_MODE (eltype))
5282                           != CODE_FOR_nothing);
5283               tree tem = make_ssa_name (etype);
5284               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5285                                                  build1 (VIEW_CONVERT_EXPR,
5286                                                          etype, new_temp));
5287               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5288               new_temp = tem;
5289               tem = make_ssa_name (eltype);
5290               epilog_stmt
5291                   = gimple_build_assign (tem, BIT_FIELD_REF,
5292                                          build3 (BIT_FIELD_REF, eltype,
5293                                                  new_temp, TYPE_SIZE (eltype),
5294                                                  bitsize_int (0)));
5295               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5296               dst1 = make_ssa_name (vectype1);
5297               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5298                                                  build1 (VIEW_CONVERT_EXPR,
5299                                                          vectype1, tem));
5300               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5301               tem = make_ssa_name (eltype);
5302               epilog_stmt
5303                   = gimple_build_assign (tem, BIT_FIELD_REF,
5304                                          build3 (BIT_FIELD_REF, eltype,
5305                                                  new_temp, TYPE_SIZE (eltype),
5306                                                  bitsize_int (sz * BITS_PER_UNIT)));
5307               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5308               dst2 =  make_ssa_name (vectype1);
5309               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5310                                                  build1 (VIEW_CONVERT_EXPR,
5311                                                          vectype1, tem));
5312               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5313             }
5314
5315           new_temp = make_ssa_name (vectype1);
5316           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5317           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5318         }
5319
5320       if (reduce_with_shift && !slp_reduc)
5321         {
5322           int element_bitsize = tree_to_uhwi (bitsize);
5323           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5324              for variable-length vectors and also requires direct target support
5325              for loop reductions.  */
5326           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5327           int nelements = vec_size_in_bits / element_bitsize;
5328           vec_perm_builder sel;
5329           vec_perm_indices indices;
5330
5331           int elt_offset;
5332
5333           tree zero_vec = build_zero_cst (vectype1);
5334           /* Case 2: Create:
5335              for (offset = nelements/2; offset >= 1; offset/=2)
5336                 {
5337                   Create:  va' = vec_shift <va, offset>
5338                   Create:  va = vop <va, va'>
5339                 }  */
5340
5341           tree rhs;
5342
5343           if (dump_enabled_p ())
5344             dump_printf_loc (MSG_NOTE, vect_location,
5345                              "Reduce using vector shifts\n");
5346
5347           mode1 = TYPE_MODE (vectype1);
5348           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5349           for (elt_offset = nelements / 2;
5350                elt_offset >= 1;
5351                elt_offset /= 2)
5352             {
5353               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5354               indices.new_vector (sel, 2, nelements);
5355               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5356               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5357                                                  new_temp, zero_vec, mask);
5358               new_name = make_ssa_name (vec_dest, epilog_stmt);
5359               gimple_assign_set_lhs (epilog_stmt, new_name);
5360               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5361
5362               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5363                                                  new_temp);
5364               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5365               gimple_assign_set_lhs (epilog_stmt, new_temp);
5366               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5367             }
5368
5369           /* 2.4  Extract the final scalar result.  Create:
5370              s_out3 = extract_field <v_out2, bitpos>  */
5371
5372           if (dump_enabled_p ())
5373             dump_printf_loc (MSG_NOTE, vect_location,
5374                              "extract scalar result\n");
5375
5376           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5377                         bitsize, bitsize_zero_node);
5378           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5379           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5380           gimple_assign_set_lhs (epilog_stmt, new_temp);
5381           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5382           scalar_results.safe_push (new_temp);
5383         }
5384       else
5385         {
5386           /* Case 3: Create:
5387              s = extract_field <v_out2, 0>
5388              for (offset = element_size;
5389                   offset < vector_size;
5390                   offset += element_size;)
5391                {
5392                  Create:  s' = extract_field <v_out2, offset>
5393                  Create:  s = op <s, s'>  // For non SLP cases
5394                }  */
5395
5396           if (dump_enabled_p ())
5397             dump_printf_loc (MSG_NOTE, vect_location,
5398                              "Reduce using scalar code.\n");
5399
5400           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5401           int element_bitsize = tree_to_uhwi (bitsize);
5402           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5403             {
5404               int bit_offset;
5405               if (gimple_code (new_phi) == GIMPLE_PHI)
5406                 vec_temp = PHI_RESULT (new_phi);
5407               else
5408                 vec_temp = gimple_assign_lhs (new_phi);
5409               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5410                                  bitsize_zero_node);
5411               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5412               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5413               gimple_assign_set_lhs (epilog_stmt, new_temp);
5414               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5415
5416               /* In SLP we don't need to apply reduction operation, so we just
5417                  collect s' values in SCALAR_RESULTS.  */
5418               if (slp_reduc)
5419                 scalar_results.safe_push (new_temp);
5420
5421               for (bit_offset = element_bitsize;
5422                    bit_offset < vec_size_in_bits;
5423                    bit_offset += element_bitsize)
5424                 {
5425                   tree bitpos = bitsize_int (bit_offset);
5426                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5427                                      bitsize, bitpos);
5428
5429                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5430                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5431                   gimple_assign_set_lhs (epilog_stmt, new_name);
5432                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5433
5434                   if (slp_reduc)
5435                     {
5436                       /* In SLP we don't need to apply reduction operation, so
5437                          we just collect s' values in SCALAR_RESULTS.  */
5438                       new_temp = new_name;
5439                       scalar_results.safe_push (new_name);
5440                     }
5441                   else
5442                     {
5443                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5444                                                          new_name, new_temp);
5445                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5446                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5447                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5448                     }
5449                 }
5450             }
5451
5452           /* The only case where we need to reduce scalar results in SLP, is
5453              unrolling.  If the size of SCALAR_RESULTS is greater than
5454              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5455              REDUC_GROUP_SIZE.  */
5456           if (slp_reduc)
5457             {
5458               tree res, first_res, new_res;
5459               gimple *new_stmt;
5460
5461               /* Reduce multiple scalar results in case of SLP unrolling.  */
5462               for (j = group_size; scalar_results.iterate (j, &res);
5463                    j++)
5464                 {
5465                   first_res = scalar_results[j % group_size];
5466                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5467                                                   first_res, res);
5468                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5469                   gimple_assign_set_lhs (new_stmt, new_res);
5470                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5471                   scalar_results[j % group_size] = new_res;
5472                 }
5473             }
5474           else
5475             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5476             scalar_results.safe_push (new_temp);
5477         }
5478
5479       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5480            == INTEGER_INDUC_COND_REDUCTION)
5481           && !operand_equal_p (initial_def, induc_val, 0))
5482         {
5483           /* Earlier we set the initial value to be a vector if induc_val
5484              values.  Check the result and if it is induc_val then replace
5485              with the original initial value, unless induc_val is
5486              the same as initial_def already.  */
5487           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5488                                   induc_val);
5489
5490           tree tmp = make_ssa_name (new_scalar_dest);
5491           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5492                                              initial_def, new_temp);
5493           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5494           scalar_results[0] = tmp;
5495         }
5496     }
5497
5498 vect_finalize_reduction:
5499
5500   if (double_reduc)
5501     loop = loop->inner;
5502
5503   /* 2.5 Adjust the final result by the initial value of the reduction
5504          variable. (When such adjustment is not needed, then
5505          'adjustment_def' is zero).  For example, if code is PLUS we create:
5506          new_temp = loop_exit_def + adjustment_def  */
5507
5508   if (adjustment_def)
5509     {
5510       gcc_assert (!slp_reduc);
5511       if (nested_in_vect_loop)
5512         {
5513           new_phi = new_phis[0];
5514           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5515           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5516           new_dest = vect_create_destination_var (scalar_dest, vectype);
5517         }
5518       else
5519         {
5520           new_temp = scalar_results[0];
5521           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5522           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5523           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5524         }
5525
5526       epilog_stmt = gimple_build_assign (new_dest, expr);
5527       new_temp = make_ssa_name (new_dest, epilog_stmt);
5528       gimple_assign_set_lhs (epilog_stmt, new_temp);
5529       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5530       if (nested_in_vect_loop)
5531         {
5532           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5533           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5534             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5535
5536           if (!double_reduc)
5537             scalar_results.quick_push (new_temp);
5538           else
5539             scalar_results[0] = new_temp;
5540         }
5541       else
5542         scalar_results[0] = new_temp;
5543
5544       new_phis[0] = epilog_stmt;
5545     }
5546
5547   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5548           phis with new adjusted scalar results, i.e., replace use <s_out0>
5549           with use <s_out4>.
5550
5551      Transform:
5552         loop_exit:
5553           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5554           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5555           v_out2 = reduce <v_out1>
5556           s_out3 = extract_field <v_out2, 0>
5557           s_out4 = adjust_result <s_out3>
5558           use <s_out0>
5559           use <s_out0>
5560
5561      into:
5562
5563         loop_exit:
5564           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5565           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5566           v_out2 = reduce <v_out1>
5567           s_out3 = extract_field <v_out2, 0>
5568           s_out4 = adjust_result <s_out3>
5569           use <s_out4>
5570           use <s_out4> */
5571
5572
5573   /* In SLP reduction chain we reduce vector results into one vector if
5574      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5575      LHS of the last stmt in the reduction chain, since we are looking for
5576      the loop exit phi node.  */
5577   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5578     {
5579       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5580       /* Handle reduction patterns.  */
5581       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5582         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5583
5584       scalar_dest = gimple_assign_lhs (dest_stmt);
5585       group_size = 1;
5586     }
5587
5588   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5589      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5590      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5591      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5592      correspond to the first vector stmt, etc.
5593      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5594   if (group_size > new_phis.length ())
5595     {
5596       ratio = group_size / new_phis.length ();
5597       gcc_assert (!(group_size % new_phis.length ()));
5598     }
5599   else
5600     ratio = 1;
5601
5602   for (k = 0; k < group_size; k++)
5603     {
5604       if (k % ratio == 0)
5605         {
5606           epilog_stmt = new_phis[k / ratio];
5607           reduction_phi = reduction_phis[k / ratio];
5608           if (double_reduc)
5609             inner_phi = inner_phis[k / ratio];
5610         }
5611
5612       if (slp_reduc)
5613         {
5614           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5615
5616           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5617           /* SLP statements can't participate in patterns.  */
5618           gcc_assert (!orig_stmt);
5619           scalar_dest = gimple_assign_lhs (current_stmt);
5620         }
5621
5622       phis.create (3);
5623       /* Find the loop-closed-use at the loop exit of the original scalar
5624          result.  (The reduction result is expected to have two immediate uses -
5625          one at the latch block, and one at the loop exit).  */
5626       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5627         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5628             && !is_gimple_debug (USE_STMT (use_p)))
5629           phis.safe_push (USE_STMT (use_p));
5630
5631       /* While we expect to have found an exit_phi because of loop-closed-ssa
5632          form we can end up without one if the scalar cycle is dead.  */
5633
5634       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5635         {
5636           if (outer_loop)
5637             {
5638               stmt_vec_info exit_phi_vinfo
5639                 = loop_vinfo->lookup_stmt (exit_phi);
5640               gphi *vect_phi;
5641
5642               /* FORNOW. Currently not supporting the case that an inner-loop
5643                  reduction is not used in the outer-loop (but only outside the
5644                  outer-loop), unless it is double reduction.  */
5645               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5646                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5647                           || double_reduc);
5648
5649               if (double_reduc)
5650                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5651               else
5652                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5653               if (!double_reduc
5654                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5655                       != vect_double_reduction_def)
5656                 continue;
5657
5658               /* Handle double reduction:
5659
5660                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5661                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5662                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5663                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5664
5665                  At that point the regular reduction (stmt2 and stmt3) is
5666                  already vectorized, as well as the exit phi node, stmt4.
5667                  Here we vectorize the phi node of double reduction, stmt1, and
5668                  update all relevant statements.  */
5669
5670               /* Go through all the uses of s2 to find double reduction phi
5671                  node, i.e., stmt1 above.  */
5672               orig_name = PHI_RESULT (exit_phi);
5673               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5674                 {
5675                   stmt_vec_info use_stmt_vinfo;
5676                   tree vect_phi_init, preheader_arg, vect_phi_res;
5677                   basic_block bb = gimple_bb (use_stmt);
5678                   gimple *use;
5679
5680                   /* Check that USE_STMT is really double reduction phi
5681                      node.  */
5682                   if (gimple_code (use_stmt) != GIMPLE_PHI
5683                       || gimple_phi_num_args (use_stmt) != 2
5684                       || bb->loop_father != outer_loop)
5685                     continue;
5686                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5687                   if (!use_stmt_vinfo
5688                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5689                           != vect_double_reduction_def)
5690                     continue;
5691
5692                   /* Create vector phi node for double reduction:
5693                      vs1 = phi <vs0, vs2>
5694                      vs1 was created previously in this function by a call to
5695                        vect_get_vec_def_for_operand and is stored in
5696                        vec_initial_def;
5697                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5698                      vs0 is created here.  */
5699
5700                   /* Create vector phi node.  */
5701                   vect_phi = create_phi_node (vec_initial_def, bb);
5702                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5703
5704                   /* Create vs0 - initial def of the double reduction phi.  */
5705                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5706                                              loop_preheader_edge (outer_loop));
5707                   vect_phi_init = get_initial_def_for_reduction
5708                     (stmt, preheader_arg, NULL);
5709
5710                   /* Update phi node arguments with vs0 and vs2.  */
5711                   add_phi_arg (vect_phi, vect_phi_init,
5712                                loop_preheader_edge (outer_loop),
5713                                UNKNOWN_LOCATION);
5714                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5715                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5716                   if (dump_enabled_p ())
5717                     {
5718                       dump_printf_loc (MSG_NOTE, vect_location,
5719                                        "created double reduction phi node: ");
5720                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5721                     }
5722
5723                   vect_phi_res = PHI_RESULT (vect_phi);
5724
5725                   /* Replace the use, i.e., set the correct vs1 in the regular
5726                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5727                      loop is redundant.  */
5728                   use = reduction_phi;
5729                   for (j = 0; j < ncopies; j++)
5730                     {
5731                       edge pr_edge = loop_preheader_edge (loop);
5732                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5733                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5734                     }
5735                 }
5736             }
5737         }
5738
5739       phis.release ();
5740       if (nested_in_vect_loop)
5741         {
5742           if (double_reduc)
5743             loop = outer_loop;
5744           else
5745             continue;
5746         }
5747
5748       phis.create (3);
5749       /* Find the loop-closed-use at the loop exit of the original scalar
5750          result.  (The reduction result is expected to have two immediate uses,
5751          one at the latch block, and one at the loop exit).  For double
5752          reductions we are looking for exit phis of the outer loop.  */
5753       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5754         {
5755           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5756             {
5757               if (!is_gimple_debug (USE_STMT (use_p)))
5758                 phis.safe_push (USE_STMT (use_p));
5759             }
5760           else
5761             {
5762               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5763                 {
5764                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5765
5766                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5767                     {
5768                       if (!flow_bb_inside_loop_p (loop,
5769                                              gimple_bb (USE_STMT (phi_use_p)))
5770                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5771                         phis.safe_push (USE_STMT (phi_use_p));
5772                     }
5773                 }
5774             }
5775         }
5776
5777       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5778         {
5779           /* Replace the uses:  */
5780           orig_name = PHI_RESULT (exit_phi);
5781           scalar_result = scalar_results[k];
5782           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5783             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5784               SET_USE (use_p, scalar_result);
5785         }
5786
5787       phis.release ();
5788     }
5789 }
5790
5791 /* Return a vector of type VECTYPE that is equal to the vector select
5792    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5793    before GSI.  */
5794
5795 static tree
5796 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5797                      tree vec, tree identity)
5798 {
5799   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5800   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5801                                           mask, vec, identity);
5802   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5803   return cond;
5804 }
5805
5806 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5807    order, starting with LHS.  Insert the extraction statements before GSI and
5808    associate the new scalar SSA names with variable SCALAR_DEST.
5809    Return the SSA name for the result.  */
5810
5811 static tree
5812 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5813                        tree_code code, tree lhs, tree vector_rhs)
5814 {
5815   tree vectype = TREE_TYPE (vector_rhs);
5816   tree scalar_type = TREE_TYPE (vectype);
5817   tree bitsize = TYPE_SIZE (scalar_type);
5818   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5819   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5820
5821   for (unsigned HOST_WIDE_INT bit_offset = 0;
5822        bit_offset < vec_size_in_bits;
5823        bit_offset += element_bitsize)
5824     {
5825       tree bitpos = bitsize_int (bit_offset);
5826       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5827                          bitsize, bitpos);
5828
5829       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5830       rhs = make_ssa_name (scalar_dest, stmt);
5831       gimple_assign_set_lhs (stmt, rhs);
5832       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5833
5834       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5835       tree new_name = make_ssa_name (scalar_dest, stmt);
5836       gimple_assign_set_lhs (stmt, new_name);
5837       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5838       lhs = new_name;
5839     }
5840   return lhs;
5841 }
5842
5843 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5844    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5845    statement.  CODE is the operation performed by STMT and OPS are
5846    its scalar operands.  REDUC_INDEX is the index of the operand in
5847    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5848    implements in-order reduction, or IFN_LAST if we should open-code it.
5849    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5850    that should be used to control the operation in a fully-masked loop.  */
5851
5852 static bool
5853 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5854                                gimple **vec_stmt, slp_tree slp_node,
5855                                gimple *reduc_def_stmt,
5856                                tree_code code, internal_fn reduc_fn,
5857                                tree ops[3], tree vectype_in,
5858                                int reduc_index, vec_loop_masks *masks)
5859 {
5860   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5861   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5862   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5863   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5864   stmt_vec_info new_stmt_info = NULL;
5865
5866   int ncopies;
5867   if (slp_node)
5868     ncopies = 1;
5869   else
5870     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5871
5872   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5873   gcc_assert (ncopies == 1);
5874   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5875   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5876   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5877               == FOLD_LEFT_REDUCTION);
5878
5879   if (slp_node)
5880     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5881                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5882
5883   tree op0 = ops[1 - reduc_index];
5884
5885   int group_size = 1;
5886   gimple *scalar_dest_def;
5887   auto_vec<tree> vec_oprnds0;
5888   if (slp_node)
5889     {
5890       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5891       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5892       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5893     }
5894   else
5895     {
5896       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5897       vec_oprnds0.create (1);
5898       vec_oprnds0.quick_push (loop_vec_def0);
5899       scalar_dest_def = stmt;
5900     }
5901
5902   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5903   tree scalar_type = TREE_TYPE (scalar_dest);
5904   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5905
5906   int vec_num = vec_oprnds0.length ();
5907   gcc_assert (vec_num == 1 || slp_node);
5908   tree vec_elem_type = TREE_TYPE (vectype_out);
5909   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5910
5911   tree vector_identity = NULL_TREE;
5912   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5913     vector_identity = build_zero_cst (vectype_out);
5914
5915   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5916   int i;
5917   tree def0;
5918   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5919     {
5920       gimple *new_stmt;
5921       tree mask = NULL_TREE;
5922       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5923         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5924
5925       /* Handle MINUS by adding the negative.  */
5926       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5927         {
5928           tree negated = make_ssa_name (vectype_out);
5929           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5930           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5931           def0 = negated;
5932         }
5933
5934       if (mask)
5935         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5936                                     vector_identity);
5937
5938       /* On the first iteration the input is simply the scalar phi
5939          result, and for subsequent iterations it is the output of
5940          the preceding operation.  */
5941       if (reduc_fn != IFN_LAST)
5942         {
5943           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5944           /* For chained SLP reductions the output of the previous reduction
5945              operation serves as the input of the next. For the final statement
5946              the output cannot be a temporary - we reuse the original
5947              scalar destination of the last statement.  */
5948           if (i != vec_num - 1)
5949             {
5950               gimple_set_lhs (new_stmt, scalar_dest_var);
5951               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5952               gimple_set_lhs (new_stmt, reduc_var);
5953             }
5954         }
5955       else
5956         {
5957           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5958                                              reduc_var, def0);
5959           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5960           /* Remove the statement, so that we can use the same code paths
5961              as for statements that we've just created.  */
5962           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5963           gsi_remove (&tmp_gsi, false);
5964         }
5965
5966       if (i == vec_num - 1)
5967         {
5968           gimple_set_lhs (new_stmt, scalar_dest);
5969           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def, new_stmt);
5970         }
5971       else
5972         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def,
5973                                                      new_stmt, gsi);
5974
5975       if (slp_node)
5976         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5977     }
5978
5979   if (!slp_node)
5980     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5981
5982   return true;
5983 }
5984
5985 /* Function is_nonwrapping_integer_induction.
5986
5987    Check if STMT (which is part of loop LOOP) both increments and
5988    does not cause overflow.  */
5989
5990 static bool
5991 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5992 {
5993   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5994   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5995   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5996   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5997   widest_int ni, max_loop_value, lhs_max;
5998   wi::overflow_type overflow = wi::OVF_NONE;
5999
6000   /* Make sure the loop is integer based.  */
6001   if (TREE_CODE (base) != INTEGER_CST
6002       || TREE_CODE (step) != INTEGER_CST)
6003     return false;
6004
6005   /* Check that the max size of the loop will not wrap.  */
6006
6007   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6008     return true;
6009
6010   if (! max_stmt_executions (loop, &ni))
6011     return false;
6012
6013   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6014                             &overflow);
6015   if (overflow)
6016     return false;
6017
6018   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6019                             TYPE_SIGN (lhs_type), &overflow);
6020   if (overflow)
6021     return false;
6022
6023   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6024           <= TYPE_PRECISION (lhs_type));
6025 }
6026
6027 /* Function vectorizable_reduction.
6028
6029    Check if STMT performs a reduction operation that can be vectorized.
6030    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6031    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6032    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6033
6034    This function also handles reduction idioms (patterns) that have been
6035    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6036    of this form:
6037      X = pattern_expr (arg0, arg1, ..., X)
6038    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6039    sequence that had been detected and replaced by the pattern-stmt (STMT).
6040
6041    This function also handles reduction of condition expressions, for example:
6042      for (int i = 0; i < N; i++)
6043        if (a[i] < value)
6044          last = a[i];
6045    This is handled by vectorising the loop and creating an additional vector
6046    containing the loop indexes for which "a[i] < value" was true.  In the
6047    function epilogue this is reduced to a single max value and then used to
6048    index into the vector of results.
6049
6050    In some cases of reduction patterns, the type of the reduction variable X is
6051    different than the type of the other arguments of STMT.
6052    In such cases, the vectype that is used when transforming STMT into a vector
6053    stmt is different than the vectype that is used to determine the
6054    vectorization factor, because it consists of a different number of elements
6055    than the actual number of elements that are being operated upon in parallel.
6056
6057    For example, consider an accumulation of shorts into an int accumulator.
6058    On some targets it's possible to vectorize this pattern operating on 8
6059    shorts at a time (hence, the vectype for purposes of determining the
6060    vectorization factor should be V8HI); on the other hand, the vectype that
6061    is used to create the vector form is actually V4SI (the type of the result).
6062
6063    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6064    indicates what is the actual level of parallelism (V8HI in the example), so
6065    that the right vectorization factor would be derived.  This vectype
6066    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6067    be used to create the vectorized stmt.  The right vectype for the vectorized
6068    stmt is obtained from the type of the result X:
6069         get_vectype_for_scalar_type (TREE_TYPE (X))
6070
6071    This means that, contrary to "regular" reductions (or "regular" stmts in
6072    general), the following equation:
6073       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6074    does *NOT* necessarily hold for reduction patterns.  */
6075
6076 bool
6077 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6078                         gimple **vec_stmt, slp_tree slp_node,
6079                         slp_instance slp_node_instance,
6080                         stmt_vector_for_cost *cost_vec)
6081 {
6082   tree vec_dest;
6083   tree scalar_dest;
6084   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6085   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6086   tree vectype_in = NULL_TREE;
6087   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6088   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6089   enum tree_code code, orig_code;
6090   internal_fn reduc_fn;
6091   machine_mode vec_mode;
6092   int op_type;
6093   optab optab;
6094   tree new_temp = NULL_TREE;
6095   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6096   gimple *cond_reduc_def_stmt = NULL;
6097   enum tree_code cond_reduc_op_code = ERROR_MARK;
6098   tree scalar_type;
6099   bool is_simple_use;
6100   gimple *orig_stmt;
6101   stmt_vec_info orig_stmt_info = NULL;
6102   int i;
6103   int ncopies;
6104   int epilog_copies;
6105   stmt_vec_info prev_stmt_info, prev_phi_info;
6106   bool single_defuse_cycle = false;
6107   stmt_vec_info new_stmt_info = NULL;
6108   int j;
6109   tree ops[3];
6110   enum vect_def_type dts[3];
6111   bool nested_cycle = false, found_nested_cycle_def = false;
6112   bool double_reduc = false;
6113   basic_block def_bb;
6114   struct loop * def_stmt_loop;
6115   tree def_arg;
6116   auto_vec<tree> vec_oprnds0;
6117   auto_vec<tree> vec_oprnds1;
6118   auto_vec<tree> vec_oprnds2;
6119   auto_vec<tree> vect_defs;
6120   auto_vec<gimple *> phis;
6121   int vec_num;
6122   tree def0, tem;
6123   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6124   tree cond_reduc_val = NULL_TREE;
6125
6126   /* Make sure it was already recognized as a reduction computation.  */
6127   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6128       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6129     return false;
6130
6131   if (nested_in_vect_loop_p (loop, stmt))
6132     {
6133       loop = loop->inner;
6134       nested_cycle = true;
6135     }
6136
6137   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6138     gcc_assert (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt);
6139
6140   if (gimple_code (stmt) == GIMPLE_PHI)
6141     {
6142       tree phi_result = gimple_phi_result (stmt);
6143       /* Analysis is fully done on the reduction stmt invocation.  */
6144       if (! vec_stmt)
6145         {
6146           if (slp_node)
6147             slp_node_instance->reduc_phis = slp_node;
6148
6149           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6150           return true;
6151         }
6152
6153       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6154         /* Leave the scalar phi in place.  Note that checking
6155            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6156            for reductions involving a single statement.  */
6157         return true;
6158
6159       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6160       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6161         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6162
6163       stmt_vec_info reduc_stmt_info = vinfo_for_stmt (reduc_stmt);
6164       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6165           == EXTRACT_LAST_REDUCTION)
6166         /* Leave the scalar phi in place.  */
6167         return true;
6168
6169       gcc_assert (is_gimple_assign (reduc_stmt));
6170       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6171         {
6172           tree op = gimple_op (reduc_stmt, k);
6173           if (op == gimple_phi_result (stmt))
6174             continue;
6175           if (k == 1
6176               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6177             continue;
6178           if (!vectype_in
6179               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6180                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6181             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6182           break;
6183         }
6184       gcc_assert (vectype_in);
6185
6186       if (slp_node)
6187         ncopies = 1;
6188       else
6189         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6190
6191       stmt_vec_info use_stmt_info;
6192       if (ncopies > 1
6193           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6194           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6195           && (use_stmt_info == reduc_stmt_info
6196               || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt))
6197         single_defuse_cycle = true;
6198
6199       /* Create the destination vector  */
6200       scalar_dest = gimple_assign_lhs (reduc_stmt);
6201       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6202
6203       if (slp_node)
6204         /* The size vect_schedule_slp_instance computes is off for us.  */
6205         vec_num = vect_get_num_vectors
6206           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6207            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6208            vectype_in);
6209       else
6210         vec_num = 1;
6211
6212       /* Generate the reduction PHIs upfront.  */
6213       prev_phi_info = NULL;
6214       for (j = 0; j < ncopies; j++)
6215         {
6216           if (j == 0 || !single_defuse_cycle)
6217             {
6218               for (i = 0; i < vec_num; i++)
6219                 {
6220                   /* Create the reduction-phi that defines the reduction
6221                      operand.  */
6222                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6223                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6224
6225                   if (slp_node)
6226                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6227                   else
6228                     {
6229                       if (j == 0)
6230                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6231                       else
6232                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6233                       prev_phi_info = new_phi_info;
6234                     }
6235                 }
6236             }
6237         }
6238
6239       return true;
6240     }
6241
6242   /* 1. Is vectorizable reduction?  */
6243   /* Not supportable if the reduction variable is used in the loop, unless
6244      it's a reduction chain.  */
6245   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6246       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6247     return false;
6248
6249   /* Reductions that are not used even in an enclosing outer-loop,
6250      are expected to be "live" (used out of the loop).  */
6251   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6252       && !STMT_VINFO_LIVE_P (stmt_info))
6253     return false;
6254
6255   /* 2. Has this been recognized as a reduction pattern?
6256
6257      Check if STMT represents a pattern that has been recognized
6258      in earlier analysis stages.  For stmts that represent a pattern,
6259      the STMT_VINFO_RELATED_STMT field records the last stmt in
6260      the original sequence that constitutes the pattern.  */
6261
6262   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6263   if (orig_stmt)
6264     {
6265       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6266       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6267       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6268     }
6269
6270   /* 3. Check the operands of the operation.  The first operands are defined
6271         inside the loop body. The last operand is the reduction variable,
6272         which is defined by the loop-header-phi.  */
6273
6274   gcc_assert (is_gimple_assign (stmt));
6275
6276   /* Flatten RHS.  */
6277   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6278     {
6279     case GIMPLE_BINARY_RHS:
6280       code = gimple_assign_rhs_code (stmt);
6281       op_type = TREE_CODE_LENGTH (code);
6282       gcc_assert (op_type == binary_op);
6283       ops[0] = gimple_assign_rhs1 (stmt);
6284       ops[1] = gimple_assign_rhs2 (stmt);
6285       break;
6286
6287     case GIMPLE_TERNARY_RHS:
6288       code = gimple_assign_rhs_code (stmt);
6289       op_type = TREE_CODE_LENGTH (code);
6290       gcc_assert (op_type == ternary_op);
6291       ops[0] = gimple_assign_rhs1 (stmt);
6292       ops[1] = gimple_assign_rhs2 (stmt);
6293       ops[2] = gimple_assign_rhs3 (stmt);
6294       break;
6295
6296     case GIMPLE_UNARY_RHS:
6297       return false;
6298
6299     default:
6300       gcc_unreachable ();
6301     }
6302
6303   if (code == COND_EXPR && slp_node)
6304     return false;
6305
6306   scalar_dest = gimple_assign_lhs (stmt);
6307   scalar_type = TREE_TYPE (scalar_dest);
6308   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6309       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6310     return false;
6311
6312   /* Do not try to vectorize bit-precision reductions.  */
6313   if (!type_has_mode_precision_p (scalar_type))
6314     return false;
6315
6316   /* All uses but the last are expected to be defined in the loop.
6317      The last use is the reduction variable.  In case of nested cycle this
6318      assumption is not true: we use reduc_index to record the index of the
6319      reduction variable.  */
6320   gimple *reduc_def_stmt = NULL;
6321   int reduc_index = -1;
6322   for (i = 0; i < op_type; i++)
6323     {
6324       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6325       if (i == 0 && code == COND_EXPR)
6326         continue;
6327
6328       stmt_vec_info def_stmt_info;
6329       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6330                                           &def_stmt_info);
6331       dt = dts[i];
6332       gcc_assert (is_simple_use);
6333       if (dt == vect_reduction_def)
6334         {
6335           reduc_def_stmt = def_stmt_info;
6336           reduc_index = i;
6337           continue;
6338         }
6339       else if (tem)
6340         {
6341           /* To properly compute ncopies we are interested in the widest
6342              input type in case we're looking at a widening accumulation.  */
6343           if (!vectype_in
6344               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6345                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6346             vectype_in = tem;
6347         }
6348
6349       if (dt != vect_internal_def
6350           && dt != vect_external_def
6351           && dt != vect_constant_def
6352           && dt != vect_induction_def
6353           && !(dt == vect_nested_cycle && nested_cycle))
6354         return false;
6355
6356       if (dt == vect_nested_cycle)
6357         {
6358           found_nested_cycle_def = true;
6359           reduc_def_stmt = def_stmt_info;
6360           reduc_index = i;
6361         }
6362
6363       if (i == 1 && code == COND_EXPR)
6364         {
6365           /* Record how value of COND_EXPR is defined.  */
6366           if (dt == vect_constant_def)
6367             {
6368               cond_reduc_dt = dt;
6369               cond_reduc_val = ops[i];
6370             }
6371           if (dt == vect_induction_def
6372               && def_stmt_info
6373               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6374             {
6375               cond_reduc_dt = dt;
6376               cond_reduc_def_stmt = def_stmt_info;
6377             }
6378         }
6379     }
6380
6381   if (!vectype_in)
6382     vectype_in = vectype_out;
6383
6384   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6385      directy used in stmt.  */
6386   if (reduc_index == -1)
6387     {
6388       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6389         {
6390           if (dump_enabled_p ())
6391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6392                              "in-order reduction chain without SLP.\n");
6393           return false;
6394         }
6395
6396       if (orig_stmt)
6397         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6398       else
6399         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6400     }
6401
6402   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6403     return false;
6404
6405   if (!(reduc_index == -1
6406         || dts[reduc_index] == vect_reduction_def
6407         || dts[reduc_index] == vect_nested_cycle
6408         || ((dts[reduc_index] == vect_internal_def
6409              || dts[reduc_index] == vect_external_def
6410              || dts[reduc_index] == vect_constant_def
6411              || dts[reduc_index] == vect_induction_def)
6412             && nested_cycle && found_nested_cycle_def)))
6413     {
6414       /* For pattern recognized stmts, orig_stmt might be a reduction,
6415          but some helper statements for the pattern might not, or
6416          might be COND_EXPRs with reduction uses in the condition.  */
6417       gcc_assert (orig_stmt);
6418       return false;
6419     }
6420
6421   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6422   /* PHIs should not participate in patterns.  */
6423   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6424   enum vect_reduction_type v_reduc_type
6425     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6426   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6427
6428   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6429   /* If we have a condition reduction, see if we can simplify it further.  */
6430   if (v_reduc_type == COND_REDUCTION)
6431     {
6432       /* TODO: We can't yet handle reduction chains, since we need to treat
6433          each COND_EXPR in the chain specially, not just the last one.
6434          E.g. for:
6435
6436             x_1 = PHI <x_3, ...>
6437             x_2 = a_2 ? ... : x_1;
6438             x_3 = a_3 ? ... : x_2;
6439
6440          we're interested in the last element in x_3 for which a_2 || a_3
6441          is true, whereas the current reduction chain handling would
6442          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6443          as a reduction operation.  */
6444       if (reduc_index == -1)
6445         {
6446           if (dump_enabled_p ())
6447             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6448                              "conditional reduction chains not supported\n");
6449           return false;
6450         }
6451
6452       /* vect_is_simple_reduction ensured that operand 2 is the
6453          loop-carried operand.  */
6454       gcc_assert (reduc_index == 2);
6455
6456       /* Loop peeling modifies initial value of reduction PHI, which
6457          makes the reduction stmt to be transformed different to the
6458          original stmt analyzed.  We need to record reduction code for
6459          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6460          it can be used directly at transform stage.  */
6461       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6462           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6463         {
6464           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6465           gcc_assert (cond_reduc_dt == vect_constant_def);
6466           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6467         }
6468       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6469                                                vectype_in, OPTIMIZE_FOR_SPEED))
6470         {
6471           if (dump_enabled_p ())
6472             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6473                              "optimizing condition reduction with"
6474                              " FOLD_EXTRACT_LAST.\n");
6475           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6476         }
6477       else if (cond_reduc_dt == vect_induction_def)
6478         {
6479           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6480           tree base
6481             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6482           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6483
6484           gcc_assert (TREE_CODE (base) == INTEGER_CST
6485                       && TREE_CODE (step) == INTEGER_CST);
6486           cond_reduc_val = NULL_TREE;
6487           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6488              above base; punt if base is the minimum value of the type for
6489              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6490           if (tree_int_cst_sgn (step) == -1)
6491             {
6492               cond_reduc_op_code = MIN_EXPR;
6493               if (tree_int_cst_sgn (base) == -1)
6494                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6495               else if (tree_int_cst_lt (base,
6496                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6497                 cond_reduc_val
6498                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6499             }
6500           else
6501             {
6502               cond_reduc_op_code = MAX_EXPR;
6503               if (tree_int_cst_sgn (base) == 1)
6504                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6505               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6506                                         base))
6507                 cond_reduc_val
6508                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6509             }
6510           if (cond_reduc_val)
6511             {
6512               if (dump_enabled_p ())
6513                 dump_printf_loc (MSG_NOTE, vect_location,
6514                                  "condition expression based on "
6515                                  "integer induction.\n");
6516               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6517                 = INTEGER_INDUC_COND_REDUCTION;
6518             }
6519         }
6520       else if (cond_reduc_dt == vect_constant_def)
6521         {
6522           enum vect_def_type cond_initial_dt;
6523           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6524           tree cond_initial_val
6525             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6526
6527           gcc_assert (cond_reduc_val != NULL_TREE);
6528           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6529           if (cond_initial_dt == vect_constant_def
6530               && types_compatible_p (TREE_TYPE (cond_initial_val),
6531                                      TREE_TYPE (cond_reduc_val)))
6532             {
6533               tree e = fold_binary (LE_EXPR, boolean_type_node,
6534                                     cond_initial_val, cond_reduc_val);
6535               if (e && (integer_onep (e) || integer_zerop (e)))
6536                 {
6537                   if (dump_enabled_p ())
6538                     dump_printf_loc (MSG_NOTE, vect_location,
6539                                      "condition expression based on "
6540                                      "compile time constant.\n");
6541                   /* Record reduction code at analysis stage.  */
6542                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6543                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6544                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6545                     = CONST_COND_REDUCTION;
6546                 }
6547             }
6548         }
6549     }
6550
6551   if (orig_stmt)
6552     gcc_assert (tmp == orig_stmt
6553                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6554                     == orig_stmt));
6555   else
6556     /* We changed STMT to be the first stmt in reduction chain, hence we
6557        check that in this case the first element in the chain is STMT.  */
6558     gcc_assert (stmt == tmp
6559                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6560
6561   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6562     return false;
6563
6564   if (slp_node)
6565     ncopies = 1;
6566   else
6567     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6568
6569   gcc_assert (ncopies >= 1);
6570
6571   vec_mode = TYPE_MODE (vectype_in);
6572   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6573
6574   if (code == COND_EXPR)
6575     {
6576       /* Only call during the analysis stage, otherwise we'll lose
6577          STMT_VINFO_TYPE.  */
6578       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6579                                                 ops[reduc_index], 0, NULL,
6580                                                 cost_vec))
6581         {
6582           if (dump_enabled_p ())
6583             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6584                              "unsupported condition in reduction\n");
6585           return false;
6586         }
6587     }
6588   else
6589     {
6590       /* 4. Supportable by target?  */
6591
6592       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6593           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6594         {
6595           /* Shifts and rotates are only supported by vectorizable_shifts,
6596              not vectorizable_reduction.  */
6597           if (dump_enabled_p ())
6598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6599                              "unsupported shift or rotation.\n");
6600           return false;
6601         }
6602
6603       /* 4.1. check support for the operation in the loop  */
6604       optab = optab_for_tree_code (code, vectype_in, optab_default);
6605       if (!optab)
6606         {
6607           if (dump_enabled_p ())
6608             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6609                              "no optab.\n");
6610
6611           return false;
6612         }
6613
6614       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6615         {
6616           if (dump_enabled_p ())
6617             dump_printf (MSG_NOTE, "op not supported by target.\n");
6618
6619           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6620               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6621             return false;
6622
6623           if (dump_enabled_p ())
6624             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6625         }
6626
6627       /* Worthwhile without SIMD support?  */
6628       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6629           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6630         {
6631           if (dump_enabled_p ())
6632             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6633                              "not worthwhile without SIMD support.\n");
6634
6635           return false;
6636         }
6637     }
6638
6639   /* 4.2. Check support for the epilog operation.
6640
6641           If STMT represents a reduction pattern, then the type of the
6642           reduction variable may be different than the type of the rest
6643           of the arguments.  For example, consider the case of accumulation
6644           of shorts into an int accumulator; The original code:
6645                         S1: int_a = (int) short_a;
6646           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6647
6648           was replaced with:
6649                         STMT: int_acc = widen_sum <short_a, int_acc>
6650
6651           This means that:
6652           1. The tree-code that is used to create the vector operation in the
6653              epilog code (that reduces the partial results) is not the
6654              tree-code of STMT, but is rather the tree-code of the original
6655              stmt from the pattern that STMT is replacing.  I.e, in the example
6656              above we want to use 'widen_sum' in the loop, but 'plus' in the
6657              epilog.
6658           2. The type (mode) we use to check available target support
6659              for the vector operation to be created in the *epilog*, is
6660              determined by the type of the reduction variable (in the example
6661              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6662              However the type (mode) we use to check available target support
6663              for the vector operation to be created *inside the loop*, is
6664              determined by the type of the other arguments to STMT (in the
6665              example we'd check this: optab_handler (widen_sum_optab,
6666              vect_short_mode)).
6667
6668           This is contrary to "regular" reductions, in which the types of all
6669           the arguments are the same as the type of the reduction variable.
6670           For "regular" reductions we can therefore use the same vector type
6671           (and also the same tree-code) when generating the epilog code and
6672           when generating the code inside the loop.  */
6673
6674   vect_reduction_type reduction_type
6675     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6676   if (orig_stmt
6677       && (reduction_type == TREE_CODE_REDUCTION
6678           || reduction_type == FOLD_LEFT_REDUCTION))
6679     {
6680       /* This is a reduction pattern: get the vectype from the type of the
6681          reduction variable, and get the tree-code from orig_stmt.  */
6682       orig_code = gimple_assign_rhs_code (orig_stmt);
6683       gcc_assert (vectype_out);
6684       vec_mode = TYPE_MODE (vectype_out);
6685     }
6686   else
6687     {
6688       /* Regular reduction: use the same vectype and tree-code as used for
6689          the vector code inside the loop can be used for the epilog code. */
6690       orig_code = code;
6691
6692       if (code == MINUS_EXPR)
6693         orig_code = PLUS_EXPR;
6694
6695       /* For simple condition reductions, replace with the actual expression
6696          we want to base our reduction around.  */
6697       if (reduction_type == CONST_COND_REDUCTION)
6698         {
6699           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6700           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6701         }
6702       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6703         orig_code = cond_reduc_op_code;
6704     }
6705
6706   if (nested_cycle)
6707     {
6708       def_bb = gimple_bb (reduc_def_stmt);
6709       def_stmt_loop = def_bb->loop_father;
6710       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6711                                        loop_preheader_edge (def_stmt_loop));
6712       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6713       if (def_arg_stmt_info
6714           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6715               == vect_double_reduction_def))
6716         double_reduc = true;
6717     }
6718
6719   reduc_fn = IFN_LAST;
6720
6721   if (reduction_type == TREE_CODE_REDUCTION
6722       || reduction_type == FOLD_LEFT_REDUCTION
6723       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6724       || reduction_type == CONST_COND_REDUCTION)
6725     {
6726       if (reduction_type == FOLD_LEFT_REDUCTION
6727           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6728           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6729         {
6730           if (reduc_fn != IFN_LAST
6731               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6732                                                   OPTIMIZE_FOR_SPEED))
6733             {
6734               if (dump_enabled_p ())
6735                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6736                                  "reduc op not supported by target.\n");
6737
6738               reduc_fn = IFN_LAST;
6739             }
6740         }
6741       else
6742         {
6743           if (!nested_cycle || double_reduc)
6744             {
6745               if (dump_enabled_p ())
6746                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6747                                  "no reduc code for scalar code.\n");
6748
6749               return false;
6750             }
6751         }
6752     }
6753   else if (reduction_type == COND_REDUCTION)
6754     {
6755       int scalar_precision
6756         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6757       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6758       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6759                                                 nunits_out);
6760
6761       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6762                                           OPTIMIZE_FOR_SPEED))
6763         reduc_fn = IFN_REDUC_MAX;
6764     }
6765
6766   if (reduction_type != EXTRACT_LAST_REDUCTION
6767       && reduc_fn == IFN_LAST
6768       && !nunits_out.is_constant ())
6769     {
6770       if (dump_enabled_p ())
6771         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6772                          "missing target support for reduction on"
6773                          " variable-length vectors.\n");
6774       return false;
6775     }
6776
6777   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6778       && ncopies > 1)
6779     {
6780       if (dump_enabled_p ())
6781         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6782                          "multiple types in double reduction or condition "
6783                          "reduction.\n");
6784       return false;
6785     }
6786
6787   /* For SLP reductions, see if there is a neutral value we can use.  */
6788   tree neutral_op = NULL_TREE;
6789   if (slp_node)
6790     neutral_op = neutral_op_for_slp_reduction
6791                    (slp_node_instance->reduc_phis, code,
6792                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6793
6794   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6795     {
6796       /* We can't support in-order reductions of code such as this:
6797
6798            for (int i = 0; i < n1; ++i)
6799              for (int j = 0; j < n2; ++j)
6800                l += a[j];
6801
6802          since GCC effectively transforms the loop when vectorizing:
6803
6804            for (int i = 0; i < n1 / VF; ++i)
6805              for (int j = 0; j < n2; ++j)
6806                for (int k = 0; k < VF; ++k)
6807                  l += a[j];
6808
6809          which is a reassociation of the original operation.  */
6810       if (dump_enabled_p ())
6811         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6812                          "in-order double reduction not supported.\n");
6813
6814       return false;
6815     }
6816
6817   if (reduction_type == FOLD_LEFT_REDUCTION
6818       && slp_node
6819       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6820     {
6821       /* We cannot use in-order reductions in this case because there is
6822          an implicit reassociation of the operations involved.  */
6823       if (dump_enabled_p ())
6824         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6825                          "in-order unchained SLP reductions not supported.\n");
6826       return false;
6827     }
6828
6829   /* For double reductions, and for SLP reductions with a neutral value,
6830      we construct a variable-length initial vector by loading a vector
6831      full of the neutral value and then shift-and-inserting the start
6832      values into the low-numbered elements.  */
6833   if ((double_reduc || neutral_op)
6834       && !nunits_out.is_constant ()
6835       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6836                                           vectype_out, OPTIMIZE_FOR_SPEED))
6837     {
6838       if (dump_enabled_p ())
6839         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6840                          "reduction on variable-length vectors requires"
6841                          " target support for a vector-shift-and-insert"
6842                          " operation.\n");
6843       return false;
6844     }
6845
6846   /* Check extra constraints for variable-length unchained SLP reductions.  */
6847   if (STMT_SLP_TYPE (stmt_info)
6848       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6849       && !nunits_out.is_constant ())
6850     {
6851       /* We checked above that we could build the initial vector when
6852          there's a neutral element value.  Check here for the case in
6853          which each SLP statement has its own initial value and in which
6854          that value needs to be repeated for every instance of the
6855          statement within the initial vector.  */
6856       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6857       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6858       if (!neutral_op
6859           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6860         {
6861           if (dump_enabled_p ())
6862             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6863                              "unsupported form of SLP reduction for"
6864                              " variable-length vectors: cannot build"
6865                              " initial vector.\n");
6866           return false;
6867         }
6868       /* The epilogue code relies on the number of elements being a multiple
6869          of the group size.  The duplicate-and-interleave approach to setting
6870          up the the initial vector does too.  */
6871       if (!multiple_p (nunits_out, group_size))
6872         {
6873           if (dump_enabled_p ())
6874             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6875                              "unsupported form of SLP reduction for"
6876                              " variable-length vectors: the vector size"
6877                              " is not a multiple of the number of results.\n");
6878           return false;
6879         }
6880     }
6881
6882   /* In case of widenning multiplication by a constant, we update the type
6883      of the constant to be the type of the other operand.  We check that the
6884      constant fits the type in the pattern recognition pass.  */
6885   if (code == DOT_PROD_EXPR
6886       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6887     {
6888       if (TREE_CODE (ops[0]) == INTEGER_CST)
6889         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6890       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6891         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6892       else
6893         {
6894           if (dump_enabled_p ())
6895             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6896                              "invalid types in dot-prod\n");
6897
6898           return false;
6899         }
6900     }
6901
6902   if (reduction_type == COND_REDUCTION)
6903     {
6904       widest_int ni;
6905
6906       if (! max_loop_iterations (loop, &ni))
6907         {
6908           if (dump_enabled_p ())
6909             dump_printf_loc (MSG_NOTE, vect_location,
6910                              "loop count not known, cannot create cond "
6911                              "reduction.\n");
6912           return false;
6913         }
6914       /* Convert backedges to iterations.  */
6915       ni += 1;
6916
6917       /* The additional index will be the same type as the condition.  Check
6918          that the loop can fit into this less one (because we'll use up the
6919          zero slot for when there are no matches).  */
6920       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6921       if (wi::geu_p (ni, wi::to_widest (max_index)))
6922         {
6923           if (dump_enabled_p ())
6924             dump_printf_loc (MSG_NOTE, vect_location,
6925                              "loop size is greater than data size.\n");
6926           return false;
6927         }
6928     }
6929
6930   /* In case the vectorization factor (VF) is bigger than the number
6931      of elements that we can fit in a vectype (nunits), we have to generate
6932      more than one vector stmt - i.e - we need to "unroll" the
6933      vector stmt by a factor VF/nunits.  For more details see documentation
6934      in vectorizable_operation.  */
6935
6936   /* If the reduction is used in an outer loop we need to generate
6937      VF intermediate results, like so (e.g. for ncopies=2):
6938         r0 = phi (init, r0)
6939         r1 = phi (init, r1)
6940         r0 = x0 + r0;
6941         r1 = x1 + r1;
6942     (i.e. we generate VF results in 2 registers).
6943     In this case we have a separate def-use cycle for each copy, and therefore
6944     for each copy we get the vector def for the reduction variable from the
6945     respective phi node created for this copy.
6946
6947     Otherwise (the reduction is unused in the loop nest), we can combine
6948     together intermediate results, like so (e.g. for ncopies=2):
6949         r = phi (init, r)
6950         r = x0 + r;
6951         r = x1 + r;
6952    (i.e. we generate VF/2 results in a single register).
6953    In this case for each copy we get the vector def for the reduction variable
6954    from the vectorized reduction operation generated in the previous iteration.
6955
6956    This only works when we see both the reduction PHI and its only consumer
6957    in vectorizable_reduction and there are no intermediate stmts
6958    participating.  */
6959   stmt_vec_info use_stmt_info;
6960   tree reduc_phi_result = gimple_phi_result (reduc_def_stmt);
6961   if (ncopies > 1
6962       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6963       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6964       && (use_stmt_info == stmt_info
6965           || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt))
6966     {
6967       single_defuse_cycle = true;
6968       epilog_copies = 1;
6969     }
6970   else
6971     epilog_copies = ncopies;
6972
6973   /* If the reduction stmt is one of the patterns that have lane
6974      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6975   if ((ncopies > 1
6976        && ! single_defuse_cycle)
6977       && (code == DOT_PROD_EXPR
6978           || code == WIDEN_SUM_EXPR
6979           || code == SAD_EXPR))
6980     {
6981       if (dump_enabled_p ())
6982         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6983                          "multi def-use cycle not possible for lane-reducing "
6984                          "reduction operation\n");
6985       return false;
6986     }
6987
6988   if (slp_node)
6989     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6990   else
6991     vec_num = 1;
6992
6993   internal_fn cond_fn = get_conditional_internal_fn (code);
6994   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6995
6996   if (!vec_stmt) /* transformation not required.  */
6997     {
6998       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6999       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7000         {
7001           if (reduction_type != FOLD_LEFT_REDUCTION
7002               && (cond_fn == IFN_LAST
7003                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7004                                                       OPTIMIZE_FOR_SPEED)))
7005             {
7006               if (dump_enabled_p ())
7007                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7008                                  "can't use a fully-masked loop because no"
7009                                  " conditional operation is available.\n");
7010               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7011             }
7012           else if (reduc_index == -1)
7013             {
7014               if (dump_enabled_p ())
7015                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7016                                  "can't use a fully-masked loop for chained"
7017                                  " reductions.\n");
7018               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7019             }
7020           else
7021             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7022                                    vectype_in);
7023         }
7024       if (dump_enabled_p ()
7025           && reduction_type == FOLD_LEFT_REDUCTION)
7026         dump_printf_loc (MSG_NOTE, vect_location,
7027                          "using an in-order (fold-left) reduction.\n");
7028       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7029       return true;
7030     }
7031
7032   /* Transform.  */
7033
7034   if (dump_enabled_p ())
7035     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7036
7037   /* FORNOW: Multiple types are not supported for condition.  */
7038   if (code == COND_EXPR)
7039     gcc_assert (ncopies == 1);
7040
7041   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7042
7043   if (reduction_type == FOLD_LEFT_REDUCTION)
7044     return vectorize_fold_left_reduction
7045       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7046        reduc_fn, ops, vectype_in, reduc_index, masks);
7047
7048   if (reduction_type == EXTRACT_LAST_REDUCTION)
7049     {
7050       gcc_assert (!slp_node);
7051       return vectorizable_condition (stmt, gsi, vec_stmt,
7052                                      NULL, reduc_index, NULL, NULL);
7053     }
7054
7055   /* Create the destination vector  */
7056   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7057
7058   prev_stmt_info = NULL;
7059   prev_phi_info = NULL;
7060   if (!slp_node)
7061     {
7062       vec_oprnds0.create (1);
7063       vec_oprnds1.create (1);
7064       if (op_type == ternary_op)
7065         vec_oprnds2.create (1);
7066     }
7067
7068   phis.create (vec_num);
7069   vect_defs.create (vec_num);
7070   if (!slp_node)
7071     vect_defs.quick_push (NULL_TREE);
7072
7073   if (slp_node)
7074     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7075   else
7076     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7077
7078   for (j = 0; j < ncopies; j++)
7079     {
7080       if (code == COND_EXPR)
7081         {
7082           gcc_assert (!slp_node);
7083           vectorizable_condition (stmt, gsi, vec_stmt,
7084                                   PHI_RESULT (phis[0]),
7085                                   reduc_index, NULL, NULL);
7086           /* Multiple types are not supported for condition.  */
7087           break;
7088         }
7089
7090       /* Handle uses.  */
7091       if (j == 0)
7092         {
7093           if (slp_node)
7094             {
7095               /* Get vec defs for all the operands except the reduction index,
7096                  ensuring the ordering of the ops in the vector is kept.  */
7097               auto_vec<tree, 3> slp_ops;
7098               auto_vec<vec<tree>, 3> vec_defs;
7099
7100               slp_ops.quick_push (ops[0]);
7101               slp_ops.quick_push (ops[1]);
7102               if (op_type == ternary_op)
7103                 slp_ops.quick_push (ops[2]);
7104
7105               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7106
7107               vec_oprnds0.safe_splice (vec_defs[0]);
7108               vec_defs[0].release ();
7109               vec_oprnds1.safe_splice (vec_defs[1]);
7110               vec_defs[1].release ();
7111               if (op_type == ternary_op)
7112                 {
7113                   vec_oprnds2.safe_splice (vec_defs[2]);
7114                   vec_defs[2].release ();
7115                 }
7116             }
7117           else
7118             {
7119               vec_oprnds0.quick_push
7120                 (vect_get_vec_def_for_operand (ops[0], stmt));
7121               vec_oprnds1.quick_push
7122                 (vect_get_vec_def_for_operand (ops[1], stmt));
7123               if (op_type == ternary_op)
7124                 vec_oprnds2.quick_push
7125                   (vect_get_vec_def_for_operand (ops[2], stmt));
7126             }
7127         }
7128       else
7129         {
7130           if (!slp_node)
7131             {
7132               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7133
7134               if (single_defuse_cycle && reduc_index == 0)
7135                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7136               else
7137                 vec_oprnds0[0]
7138                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7139               if (single_defuse_cycle && reduc_index == 1)
7140                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7141               else
7142                 vec_oprnds1[0]
7143                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7144               if (op_type == ternary_op)
7145                 {
7146                   if (single_defuse_cycle && reduc_index == 2)
7147                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7148                   else
7149                     vec_oprnds2[0]
7150                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7151                 }
7152             }
7153         }
7154
7155       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7156         {
7157           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7158           if (masked_loop_p)
7159             {
7160               /* Make sure that the reduction accumulator is vop[0].  */
7161               if (reduc_index == 1)
7162                 {
7163                   gcc_assert (commutative_tree_code (code));
7164                   std::swap (vop[0], vop[1]);
7165                 }
7166               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7167                                               vectype_in, i * ncopies + j);
7168               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7169                                                         vop[0], vop[1],
7170                                                         vop[0]);
7171               new_temp = make_ssa_name (vec_dest, call);
7172               gimple_call_set_lhs (call, new_temp);
7173               gimple_call_set_nothrow (call, true);
7174               new_stmt_info = vect_finish_stmt_generation (stmt, call, gsi);
7175             }
7176           else
7177             {
7178               if (op_type == ternary_op)
7179                 vop[2] = vec_oprnds2[i];
7180
7181               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7182                                                        vop[0], vop[1], vop[2]);
7183               new_temp = make_ssa_name (vec_dest, new_stmt);
7184               gimple_assign_set_lhs (new_stmt, new_temp);
7185               new_stmt_info
7186                 = vect_finish_stmt_generation (stmt, new_stmt, gsi);
7187             }
7188
7189           if (slp_node)
7190             {
7191               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7192               vect_defs.quick_push (new_temp);
7193             }
7194           else
7195             vect_defs[0] = new_temp;
7196         }
7197
7198       if (slp_node)
7199         continue;
7200
7201       if (j == 0)
7202         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7203       else
7204         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7205
7206       prev_stmt_info = new_stmt_info;
7207     }
7208
7209   /* Finalize the reduction-phi (set its arguments) and create the
7210      epilog reduction code.  */
7211   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7212     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7213
7214   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7215                                     epilog_copies, reduc_fn, phis,
7216                                     double_reduc, slp_node, slp_node_instance,
7217                                     cond_reduc_val, cond_reduc_op_code,
7218                                     neutral_op);
7219
7220   return true;
7221 }
7222
7223 /* Function vect_min_worthwhile_factor.
7224
7225    For a loop where we could vectorize the operation indicated by CODE,
7226    return the minimum vectorization factor that makes it worthwhile
7227    to use generic vectors.  */
7228 static unsigned int
7229 vect_min_worthwhile_factor (enum tree_code code)
7230 {
7231   switch (code)
7232     {
7233     case PLUS_EXPR:
7234     case MINUS_EXPR:
7235     case NEGATE_EXPR:
7236       return 4;
7237
7238     case BIT_AND_EXPR:
7239     case BIT_IOR_EXPR:
7240     case BIT_XOR_EXPR:
7241     case BIT_NOT_EXPR:
7242       return 2;
7243
7244     default:
7245       return INT_MAX;
7246     }
7247 }
7248
7249 /* Return true if VINFO indicates we are doing loop vectorization and if
7250    it is worth decomposing CODE operations into scalar operations for
7251    that loop's vectorization factor.  */
7252
7253 bool
7254 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7255 {
7256   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7257   unsigned HOST_WIDE_INT value;
7258   return (loop_vinfo
7259           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7260           && value >= vect_min_worthwhile_factor (code));
7261 }
7262
7263 /* Function vectorizable_induction
7264
7265    Check if PHI performs an induction computation that can be vectorized.
7266    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7267    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7268    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7269
7270 bool
7271 vectorizable_induction (gimple *phi,
7272                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7273                         gimple **vec_stmt, slp_tree slp_node,
7274                         stmt_vector_for_cost *cost_vec)
7275 {
7276   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7277   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7278   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7279   unsigned ncopies;
7280   bool nested_in_vect_loop = false;
7281   struct loop *iv_loop;
7282   tree vec_def;
7283   edge pe = loop_preheader_edge (loop);
7284   basic_block new_bb;
7285   tree new_vec, vec_init, vec_step, t;
7286   tree new_name;
7287   gimple *new_stmt;
7288   gphi *induction_phi;
7289   tree induc_def, vec_dest;
7290   tree init_expr, step_expr;
7291   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7292   unsigned i;
7293   tree expr;
7294   gimple_seq stmts;
7295   imm_use_iterator imm_iter;
7296   use_operand_p use_p;
7297   gimple *exit_phi;
7298   edge latch_e;
7299   tree loop_arg;
7300   gimple_stmt_iterator si;
7301   basic_block bb = gimple_bb (phi);
7302
7303   if (gimple_code (phi) != GIMPLE_PHI)
7304     return false;
7305
7306   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7307     return false;
7308
7309   /* Make sure it was recognized as induction computation.  */
7310   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7311     return false;
7312
7313   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7314   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7315
7316   if (slp_node)
7317     ncopies = 1;
7318   else
7319     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7320   gcc_assert (ncopies >= 1);
7321
7322   /* FORNOW. These restrictions should be relaxed.  */
7323   if (nested_in_vect_loop_p (loop, phi))
7324     {
7325       imm_use_iterator imm_iter;
7326       use_operand_p use_p;
7327       gimple *exit_phi;
7328       edge latch_e;
7329       tree loop_arg;
7330
7331       if (ncopies > 1)
7332         {
7333           if (dump_enabled_p ())
7334             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7335                              "multiple types in nested loop.\n");
7336           return false;
7337         }
7338
7339       /* FORNOW: outer loop induction with SLP not supported.  */
7340       if (STMT_SLP_TYPE (stmt_info))
7341         return false;
7342
7343       exit_phi = NULL;
7344       latch_e = loop_latch_edge (loop->inner);
7345       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7346       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7347         {
7348           gimple *use_stmt = USE_STMT (use_p);
7349           if (is_gimple_debug (use_stmt))
7350             continue;
7351
7352           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7353             {
7354               exit_phi = use_stmt;
7355               break;
7356             }
7357         }
7358       if (exit_phi)
7359         {
7360           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7361           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7362                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7363             {
7364               if (dump_enabled_p ())
7365                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7366                                  "inner-loop induction only used outside "
7367                                  "of the outer vectorized loop.\n");
7368               return false;
7369             }
7370         }
7371
7372       nested_in_vect_loop = true;
7373       iv_loop = loop->inner;
7374     }
7375   else
7376     iv_loop = loop;
7377   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7378
7379   if (slp_node && !nunits.is_constant ())
7380     {
7381       /* The current SLP code creates the initial value element-by-element.  */
7382       if (dump_enabled_p ())
7383         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7384                          "SLP induction not supported for variable-length"
7385                          " vectors.\n");
7386       return false;
7387     }
7388
7389   if (!vec_stmt) /* transformation not required.  */
7390     {
7391       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7392       DUMP_VECT_SCOPE ("vectorizable_induction");
7393       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7394       return true;
7395     }
7396
7397   /* Transform.  */
7398
7399   /* Compute a vector variable, initialized with the first VF values of
7400      the induction variable.  E.g., for an iv with IV_PHI='X' and
7401      evolution S, for a vector of 4 units, we want to compute:
7402      [X, X + S, X + 2*S, X + 3*S].  */
7403
7404   if (dump_enabled_p ())
7405     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7406
7407   latch_e = loop_latch_edge (iv_loop);
7408   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7409
7410   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7411   gcc_assert (step_expr != NULL_TREE);
7412
7413   pe = loop_preheader_edge (iv_loop);
7414   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7415                                      loop_preheader_edge (iv_loop));
7416
7417   stmts = NULL;
7418   if (!nested_in_vect_loop)
7419     {
7420       /* Convert the initial value to the desired type.  */
7421       tree new_type = TREE_TYPE (vectype);
7422       init_expr = gimple_convert (&stmts, new_type, init_expr);
7423
7424       /* If we are using the loop mask to "peel" for alignment then we need
7425          to adjust the start value here.  */
7426       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7427       if (skip_niters != NULL_TREE)
7428         {
7429           if (FLOAT_TYPE_P (vectype))
7430             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7431                                         skip_niters);
7432           else
7433             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7434           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7435                                          skip_niters, step_expr);
7436           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7437                                     init_expr, skip_step);
7438         }
7439     }
7440
7441   /* Convert the step to the desired type.  */
7442   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7443
7444   if (stmts)
7445     {
7446       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7447       gcc_assert (!new_bb);
7448     }
7449
7450   /* Find the first insertion point in the BB.  */
7451   si = gsi_after_labels (bb);
7452
7453   /* For SLP induction we have to generate several IVs as for example
7454      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7455      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7456      [VF*S, VF*S, VF*S, VF*S] for all.  */
7457   if (slp_node)
7458     {
7459       /* Enforced above.  */
7460       unsigned int const_nunits = nunits.to_constant ();
7461
7462       /* Generate [VF*S, VF*S, ... ].  */
7463       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7464         {
7465           expr = build_int_cst (integer_type_node, vf);
7466           expr = fold_convert (TREE_TYPE (step_expr), expr);
7467         }
7468       else
7469         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7470       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7471                               expr, step_expr);
7472       if (! CONSTANT_CLASS_P (new_name))
7473         new_name = vect_init_vector (phi, new_name,
7474                                      TREE_TYPE (step_expr), NULL);
7475       new_vec = build_vector_from_val (vectype, new_name);
7476       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7477
7478       /* Now generate the IVs.  */
7479       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7480       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7481       unsigned elts = const_nunits * nvects;
7482       unsigned nivs = least_common_multiple (group_size,
7483                                              const_nunits) / const_nunits;
7484       gcc_assert (elts % group_size == 0);
7485       tree elt = init_expr;
7486       unsigned ivn;
7487       for (ivn = 0; ivn < nivs; ++ivn)
7488         {
7489           tree_vector_builder elts (vectype, const_nunits, 1);
7490           stmts = NULL;
7491           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7492             {
7493               if (ivn*const_nunits + eltn >= group_size
7494                   && (ivn * const_nunits + eltn) % group_size == 0)
7495                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7496                                     elt, step_expr);
7497               elts.quick_push (elt);
7498             }
7499           vec_init = gimple_build_vector (&stmts, &elts);
7500           if (stmts)
7501             {
7502               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7503               gcc_assert (!new_bb);
7504             }
7505
7506           /* Create the induction-phi that defines the induction-operand.  */
7507           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7508           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7509           loop_vinfo->add_stmt (induction_phi);
7510           induc_def = PHI_RESULT (induction_phi);
7511
7512           /* Create the iv update inside the loop  */
7513           vec_def = make_ssa_name (vec_dest);
7514           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7515           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7516           loop_vinfo->add_stmt (new_stmt);
7517
7518           /* Set the arguments of the phi node:  */
7519           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7520           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7521                        UNKNOWN_LOCATION);
7522
7523           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7524         }
7525
7526       /* Re-use IVs when we can.  */
7527       if (ivn < nvects)
7528         {
7529           unsigned vfp
7530             = least_common_multiple (group_size, const_nunits) / group_size;
7531           /* Generate [VF'*S, VF'*S, ... ].  */
7532           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7533             {
7534               expr = build_int_cst (integer_type_node, vfp);
7535               expr = fold_convert (TREE_TYPE (step_expr), expr);
7536             }
7537           else
7538             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7539           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7540                                   expr, step_expr);
7541           if (! CONSTANT_CLASS_P (new_name))
7542             new_name = vect_init_vector (phi, new_name,
7543                                          TREE_TYPE (step_expr), NULL);
7544           new_vec = build_vector_from_val (vectype, new_name);
7545           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7546           for (; ivn < nvects; ++ivn)
7547             {
7548               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7549               tree def;
7550               if (gimple_code (iv) == GIMPLE_PHI)
7551                 def = gimple_phi_result (iv);
7552               else
7553                 def = gimple_assign_lhs (iv);
7554               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7555                                               PLUS_EXPR,
7556                                               def, vec_step);
7557               if (gimple_code (iv) == GIMPLE_PHI)
7558                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7559               else
7560                 {
7561                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7562                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7563                 }
7564               loop_vinfo->add_stmt (new_stmt);
7565               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7566             }
7567         }
7568
7569       return true;
7570     }
7571
7572   /* Create the vector that holds the initial_value of the induction.  */
7573   if (nested_in_vect_loop)
7574     {
7575       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7576          been created during vectorization of previous stmts.  We obtain it
7577          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7578       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7579       /* If the initial value is not of proper type, convert it.  */
7580       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7581         {
7582           new_stmt
7583             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7584                                                           vect_simple_var,
7585                                                           "vec_iv_"),
7586                                    VIEW_CONVERT_EXPR,
7587                                    build1 (VIEW_CONVERT_EXPR, vectype,
7588                                            vec_init));
7589           vec_init = gimple_assign_lhs (new_stmt);
7590           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7591                                                  new_stmt);
7592           gcc_assert (!new_bb);
7593           loop_vinfo->add_stmt (new_stmt);
7594         }
7595     }
7596   else
7597     {
7598       /* iv_loop is the loop to be vectorized. Create:
7599          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7600       stmts = NULL;
7601       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7602
7603       unsigned HOST_WIDE_INT const_nunits;
7604       if (nunits.is_constant (&const_nunits))
7605         {
7606           tree_vector_builder elts (vectype, const_nunits, 1);
7607           elts.quick_push (new_name);
7608           for (i = 1; i < const_nunits; i++)
7609             {
7610               /* Create: new_name_i = new_name + step_expr  */
7611               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7612                                        new_name, step_expr);
7613               elts.quick_push (new_name);
7614             }
7615           /* Create a vector from [new_name_0, new_name_1, ...,
7616              new_name_nunits-1]  */
7617           vec_init = gimple_build_vector (&stmts, &elts);
7618         }
7619       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7620         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7621         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7622                                  new_name, step_expr);
7623       else
7624         {
7625           /* Build:
7626                 [base, base, base, ...]
7627                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7628           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7629           gcc_assert (flag_associative_math);
7630           tree index = build_index_vector (vectype, 0, 1);
7631           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7632                                                         new_name);
7633           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7634                                                         step_expr);
7635           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7636           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7637                                    vec_init, step_vec);
7638           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7639                                    vec_init, base_vec);
7640         }
7641
7642       if (stmts)
7643         {
7644           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7645           gcc_assert (!new_bb);
7646         }
7647     }
7648
7649
7650   /* Create the vector that holds the step of the induction.  */
7651   if (nested_in_vect_loop)
7652     /* iv_loop is nested in the loop to be vectorized. Generate:
7653        vec_step = [S, S, S, S]  */
7654     new_name = step_expr;
7655   else
7656     {
7657       /* iv_loop is the loop to be vectorized. Generate:
7658           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7659       gimple_seq seq = NULL;
7660       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7661         {
7662           expr = build_int_cst (integer_type_node, vf);
7663           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7664         }
7665       else
7666         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7667       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7668                                expr, step_expr);
7669       if (seq)
7670         {
7671           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7672           gcc_assert (!new_bb);
7673         }
7674     }
7675
7676   t = unshare_expr (new_name);
7677   gcc_assert (CONSTANT_CLASS_P (new_name)
7678               || TREE_CODE (new_name) == SSA_NAME);
7679   new_vec = build_vector_from_val (vectype, t);
7680   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7681
7682
7683   /* Create the following def-use cycle:
7684      loop prolog:
7685          vec_init = ...
7686          vec_step = ...
7687      loop:
7688          vec_iv = PHI <vec_init, vec_loop>
7689          ...
7690          STMT
7691          ...
7692          vec_loop = vec_iv + vec_step;  */
7693
7694   /* Create the induction-phi that defines the induction-operand.  */
7695   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7696   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7697   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7698   induc_def = PHI_RESULT (induction_phi);
7699
7700   /* Create the iv update inside the loop  */
7701   vec_def = make_ssa_name (vec_dest);
7702   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7703   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7704   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7705
7706   /* Set the arguments of the phi node:  */
7707   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7708   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7709                UNKNOWN_LOCATION);
7710
7711   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7712
7713   /* In case that vectorization factor (VF) is bigger than the number
7714      of elements that we can fit in a vectype (nunits), we have to generate
7715      more than one vector stmt - i.e - we need to "unroll" the
7716      vector stmt by a factor VF/nunits.  For more details see documentation
7717      in vectorizable_operation.  */
7718
7719   if (ncopies > 1)
7720     {
7721       gimple_seq seq = NULL;
7722       stmt_vec_info prev_stmt_vinfo;
7723       /* FORNOW. This restriction should be relaxed.  */
7724       gcc_assert (!nested_in_vect_loop);
7725
7726       /* Create the vector that holds the step of the induction.  */
7727       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7728         {
7729           expr = build_int_cst (integer_type_node, nunits);
7730           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7731         }
7732       else
7733         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7734       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7735                                expr, step_expr);
7736       if (seq)
7737         {
7738           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7739           gcc_assert (!new_bb);
7740         }
7741
7742       t = unshare_expr (new_name);
7743       gcc_assert (CONSTANT_CLASS_P (new_name)
7744                   || TREE_CODE (new_name) == SSA_NAME);
7745       new_vec = build_vector_from_val (vectype, t);
7746       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7747
7748       vec_def = induc_def;
7749       prev_stmt_vinfo = induction_phi_info;
7750       for (i = 1; i < ncopies; i++)
7751         {
7752           /* vec_i = vec_prev + vec_step  */
7753           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7754                                           vec_def, vec_step);
7755           vec_def = make_ssa_name (vec_dest, new_stmt);
7756           gimple_assign_set_lhs (new_stmt, vec_def);
7757
7758           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7759           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7760           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7761           prev_stmt_vinfo = new_stmt_info;
7762         }
7763     }
7764
7765   if (nested_in_vect_loop)
7766     {
7767       /* Find the loop-closed exit-phi of the induction, and record
7768          the final vector of induction results:  */
7769       exit_phi = NULL;
7770       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7771         {
7772           gimple *use_stmt = USE_STMT (use_p);
7773           if (is_gimple_debug (use_stmt))
7774             continue;
7775
7776           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7777             {
7778               exit_phi = use_stmt;
7779               break;
7780             }
7781         }
7782       if (exit_phi)
7783         {
7784           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7785           /* FORNOW. Currently not supporting the case that an inner-loop induction
7786              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7787           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7788                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7789
7790           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7791           if (dump_enabled_p ())
7792             {
7793               dump_printf_loc (MSG_NOTE, vect_location,
7794                                "vector of inductions after inner-loop:");
7795               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7796             }
7797         }
7798     }
7799
7800
7801   if (dump_enabled_p ())
7802     {
7803       dump_printf_loc (MSG_NOTE, vect_location,
7804                        "transform induction: created def-use cycle: ");
7805       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7806       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7807                         SSA_NAME_DEF_STMT (vec_def), 0);
7808     }
7809
7810   return true;
7811 }
7812
7813 /* Function vectorizable_live_operation.
7814
7815    STMT computes a value that is used outside the loop.  Check if
7816    it can be supported.  */
7817
7818 bool
7819 vectorizable_live_operation (gimple *stmt,
7820                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7821                              slp_tree slp_node, int slp_index,
7822                              gimple **vec_stmt,
7823                              stmt_vector_for_cost *)
7824 {
7825   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7826   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7827   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7828   imm_use_iterator imm_iter;
7829   tree lhs, lhs_type, bitsize, vec_bitsize;
7830   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7831   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7832   int ncopies;
7833   gimple *use_stmt;
7834   auto_vec<tree> vec_oprnds;
7835   int vec_entry = 0;
7836   poly_uint64 vec_index = 0;
7837
7838   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7839
7840   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7841     return false;
7842
7843   /* FORNOW.  CHECKME.  */
7844   if (nested_in_vect_loop_p (loop, stmt))
7845     return false;
7846
7847   /* If STMT is not relevant and it is a simple assignment and its inputs are
7848      invariant then it can remain in place, unvectorized.  The original last
7849      scalar value that it computes will be used.  */
7850   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7851     {
7852       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7853       if (dump_enabled_p ())
7854         dump_printf_loc (MSG_NOTE, vect_location,
7855                          "statement is simple and uses invariant.  Leaving in "
7856                          "place.\n");
7857       return true;
7858     }
7859
7860   if (slp_node)
7861     ncopies = 1;
7862   else
7863     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7864
7865   if (slp_node)
7866     {
7867       gcc_assert (slp_index >= 0);
7868
7869       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7870       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7871
7872       /* Get the last occurrence of the scalar index from the concatenation of
7873          all the slp vectors. Calculate which slp vector it is and the index
7874          within.  */
7875       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7876
7877       /* Calculate which vector contains the result, and which lane of
7878          that vector we need.  */
7879       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7880         {
7881           if (dump_enabled_p ())
7882             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7883                              "Cannot determine which vector holds the"
7884                              " final result.\n");
7885           return false;
7886         }
7887     }
7888
7889   if (!vec_stmt)
7890     {
7891       /* No transformation required.  */
7892       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7893         {
7894           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7895                                                OPTIMIZE_FOR_SPEED))
7896             {
7897               if (dump_enabled_p ())
7898                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7899                                  "can't use a fully-masked loop because "
7900                                  "the target doesn't support extract last "
7901                                  "reduction.\n");
7902               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7903             }
7904           else if (slp_node)
7905             {
7906               if (dump_enabled_p ())
7907                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7908                                  "can't use a fully-masked loop because an "
7909                                  "SLP statement is live after the loop.\n");
7910               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7911             }
7912           else if (ncopies > 1)
7913             {
7914               if (dump_enabled_p ())
7915                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7916                                  "can't use a fully-masked loop because"
7917                                  " ncopies is greater than 1.\n");
7918               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7919             }
7920           else
7921             {
7922               gcc_assert (ncopies == 1 && !slp_node);
7923               vect_record_loop_mask (loop_vinfo,
7924                                      &LOOP_VINFO_MASKS (loop_vinfo),
7925                                      1, vectype);
7926             }
7927         }
7928       return true;
7929     }
7930
7931   /* If stmt has a related stmt, then use that for getting the lhs.  */
7932   if (is_pattern_stmt_p (stmt_info))
7933     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7934
7935   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7936         : gimple_get_lhs (stmt);
7937   lhs_type = TREE_TYPE (lhs);
7938
7939   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7940              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7941              : TYPE_SIZE (TREE_TYPE (vectype)));
7942   vec_bitsize = TYPE_SIZE (vectype);
7943
7944   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7945   tree vec_lhs, bitstart;
7946   if (slp_node)
7947     {
7948       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7949
7950       /* Get the correct slp vectorized stmt.  */
7951       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
7952       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7953         vec_lhs = gimple_phi_result (phi);
7954       else
7955         vec_lhs = gimple_get_lhs (vec_stmt);
7956
7957       /* Get entry to use.  */
7958       bitstart = bitsize_int (vec_index);
7959       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7960     }
7961   else
7962     {
7963       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7964       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7965       gcc_checking_assert (ncopies == 1
7966                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7967
7968       /* For multiple copies, get the last copy.  */
7969       for (int i = 1; i < ncopies; ++i)
7970         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7971                                                   vec_lhs);
7972
7973       /* Get the last lane in the vector.  */
7974       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7975     }
7976
7977   gimple_seq stmts = NULL;
7978   tree new_tree;
7979   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7980     {
7981       /* Emit:
7982
7983            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7984
7985          where VEC_LHS is the vectorized live-out result and MASK is
7986          the loop mask for the final iteration.  */
7987       gcc_assert (ncopies == 1 && !slp_node);
7988       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7989       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7990                                       1, vectype, 0);
7991       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7992                                       scalar_type, mask, vec_lhs);
7993
7994       /* Convert the extracted vector element to the required scalar type.  */
7995       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7996     }
7997   else
7998     {
7999       tree bftype = TREE_TYPE (vectype);
8000       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8001         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8002       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8003       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8004                                        &stmts, true, NULL_TREE);
8005     }
8006
8007   if (stmts)
8008     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8009
8010   /* Replace use of lhs with newly computed result.  If the use stmt is a
8011      single arg PHI, just replace all uses of PHI result.  It's necessary
8012      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8013   use_operand_p use_p;
8014   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8015     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8016         && !is_gimple_debug (use_stmt))
8017     {
8018       if (gimple_code (use_stmt) == GIMPLE_PHI
8019           && gimple_phi_num_args (use_stmt) == 1)
8020         {
8021           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8022         }
8023       else
8024         {
8025           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8026             SET_USE (use_p, new_tree);
8027         }
8028       update_stmt (use_stmt);
8029     }
8030
8031   return true;
8032 }
8033
8034 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8035
8036 static void
8037 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8038 {
8039   ssa_op_iter op_iter;
8040   imm_use_iterator imm_iter;
8041   def_operand_p def_p;
8042   gimple *ustmt;
8043
8044   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8045     {
8046       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8047         {
8048           basic_block bb;
8049
8050           if (!is_gimple_debug (ustmt))
8051             continue;
8052
8053           bb = gimple_bb (ustmt);
8054
8055           if (!flow_bb_inside_loop_p (loop, bb))
8056             {
8057               if (gimple_debug_bind_p (ustmt))
8058                 {
8059                   if (dump_enabled_p ())
8060                     dump_printf_loc (MSG_NOTE, vect_location,
8061                                      "killing debug use\n");
8062
8063                   gimple_debug_bind_reset_value (ustmt);
8064                   update_stmt (ustmt);
8065                 }
8066               else
8067                 gcc_unreachable ();
8068             }
8069         }
8070     }
8071 }
8072
8073 /* Given loop represented by LOOP_VINFO, return true if computation of
8074    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8075    otherwise.  */
8076
8077 static bool
8078 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8079 {
8080   /* Constant case.  */
8081   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8082     {
8083       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8084       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8085
8086       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8087       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8088       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8089         return true;
8090     }
8091
8092   widest_int max;
8093   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8094   /* Check the upper bound of loop niters.  */
8095   if (get_max_loop_iterations (loop, &max))
8096     {
8097       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8098       signop sgn = TYPE_SIGN (type);
8099       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8100       if (max < type_max)
8101         return true;
8102     }
8103   return false;
8104 }
8105
8106 /* Return a mask type with half the number of elements as TYPE.  */
8107
8108 tree
8109 vect_halve_mask_nunits (tree type)
8110 {
8111   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8112   return build_truth_vector_type (nunits, current_vector_size);
8113 }
8114
8115 /* Return a mask type with twice as many elements as TYPE.  */
8116
8117 tree
8118 vect_double_mask_nunits (tree type)
8119 {
8120   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8121   return build_truth_vector_type (nunits, current_vector_size);
8122 }
8123
8124 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8125    contain a sequence of NVECTORS masks that each control a vector of type
8126    VECTYPE.  */
8127
8128 void
8129 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8130                        unsigned int nvectors, tree vectype)
8131 {
8132   gcc_assert (nvectors != 0);
8133   if (masks->length () < nvectors)
8134     masks->safe_grow_cleared (nvectors);
8135   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8136   /* The number of scalars per iteration and the number of vectors are
8137      both compile-time constants.  */
8138   unsigned int nscalars_per_iter
8139     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8140                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8141   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8142     {
8143       rgm->max_nscalars_per_iter = nscalars_per_iter;
8144       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8145     }
8146 }
8147
8148 /* Given a complete set of masks MASKS, extract mask number INDEX
8149    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8150    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8151
8152    See the comment above vec_loop_masks for more details about the mask
8153    arrangement.  */
8154
8155 tree
8156 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8157                     unsigned int nvectors, tree vectype, unsigned int index)
8158 {
8159   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8160   tree mask_type = rgm->mask_type;
8161
8162   /* Populate the rgroup's mask array, if this is the first time we've
8163      used it.  */
8164   if (rgm->masks.is_empty ())
8165     {
8166       rgm->masks.safe_grow_cleared (nvectors);
8167       for (unsigned int i = 0; i < nvectors; ++i)
8168         {
8169           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8170           /* Provide a dummy definition until the real one is available.  */
8171           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8172           rgm->masks[i] = mask;
8173         }
8174     }
8175
8176   tree mask = rgm->masks[index];
8177   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8178                 TYPE_VECTOR_SUBPARTS (vectype)))
8179     {
8180       /* A loop mask for data type X can be reused for data type Y
8181          if X has N times more elements than Y and if Y's elements
8182          are N times bigger than X's.  In this case each sequence
8183          of N elements in the loop mask will be all-zero or all-one.
8184          We can then view-convert the mask so that each sequence of
8185          N elements is replaced by a single element.  */
8186       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8187                               TYPE_VECTOR_SUBPARTS (vectype)));
8188       gimple_seq seq = NULL;
8189       mask_type = build_same_sized_truth_vector_type (vectype);
8190       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8191       if (seq)
8192         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8193     }
8194   return mask;
8195 }
8196
8197 /* Scale profiling counters by estimation for LOOP which is vectorized
8198    by factor VF.  */
8199
8200 static void
8201 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8202 {
8203   edge preheader = loop_preheader_edge (loop);
8204   /* Reduce loop iterations by the vectorization factor.  */
8205   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8206   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8207
8208   if (freq_h.nonzero_p ())
8209     {
8210       profile_probability p;
8211
8212       /* Avoid dropping loop body profile counter to 0 because of zero count
8213          in loop's preheader.  */
8214       if (!(freq_e == profile_count::zero ()))
8215         freq_e = freq_e.force_nonzero ();
8216       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8217       scale_loop_frequencies (loop, p);
8218     }
8219
8220   edge exit_e = single_exit (loop);
8221   exit_e->probability = profile_probability::always ()
8222                                  .apply_scale (1, new_est_niter + 1);
8223
8224   edge exit_l = single_pred_edge (loop->latch);
8225   profile_probability prob = exit_l->probability;
8226   exit_l->probability = exit_e->probability.invert ();
8227   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8228     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8229 }
8230
8231 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8232    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8233    *SLP_SCHEDULE is a running record of whether we have called
8234    vect_schedule_slp.  */
8235
8236 static void
8237 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8238                           gimple_stmt_iterator *gsi,
8239                           stmt_vec_info *seen_store, bool *slp_scheduled)
8240 {
8241   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8242   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8243   stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
8244   if (!stmt_info)
8245     return;
8246
8247   if (dump_enabled_p ())
8248     {
8249       dump_printf_loc (MSG_NOTE, vect_location,
8250                        "------>vectorizing statement: ");
8251       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8252     }
8253
8254   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8255     vect_loop_kill_debug_uses (loop, stmt);
8256
8257   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8258       && !STMT_VINFO_LIVE_P (stmt_info))
8259     return;
8260
8261   if (STMT_VINFO_VECTYPE (stmt_info))
8262     {
8263       poly_uint64 nunits
8264         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8265       if (!STMT_SLP_TYPE (stmt_info)
8266           && maybe_ne (nunits, vf)
8267           && dump_enabled_p ())
8268         /* For SLP VF is set according to unrolling factor, and not
8269            to vector size, hence for SLP this print is not valid.  */
8270         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8271     }
8272
8273   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8274      reached.  */
8275   if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8276     {
8277
8278       if (!*slp_scheduled)
8279         {
8280           *slp_scheduled = true;
8281
8282           DUMP_VECT_SCOPE ("scheduling SLP instances");
8283
8284           vect_schedule_slp (loop_vinfo);
8285         }
8286
8287       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8288       if (slptype == pure_slp)
8289         return;
8290     }
8291
8292   if (dump_enabled_p ())
8293     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8294
8295   bool grouped_store = false;
8296   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8297     *seen_store = stmt_info;
8298 }
8299
8300 /* Function vect_transform_loop.
8301
8302    The analysis phase has determined that the loop is vectorizable.
8303    Vectorize the loop - created vectorized stmts to replace the scalar
8304    stmts in the loop, and update the loop exit condition.
8305    Returns scalar epilogue loop if any.  */
8306
8307 struct loop *
8308 vect_transform_loop (loop_vec_info loop_vinfo)
8309 {
8310   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8311   struct loop *epilogue = NULL;
8312   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8313   int nbbs = loop->num_nodes;
8314   int i;
8315   tree niters_vector = NULL_TREE;
8316   tree step_vector = NULL_TREE;
8317   tree niters_vector_mult_vf = NULL_TREE;
8318   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8319   unsigned int lowest_vf = constant_lower_bound (vf);
8320   bool slp_scheduled = false;
8321   gimple *stmt;
8322   bool check_profitability = false;
8323   unsigned int th;
8324
8325   DUMP_VECT_SCOPE ("vec_transform_loop");
8326
8327   loop_vinfo->shared->check_datarefs ();
8328
8329   /* Use the more conservative vectorization threshold.  If the number
8330      of iterations is constant assume the cost check has been performed
8331      by our caller.  If the threshold makes all loops profitable that
8332      run at least the (estimated) vectorization factor number of times
8333      checking is pointless, too.  */
8334   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8335   if (th >= vect_vf_for_cost (loop_vinfo)
8336       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8337     {
8338       if (dump_enabled_p ())
8339         dump_printf_loc (MSG_NOTE, vect_location,
8340                          "Profitability threshold is %d loop iterations.\n",
8341                          th);
8342       check_profitability = true;
8343     }
8344
8345   /* Make sure there exists a single-predecessor exit bb.  Do this before
8346      versioning.   */
8347   edge e = single_exit (loop);
8348   if (! single_pred_p (e->dest))
8349     {
8350       split_loop_exit_edge (e);
8351       if (dump_enabled_p ())
8352         dump_printf (MSG_NOTE, "split exit edge\n");
8353     }
8354
8355   /* Version the loop first, if required, so the profitability check
8356      comes first.  */
8357
8358   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8359     {
8360       poly_uint64 versioning_threshold
8361         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8362       if (check_profitability
8363           && ordered_p (poly_uint64 (th), versioning_threshold))
8364         {
8365           versioning_threshold = ordered_max (poly_uint64 (th),
8366                                               versioning_threshold);
8367           check_profitability = false;
8368         }
8369       vect_loop_versioning (loop_vinfo, th, check_profitability,
8370                             versioning_threshold);
8371       check_profitability = false;
8372     }
8373
8374   /* Make sure there exists a single-predecessor exit bb also on the
8375      scalar loop copy.  Do this after versioning but before peeling
8376      so CFG structure is fine for both scalar and if-converted loop
8377      to make slpeel_duplicate_current_defs_from_edges face matched
8378      loop closed PHI nodes on the exit.  */
8379   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8380     {
8381       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8382       if (! single_pred_p (e->dest))
8383         {
8384           split_loop_exit_edge (e);
8385           if (dump_enabled_p ())
8386             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8387         }
8388     }
8389
8390   tree niters = vect_build_loop_niters (loop_vinfo);
8391   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8392   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8393   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8394   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8395                               &step_vector, &niters_vector_mult_vf, th,
8396                               check_profitability, niters_no_overflow);
8397
8398   if (niters_vector == NULL_TREE)
8399     {
8400       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8401           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8402           && known_eq (lowest_vf, vf))
8403         {
8404           niters_vector
8405             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8406                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8407           step_vector = build_one_cst (TREE_TYPE (niters));
8408         }
8409       else
8410         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8411                                      &step_vector, niters_no_overflow);
8412     }
8413
8414   /* 1) Make sure the loop header has exactly two entries
8415      2) Make sure we have a preheader basic block.  */
8416
8417   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8418
8419   split_edge (loop_preheader_edge (loop));
8420
8421   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8422       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8423     /* This will deal with any possible peeling.  */
8424     vect_prepare_for_masked_peels (loop_vinfo);
8425
8426   /* FORNOW: the vectorizer supports only loops which body consist
8427      of one basic block (header + empty latch). When the vectorizer will
8428      support more involved loop forms, the order by which the BBs are
8429      traversed need to be reconsidered.  */
8430
8431   for (i = 0; i < nbbs; i++)
8432     {
8433       basic_block bb = bbs[i];
8434       stmt_vec_info stmt_info;
8435
8436       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8437            gsi_next (&si))
8438         {
8439           gphi *phi = si.phi ();
8440           if (dump_enabled_p ())
8441             {
8442               dump_printf_loc (MSG_NOTE, vect_location,
8443                                "------>vectorizing phi: ");
8444               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8445             }
8446           stmt_info = loop_vinfo->lookup_stmt (phi);
8447           if (!stmt_info)
8448             continue;
8449
8450           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8451             vect_loop_kill_debug_uses (loop, phi);
8452
8453           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8454               && !STMT_VINFO_LIVE_P (stmt_info))
8455             continue;
8456
8457           if (STMT_VINFO_VECTYPE (stmt_info)
8458               && (maybe_ne
8459                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8460               && dump_enabled_p ())
8461             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8462
8463           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8464                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8465                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8466               && ! PURE_SLP_STMT (stmt_info))
8467             {
8468               if (dump_enabled_p ())
8469                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8470               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8471             }
8472         }
8473
8474       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8475            !gsi_end_p (si);)
8476         {
8477           stmt = gsi_stmt (si);
8478           /* During vectorization remove existing clobber stmts.  */
8479           if (gimple_clobber_p (stmt))
8480             {
8481               unlink_stmt_vdef (stmt);
8482               gsi_remove (&si, true);
8483               release_defs (stmt);
8484             }
8485           else
8486             {
8487               stmt_info = loop_vinfo->lookup_stmt (stmt);
8488
8489               /* vector stmts created in the outer-loop during vectorization of
8490                  stmts in an inner-loop may not have a stmt_info, and do not
8491                  need to be vectorized.  */
8492               stmt_vec_info seen_store = NULL;
8493               if (stmt_info)
8494                 {
8495                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8496                     {
8497                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8498                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8499                            !gsi_end_p (subsi); gsi_next (&subsi))
8500                         vect_transform_loop_stmt (loop_vinfo,
8501                                                   gsi_stmt (subsi), &si,
8502                                                   &seen_store,
8503                                                   &slp_scheduled);
8504                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8505                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8506                                                 &seen_store, &slp_scheduled);
8507                     }
8508                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8509                                             &seen_store, &slp_scheduled);
8510                 }
8511               if (seen_store)
8512                 {
8513                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8514                     {
8515                       /* Interleaving.  If IS_STORE is TRUE, the
8516                          vectorization of the interleaving chain was
8517                          completed - free all the stores in the chain.  */
8518                       gsi_next (&si);
8519                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8520                     }
8521                   else
8522                     {
8523                       /* Free the attached stmt_vec_info and remove the
8524                          stmt.  */
8525                       free_stmt_vec_info (stmt);
8526                       unlink_stmt_vdef (stmt);
8527                       gsi_remove (&si, true);
8528                       release_defs (stmt);
8529                     }
8530                 }
8531               else
8532                 gsi_next (&si);
8533             }
8534         }
8535
8536       /* Stub out scalar statements that must not survive vectorization.
8537          Doing this here helps with grouped statements, or statements that
8538          are involved in patterns.  */
8539       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8540            !gsi_end_p (gsi); gsi_next (&gsi))
8541         {
8542           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8543           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8544             {
8545               tree lhs = gimple_get_lhs (call);
8546               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8547                 {
8548                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8549                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8550                   gsi_replace (&gsi, new_stmt, true);
8551                 }
8552             }
8553         }
8554     }                           /* BBs in loop */
8555
8556   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8557      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8558   if (integer_onep (step_vector))
8559     niters_no_overflow = true;
8560   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8561                            niters_vector_mult_vf, !niters_no_overflow);
8562
8563   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8564   scale_profile_for_vect_loop (loop, assumed_vf);
8565
8566   /* True if the final iteration might not handle a full vector's
8567      worth of scalar iterations.  */
8568   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8569   /* The minimum number of iterations performed by the epilogue.  This
8570      is 1 when peeling for gaps because we always need a final scalar
8571      iteration.  */
8572   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8573   /* +1 to convert latch counts to loop iteration counts,
8574      -min_epilogue_iters to remove iterations that cannot be performed
8575        by the vector code.  */
8576   int bias_for_lowest = 1 - min_epilogue_iters;
8577   int bias_for_assumed = bias_for_lowest;
8578   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8579   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8580     {
8581       /* When the amount of peeling is known at compile time, the first
8582          iteration will have exactly alignment_npeels active elements.
8583          In the worst case it will have at least one.  */
8584       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8585       bias_for_lowest += lowest_vf - min_first_active;
8586       bias_for_assumed += assumed_vf - min_first_active;
8587     }
8588   /* In these calculations the "- 1" converts loop iteration counts
8589      back to latch counts.  */
8590   if (loop->any_upper_bound)
8591     loop->nb_iterations_upper_bound
8592       = (final_iter_may_be_partial
8593          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8594                           lowest_vf) - 1
8595          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8596                            lowest_vf) - 1);
8597   if (loop->any_likely_upper_bound)
8598     loop->nb_iterations_likely_upper_bound
8599       = (final_iter_may_be_partial
8600          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8601                           + bias_for_lowest, lowest_vf) - 1
8602          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8603                            + bias_for_lowest, lowest_vf) - 1);
8604   if (loop->any_estimate)
8605     loop->nb_iterations_estimate
8606       = (final_iter_may_be_partial
8607          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8608                           assumed_vf) - 1
8609          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8610                            assumed_vf) - 1);
8611
8612   if (dump_enabled_p ())
8613     {
8614       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8615         {
8616           dump_printf_loc (MSG_NOTE, vect_location,
8617                            "LOOP VECTORIZED\n");
8618           if (loop->inner)
8619             dump_printf_loc (MSG_NOTE, vect_location,
8620                              "OUTER LOOP VECTORIZED\n");
8621           dump_printf (MSG_NOTE, "\n");
8622         }
8623       else
8624         {
8625           dump_printf_loc (MSG_NOTE, vect_location,
8626                            "LOOP EPILOGUE VECTORIZED (VS=");
8627           dump_dec (MSG_NOTE, current_vector_size);
8628           dump_printf (MSG_NOTE, ")\n");
8629         }
8630     }
8631
8632   /* Free SLP instances here because otherwise stmt reference counting
8633      won't work.  */
8634   slp_instance instance;
8635   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8636     vect_free_slp_instance (instance, true);
8637   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8638   /* Clear-up safelen field since its value is invalid after vectorization
8639      since vectorized loop can have loop-carried dependencies.  */
8640   loop->safelen = 0;
8641
8642   /* Don't vectorize epilogue for epilogue.  */
8643   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8644     epilogue = NULL;
8645
8646   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8647     epilogue = NULL;
8648
8649   if (epilogue)
8650     {
8651       auto_vector_sizes vector_sizes;
8652       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8653       unsigned int next_size = 0;
8654
8655       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8656           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8657           && known_eq (vf, lowest_vf))
8658         {
8659           unsigned int eiters
8660             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8661                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8662           eiters = eiters % lowest_vf;
8663           epilogue->nb_iterations_upper_bound = eiters - 1;
8664
8665           unsigned int ratio;
8666           while (next_size < vector_sizes.length ()
8667                  && !(constant_multiple_p (current_vector_size,
8668                                            vector_sizes[next_size], &ratio)
8669                       && eiters >= lowest_vf / ratio))
8670             next_size += 1;
8671         }
8672       else
8673         while (next_size < vector_sizes.length ()
8674                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8675           next_size += 1;
8676
8677       if (next_size == vector_sizes.length ())
8678         epilogue = NULL;
8679     }
8680
8681   if (epilogue)
8682     {
8683       epilogue->force_vectorize = loop->force_vectorize;
8684       epilogue->safelen = loop->safelen;
8685       epilogue->dont_vectorize = false;
8686
8687       /* We may need to if-convert epilogue to vectorize it.  */
8688       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8689         tree_if_conversion (epilogue);
8690     }
8691
8692   return epilogue;
8693 }
8694
8695 /* The code below is trying to perform simple optimization - revert
8696    if-conversion for masked stores, i.e. if the mask of a store is zero
8697    do not perform it and all stored value producers also if possible.
8698    For example,
8699      for (i=0; i<n; i++)
8700        if (c[i])
8701         {
8702           p1[i] += 1;
8703           p2[i] = p3[i] +2;
8704         }
8705    this transformation will produce the following semi-hammock:
8706
8707    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8708      {
8709        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8710        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8711        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8712        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8713        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8714        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8715      }
8716 */
8717
8718 void
8719 optimize_mask_stores (struct loop *loop)
8720 {
8721   basic_block *bbs = get_loop_body (loop);
8722   unsigned nbbs = loop->num_nodes;
8723   unsigned i;
8724   basic_block bb;
8725   struct loop *bb_loop;
8726   gimple_stmt_iterator gsi;
8727   gimple *stmt;
8728   auto_vec<gimple *> worklist;
8729
8730   vect_location = find_loop_location (loop);
8731   /* Pick up all masked stores in loop if any.  */
8732   for (i = 0; i < nbbs; i++)
8733     {
8734       bb = bbs[i];
8735       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8736            gsi_next (&gsi))
8737         {
8738           stmt = gsi_stmt (gsi);
8739           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8740             worklist.safe_push (stmt);
8741         }
8742     }
8743
8744   free (bbs);
8745   if (worklist.is_empty ())
8746     return;
8747
8748   /* Loop has masked stores.  */
8749   while (!worklist.is_empty ())
8750     {
8751       gimple *last, *last_store;
8752       edge e, efalse;
8753       tree mask;
8754       basic_block store_bb, join_bb;
8755       gimple_stmt_iterator gsi_to;
8756       tree vdef, new_vdef;
8757       gphi *phi;
8758       tree vectype;
8759       tree zero;
8760
8761       last = worklist.pop ();
8762       mask = gimple_call_arg (last, 2);
8763       bb = gimple_bb (last);
8764       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8765          the same loop as if_bb.  It could be different to LOOP when two
8766          level loop-nest is vectorized and mask_store belongs to the inner
8767          one.  */
8768       e = split_block (bb, last);
8769       bb_loop = bb->loop_father;
8770       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8771       join_bb = e->dest;
8772       store_bb = create_empty_bb (bb);
8773       add_bb_to_loop (store_bb, bb_loop);
8774       e->flags = EDGE_TRUE_VALUE;
8775       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8776       /* Put STORE_BB to likely part.  */
8777       efalse->probability = profile_probability::unlikely ();
8778       store_bb->count = efalse->count ();
8779       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8780       if (dom_info_available_p (CDI_DOMINATORS))
8781         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8782       if (dump_enabled_p ())
8783         dump_printf_loc (MSG_NOTE, vect_location,
8784                          "Create new block %d to sink mask stores.",
8785                          store_bb->index);
8786       /* Create vector comparison with boolean result.  */
8787       vectype = TREE_TYPE (mask);
8788       zero = build_zero_cst (vectype);
8789       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8790       gsi = gsi_last_bb (bb);
8791       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8792       /* Create new PHI node for vdef of the last masked store:
8793          .MEM_2 = VDEF <.MEM_1>
8794          will be converted to
8795          .MEM.3 = VDEF <.MEM_1>
8796          and new PHI node will be created in join bb
8797          .MEM_2 = PHI <.MEM_1, .MEM_3>
8798       */
8799       vdef = gimple_vdef (last);
8800       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8801       gimple_set_vdef (last, new_vdef);
8802       phi = create_phi_node (vdef, join_bb);
8803       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8804
8805       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8806       while (true)
8807         {
8808           gimple_stmt_iterator gsi_from;
8809           gimple *stmt1 = NULL;
8810
8811           /* Move masked store to STORE_BB.  */
8812           last_store = last;
8813           gsi = gsi_for_stmt (last);
8814           gsi_from = gsi;
8815           /* Shift GSI to the previous stmt for further traversal.  */
8816           gsi_prev (&gsi);
8817           gsi_to = gsi_start_bb (store_bb);
8818           gsi_move_before (&gsi_from, &gsi_to);
8819           /* Setup GSI_TO to the non-empty block start.  */
8820           gsi_to = gsi_start_bb (store_bb);
8821           if (dump_enabled_p ())
8822             {
8823               dump_printf_loc (MSG_NOTE, vect_location,
8824                                "Move stmt to created bb\n");
8825               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8826             }
8827           /* Move all stored value producers if possible.  */
8828           while (!gsi_end_p (gsi))
8829             {
8830               tree lhs;
8831               imm_use_iterator imm_iter;
8832               use_operand_p use_p;
8833               bool res;
8834
8835               /* Skip debug statements.  */
8836               if (is_gimple_debug (gsi_stmt (gsi)))
8837                 {
8838                   gsi_prev (&gsi);
8839                   continue;
8840                 }
8841               stmt1 = gsi_stmt (gsi);
8842               /* Do not consider statements writing to memory or having
8843                  volatile operand.  */
8844               if (gimple_vdef (stmt1)
8845                   || gimple_has_volatile_ops (stmt1))
8846                 break;
8847               gsi_from = gsi;
8848               gsi_prev (&gsi);
8849               lhs = gimple_get_lhs (stmt1);
8850               if (!lhs)
8851                 break;
8852
8853               /* LHS of vectorized stmt must be SSA_NAME.  */
8854               if (TREE_CODE (lhs) != SSA_NAME)
8855                 break;
8856
8857               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8858                 {
8859                   /* Remove dead scalar statement.  */
8860                   if (has_zero_uses (lhs))
8861                     {
8862                       gsi_remove (&gsi_from, true);
8863                       continue;
8864                     }
8865                 }
8866
8867               /* Check that LHS does not have uses outside of STORE_BB.  */
8868               res = true;
8869               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8870                 {
8871                   gimple *use_stmt;
8872                   use_stmt = USE_STMT (use_p);
8873                   if (is_gimple_debug (use_stmt))
8874                     continue;
8875                   if (gimple_bb (use_stmt) != store_bb)
8876                     {
8877                       res = false;
8878                       break;
8879                     }
8880                 }
8881               if (!res)
8882                 break;
8883
8884               if (gimple_vuse (stmt1)
8885                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8886                 break;
8887
8888               /* Can move STMT1 to STORE_BB.  */
8889               if (dump_enabled_p ())
8890                 {
8891                   dump_printf_loc (MSG_NOTE, vect_location,
8892                                    "Move stmt to created bb\n");
8893                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8894                 }
8895               gsi_move_before (&gsi_from, &gsi_to);
8896               /* Shift GSI_TO for further insertion.  */
8897               gsi_prev (&gsi_to);
8898             }
8899           /* Put other masked stores with the same mask to STORE_BB.  */
8900           if (worklist.is_empty ()
8901               || gimple_call_arg (worklist.last (), 2) != mask
8902               || worklist.last () != stmt1)
8903             break;
8904           last = worklist.pop ();
8905         }
8906       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8907     }
8908 }