gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     {
 219       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 220       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 221     }
 222   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 223     return false;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             {
 238               dump_printf_loc (MSG_NOTE, vect_location,
 239                                "==> examining pattern def stmt: ");
 240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 241                                 def_stmt_info->stmt, 0);
 242             }
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245             return false;
 246         }
 247
 248       if (dump_enabled_p ())
 249         {
 250           dump_printf_loc (MSG_NOTE, vect_location,
 251                            "==> examining pattern statement: ");
 252           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 253         }
 254       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 255         return false;
 256     }
 257
 258   return true;
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static bool
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 314               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 315             }
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 {
 327                   dump_printf_loc (MSG_NOTE, vect_location,
 328                                    "get vectype for scalar type:  ");
 329                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 330                   dump_printf (MSG_NOTE, "\n");
 331                 }
 332
 333               vectype = get_vectype_for_scalar_type (scalar_type);
 334               if (!vectype)
 335                 {
 336                   if (dump_enabled_p ())
 337                     {
 338                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                        "not vectorized: unsupported "
 340                                        "data-type ");
 341                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                          scalar_type);
 343                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 344                     }
 345                   return false;
 346                 }
 347               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 348
 349               if (dump_enabled_p ())
 350                 {
 351                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 352                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 353                   dump_printf (MSG_NOTE, "\n");
 354                 }
 355
 356               if (dump_enabled_p ())
 357                 {
 358                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 359                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 360                   dump_printf (MSG_NOTE, "\n");
 361                 }
 362
 363               vect_update_max_nunits (&vectorization_factor, vectype);
 364             }
 365         }
 366
 367       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 368            gsi_next (&si))
 369         {
 370           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 371           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 372                                            &mask_producers))
 373             return false;
 374         }
 375     }
 376
 377   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 378   if (dump_enabled_p ())
 379     {
 380       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 381       dump_dec (MSG_NOTE, vectorization_factor);
 382       dump_printf (MSG_NOTE, "\n");
 383     }
 384
 385   if (known_le (vectorization_factor, 1U))
 386     {
 387       if (dump_enabled_p ())
 388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                          "not vectorized: unsupported data-type\n");
 390       return false;
 391     }
 392   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 393
 394   for (i = 0; i < mask_producers.length (); i++)
 395     {
 396       stmt_info = mask_producers[i];
 397       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 398       if (!mask_type)
 399         return false;
 400       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 401     }
 402
 403   return true;
 404 }
 405
 406
 407 /* Function vect_is_simple_iv_evolution.
 408
 409    FORNOW: A simple evolution of an induction variables in the loop is
 410    considered a polynomial evolution.  */
 411
 412 static bool
 413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 414                              tree * step)
 415 {
 416   tree init_expr;
 417   tree step_expr;
 418   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 419   basic_block bb;
 420
 421   /* When there is no evolution in this loop, the evolution function
 422      is not "simple".  */
 423   if (evolution_part == NULL_TREE)
 424     return false;
 425
 426   /* When the evolution is a polynomial of degree >= 2
 427      the evolution function is not "simple".  */
 428   if (tree_is_chrec (evolution_part))
 429     return false;
 430
 431   step_expr = evolution_part;
 432   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 433
 434   if (dump_enabled_p ())
 435     {
 436       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 437       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 438       dump_printf (MSG_NOTE, ",  init: ");
 439       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 440       dump_printf (MSG_NOTE, "\n");
 441     }
 442
 443   *init = init_expr;
 444   *step = step_expr;
 445
 446   if (TREE_CODE (step_expr) != INTEGER_CST
 447       && (TREE_CODE (step_expr) != SSA_NAME
 448           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 449               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 450           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 451               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 452                   || !flag_associative_math)))
 453       && (TREE_CODE (step_expr) != REAL_CST
 454           || !flag_associative_math))
 455     {
 456       if (dump_enabled_p ())
 457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 458                          "step unknown.\n");
 459       return false;
 460     }
 461
 462   return true;
 463 }
 464
 465 /* Function vect_analyze_scalar_cycles_1.
 466
 467    Examine the cross iteration def-use cycles of scalar variables
 468    in LOOP.  LOOP_VINFO represents the loop that is now being
 469    considered for vectorization (can be LOOP, or an outer-loop
 470    enclosing LOOP).  */
 471
 472 static void
 473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 474 {
 475   basic_block bb = loop->header;
 476   tree init, step;
 477   auto_vec<gimple *, 64> worklist;
 478   gphi_iterator gsi;
 479   bool double_reduc;
 480
 481   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 482
 483   /* First - identify all inductions.  Reduction detection assumes that all the
 484      inductions have been identified, therefore, this order must not be
 485      changed.  */
 486   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 487     {
 488       gphi *phi = gsi.phi ();
 489       tree access_fn = NULL;
 490       tree def = PHI_RESULT (phi);
 491       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 492
 493       if (dump_enabled_p ())
 494         {
 495           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 496           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 497         }
 498
 499       /* Skip virtual phi's.  The data dependences that are associated with
 500          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 501       if (virtual_operand_p (def))
 502         continue;
 503
 504       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 505
 506       /* Analyze the evolution function.  */
 507       access_fn = analyze_scalar_evolution (loop, def);
 508       if (access_fn)
 509         {
 510           STRIP_NOPS (access_fn);
 511           if (dump_enabled_p ())
 512             {
 513               dump_printf_loc (MSG_NOTE, vect_location,
 514                                "Access function of PHI: ");
 515               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 516               dump_printf (MSG_NOTE, "\n");
 517             }
 518           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519             = initial_condition_in_loop_num (access_fn, loop->num);
 520           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 521             = evolution_part_in_loop_num (access_fn, loop->num);
 522         }
 523
 524       if (!access_fn
 525           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 526           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 527               && TREE_CODE (step) != INTEGER_CST))
 528         {
 529           worklist.safe_push (phi);
 530           continue;
 531         }
 532
 533       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 534                   != NULL_TREE);
 535       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 539       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 540     }
 541
 542
 543   /* Second - identify all reductions and nested cycles.  */
 544   while (worklist.length () > 0)
 545     {
 546       gimple *phi = worklist.pop ();
 547       tree def = PHI_RESULT (phi);
 548       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 549       gimple *reduc_stmt;
 550
 551       if (dump_enabled_p ())
 552         {
 553           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 554           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 555         }
 556
 557       gcc_assert (!virtual_operand_p (def)
 558                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 559
 560       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 561                                                 &double_reduc, false);
 562       if (reduc_stmt)
 563         {
 564           if (double_reduc)
 565             {
 566               if (dump_enabled_p ())
 567                 dump_printf_loc (MSG_NOTE, vect_location,
 568                                  "Detected double reduction.\n");
 569
 570               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 571               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 572                                                     vect_double_reduction_def;
 573             }
 574           else
 575             {
 576               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 577                 {
 578                   if (dump_enabled_p ())
 579                     dump_printf_loc (MSG_NOTE, vect_location,
 580                                      "Detected vectorizable nested cycle.\n");
 581
 582                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 583                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 584                                                              vect_nested_cycle;
 585                 }
 586               else
 587                 {
 588                   if (dump_enabled_p ())
 589                     dump_printf_loc (MSG_NOTE, vect_location,
 590                                      "Detected reduction.\n");
 591
 592                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 593                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 594                                                            vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 600                 }
 601             }
 602         }
 603       else
 604         if (dump_enabled_p ())
 605           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 606                            "Unknown def-use cycle pattern.\n");
 607     }
 608 }
 609
 610
 611 /* Function vect_analyze_scalar_cycles.
 612
 613    Examine the cross iteration def-use cycles of scalar variables, by
 614    analyzing the loop-header PHIs of scalar variables.  Classify each
 615    cycle as one of the following: invariant, induction, reduction, unknown.
 616    We do that for the loop represented by LOOP_VINFO, and also to its
 617    inner-loop, if exists.
 618    Examples for scalar cycles:
 619
 620    Example1: reduction:
 621
 622               loop1:
 623               for (i=0; i<N; i++)
 624                  sum += a[i];
 625
 626    Example2: induction:
 627
 628               loop2:
 629               for (i=0; i<N; i++)
 630                  a[i] = i;  */
 631
 632 static void
 633 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 634 {
 635   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 636
 637   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 638
 639   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 640      Reductions in such inner-loop therefore have different properties than
 641      the reductions in the nest that gets vectorized:
 642      1. When vectorized, they are executed in the same order as in the original
 643         scalar loop, so we can't change the order of computation when
 644         vectorizing them.
 645      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 646         current checks are too strict.  */
 647
 648   if (loop->inner)
 649     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 650 }
 651
 652 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 653
 654 static void
 655 vect_fixup_reduc_chain (gimple *stmt)
 656 {
 657   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 658   gimple *stmtp;
 659   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 660               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 661   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 662     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 663   do
 664     {
 665       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 666       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 667       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 668       if (stmt)
 669         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 670           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 671     }
 672   while (stmt);
 673   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 674 }
 675
 676 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 677
 678 static void
 679 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 680 {
 681   gimple *first;
 682   unsigned i;
 683
 684   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 685     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 686       {
 687         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 688         while (next)
 689           {
 690             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 691               break;
 692             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 693           }
 694         /* If not all stmt in the chain are patterns try to handle
 695            the chain without patterns.  */
 696         if (! next)
 697           {
 698             vect_fixup_reduc_chain (first);
 699             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 700               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 701           }
 702       }
 703 }
 704
 705 /* Function vect_get_loop_niters.
 706
 707    Determine how many iterations the loop is executed and place it
 708    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 709    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 710    niter information holds in ASSUMPTIONS.
 711
 712    Return the loop exit condition.  */
 713
 714
 715 static gcond *
 716 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 717                       tree *number_of_iterations, tree *number_of_iterationsm1)
 718 {
 719   edge exit = single_exit (loop);
 720   struct tree_niter_desc niter_desc;
 721   tree niter_assumptions, niter, may_be_zero;
 722   gcond *cond = get_loop_exit_condition (loop);
 723
 724   *assumptions = boolean_true_node;
 725   *number_of_iterationsm1 = chrec_dont_know;
 726   *number_of_iterations = chrec_dont_know;
 727   DUMP_VECT_SCOPE ("get_loop_niters");
 728
 729   if (!exit)
 730     return cond;
 731
 732   niter = chrec_dont_know;
 733   may_be_zero = NULL_TREE;
 734   niter_assumptions = boolean_true_node;
 735   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 736       || chrec_contains_undetermined (niter_desc.niter))
 737     return cond;
 738
 739   niter_assumptions = niter_desc.assumptions;
 740   may_be_zero = niter_desc.may_be_zero;
 741   niter = niter_desc.niter;
 742
 743   if (may_be_zero && integer_zerop (may_be_zero))
 744     may_be_zero = NULL_TREE;
 745
 746   if (may_be_zero)
 747     {
 748       if (COMPARISON_CLASS_P (may_be_zero))
 749         {
 750           /* Try to combine may_be_zero with assumptions, this can simplify
 751              computation of niter expression.  */
 752           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 753             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 754                                              niter_assumptions,
 755                                              fold_build1 (TRUTH_NOT_EXPR,
 756                                                           boolean_type_node,
 757                                                           may_be_zero));
 758           else
 759             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 760                                  build_int_cst (TREE_TYPE (niter), 0),
 761                                  rewrite_to_non_trapping_overflow (niter));
 762
 763           may_be_zero = NULL_TREE;
 764         }
 765       else if (integer_nonzerop (may_be_zero))
 766         {
 767           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 768           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 769           return cond;
 770         }
 771       else
 772         return cond;
 773     }
 774
 775   *assumptions = niter_assumptions;
 776   *number_of_iterationsm1 = niter;
 777
 778   /* We want the number of loop header executions which is the number
 779      of latch executions plus one.
 780      ???  For UINT_MAX latch executions this number overflows to zero
 781      for loops like do { n++; } while (n != 0);  */
 782   if (niter && !chrec_contains_undetermined (niter))
 783     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 784                           build_int_cst (TREE_TYPE (niter), 1));
 785   *number_of_iterations = niter;
 786
 787   return cond;
 788 }
 789
 790 /* Function bb_in_loop_p
 791
 792    Used as predicate for dfs order traversal of the loop bbs.  */
 793
 794 static bool
 795 bb_in_loop_p (const_basic_block bb, const void *data)
 796 {
 797   const struct loop *const loop = (const struct loop *)data;
 798   if (flow_bb_inside_loop_p (loop, bb))
 799     return true;
 800   return false;
 801 }
 802
 803
 804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 805    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 806
 807 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 808   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 809     loop (loop_in),
 810     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 811     num_itersm1 (NULL_TREE),
 812     num_iters (NULL_TREE),
 813     num_iters_unchanged (NULL_TREE),
 814     num_iters_assumptions (NULL_TREE),
 815     th (0),
 816     versioning_threshold (0),
 817     vectorization_factor (0),
 818     max_vectorization_factor (0),
 819     mask_skip_niters (NULL_TREE),
 820     mask_compare_type (NULL_TREE),
 821     unaligned_dr (NULL),
 822     peeling_for_alignment (0),
 823     ptr_mask (0),
 824     ivexpr_map (NULL),
 825     slp_unrolling_factor (1),
 826     single_scalar_iteration_cost (0),
 827     vectorizable (false),
 828     can_fully_mask_p (true),
 829     fully_masked_p (false),
 830     peeling_for_gaps (false),
 831     peeling_for_niter (false),
 832     operands_swapped (false),
 833     no_data_dependencies (false),
 834     has_mask_store (false),
 835     scalar_loop (NULL),
 836     orig_loop_info (NULL)
 837 {
 838   /* Create/Update stmt_info for all stmts in the loop.  */
 839   basic_block *body = get_loop_body (loop);
 840   for (unsigned int i = 0; i < loop->num_nodes; i++)
 841     {
 842       basic_block bb = body[i];
 843       gimple_stmt_iterator si;
 844
 845       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 846         {
 847           gimple *phi = gsi_stmt (si);
 848           gimple_set_uid (phi, 0);
 849           add_stmt (phi);
 850         }
 851
 852       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 853         {
 854           gimple *stmt = gsi_stmt (si);
 855           gimple_set_uid (stmt, 0);
 856           add_stmt (stmt);
 857         }
 858     }
 859   free (body);
 860
 861   /* CHECKME: We want to visit all BBs before their successors (except for
 862      latch blocks, for which this assertion wouldn't hold).  In the simple
 863      case of the loop forms we allow, a dfs order of the BBs would the same
 864      as reversed postorder traversal, so we are safe.  */
 865
 866   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 867                                           bbs, loop->num_nodes, loop);
 868   gcc_assert (nbbs == loop->num_nodes);
 869 }
 870
 871 /* Free all levels of MASKS.  */
 872
 873 void
 874 release_vec_loop_masks (vec_loop_masks *masks)
 875 {
 876   rgroup_masks *rgm;
 877   unsigned int i;
 878   FOR_EACH_VEC_ELT (*masks, i, rgm)
 879     rgm->masks.release ();
 880   masks->release ();
 881 }
 882
 883 /* Free all memory used by the _loop_vec_info, as well as all the
 884    stmt_vec_info structs of all the stmts in the loop.  */
 885
 886 _loop_vec_info::~_loop_vec_info ()
 887 {
 888   int nbbs;
 889   gimple_stmt_iterator si;
 890   int j;
 891
 892   /* ???  We're releasing loop_vinfos en-block.  */
 893   set_stmt_vec_info_vec (&stmt_vec_infos);
 894   nbbs = loop->num_nodes;
 895   for (j = 0; j < nbbs; j++)
 896     {
 897       basic_block bb = bbs[j];
 898       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 899         free_stmt_vec_info (gsi_stmt (si));
 900
 901       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 902         {
 903           gimple *stmt = gsi_stmt (si);
 904
 905           /* We may have broken canonical form by moving a constant
 906              into RHS1 of a commutative op.  Fix such occurrences.  */
 907           if (operands_swapped && is_gimple_assign (stmt))
 908             {
 909               enum tree_code code = gimple_assign_rhs_code (stmt);
 910
 911               if ((code == PLUS_EXPR
 912                    || code == POINTER_PLUS_EXPR
 913                    || code == MULT_EXPR)
 914                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 915                 swap_ssa_operands (stmt,
 916                                    gimple_assign_rhs1_ptr (stmt),
 917                                    gimple_assign_rhs2_ptr (stmt));
 918               else if (code == COND_EXPR
 919                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 920                 {
 921                   tree cond_expr = gimple_assign_rhs1 (stmt);
 922                   enum tree_code cond_code = TREE_CODE (cond_expr);
 923
 924                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 925                     {
 926                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 927                                                                   0));
 928                       cond_code = invert_tree_comparison (cond_code,
 929                                                           honor_nans);
 930                       if (cond_code != ERROR_MARK)
 931                         {
 932                           TREE_SET_CODE (cond_expr, cond_code);
 933                           swap_ssa_operands (stmt,
 934                                              gimple_assign_rhs2_ptr (stmt),
 935                                              gimple_assign_rhs3_ptr (stmt));
 936                         }
 937                     }
 938                 }
 939             }
 940
 941           /* Free stmt_vec_info.  */
 942           free_stmt_vec_info (stmt);
 943           gsi_next (&si);
 944         }
 945     }
 946
 947   free (bbs);
 948
 949   release_vec_loop_masks (&masks);
 950   delete ivexpr_map;
 951
 952   loop->aux = NULL;
 953 }
 954
 955 /* Return an invariant or register for EXPR and emit necessary
 956    computations in the LOOP_VINFO loop preheader.  */
 957
 958 tree
 959 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 960 {
 961   if (is_gimple_reg (expr)
 962       || is_gimple_min_invariant (expr))
 963     return expr;
 964
 965   if (! loop_vinfo->ivexpr_map)
 966     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 967   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 968   if (! cached)
 969     {
 970       gimple_seq stmts = NULL;
 971       cached = force_gimple_operand (unshare_expr (expr),
 972                                      &stmts, true, NULL_TREE);
 973       if (stmts)
 974         {
 975           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 976           gsi_insert_seq_on_edge_immediate (e, stmts);
 977         }
 978     }
 979   return cached;
 980 }
 981
 982 /* Return true if we can use CMP_TYPE as the comparison type to produce
 983    all masks required to mask LOOP_VINFO.  */
 984
 985 static bool
 986 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 987 {
 988   rgroup_masks *rgm;
 989   unsigned int i;
 990   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 991     if (rgm->mask_type != NULL_TREE
 992         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 993                                             cmp_type, rgm->mask_type,
 994                                             OPTIMIZE_FOR_SPEED))
 995       return false;
 996   return true;
 997 }
 998
 999 /* Calculate the maximum number of scalars per iteration for every
1000    rgroup in LOOP_VINFO.  */
1001
1002 static unsigned int
1003 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1004 {
1005   unsigned int res = 1;
1006   unsigned int i;
1007   rgroup_masks *rgm;
1008   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1009     res = MAX (res, rgm->max_nscalars_per_iter);
1010   return res;
1011 }
1012
1013 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1014    whether we can actually generate the masks required.  Return true if so,
1015    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1016
1017 static bool
1018 vect_verify_full_masking (loop_vec_info loop_vinfo)
1019 {
1020   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1021   unsigned int min_ni_width;
1022
1023   /* Use a normal loop if there are no statements that need masking.
1024      This only happens in rare degenerate cases: it means that the loop
1025      has no loads, no stores, and no live-out values.  */
1026   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1027     return false;
1028
1029   /* Get the maximum number of iterations that is representable
1030      in the counter type.  */
1031   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1032   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1033
1034   /* Get a more refined estimate for the number of iterations.  */
1035   widest_int max_back_edges;
1036   if (max_loop_iterations (loop, &max_back_edges))
1037     max_ni = wi::smin (max_ni, max_back_edges + 1);
1038
1039   /* Account for rgroup masks, in which each bit is replicated N times.  */
1040   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1041
1042   /* Work out how many bits we need to represent the limit.  */
1043   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1044
1045   /* Find a scalar mode for which WHILE_ULT is supported.  */
1046   opt_scalar_int_mode cmp_mode_iter;
1047   tree cmp_type = NULL_TREE;
1048   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1049     {
1050       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1051       if (cmp_bits >= min_ni_width
1052           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1053         {
1054           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1055           if (this_type
1056               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1057             {
1058               /* Although we could stop as soon as we find a valid mode,
1059                  it's often better to continue until we hit Pmode, since the
1060                  operands to the WHILE are more likely to be reusable in
1061                  address calculations.  */
1062               cmp_type = this_type;
1063               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064                 break;
1065             }
1066         }
1067     }
1068
1069   if (!cmp_type)
1070     return false;
1071
1072   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073   return true;
1074 }
1075
1076 /* Calculate the cost of one scalar iteration of the loop.  */
1077 static void
1078 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1079 {
1080   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1081   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1082   int nbbs = loop->num_nodes, factor;
1083   int innerloop_iters, i;
1084
1085   /* Gather costs for statements in the scalar loop.  */
1086
1087   /* FORNOW.  */
1088   innerloop_iters = 1;
1089   if (loop->inner)
1090     innerloop_iters = 50; /* FIXME */
1091
1092   for (i = 0; i < nbbs; i++)
1093     {
1094       gimple_stmt_iterator si;
1095       basic_block bb = bbs[i];
1096
1097       if (bb->loop_father == loop->inner)
1098         factor = innerloop_iters;
1099       else
1100         factor = 1;
1101
1102       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1103         {
1104           gimple *stmt = gsi_stmt (si);
1105           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1106
1107           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1108             continue;
1109
1110           /* Skip stmts that are not vectorized inside the loop.  */
1111           if (stmt_info
1112               && !STMT_VINFO_RELEVANT_P (stmt_info)
1113               && (!STMT_VINFO_LIVE_P (stmt_info)
1114                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1115               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1116             continue;
1117
1118           vect_cost_for_stmt kind;
1119           if (STMT_VINFO_DATA_REF (stmt_info))
1120             {
1121               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1122                kind = scalar_load;
1123              else
1124                kind = scalar_store;
1125             }
1126           else
1127             kind = scalar_stmt;
1128
1129           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1130                             factor, kind, stmt_info, 0, vect_prologue);
1131         }
1132     }
1133
1134   /* Now accumulate cost.  */
1135   void *target_cost_data = init_cost (loop);
1136   stmt_info_for_cost *si;
1137   int j;
1138   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1139                     j, si)
1140     {
1141       struct _stmt_vec_info *stmt_info
1142         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1143       (void) add_stmt_cost (target_cost_data, si->count,
1144                             si->kind, stmt_info, si->misalign,
1145                             vect_body);
1146     }
1147   unsigned dummy, body_cost = 0;
1148   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1149   destroy_cost_data (target_cost_data);
1150   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1151 }
1152
1153
1154 /* Function vect_analyze_loop_form_1.
1155
1156    Verify that certain CFG restrictions hold, including:
1157    - the loop has a pre-header
1158    - the loop has a single entry and exit
1159    - the loop exit condition is simple enough
1160    - the number of iterations can be analyzed, i.e, a countable loop.  The
1161      niter could be analyzed under some assumptions.  */
1162
1163 bool
1164 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1165                           tree *assumptions, tree *number_of_iterationsm1,
1166                           tree *number_of_iterations, gcond **inner_loop_cond)
1167 {
1168   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1169
1170   /* Different restrictions apply when we are considering an inner-most loop,
1171      vs. an outer (nested) loop.
1172      (FORNOW. May want to relax some of these restrictions in the future).  */
1173
1174   if (!loop->inner)
1175     {
1176       /* Inner-most loop.  We currently require that the number of BBs is
1177          exactly 2 (the header and latch).  Vectorizable inner-most loops
1178          look like this:
1179
1180                         (pre-header)
1181                            |
1182                           header <--------+
1183                            | |            |
1184                            | +--> latch --+
1185                            |
1186                         (exit-bb)  */
1187
1188       if (loop->num_nodes != 2)
1189         {
1190           if (dump_enabled_p ())
1191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1192                              "not vectorized: control flow in loop.\n");
1193           return false;
1194         }
1195
1196       if (empty_block_p (loop->header))
1197         {
1198           if (dump_enabled_p ())
1199             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1200                              "not vectorized: empty loop.\n");
1201           return false;
1202         }
1203     }
1204   else
1205     {
1206       struct loop *innerloop = loop->inner;
1207       edge entryedge;
1208
1209       /* Nested loop. We currently require that the loop is doubly-nested,
1210          contains a single inner loop, and the number of BBs is exactly 5.
1211          Vectorizable outer-loops look like this:
1212
1213                         (pre-header)
1214                            |
1215                           header <---+
1216                            |         |
1217                           inner-loop |
1218                            |         |
1219                           tail ------+
1220                            |
1221                         (exit-bb)
1222
1223          The inner-loop has the properties expected of inner-most loops
1224          as described above.  */
1225
1226       if ((loop->inner)->inner || (loop->inner)->next)
1227         {
1228           if (dump_enabled_p ())
1229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230                              "not vectorized: multiple nested loops.\n");
1231           return false;
1232         }
1233
1234       if (loop->num_nodes != 5)
1235         {
1236           if (dump_enabled_p ())
1237             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238                              "not vectorized: control flow in loop.\n");
1239           return false;
1240         }
1241
1242       entryedge = loop_preheader_edge (innerloop);
1243       if (entryedge->src != loop->header
1244           || !single_exit (innerloop)
1245           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1246         {
1247           if (dump_enabled_p ())
1248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                              "not vectorized: unsupported outerloop form.\n");
1250           return false;
1251         }
1252
1253       /* Analyze the inner-loop.  */
1254       tree inner_niterm1, inner_niter, inner_assumptions;
1255       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1256                                       &inner_assumptions, &inner_niterm1,
1257                                       &inner_niter, NULL)
1258           /* Don't support analyzing niter under assumptions for inner
1259              loop.  */
1260           || !integer_onep (inner_assumptions))
1261         {
1262           if (dump_enabled_p ())
1263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1264                              "not vectorized: Bad inner loop.\n");
1265           return false;
1266         }
1267
1268       if (!expr_invariant_in_loop_p (loop, inner_niter))
1269         {
1270           if (dump_enabled_p ())
1271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272                              "not vectorized: inner-loop count not"
1273                              " invariant.\n");
1274           return false;
1275         }
1276
1277       if (dump_enabled_p ())
1278         dump_printf_loc (MSG_NOTE, vect_location,
1279                          "Considering outer-loop vectorization.\n");
1280     }
1281
1282   if (!single_exit (loop)
1283       || EDGE_COUNT (loop->header->preds) != 2)
1284     {
1285       if (dump_enabled_p ())
1286         {
1287           if (!single_exit (loop))
1288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289                              "not vectorized: multiple exits.\n");
1290           else if (EDGE_COUNT (loop->header->preds) != 2)
1291             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1292                              "not vectorized: too many incoming edges.\n");
1293         }
1294       return false;
1295     }
1296
1297   /* We assume that the loop exit condition is at the end of the loop. i.e,
1298      that the loop is represented as a do-while (with a proper if-guard
1299      before the loop if needed), where the loop header contains all the
1300      executable statements, and the latch is empty.  */
1301   if (!empty_block_p (loop->latch)
1302       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1303     {
1304       if (dump_enabled_p ())
1305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306                          "not vectorized: latch block not empty.\n");
1307       return false;
1308     }
1309
1310   /* Make sure the exit is not abnormal.  */
1311   edge e = single_exit (loop);
1312   if (e->flags & EDGE_ABNORMAL)
1313     {
1314       if (dump_enabled_p ())
1315         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1316                          "not vectorized: abnormal loop exit edge.\n");
1317       return false;
1318     }
1319
1320   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1321                                      number_of_iterationsm1);
1322   if (!*loop_cond)
1323     {
1324       if (dump_enabled_p ())
1325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1326                          "not vectorized: complicated exit condition.\n");
1327       return false;
1328     }
1329
1330   if (integer_zerop (*assumptions)
1331       || !*number_of_iterations
1332       || chrec_contains_undetermined (*number_of_iterations))
1333     {
1334       if (dump_enabled_p ())
1335         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1336                          "not vectorized: number of iterations cannot be "
1337                          "computed.\n");
1338       return false;
1339     }
1340
1341   if (integer_zerop (*number_of_iterations))
1342     {
1343       if (dump_enabled_p ())
1344         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                          "not vectorized: number of iterations = 0.\n");
1346       return false;
1347     }
1348
1349   return true;
1350 }
1351
1352 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1353
1354 loop_vec_info
1355 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1356 {
1357   tree assumptions, number_of_iterations, number_of_iterationsm1;
1358   gcond *loop_cond, *inner_loop_cond = NULL;
1359
1360   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1361                                   &assumptions, &number_of_iterationsm1,
1362                                   &number_of_iterations, &inner_loop_cond))
1363     return NULL;
1364
1365   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1366   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1367   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1368   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1369   if (!integer_onep (assumptions))
1370     {
1371       /* We consider to vectorize this loop by versioning it under
1372          some assumptions.  In order to do this, we need to clear
1373          existing information computed by scev and niter analyzer.  */
1374       scev_reset_htab ();
1375       free_numbers_of_iterations_estimates (loop);
1376       /* Also set flag for this loop so that following scev and niter
1377          analysis are done under the assumptions.  */
1378       loop_constraint_set (loop, LOOP_C_FINITE);
1379       /* Also record the assumptions for versioning.  */
1380       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1381     }
1382
1383   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1384     {
1385       if (dump_enabled_p ())
1386         {
1387           dump_printf_loc (MSG_NOTE, vect_location,
1388                            "Symbolic number of iterations is ");
1389           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1390           dump_printf (MSG_NOTE, "\n");
1391         }
1392     }
1393
1394   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1395   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1396   if (inner_loop_cond)
1397     {
1398       stmt_vec_info inner_loop_cond_info
1399         = loop_vinfo->lookup_stmt (inner_loop_cond);
1400       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1401     }
1402
1403   gcc_assert (!loop->aux);
1404   loop->aux = loop_vinfo;
1405   return loop_vinfo;
1406 }
1407
1408
1409
1410 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1411    statements update the vectorization factor.  */
1412
1413 static void
1414 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1415 {
1416   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1417   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1418   int nbbs = loop->num_nodes;
1419   poly_uint64 vectorization_factor;
1420   int i;
1421
1422   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1423
1424   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1425   gcc_assert (known_ne (vectorization_factor, 0U));
1426
1427   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1428      vectorization factor of the loop is the unrolling factor required by
1429      the SLP instances.  If that unrolling factor is 1, we say, that we
1430      perform pure SLP on loop - cross iteration parallelism is not
1431      exploited.  */
1432   bool only_slp_in_loop = true;
1433   for (i = 0; i < nbbs; i++)
1434     {
1435       basic_block bb = bbs[i];
1436       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1437            gsi_next (&si))
1438         {
1439           gimple *stmt = gsi_stmt (si);
1440           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1441           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1442               && STMT_VINFO_RELATED_STMT (stmt_info))
1443             {
1444               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1445               stmt_info = vinfo_for_stmt (stmt);
1446             }
1447           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1448                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1449               && !PURE_SLP_STMT (stmt_info))
1450             /* STMT needs both SLP and loop-based vectorization.  */
1451             only_slp_in_loop = false;
1452         }
1453     }
1454
1455   if (only_slp_in_loop)
1456     {
1457       dump_printf_loc (MSG_NOTE, vect_location,
1458                        "Loop contains only SLP stmts\n");
1459       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1460     }
1461   else
1462     {
1463       dump_printf_loc (MSG_NOTE, vect_location,
1464                        "Loop contains SLP and non-SLP stmts\n");
1465       /* Both the vectorization factor and unroll factor have the form
1466          current_vector_size * X for some rational X, so they must have
1467          a common multiple.  */
1468       vectorization_factor
1469         = force_common_multiple (vectorization_factor,
1470                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1471     }
1472
1473   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1474   if (dump_enabled_p ())
1475     {
1476       dump_printf_loc (MSG_NOTE, vect_location,
1477                        "Updating vectorization factor to ");
1478       dump_dec (MSG_NOTE, vectorization_factor);
1479       dump_printf (MSG_NOTE, ".\n");
1480     }
1481 }
1482
1483 /* Return true if STMT_INFO describes a double reduction phi and if
1484    the other phi in the reduction is also relevant for vectorization.
1485    This rejects cases such as:
1486
1487       outer1:
1488         x_1 = PHI <x_3(outer2), ...>;
1489         ...
1490
1491       inner:
1492         x_2 = ...;
1493         ...
1494
1495       outer2:
1496         x_3 = PHI <x_2(inner)>;
1497
1498    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1499
1500 static bool
1501 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1502 {
1503   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1504     return false;
1505
1506   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1507   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1508 }
1509
1510 /* Function vect_analyze_loop_operations.
1511
1512    Scan the loop stmts and make sure they are all vectorizable.  */
1513
1514 static bool
1515 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1516 {
1517   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1518   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1519   int nbbs = loop->num_nodes;
1520   int i;
1521   stmt_vec_info stmt_info;
1522   bool need_to_vectorize = false;
1523   bool ok;
1524
1525   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1526
1527   stmt_vector_for_cost cost_vec;
1528   cost_vec.create (2);
1529
1530   for (i = 0; i < nbbs; i++)
1531     {
1532       basic_block bb = bbs[i];
1533
1534       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1535            gsi_next (&si))
1536         {
1537           gphi *phi = si.phi ();
1538           ok = true;
1539
1540           stmt_info = loop_vinfo->lookup_stmt (phi);
1541           if (dump_enabled_p ())
1542             {
1543               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1544               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1545             }
1546           if (virtual_operand_p (gimple_phi_result (phi)))
1547             continue;
1548
1549           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1550              (i.e., a phi in the tail of the outer-loop).  */
1551           if (! is_loop_header_bb_p (bb))
1552             {
1553               /* FORNOW: we currently don't support the case that these phis
1554                  are not used in the outerloop (unless it is double reduction,
1555                  i.e., this phi is vect_reduction_def), cause this case
1556                  requires to actually do something here.  */
1557               if (STMT_VINFO_LIVE_P (stmt_info)
1558                   && !vect_active_double_reduction_p (stmt_info))
1559                 {
1560                   if (dump_enabled_p ())
1561                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1562                                      "Unsupported loop-closed phi in "
1563                                      "outer-loop.\n");
1564                   return false;
1565                 }
1566
1567               /* If PHI is used in the outer loop, we check that its operand
1568                  is defined in the inner loop.  */
1569               if (STMT_VINFO_RELEVANT_P (stmt_info))
1570                 {
1571                   tree phi_op;
1572
1573                   if (gimple_phi_num_args (phi) != 1)
1574                     return false;
1575
1576                   phi_op = PHI_ARG_DEF (phi, 0);
1577                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1578                   if (!op_def_info)
1579                     return false;
1580
1581                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1582                       && (STMT_VINFO_RELEVANT (op_def_info)
1583                           != vect_used_in_outer_by_reduction))
1584                     return false;
1585                 }
1586
1587               continue;
1588             }
1589
1590           gcc_assert (stmt_info);
1591
1592           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1593                || STMT_VINFO_LIVE_P (stmt_info))
1594               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1595             {
1596               /* A scalar-dependence cycle that we don't support.  */
1597               if (dump_enabled_p ())
1598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599                                  "not vectorized: scalar dependence cycle.\n");
1600               return false;
1601             }
1602
1603           if (STMT_VINFO_RELEVANT_P (stmt_info))
1604             {
1605               need_to_vectorize = true;
1606               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1607                   && ! PURE_SLP_STMT (stmt_info))
1608                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1609               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1610                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1611                        && ! PURE_SLP_STMT (stmt_info))
1612                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1613                                              &cost_vec);
1614             }
1615
1616           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1617           if (ok
1618               && STMT_VINFO_LIVE_P (stmt_info)
1619               && !PURE_SLP_STMT (stmt_info))
1620             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1621                                               &cost_vec);
1622
1623           if (!ok)
1624             {
1625               if (dump_enabled_p ())
1626                 {
1627                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1628                                    "not vectorized: relevant phi not "
1629                                    "supported: ");
1630                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1631                 }
1632               return false;
1633             }
1634         }
1635
1636       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1637            gsi_next (&si))
1638         {
1639           gimple *stmt = gsi_stmt (si);
1640           if (!gimple_clobber_p (stmt)
1641               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1642                                      &cost_vec))
1643             return false;
1644         }
1645     } /* bbs */
1646
1647   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1648   cost_vec.release ();
1649
1650   /* All operations in the loop are either irrelevant (deal with loop
1651      control, or dead), or only used outside the loop and can be moved
1652      out of the loop (e.g. invariants, inductions).  The loop can be
1653      optimized away by scalar optimizations.  We're better off not
1654      touching this loop.  */
1655   if (!need_to_vectorize)
1656     {
1657       if (dump_enabled_p ())
1658         dump_printf_loc (MSG_NOTE, vect_location,
1659                          "All the computation can be taken out of the loop.\n");
1660       if (dump_enabled_p ())
1661         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1662                          "not vectorized: redundant loop. no profit to "
1663                          "vectorize.\n");
1664       return false;
1665     }
1666
1667   return true;
1668 }
1669
1670 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1671    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1672    definitely no, or -1 if it's worth retrying.  */
1673
1674 static int
1675 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1676 {
1677   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1678   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1679
1680   /* Only fully-masked loops can have iteration counts less than the
1681      vectorization factor.  */
1682   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1683     {
1684       HOST_WIDE_INT max_niter;
1685
1686       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1687         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1688       else
1689         max_niter = max_stmt_executions_int (loop);
1690
1691       if (max_niter != -1
1692           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1693         {
1694           if (dump_enabled_p ())
1695             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1696                              "not vectorized: iteration count smaller than "
1697                              "vectorization factor.\n");
1698           return 0;
1699         }
1700     }
1701
1702   int min_profitable_iters, min_profitable_estimate;
1703   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1704                                       &min_profitable_estimate);
1705
1706   if (min_profitable_iters < 0)
1707     {
1708       if (dump_enabled_p ())
1709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                          "not vectorized: vectorization not profitable.\n");
1711       if (dump_enabled_p ())
1712         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1713                          "not vectorized: vector version will never be "
1714                          "profitable.\n");
1715       return -1;
1716     }
1717
1718   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1719                                * assumed_vf);
1720
1721   /* Use the cost model only if it is more conservative than user specified
1722      threshold.  */
1723   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1724                                     min_profitable_iters);
1725
1726   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1727
1728   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1729       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1730     {
1731       if (dump_enabled_p ())
1732         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1733                          "not vectorized: vectorization not profitable.\n");
1734       if (dump_enabled_p ())
1735         dump_printf_loc (MSG_NOTE, vect_location,
1736                          "not vectorized: iteration count smaller than user "
1737                          "specified loop bound parameter or minimum profitable "
1738                          "iterations (whichever is more conservative).\n");
1739       return 0;
1740     }
1741
1742   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1743   if (estimated_niter == -1)
1744     estimated_niter = likely_max_stmt_executions_int (loop);
1745   if (estimated_niter != -1
1746       && ((unsigned HOST_WIDE_INT) estimated_niter
1747           < MAX (th, (unsigned) min_profitable_estimate)))
1748     {
1749       if (dump_enabled_p ())
1750         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1751                          "not vectorized: estimated iteration count too "
1752                          "small.\n");
1753       if (dump_enabled_p ())
1754         dump_printf_loc (MSG_NOTE, vect_location,
1755                          "not vectorized: estimated iteration count smaller "
1756                          "than specified loop bound parameter or minimum "
1757                          "profitable iterations (whichever is more "
1758                          "conservative).\n");
1759       return -1;
1760     }
1761
1762   return 1;
1763 }
1764
1765 static bool
1766 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1767                            vec<data_reference_p> *datarefs,
1768                            unsigned int *n_stmts)
1769 {
1770   *n_stmts = 0;
1771   for (unsigned i = 0; i < loop->num_nodes; i++)
1772     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1773          !gsi_end_p (gsi); gsi_next (&gsi))
1774       {
1775         gimple *stmt = gsi_stmt (gsi);
1776         if (is_gimple_debug (stmt))
1777           continue;
1778         ++(*n_stmts);
1779         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1780           {
1781             if (is_gimple_call (stmt) && loop->safelen)
1782               {
1783                 tree fndecl = gimple_call_fndecl (stmt), op;
1784                 if (fndecl != NULL_TREE)
1785                   {
1786                     cgraph_node *node = cgraph_node::get (fndecl);
1787                     if (node != NULL && node->simd_clones != NULL)
1788                       {
1789                         unsigned int j, n = gimple_call_num_args (stmt);
1790                         for (j = 0; j < n; j++)
1791                           {
1792                             op = gimple_call_arg (stmt, j);
1793                             if (DECL_P (op)
1794                                 || (REFERENCE_CLASS_P (op)
1795                                     && get_base_address (op)))
1796                               break;
1797                           }
1798                         op = gimple_call_lhs (stmt);
1799                         /* Ignore #pragma omp declare simd functions
1800                            if they don't have data references in the
1801                            call stmt itself.  */
1802                         if (j == n
1803                             && !(op
1804                                  && (DECL_P (op)
1805                                      || (REFERENCE_CLASS_P (op)
1806                                          && get_base_address (op)))))
1807                           continue;
1808                       }
1809                   }
1810               }
1811             return false;
1812           }
1813         /* If dependence analysis will give up due to the limit on the
1814            number of datarefs stop here and fail fatally.  */
1815         if (datarefs->length ()
1816             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1817           return false;
1818       }
1819   return true;
1820 }
1821
1822 /* Function vect_analyze_loop_2.
1823
1824    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1825    for it.  The different analyses will record information in the
1826    loop_vec_info struct.  */
1827 static bool
1828 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1829 {
1830   bool ok;
1831   int res;
1832   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1833   poly_uint64 min_vf = 2;
1834
1835   /* The first group of checks is independent of the vector size.  */
1836   fatal = true;
1837
1838   /* Find all data references in the loop (which correspond to vdefs/vuses)
1839      and analyze their evolution in the loop.  */
1840
1841   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1842
1843   /* Gather the data references and count stmts in the loop.  */
1844   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1845     {
1846       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1847                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1848                                       n_stmts))
1849         {
1850           if (dump_enabled_p ())
1851             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852                              "not vectorized: loop contains function "
1853                              "calls or data references that cannot "
1854                              "be analyzed\n");
1855           return false;
1856         }
1857       loop_vinfo->shared->save_datarefs ();
1858     }
1859   else
1860     loop_vinfo->shared->check_datarefs ();
1861
1862   /* Analyze the data references and also adjust the minimal
1863      vectorization factor according to the loads and stores.  */
1864
1865   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1866   if (!ok)
1867     {
1868       if (dump_enabled_p ())
1869         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1870                          "bad data references.\n");
1871       return false;
1872     }
1873
1874   /* Classify all cross-iteration scalar data-flow cycles.
1875      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1876   vect_analyze_scalar_cycles (loop_vinfo);
1877
1878   vect_pattern_recog (loop_vinfo);
1879
1880   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1881
1882   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1883      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1884
1885   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1886   if (!ok)
1887     {
1888       if (dump_enabled_p ())
1889         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1890                          "bad data access.\n");
1891       return false;
1892     }
1893
1894   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1895
1896   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1897   if (!ok)
1898     {
1899       if (dump_enabled_p ())
1900         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1901                          "unexpected pattern.\n");
1902       return false;
1903     }
1904
1905   /* While the rest of the analysis below depends on it in some way.  */
1906   fatal = false;
1907
1908   /* Analyze data dependences between the data-refs in the loop
1909      and adjust the maximum vectorization factor according to
1910      the dependences.
1911      FORNOW: fail at the first data dependence that we encounter.  */
1912
1913   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1914   if (!ok
1915       || (max_vf != MAX_VECTORIZATION_FACTOR
1916           && maybe_lt (max_vf, min_vf)))
1917     {
1918       if (dump_enabled_p ())
1919             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1920                              "bad data dependence.\n");
1921       return false;
1922     }
1923   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1924
1925   ok = vect_determine_vectorization_factor (loop_vinfo);
1926   if (!ok)
1927     {
1928       if (dump_enabled_p ())
1929         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930                          "can't determine vectorization factor.\n");
1931       return false;
1932     }
1933   if (max_vf != MAX_VECTORIZATION_FACTOR
1934       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1935     {
1936       if (dump_enabled_p ())
1937         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1938                          "bad data dependence.\n");
1939       return false;
1940     }
1941
1942   /* Compute the scalar iteration cost.  */
1943   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1944
1945   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1946   unsigned th;
1947
1948   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1949   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1950   if (!ok)
1951     return false;
1952
1953   /* If there are any SLP instances mark them as pure_slp.  */
1954   bool slp = vect_make_slp_decision (loop_vinfo);
1955   if (slp)
1956     {
1957       /* Find stmts that need to be both vectorized and SLPed.  */
1958       vect_detect_hybrid_slp (loop_vinfo);
1959
1960       /* Update the vectorization factor based on the SLP decision.  */
1961       vect_update_vf_for_slp (loop_vinfo);
1962     }
1963
1964   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1965
1966   /* We don't expect to have to roll back to anything other than an empty
1967      set of rgroups.  */
1968   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1969
1970   /* This is the point where we can re-start analysis with SLP forced off.  */
1971 start_over:
1972
1973   /* Now the vectorization factor is final.  */
1974   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1975   gcc_assert (known_ne (vectorization_factor, 0U));
1976
1977   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1978     {
1979       dump_printf_loc (MSG_NOTE, vect_location,
1980                        "vectorization_factor = ");
1981       dump_dec (MSG_NOTE, vectorization_factor);
1982       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1983                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1984     }
1985
1986   HOST_WIDE_INT max_niter
1987     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1988
1989   /* Analyze the alignment of the data-refs in the loop.
1990      Fail if a data reference is found that cannot be vectorized.  */
1991
1992   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1993   if (!ok)
1994     {
1995       if (dump_enabled_p ())
1996         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1997                          "bad data alignment.\n");
1998       return false;
1999     }
2000
2001   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2002      It is important to call pruning after vect_analyze_data_ref_accesses,
2003      since we use grouping information gathered by interleaving analysis.  */
2004   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2005   if (!ok)
2006     return false;
2007
2008   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2009      vectorization.  */
2010   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2011     {
2012     /* This pass will decide on using loop versioning and/or loop peeling in
2013        order to enhance the alignment of data references in the loop.  */
2014     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2015     if (!ok)
2016       {
2017         if (dump_enabled_p ())
2018           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2019                            "bad data alignment.\n");
2020         return false;
2021       }
2022     }
2023
2024   if (slp)
2025     {
2026       /* Analyze operations in the SLP instances.  Note this may
2027          remove unsupported SLP instances which makes the above
2028          SLP kind detection invalid.  */
2029       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2030       vect_slp_analyze_operations (loop_vinfo);
2031       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2032         goto again;
2033     }
2034
2035   /* Scan all the remaining operations in the loop that are not subject
2036      to SLP and make sure they are vectorizable.  */
2037   ok = vect_analyze_loop_operations (loop_vinfo);
2038   if (!ok)
2039     {
2040       if (dump_enabled_p ())
2041         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2042                          "bad operation or unsupported loop bound.\n");
2043       return false;
2044     }
2045
2046   /* Decide whether to use a fully-masked loop for this vectorization
2047      factor.  */
2048   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2049     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2050        && vect_verify_full_masking (loop_vinfo));
2051   if (dump_enabled_p ())
2052     {
2053       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2054         dump_printf_loc (MSG_NOTE, vect_location,
2055                          "using a fully-masked loop.\n");
2056       else
2057         dump_printf_loc (MSG_NOTE, vect_location,
2058                          "not using a fully-masked loop.\n");
2059     }
2060
2061   /* If epilog loop is required because of data accesses with gaps,
2062      one additional iteration needs to be peeled.  Check if there is
2063      enough iterations for vectorization.  */
2064   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2065       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2066       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2067     {
2068       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2069       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2070
2071       if (known_lt (wi::to_widest (scalar_niters), vf))
2072         {
2073           if (dump_enabled_p ())
2074             dump_printf_loc (MSG_NOTE, vect_location,
2075                              "loop has no enough iterations to support"
2076                              " peeling for gaps.\n");
2077           return false;
2078         }
2079     }
2080
2081   /* Check the costings of the loop make vectorizing worthwhile.  */
2082   res = vect_analyze_loop_costing (loop_vinfo);
2083   if (res < 0)
2084     goto again;
2085   if (!res)
2086     {
2087       if (dump_enabled_p ())
2088         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089                          "Loop costings not worthwhile.\n");
2090       return false;
2091     }
2092
2093   /* Decide whether we need to create an epilogue loop to handle
2094      remaining scalar iterations.  */
2095   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2096
2097   unsigned HOST_WIDE_INT const_vf;
2098   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2099     /* The main loop handles all iterations.  */
2100     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2101   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2102            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2103     {
2104       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2105                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2106                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2107         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2108     }
2109   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2110            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2111            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2112                 < (unsigned) exact_log2 (const_vf))
2113                /* In case of versioning, check if the maximum number of
2114                   iterations is greater than th.  If they are identical,
2115                   the epilogue is unnecessary.  */
2116                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2117                    || ((unsigned HOST_WIDE_INT) max_niter
2118                        > (th / const_vf) * const_vf))))
2119     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2120
2121   /* If an epilogue loop is required make sure we can create one.  */
2122   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2123       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2124     {
2125       if (dump_enabled_p ())
2126         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2127       if (!vect_can_advance_ivs_p (loop_vinfo)
2128           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2129                                            single_exit (LOOP_VINFO_LOOP
2130                                                          (loop_vinfo))))
2131         {
2132           if (dump_enabled_p ())
2133             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2134                              "not vectorized: can't create required "
2135                              "epilog loop\n");
2136           goto again;
2137         }
2138     }
2139
2140   /* During peeling, we need to check if number of loop iterations is
2141      enough for both peeled prolog loop and vector loop.  This check
2142      can be merged along with threshold check of loop versioning, so
2143      increase threshold for this case if necessary.  */
2144   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2145     {
2146       poly_uint64 niters_th = 0;
2147
2148       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2149         {
2150           /* Niters for peeled prolog loop.  */
2151           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2152             {
2153               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2154               tree vectype
2155                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2156               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2157             }
2158           else
2159             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2160         }
2161
2162       /* Niters for at least one iteration of vectorized loop.  */
2163       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2164         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2165       /* One additional iteration because of peeling for gap.  */
2166       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2167         niters_th += 1;
2168       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2169     }
2170
2171   gcc_assert (known_eq (vectorization_factor,
2172                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2173
2174   /* Ok to vectorize!  */
2175   return true;
2176
2177 again:
2178   /* Try again with SLP forced off but if we didn't do any SLP there is
2179      no point in re-trying.  */
2180   if (!slp)
2181     return false;
2182
2183   /* If there are reduction chains re-trying will fail anyway.  */
2184   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2185     return false;
2186
2187   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2188      via interleaving or lane instructions.  */
2189   slp_instance instance;
2190   slp_tree node;
2191   unsigned i, j;
2192   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2193     {
2194       stmt_vec_info vinfo;
2195       vinfo = vinfo_for_stmt
2196           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2197       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2198         continue;
2199       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2200       unsigned int size = DR_GROUP_SIZE (vinfo);
2201       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2202       if (! vect_store_lanes_supported (vectype, size, false)
2203          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2204          && ! vect_grouped_store_supported (vectype, size))
2205        return false;
2206       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2207         {
2208           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2209           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2210           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2211           size = DR_GROUP_SIZE (vinfo);
2212           vectype = STMT_VINFO_VECTYPE (vinfo);
2213           if (! vect_load_lanes_supported (vectype, size, false)
2214               && ! vect_grouped_load_supported (vectype, single_element_p,
2215                                                 size))
2216             return false;
2217         }
2218     }
2219
2220   if (dump_enabled_p ())
2221     dump_printf_loc (MSG_NOTE, vect_location,
2222                      "re-trying with SLP disabled\n");
2223
2224   /* Roll back state appropriately.  No SLP this time.  */
2225   slp = false;
2226   /* Restore vectorization factor as it were without SLP.  */
2227   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2228   /* Free the SLP instances.  */
2229   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2230     vect_free_slp_instance (instance, false);
2231   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2232   /* Reset SLP type to loop_vect on all stmts.  */
2233   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2234     {
2235       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2236       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2237            !gsi_end_p (si); gsi_next (&si))
2238         {
2239           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2240           STMT_SLP_TYPE (stmt_info) = loop_vect;
2241         }
2242       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2243            !gsi_end_p (si); gsi_next (&si))
2244         {
2245           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2246           STMT_SLP_TYPE (stmt_info) = loop_vect;
2247           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2248             {
2249               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2250               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2251               STMT_SLP_TYPE (stmt_info) = loop_vect;
2252               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2253                    !gsi_end_p (pi); gsi_next (&pi))
2254                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2255                   = loop_vect;
2256             }
2257         }
2258     }
2259   /* Free optimized alias test DDRS.  */
2260   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2261   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2262   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2263   /* Reset target cost data.  */
2264   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2265   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2266     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2267   /* Reset accumulated rgroup information.  */
2268   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2269   /* Reset assorted flags.  */
2270   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2271   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2272   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2273   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2274   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2275
2276   goto start_over;
2277 }
2278
2279 /* Function vect_analyze_loop.
2280
2281    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2282    for it.  The different analyses will record information in the
2283    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2284    be vectorized.  */
2285 loop_vec_info
2286 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2287                    vec_info_shared *shared)
2288 {
2289   loop_vec_info loop_vinfo;
2290   auto_vector_sizes vector_sizes;
2291
2292   /* Autodetect first vector size we try.  */
2293   current_vector_size = 0;
2294   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2295   unsigned int next_size = 0;
2296
2297   DUMP_VECT_SCOPE ("analyze_loop_nest");
2298
2299   if (loop_outer (loop)
2300       && loop_vec_info_for_loop (loop_outer (loop))
2301       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2302     {
2303       if (dump_enabled_p ())
2304         dump_printf_loc (MSG_NOTE, vect_location,
2305                          "outer-loop already vectorized.\n");
2306       return NULL;
2307     }
2308
2309   if (!find_loop_nest (loop, &shared->loop_nest))
2310     {
2311       if (dump_enabled_p ())
2312         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313                          "not vectorized: loop nest containing two "
2314                          "or more consecutive inner loops cannot be "
2315                          "vectorized\n");
2316       return NULL;
2317     }
2318
2319   unsigned n_stmts = 0;
2320   poly_uint64 autodetected_vector_size = 0;
2321   while (1)
2322     {
2323       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2324       loop_vinfo = vect_analyze_loop_form (loop, shared);
2325       if (!loop_vinfo)
2326         {
2327           if (dump_enabled_p ())
2328             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329                              "bad loop form.\n");
2330           return NULL;
2331         }
2332
2333       bool fatal = false;
2334
2335       if (orig_loop_vinfo)
2336         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2337
2338       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2339         {
2340           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2341
2342           return loop_vinfo;
2343         }
2344
2345       delete loop_vinfo;
2346
2347       if (next_size == 0)
2348         autodetected_vector_size = current_vector_size;
2349
2350       if (next_size < vector_sizes.length ()
2351           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2352         next_size += 1;
2353
2354       if (fatal
2355           || next_size == vector_sizes.length ()
2356           || known_eq (current_vector_size, 0U))
2357         return NULL;
2358
2359       /* Try the next biggest vector size.  */
2360       current_vector_size = vector_sizes[next_size++];
2361       if (dump_enabled_p ())
2362         {
2363           dump_printf_loc (MSG_NOTE, vect_location,
2364                            "***** Re-trying analysis with "
2365                            "vector size ");
2366           dump_dec (MSG_NOTE, current_vector_size);
2367           dump_printf (MSG_NOTE, "\n");
2368         }
2369     }
2370 }
2371
2372 /* Return true if there is an in-order reduction function for CODE, storing
2373    it in *REDUC_FN if so.  */
2374
2375 static bool
2376 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2377 {
2378   switch (code)
2379     {
2380     case PLUS_EXPR:
2381       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2382       return true;
2383
2384     default:
2385       return false;
2386     }
2387 }
2388
2389 /* Function reduction_fn_for_scalar_code
2390
2391    Input:
2392    CODE - tree_code of a reduction operations.
2393
2394    Output:
2395    REDUC_FN - the corresponding internal function to be used to reduce the
2396       vector of partial results into a single scalar result, or IFN_LAST
2397       if the operation is a supported reduction operation, but does not have
2398       such an internal function.
2399
2400    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2401
2402 static bool
2403 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2404 {
2405   switch (code)
2406     {
2407       case MAX_EXPR:
2408         *reduc_fn = IFN_REDUC_MAX;
2409         return true;
2410
2411       case MIN_EXPR:
2412         *reduc_fn = IFN_REDUC_MIN;
2413         return true;
2414
2415       case PLUS_EXPR:
2416         *reduc_fn = IFN_REDUC_PLUS;
2417         return true;
2418
2419       case BIT_AND_EXPR:
2420         *reduc_fn = IFN_REDUC_AND;
2421         return true;
2422
2423       case BIT_IOR_EXPR:
2424         *reduc_fn = IFN_REDUC_IOR;
2425         return true;
2426
2427       case BIT_XOR_EXPR:
2428         *reduc_fn = IFN_REDUC_XOR;
2429         return true;
2430
2431       case MULT_EXPR:
2432       case MINUS_EXPR:
2433         *reduc_fn = IFN_LAST;
2434         return true;
2435
2436       default:
2437        return false;
2438     }
2439 }
2440
2441 /* If there is a neutral value X such that SLP reduction NODE would not
2442    be affected by the introduction of additional X elements, return that X,
2443    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2444    is true if the SLP statements perform a single reduction, false if each
2445    statement performs an independent reduction.  */
2446
2447 static tree
2448 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2449                               bool reduc_chain)
2450 {
2451   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2452   gimple *stmt = stmts[0];
2453   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2454   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2455   tree scalar_type = TREE_TYPE (vector_type);
2456   struct loop *loop = gimple_bb (stmt)->loop_father;
2457   gcc_assert (loop);
2458
2459   switch (code)
2460     {
2461     case WIDEN_SUM_EXPR:
2462     case DOT_PROD_EXPR:
2463     case SAD_EXPR:
2464     case PLUS_EXPR:
2465     case MINUS_EXPR:
2466     case BIT_IOR_EXPR:
2467     case BIT_XOR_EXPR:
2468       return build_zero_cst (scalar_type);
2469
2470     case MULT_EXPR:
2471       return build_one_cst (scalar_type);
2472
2473     case BIT_AND_EXPR:
2474       return build_all_ones_cst (scalar_type);
2475
2476     case MAX_EXPR:
2477     case MIN_EXPR:
2478       /* For MIN/MAX the initial values are neutral.  A reduction chain
2479          has only a single initial value, so that value is neutral for
2480          all statements.  */
2481       if (reduc_chain)
2482         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2483       return NULL_TREE;
2484
2485     default:
2486       return NULL_TREE;
2487     }
2488 }
2489
2490 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2491    STMT is printed with a message MSG. */
2492
2493 static void
2494 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2495 {
2496   dump_printf_loc (msg_type, vect_location, "%s", msg);
2497   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2498 }
2499
2500 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2501    operation.  Return true if the results of DEF_STMT_INFO are something
2502    that can be accumulated by such a reduction.  */
2503
2504 static bool
2505 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2506 {
2507   return (is_gimple_assign (def_stmt_info->stmt)
2508           || is_gimple_call (def_stmt_info->stmt)
2509           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2510           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2511               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2512               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2513 }
2514
2515 /* Detect SLP reduction of the form:
2516
2517    #a1 = phi <a5, a0>
2518    a2 = operation (a1)
2519    a3 = operation (a2)
2520    a4 = operation (a3)
2521    a5 = operation (a4)
2522
2523    #a = phi <a5>
2524
2525    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2526    FIRST_STMT is the first reduction stmt in the chain
2527    (a2 = operation (a1)).
2528
2529    Return TRUE if a reduction chain was detected.  */
2530
2531 static bool
2532 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2533                        gimple *first_stmt)
2534 {
2535   struct loop *loop = (gimple_bb (phi))->loop_father;
2536   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2537   enum tree_code code;
2538   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2539   stmt_vec_info use_stmt_info, current_stmt_info;
2540   tree lhs;
2541   imm_use_iterator imm_iter;
2542   use_operand_p use_p;
2543   int nloop_uses, size = 0, n_out_of_loop_uses;
2544   bool found = false;
2545
2546   if (loop != vect_loop)
2547     return false;
2548
2549   lhs = PHI_RESULT (phi);
2550   code = gimple_assign_rhs_code (first_stmt);
2551   while (1)
2552     {
2553       nloop_uses = 0;
2554       n_out_of_loop_uses = 0;
2555       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2556         {
2557           gimple *use_stmt = USE_STMT (use_p);
2558           if (is_gimple_debug (use_stmt))
2559             continue;
2560
2561           /* Check if we got back to the reduction phi.  */
2562           if (use_stmt == phi)
2563             {
2564               loop_use_stmt = use_stmt;
2565               found = true;
2566               break;
2567             }
2568
2569           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2570             {
2571               loop_use_stmt = use_stmt;
2572               nloop_uses++;
2573             }
2574            else
2575              n_out_of_loop_uses++;
2576
2577            /* There are can be either a single use in the loop or two uses in
2578               phi nodes.  */
2579            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2580              return false;
2581         }
2582
2583       if (found)
2584         break;
2585
2586       /* We reached a statement with no loop uses.  */
2587       if (nloop_uses == 0)
2588         return false;
2589
2590       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2591       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2592         return false;
2593
2594       if (!is_gimple_assign (loop_use_stmt)
2595           || code != gimple_assign_rhs_code (loop_use_stmt)
2596           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2597         return false;
2598
2599       /* Insert USE_STMT into reduction chain.  */
2600       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2601       if (current_stmt)
2602         {
2603           current_stmt_info = vinfo_for_stmt (current_stmt);
2604           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2605           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2606             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2607         }
2608       else
2609         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2610
2611       lhs = gimple_assign_lhs (loop_use_stmt);
2612       current_stmt = loop_use_stmt;
2613       size++;
2614    }
2615
2616   if (!found || loop_use_stmt != phi || size < 2)
2617     return false;
2618
2619   /* Swap the operands, if needed, to make the reduction operand be the second
2620      operand.  */
2621   lhs = PHI_RESULT (phi);
2622   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2623   while (next_stmt)
2624     {
2625       if (gimple_assign_rhs2 (next_stmt) == lhs)
2626         {
2627           tree op = gimple_assign_rhs1 (next_stmt);
2628           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2629
2630           /* Check that the other def is either defined in the loop
2631              ("vect_internal_def"), or it's an induction (defined by a
2632              loop-header phi-node).  */
2633           if (def_stmt_info
2634               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2635               && vect_valid_reduction_input_p (def_stmt_info))
2636             {
2637               lhs = gimple_assign_lhs (next_stmt);
2638               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2639               continue;
2640             }
2641
2642           return false;
2643         }
2644       else
2645         {
2646           tree op = gimple_assign_rhs2 (next_stmt);
2647           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2648
2649           /* Check that the other def is either defined in the loop
2650             ("vect_internal_def"), or it's an induction (defined by a
2651             loop-header phi-node).  */
2652           if (def_stmt_info
2653               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2654               && vect_valid_reduction_input_p (def_stmt_info))
2655             {
2656               if (dump_enabled_p ())
2657                 {
2658                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2659                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2660                 }
2661
2662               swap_ssa_operands (next_stmt,
2663                                  gimple_assign_rhs1_ptr (next_stmt),
2664                                  gimple_assign_rhs2_ptr (next_stmt));
2665               update_stmt (next_stmt);
2666
2667               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2668                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2669             }
2670           else
2671             return false;
2672         }
2673
2674       lhs = gimple_assign_lhs (next_stmt);
2675       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2676     }
2677
2678   /* Save the chain for further analysis in SLP detection.  */
2679   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2680   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2681   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2682
2683   return true;
2684 }
2685
2686 /* Return true if we need an in-order reduction for operation CODE
2687    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2688    overflow must wrap.  */
2689
2690 static bool
2691 needs_fold_left_reduction_p (tree type, tree_code code,
2692                              bool need_wrapping_integral_overflow)
2693 {
2694   /* CHECKME: check for !flag_finite_math_only too?  */
2695   if (SCALAR_FLOAT_TYPE_P (type))
2696     switch (code)
2697       {
2698       case MIN_EXPR:
2699       case MAX_EXPR:
2700         return false;
2701
2702       default:
2703         return !flag_associative_math;
2704       }
2705
2706   if (INTEGRAL_TYPE_P (type))
2707     {
2708       if (!operation_no_trapping_overflow (type, code))
2709         return true;
2710       if (need_wrapping_integral_overflow
2711           && !TYPE_OVERFLOW_WRAPS (type)
2712           && operation_can_overflow (code))
2713         return true;
2714       return false;
2715     }
2716
2717   if (SAT_FIXED_POINT_TYPE_P (type))
2718     return true;
2719
2720   return false;
2721 }
2722
2723 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2724    reduction operation CODE has a handled computation expression.  */
2725
2726 bool
2727 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2728                       tree loop_arg, enum tree_code code)
2729 {
2730   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2731   auto_bitmap visited;
2732   tree lookfor = PHI_RESULT (phi);
2733   ssa_op_iter curri;
2734   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2735   while (USE_FROM_PTR (curr) != loop_arg)
2736     curr = op_iter_next_use (&curri);
2737   curri.i = curri.numops;
2738   do
2739     {
2740       path.safe_push (std::make_pair (curri, curr));
2741       tree use = USE_FROM_PTR (curr);
2742       if (use == lookfor)
2743         break;
2744       gimple *def = SSA_NAME_DEF_STMT (use);
2745       if (gimple_nop_p (def)
2746           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2747         {
2748 pop:
2749           do
2750             {
2751               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2752               curri = x.first;
2753               curr = x.second;
2754               do
2755                 curr = op_iter_next_use (&curri);
2756               /* Skip already visited or non-SSA operands (from iterating
2757                  over PHI args).  */
2758               while (curr != NULL_USE_OPERAND_P
2759                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2760                          || ! bitmap_set_bit (visited,
2761                                               SSA_NAME_VERSION
2762                                                 (USE_FROM_PTR (curr)))));
2763             }
2764           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2765           if (curr == NULL_USE_OPERAND_P)
2766             break;
2767         }
2768       else
2769         {
2770           if (gimple_code (def) == GIMPLE_PHI)
2771             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2772           else
2773             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2774           while (curr != NULL_USE_OPERAND_P
2775                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2776                      || ! bitmap_set_bit (visited,
2777                                           SSA_NAME_VERSION
2778                                             (USE_FROM_PTR (curr)))))
2779             curr = op_iter_next_use (&curri);
2780           if (curr == NULL_USE_OPERAND_P)
2781             goto pop;
2782         }
2783     }
2784   while (1);
2785   if (dump_file && (dump_flags & TDF_DETAILS))
2786     {
2787       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2788       unsigned i;
2789       std::pair<ssa_op_iter, use_operand_p> *x;
2790       FOR_EACH_VEC_ELT (path, i, x)
2791         {
2792           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2793           dump_printf (MSG_NOTE, " ");
2794         }
2795       dump_printf (MSG_NOTE, "\n");
2796     }
2797
2798   /* Check whether the reduction path detected is valid.  */
2799   bool fail = path.length () == 0;
2800   bool neg = false;
2801   for (unsigned i = 1; i < path.length (); ++i)
2802     {
2803       gimple *use_stmt = USE_STMT (path[i].second);
2804       tree op = USE_FROM_PTR (path[i].second);
2805       if (! has_single_use (op)
2806           || ! is_gimple_assign (use_stmt))
2807         {
2808           fail = true;
2809           break;
2810         }
2811       if (gimple_assign_rhs_code (use_stmt) != code)
2812         {
2813           if (code == PLUS_EXPR
2814               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2815             {
2816               /* Track whether we negate the reduction value each iteration.  */
2817               if (gimple_assign_rhs2 (use_stmt) == op)
2818                 neg = ! neg;
2819             }
2820           else
2821             {
2822               fail = true;
2823               break;
2824             }
2825         }
2826     }
2827   return ! fail && ! neg;
2828 }
2829
2830
2831 /* Function vect_is_simple_reduction
2832
2833    (1) Detect a cross-iteration def-use cycle that represents a simple
2834    reduction computation.  We look for the following pattern:
2835
2836    loop_header:
2837      a1 = phi < a0, a2 >
2838      a3 = ...
2839      a2 = operation (a3, a1)
2840
2841    or
2842
2843    a3 = ...
2844    loop_header:
2845      a1 = phi < a0, a2 >
2846      a2 = operation (a3, a1)
2847
2848    such that:
2849    1. operation is commutative and associative and it is safe to
2850       change the order of the computation
2851    2. no uses for a2 in the loop (a2 is used out of the loop)
2852    3. no uses of a1 in the loop besides the reduction operation
2853    4. no uses of a1 outside the loop.
2854
2855    Conditions 1,4 are tested here.
2856    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2857
2858    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2859    nested cycles.
2860
2861    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2862    reductions:
2863
2864      a1 = phi < a0, a2 >
2865      inner loop (def of a3)
2866      a2 = phi < a3 >
2867
2868    (4) Detect condition expressions, ie:
2869      for (int i = 0; i < N; i++)
2870        if (a[i] < val)
2871         ret_val = a[i];
2872
2873 */
2874
2875 static gimple *
2876 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2877                           bool *double_reduc,
2878                           bool need_wrapping_integral_overflow,
2879                           enum vect_reduction_type *v_reduc_type)
2880 {
2881   struct loop *loop = (gimple_bb (phi))->loop_father;
2882   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2883   gimple *def_stmt, *phi_use_stmt = NULL;
2884   enum tree_code orig_code, code;
2885   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2886   tree type;
2887   int nloop_uses;
2888   tree name;
2889   imm_use_iterator imm_iter;
2890   use_operand_p use_p;
2891   bool phi_def;
2892
2893   *double_reduc = false;
2894   *v_reduc_type = TREE_CODE_REDUCTION;
2895
2896   tree phi_name = PHI_RESULT (phi);
2897   /* ???  If there are no uses of the PHI result the inner loop reduction
2898      won't be detected as possibly double-reduction by vectorizable_reduction
2899      because that tries to walk the PHI arg from the preheader edge which
2900      can be constant.  See PR60382.  */
2901   if (has_zero_uses (phi_name))
2902     return NULL;
2903   nloop_uses = 0;
2904   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2905     {
2906       gimple *use_stmt = USE_STMT (use_p);
2907       if (is_gimple_debug (use_stmt))
2908         continue;
2909
2910       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2911         {
2912           if (dump_enabled_p ())
2913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2914                              "intermediate value used outside loop.\n");
2915
2916           return NULL;
2917         }
2918
2919       nloop_uses++;
2920       if (nloop_uses > 1)
2921         {
2922           if (dump_enabled_p ())
2923             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2924                              "reduction value used in loop.\n");
2925           return NULL;
2926         }
2927
2928       phi_use_stmt = use_stmt;
2929     }
2930
2931   edge latch_e = loop_latch_edge (loop);
2932   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2933   if (TREE_CODE (loop_arg) != SSA_NAME)
2934     {
2935       if (dump_enabled_p ())
2936         {
2937           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2938                            "reduction: not ssa_name: ");
2939           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2940           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2941         }
2942       return NULL;
2943     }
2944
2945   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2946   if (is_gimple_assign (def_stmt))
2947     {
2948       name = gimple_assign_lhs (def_stmt);
2949       phi_def = false;
2950     }
2951   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2952     {
2953       name = PHI_RESULT (def_stmt);
2954       phi_def = true;
2955     }
2956   else
2957     {
2958       if (dump_enabled_p ())
2959         {
2960           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2961                            "reduction: unhandled reduction operation: ");
2962           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2963         }
2964       return NULL;
2965     }
2966
2967   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2968     return NULL;
2969
2970   nloop_uses = 0;
2971   auto_vec<gphi *, 3> lcphis;
2972   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2973     {
2974       gimple *use_stmt = USE_STMT (use_p);
2975       if (is_gimple_debug (use_stmt))
2976         continue;
2977       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2978         nloop_uses++;
2979       else
2980         /* We can have more than one loop-closed PHI.  */
2981         lcphis.safe_push (as_a <gphi *> (use_stmt));
2982       if (nloop_uses > 1)
2983         {
2984           if (dump_enabled_p ())
2985             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2986                              "reduction used in loop.\n");
2987           return NULL;
2988         }
2989     }
2990
2991   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2992      defined in the inner loop.  */
2993   if (phi_def)
2994     {
2995       op1 = PHI_ARG_DEF (def_stmt, 0);
2996
2997       if (gimple_phi_num_args (def_stmt) != 1
2998           || TREE_CODE (op1) != SSA_NAME)
2999         {
3000           if (dump_enabled_p ())
3001             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002                              "unsupported phi node definition.\n");
3003
3004           return NULL;
3005         }
3006
3007       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3008       if (gimple_bb (def1)
3009           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3010           && loop->inner
3011           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3012           && is_gimple_assign (def1)
3013           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3014         {
3015           if (dump_enabled_p ())
3016             report_vect_op (MSG_NOTE, def_stmt,
3017                             "detected double reduction: ");
3018
3019           *double_reduc = true;
3020           return def_stmt;
3021         }
3022
3023       return NULL;
3024     }
3025
3026   /* If we are vectorizing an inner reduction we are executing that
3027      in the original order only in case we are not dealing with a
3028      double reduction.  */
3029   bool check_reduction = true;
3030   if (flow_loop_nested_p (vect_loop, loop))
3031     {
3032       gphi *lcphi;
3033       unsigned i;
3034       check_reduction = false;
3035       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3036         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3037           {
3038             gimple *use_stmt = USE_STMT (use_p);
3039             if (is_gimple_debug (use_stmt))
3040               continue;
3041             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3042               check_reduction = true;
3043           }
3044     }
3045
3046   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3047   code = orig_code = gimple_assign_rhs_code (def_stmt);
3048
3049   /* We can handle "res -= x[i]", which is non-associative by
3050      simply rewriting this into "res += -x[i]".  Avoid changing
3051      gimple instruction for the first simple tests and only do this
3052      if we're allowed to change code at all.  */
3053   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3054     code = PLUS_EXPR;
3055
3056   if (code == COND_EXPR)
3057     {
3058       if (! nested_in_vect_loop)
3059         *v_reduc_type = COND_REDUCTION;
3060
3061       op3 = gimple_assign_rhs1 (def_stmt);
3062       if (COMPARISON_CLASS_P (op3))
3063         {
3064           op4 = TREE_OPERAND (op3, 1);
3065           op3 = TREE_OPERAND (op3, 0);
3066         }
3067       if (op3 == phi_name || op4 == phi_name)
3068         {
3069           if (dump_enabled_p ())
3070             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3071                             "reduction: condition depends on previous"
3072                             " iteration: ");
3073           return NULL;
3074         }
3075
3076       op1 = gimple_assign_rhs2 (def_stmt);
3077       op2 = gimple_assign_rhs3 (def_stmt);
3078     }
3079   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3080     {
3081       if (dump_enabled_p ())
3082         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3083                         "reduction: not commutative/associative: ");
3084       return NULL;
3085     }
3086   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3087     {
3088       op1 = gimple_assign_rhs1 (def_stmt);
3089       op2 = gimple_assign_rhs2 (def_stmt);
3090     }
3091   else
3092     {
3093       if (dump_enabled_p ())
3094         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3095                         "reduction: not handled operation: ");
3096       return NULL;
3097     }
3098
3099   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3100     {
3101       if (dump_enabled_p ())
3102         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3103                         "reduction: both uses not ssa_names: ");
3104
3105       return NULL;
3106     }
3107
3108   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3109   if ((TREE_CODE (op1) == SSA_NAME
3110        && !types_compatible_p (type,TREE_TYPE (op1)))
3111       || (TREE_CODE (op2) == SSA_NAME
3112           && !types_compatible_p (type, TREE_TYPE (op2)))
3113       || (op3 && TREE_CODE (op3) == SSA_NAME
3114           && !types_compatible_p (type, TREE_TYPE (op3)))
3115       || (op4 && TREE_CODE (op4) == SSA_NAME
3116           && !types_compatible_p (type, TREE_TYPE (op4))))
3117     {
3118       if (dump_enabled_p ())
3119         {
3120           dump_printf_loc (MSG_NOTE, vect_location,
3121                            "reduction: multiple types: operation type: ");
3122           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3123           dump_printf (MSG_NOTE, ", operands types: ");
3124           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3125                              TREE_TYPE (op1));
3126           dump_printf (MSG_NOTE, ",");
3127           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3128                              TREE_TYPE (op2));
3129           if (op3)
3130             {
3131               dump_printf (MSG_NOTE, ",");
3132               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3133                                  TREE_TYPE (op3));
3134             }
3135
3136           if (op4)
3137             {
3138               dump_printf (MSG_NOTE, ",");
3139               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3140                                  TREE_TYPE (op4));
3141             }
3142           dump_printf (MSG_NOTE, "\n");
3143         }
3144
3145       return NULL;
3146     }
3147
3148   /* Check whether it's ok to change the order of the computation.
3149      Generally, when vectorizing a reduction we change the order of the
3150      computation.  This may change the behavior of the program in some
3151      cases, so we need to check that this is ok.  One exception is when
3152      vectorizing an outer-loop: the inner-loop is executed sequentially,
3153      and therefore vectorizing reductions in the inner-loop during
3154      outer-loop vectorization is safe.  */
3155   if (check_reduction
3156       && *v_reduc_type == TREE_CODE_REDUCTION
3157       && needs_fold_left_reduction_p (type, code,
3158                                       need_wrapping_integral_overflow))
3159     *v_reduc_type = FOLD_LEFT_REDUCTION;
3160
3161   /* Reduction is safe. We're dealing with one of the following:
3162      1) integer arithmetic and no trapv
3163      2) floating point arithmetic, and special flags permit this optimization
3164      3) nested cycle (i.e., outer loop vectorization).  */
3165   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3166   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3167   if (code != COND_EXPR && !def1_info && !def2_info)
3168     {
3169       if (dump_enabled_p ())
3170         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3171       return NULL;
3172     }
3173
3174   /* Check that one def is the reduction def, defined by PHI,
3175      the other def is either defined in the loop ("vect_internal_def"),
3176      or it's an induction (defined by a loop-header phi-node).  */
3177
3178   if (def2_info
3179       && def2_info->stmt == phi
3180       && (code == COND_EXPR
3181           || !def1_info
3182           || vect_valid_reduction_input_p (def1_info)))
3183     {
3184       if (dump_enabled_p ())
3185         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3186       return def_stmt;
3187     }
3188
3189   if (def1_info
3190       && def1_info->stmt == phi
3191       && (code == COND_EXPR
3192           || !def2_info
3193           || vect_valid_reduction_input_p (def2_info)))
3194     {
3195       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3196         {
3197           /* Check if we can swap operands (just for simplicity - so that
3198              the rest of the code can assume that the reduction variable
3199              is always the last (second) argument).  */
3200           if (code == COND_EXPR)
3201             {
3202               /* Swap cond_expr by inverting the condition.  */
3203               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3204               enum tree_code invert_code = ERROR_MARK;
3205               enum tree_code cond_code = TREE_CODE (cond_expr);
3206
3207               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3208                 {
3209                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3210                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3211                 }
3212               if (invert_code != ERROR_MARK)
3213                 {
3214                   TREE_SET_CODE (cond_expr, invert_code);
3215                   swap_ssa_operands (def_stmt,
3216                                      gimple_assign_rhs2_ptr (def_stmt),
3217                                      gimple_assign_rhs3_ptr (def_stmt));
3218                 }
3219               else
3220                 {
3221                   if (dump_enabled_p ())
3222                     report_vect_op (MSG_NOTE, def_stmt,
3223                                     "detected reduction: cannot swap operands "
3224                                     "for cond_expr");
3225                   return NULL;
3226                 }
3227             }
3228           else
3229             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3230                                gimple_assign_rhs2_ptr (def_stmt));
3231
3232           if (dump_enabled_p ())
3233             report_vect_op (MSG_NOTE, def_stmt,
3234                             "detected reduction: need to swap operands: ");
3235
3236           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3237             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3238         }
3239       else
3240         {
3241           if (dump_enabled_p ())
3242             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3243         }
3244
3245       return def_stmt;
3246     }
3247
3248   /* Try to find SLP reduction chain.  */
3249   if (! nested_in_vect_loop
3250       && code != COND_EXPR
3251       && orig_code != MINUS_EXPR
3252       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3253     {
3254       if (dump_enabled_p ())
3255         report_vect_op (MSG_NOTE, def_stmt,
3256                         "reduction: detected reduction chain: ");
3257
3258       return def_stmt;
3259     }
3260
3261   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3262   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3263   while (first)
3264     {
3265       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3266       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3267       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3268       first = next;
3269     }
3270
3271   /* Look for the expression computing loop_arg from loop PHI result.  */
3272   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3273                             code))
3274     return def_stmt;
3275
3276   if (dump_enabled_p ())
3277     {
3278       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3279                       "reduction: unknown pattern: ");
3280     }
3281
3282   return NULL;
3283 }
3284
3285 /* Wrapper around vect_is_simple_reduction, which will modify code
3286    in-place if it enables detection of more reductions.  Arguments
3287    as there.  */
3288
3289 gimple *
3290 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3291                              bool *double_reduc,
3292                              bool need_wrapping_integral_overflow)
3293 {
3294   enum vect_reduction_type v_reduc_type;
3295   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3296                                           need_wrapping_integral_overflow,
3297                                           &v_reduc_type);
3298   if (def)
3299     {
3300       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3301       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3302       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3303       reduc_def_info = vinfo_for_stmt (def);
3304       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3305       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3306     }
3307   return def;
3308 }
3309
3310 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3311 int
3312 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3313                              int *peel_iters_epilogue,
3314                              stmt_vector_for_cost *scalar_cost_vec,
3315                              stmt_vector_for_cost *prologue_cost_vec,
3316                              stmt_vector_for_cost *epilogue_cost_vec)
3317 {
3318   int retval = 0;
3319   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3320
3321   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3322     {
3323       *peel_iters_epilogue = assumed_vf / 2;
3324       if (dump_enabled_p ())
3325         dump_printf_loc (MSG_NOTE, vect_location,
3326                          "cost model: epilogue peel iters set to vf/2 "
3327                          "because loop iterations are unknown .\n");
3328
3329       /* If peeled iterations are known but number of scalar loop
3330          iterations are unknown, count a taken branch per peeled loop.  */
3331       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3332                                  NULL, 0, vect_prologue);
3333       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3334                                  NULL, 0, vect_epilogue);
3335     }
3336   else
3337     {
3338       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3339       peel_iters_prologue = niters < peel_iters_prologue ?
3340                             niters : peel_iters_prologue;
3341       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3342       /* If we need to peel for gaps, but no peeling is required, we have to
3343          peel VF iterations.  */
3344       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3345         *peel_iters_epilogue = assumed_vf;
3346     }
3347
3348   stmt_info_for_cost *si;
3349   int j;
3350   if (peel_iters_prologue)
3351     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3352         {
3353           stmt_vec_info stmt_info
3354             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3355           retval += record_stmt_cost (prologue_cost_vec,
3356                                       si->count * peel_iters_prologue,
3357                                       si->kind, stmt_info, si->misalign,
3358                                       vect_prologue);
3359         }
3360   if (*peel_iters_epilogue)
3361     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3362         {
3363           stmt_vec_info stmt_info
3364             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3365           retval += record_stmt_cost (epilogue_cost_vec,
3366                                       si->count * *peel_iters_epilogue,
3367                                       si->kind, stmt_info, si->misalign,
3368                                       vect_epilogue);
3369         }
3370
3371   return retval;
3372 }
3373
3374 /* Function vect_estimate_min_profitable_iters
3375
3376    Return the number of iterations required for the vector version of the
3377    loop to be profitable relative to the cost of the scalar version of the
3378    loop.
3379
3380    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3381    of iterations for vectorization.  -1 value means loop vectorization
3382    is not profitable.  This returned value may be used for dynamic
3383    profitability check.
3384
3385    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3386    for static check against estimated number of iterations.  */
3387
3388 static void
3389 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3390                                     int *ret_min_profitable_niters,
3391                                     int *ret_min_profitable_estimate)
3392 {
3393   int min_profitable_iters;
3394   int min_profitable_estimate;
3395   int peel_iters_prologue;
3396   int peel_iters_epilogue;
3397   unsigned vec_inside_cost = 0;
3398   int vec_outside_cost = 0;
3399   unsigned vec_prologue_cost = 0;
3400   unsigned vec_epilogue_cost = 0;
3401   int scalar_single_iter_cost = 0;
3402   int scalar_outside_cost = 0;
3403   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3404   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3405   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3406
3407   /* Cost model disabled.  */
3408   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3409     {
3410       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3411       *ret_min_profitable_niters = 0;
3412       *ret_min_profitable_estimate = 0;
3413       return;
3414     }
3415
3416   /* Requires loop versioning tests to handle misalignment.  */
3417   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3418     {
3419       /*  FIXME: Make cost depend on complexity of individual check.  */
3420       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3421       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3422                             vect_prologue);
3423       dump_printf (MSG_NOTE,
3424                    "cost model: Adding cost of checks for loop "
3425                    "versioning to treat misalignment.\n");
3426     }
3427
3428   /* Requires loop versioning with alias checks.  */
3429   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3430     {
3431       /*  FIXME: Make cost depend on complexity of individual check.  */
3432       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3433       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3434                             vect_prologue);
3435       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3436       if (len)
3437         /* Count LEN - 1 ANDs and LEN comparisons.  */
3438         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3439                               NULL, 0, vect_prologue);
3440       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3441       if (len)
3442         {
3443           /* Count LEN - 1 ANDs and LEN comparisons.  */
3444           unsigned int nstmts = len * 2 - 1;
3445           /* +1 for each bias that needs adding.  */
3446           for (unsigned int i = 0; i < len; ++i)
3447             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3448               nstmts += 1;
3449           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3450                                 NULL, 0, vect_prologue);
3451         }
3452       dump_printf (MSG_NOTE,
3453                    "cost model: Adding cost of checks for loop "
3454                    "versioning aliasing.\n");
3455     }
3456
3457   /* Requires loop versioning with niter checks.  */
3458   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3459     {
3460       /*  FIXME: Make cost depend on complexity of individual check.  */
3461       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3462                             vect_prologue);
3463       dump_printf (MSG_NOTE,
3464                    "cost model: Adding cost of checks for loop "
3465                    "versioning niters.\n");
3466     }
3467
3468   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3469     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3470                           vect_prologue);
3471
3472   /* Count statements in scalar loop.  Using this as scalar cost for a single
3473      iteration for now.
3474
3475      TODO: Add outer loop support.
3476
3477      TODO: Consider assigning different costs to different scalar
3478      statements.  */
3479
3480   scalar_single_iter_cost
3481     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3482
3483   /* Add additional cost for the peeled instructions in prologue and epilogue
3484      loop.  (For fully-masked loops there will be no peeling.)
3485
3486      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3487      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3488
3489      TODO: Build an expression that represents peel_iters for prologue and
3490      epilogue to be used in a run-time test.  */
3491
3492   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3493     {
3494       peel_iters_prologue = 0;
3495       peel_iters_epilogue = 0;
3496
3497       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3498         {
3499           /* We need to peel exactly one iteration.  */
3500           peel_iters_epilogue += 1;
3501           stmt_info_for_cost *si;
3502           int j;
3503           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3504                             j, si)
3505             {
3506               struct _stmt_vec_info *stmt_info
3507                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3508               (void) add_stmt_cost (target_cost_data, si->count,
3509                                     si->kind, stmt_info, si->misalign,
3510                                     vect_epilogue);
3511             }
3512         }
3513     }
3514   else if (npeel < 0)
3515     {
3516       peel_iters_prologue = assumed_vf / 2;
3517       dump_printf (MSG_NOTE, "cost model: "
3518                    "prologue peel iters set to vf/2.\n");
3519
3520       /* If peeling for alignment is unknown, loop bound of main loop becomes
3521          unknown.  */
3522       peel_iters_epilogue = assumed_vf / 2;
3523       dump_printf (MSG_NOTE, "cost model: "
3524                    "epilogue peel iters set to vf/2 because "
3525                    "peeling for alignment is unknown.\n");
3526
3527       /* If peeled iterations are unknown, count a taken branch and a not taken
3528          branch per peeled loop. Even if scalar loop iterations are known,
3529          vector iterations are not known since peeled prologue iterations are
3530          not known. Hence guards remain the same.  */
3531       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3532                             NULL, 0, vect_prologue);
3533       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3534                             NULL, 0, vect_prologue);
3535       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3536                             NULL, 0, vect_epilogue);
3537       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3538                             NULL, 0, vect_epilogue);
3539       stmt_info_for_cost *si;
3540       int j;
3541       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3542         {
3543           struct _stmt_vec_info *stmt_info
3544             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3545           (void) add_stmt_cost (target_cost_data,
3546                                 si->count * peel_iters_prologue,
3547                                 si->kind, stmt_info, si->misalign,
3548                                 vect_prologue);
3549           (void) add_stmt_cost (target_cost_data,
3550                                 si->count * peel_iters_epilogue,
3551                                 si->kind, stmt_info, si->misalign,
3552                                 vect_epilogue);
3553         }
3554     }
3555   else
3556     {
3557       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3558       stmt_info_for_cost *si;
3559       int j;
3560       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3561
3562       prologue_cost_vec.create (2);
3563       epilogue_cost_vec.create (2);
3564       peel_iters_prologue = npeel;
3565
3566       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3567                                           &peel_iters_epilogue,
3568                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3569                                             (loop_vinfo),
3570                                           &prologue_cost_vec,
3571                                           &epilogue_cost_vec);
3572
3573       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3574         {
3575           struct _stmt_vec_info *stmt_info
3576             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3577           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3578                                 si->misalign, vect_prologue);
3579         }
3580
3581       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3582         {
3583           struct _stmt_vec_info *stmt_info
3584             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3585           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3586                                 si->misalign, vect_epilogue);
3587         }
3588
3589       prologue_cost_vec.release ();
3590       epilogue_cost_vec.release ();
3591     }
3592
3593   /* FORNOW: The scalar outside cost is incremented in one of the
3594      following ways:
3595
3596      1. The vectorizer checks for alignment and aliasing and generates
3597      a condition that allows dynamic vectorization.  A cost model
3598      check is ANDED with the versioning condition.  Hence scalar code
3599      path now has the added cost of the versioning check.
3600
3601        if (cost > th & versioning_check)
3602          jmp to vector code
3603
3604      Hence run-time scalar is incremented by not-taken branch cost.
3605
3606      2. The vectorizer then checks if a prologue is required.  If the
3607      cost model check was not done before during versioning, it has to
3608      be done before the prologue check.
3609
3610        if (cost <= th)
3611          prologue = scalar_iters
3612        if (prologue == 0)
3613          jmp to vector code
3614        else
3615          execute prologue
3616        if (prologue == num_iters)
3617          go to exit
3618
3619      Hence the run-time scalar cost is incremented by a taken branch,
3620      plus a not-taken branch, plus a taken branch cost.
3621
3622      3. The vectorizer then checks if an epilogue is required.  If the
3623      cost model check was not done before during prologue check, it
3624      has to be done with the epilogue check.
3625
3626        if (prologue == 0)
3627          jmp to vector code
3628        else
3629          execute prologue
3630        if (prologue == num_iters)
3631          go to exit
3632        vector code:
3633          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3634            jmp to epilogue
3635
3636      Hence the run-time scalar cost should be incremented by 2 taken
3637      branches.
3638
3639      TODO: The back end may reorder the BBS's differently and reverse
3640      conditions/branch directions.  Change the estimates below to
3641      something more reasonable.  */
3642
3643   /* If the number of iterations is known and we do not do versioning, we can
3644      decide whether to vectorize at compile time.  Hence the scalar version
3645      do not carry cost model guard costs.  */
3646   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3647       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3648     {
3649       /* Cost model check occurs at versioning.  */
3650       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3651         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3652       else
3653         {
3654           /* Cost model check occurs at prologue generation.  */
3655           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3656             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3657               + vect_get_stmt_cost (cond_branch_not_taken);
3658           /* Cost model check occurs at epilogue generation.  */
3659           else
3660             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3661         }
3662     }
3663
3664   /* Complete the target-specific cost calculations.  */
3665   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3666                &vec_inside_cost, &vec_epilogue_cost);
3667
3668   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3669
3670   if (dump_enabled_p ())
3671     {
3672       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3673       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3674                    vec_inside_cost);
3675       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3676                    vec_prologue_cost);
3677       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3678                    vec_epilogue_cost);
3679       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3680                    scalar_single_iter_cost);
3681       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3682                    scalar_outside_cost);
3683       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3684                    vec_outside_cost);
3685       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3686                    peel_iters_prologue);
3687       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3688                    peel_iters_epilogue);
3689     }
3690
3691   /* Calculate number of iterations required to make the vector version
3692      profitable, relative to the loop bodies only.  The following condition
3693      must hold true:
3694      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3695      where
3696      SIC = scalar iteration cost, VIC = vector iteration cost,
3697      VOC = vector outside cost, VF = vectorization factor,
3698      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3699      SOC = scalar outside cost for run time cost model check.  */
3700
3701   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3702     {
3703       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3704                               * assumed_vf
3705                               - vec_inside_cost * peel_iters_prologue
3706                               - vec_inside_cost * peel_iters_epilogue);
3707       if (min_profitable_iters <= 0)
3708         min_profitable_iters = 0;
3709       else
3710         {
3711           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3712                                    - vec_inside_cost);
3713
3714           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3715               <= (((int) vec_inside_cost * min_profitable_iters)
3716                   + (((int) vec_outside_cost - scalar_outside_cost)
3717                      * assumed_vf)))
3718             min_profitable_iters++;
3719         }
3720     }
3721   /* vector version will never be profitable.  */
3722   else
3723     {
3724       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3725         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3726                     "vectorization did not happen for a simd loop");
3727
3728       if (dump_enabled_p ())
3729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3730                          "cost model: the vector iteration cost = %d "
3731                          "divided by the scalar iteration cost = %d "
3732                          "is greater or equal to the vectorization factor = %d"
3733                          ".\n",
3734                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3735       *ret_min_profitable_niters = -1;
3736       *ret_min_profitable_estimate = -1;
3737       return;
3738     }
3739
3740   dump_printf (MSG_NOTE,
3741                "  Calculated minimum iters for profitability: %d\n",
3742                min_profitable_iters);
3743
3744   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3745       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3746     /* We want the vectorized loop to execute at least once.  */
3747     min_profitable_iters = assumed_vf + peel_iters_prologue;
3748
3749   if (dump_enabled_p ())
3750     dump_printf_loc (MSG_NOTE, vect_location,
3751                      "  Runtime profitability threshold = %d\n",
3752                      min_profitable_iters);
3753
3754   *ret_min_profitable_niters = min_profitable_iters;
3755
3756   /* Calculate number of iterations required to make the vector version
3757      profitable, relative to the loop bodies only.
3758
3759      Non-vectorized variant is SIC * niters and it must win over vector
3760      variant on the expected loop trip count.  The following condition must hold true:
3761      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3762
3763   if (vec_outside_cost <= 0)
3764     min_profitable_estimate = 0;
3765   else
3766     {
3767       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3768                                  * assumed_vf
3769                                  - vec_inside_cost * peel_iters_prologue
3770                                  - vec_inside_cost * peel_iters_epilogue)
3771                                  / ((scalar_single_iter_cost * assumed_vf)
3772                                    - vec_inside_cost);
3773     }
3774   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3775   if (dump_enabled_p ())
3776     dump_printf_loc (MSG_NOTE, vect_location,
3777                      "  Static estimate profitability threshold = %d\n",
3778                      min_profitable_estimate);
3779
3780   *ret_min_profitable_estimate = min_profitable_estimate;
3781 }
3782
3783 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3784    vector elements (not bits) for a vector with NELT elements.  */
3785 static void
3786 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3787                               vec_perm_builder *sel)
3788 {
3789   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3790      by vec_perm_indices.  */
3791   sel->new_vector (nelt, 1, 3);
3792   for (unsigned int i = 0; i < 3; i++)
3793     sel->quick_push (i + offset);
3794 }
3795
3796 /* Checks whether the target supports whole-vector shifts for vectors of mode
3797    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3798    it supports vec_perm_const with masks for all necessary shift amounts.  */
3799 static bool
3800 have_whole_vector_shift (machine_mode mode)
3801 {
3802   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3803     return true;
3804
3805   /* Variable-length vectors should be handled via the optab.  */
3806   unsigned int nelt;
3807   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3808     return false;
3809
3810   vec_perm_builder sel;
3811   vec_perm_indices indices;
3812   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3813     {
3814       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3815       indices.new_vector (sel, 2, nelt);
3816       if (!can_vec_perm_const_p (mode, indices, false))
3817         return false;
3818     }
3819   return true;
3820 }
3821
3822 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3823    functions. Design better to avoid maintenance issues.  */
3824
3825 /* Function vect_model_reduction_cost.
3826
3827    Models cost for a reduction operation, including the vector ops
3828    generated within the strip-mine loop, the initial definition before
3829    the loop, and the epilogue code that must be generated.  */
3830
3831 static void
3832 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3833                            int ncopies, stmt_vector_for_cost *cost_vec)
3834 {
3835   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3836   enum tree_code code;
3837   optab optab;
3838   tree vectype;
3839   gimple *orig_stmt;
3840   machine_mode mode;
3841   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3842   struct loop *loop = NULL;
3843
3844   if (loop_vinfo)
3845     loop = LOOP_VINFO_LOOP (loop_vinfo);
3846
3847   /* Condition reductions generate two reductions in the loop.  */
3848   vect_reduction_type reduction_type
3849     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3850   if (reduction_type == COND_REDUCTION)
3851     ncopies *= 2;
3852
3853   vectype = STMT_VINFO_VECTYPE (stmt_info);
3854   mode = TYPE_MODE (vectype);
3855   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3856
3857   if (!orig_stmt)
3858     orig_stmt = STMT_VINFO_STMT (stmt_info);
3859
3860   code = gimple_assign_rhs_code (orig_stmt);
3861
3862   if (reduction_type == EXTRACT_LAST_REDUCTION
3863       || reduction_type == FOLD_LEFT_REDUCTION)
3864     {
3865       /* No extra instructions needed in the prologue.  */
3866       prologue_cost = 0;
3867
3868       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3869         /* Count one reduction-like operation per vector.  */
3870         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3871                                         stmt_info, 0, vect_body);
3872       else
3873         {
3874           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3875           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3876           inside_cost = record_stmt_cost (cost_vec, nelements,
3877                                           vec_to_scalar, stmt_info, 0,
3878                                           vect_body);
3879           inside_cost += record_stmt_cost (cost_vec, nelements,
3880                                            scalar_stmt, stmt_info, 0,
3881                                            vect_body);
3882         }
3883     }
3884   else
3885     {
3886       /* Add in cost for initial definition.
3887          For cond reduction we have four vectors: initial index, step,
3888          initial result of the data reduction, initial value of the index
3889          reduction.  */
3890       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3891       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3892                                          scalar_to_vec, stmt_info, 0,
3893                                          vect_prologue);
3894
3895       /* Cost of reduction op inside loop.  */
3896       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3897                                       stmt_info, 0, vect_body);
3898     }
3899
3900   /* Determine cost of epilogue code.
3901
3902      We have a reduction operator that will reduce the vector in one statement.
3903      Also requires scalar extract.  */
3904
3905   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3906     {
3907       if (reduc_fn != IFN_LAST)
3908         {
3909           if (reduction_type == COND_REDUCTION)
3910             {
3911               /* An EQ stmt and an COND_EXPR stmt.  */
3912               epilogue_cost += record_stmt_cost (cost_vec, 2,
3913                                                  vector_stmt, stmt_info, 0,
3914                                                  vect_epilogue);
3915               /* Reduction of the max index and a reduction of the found
3916                  values.  */
3917               epilogue_cost += record_stmt_cost (cost_vec, 2,
3918                                                  vec_to_scalar, stmt_info, 0,
3919                                                  vect_epilogue);
3920               /* A broadcast of the max value.  */
3921               epilogue_cost += record_stmt_cost (cost_vec, 1,
3922                                                  scalar_to_vec, stmt_info, 0,
3923                                                  vect_epilogue);
3924             }
3925           else
3926             {
3927               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3928                                                  stmt_info, 0, vect_epilogue);
3929               epilogue_cost += record_stmt_cost (cost_vec, 1,
3930                                                  vec_to_scalar, stmt_info, 0,
3931                                                  vect_epilogue);
3932             }
3933         }
3934       else if (reduction_type == COND_REDUCTION)
3935         {
3936           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3937           /* Extraction of scalar elements.  */
3938           epilogue_cost += record_stmt_cost (cost_vec,
3939                                              2 * estimated_nunits,
3940                                              vec_to_scalar, stmt_info, 0,
3941                                              vect_epilogue);
3942           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3943           epilogue_cost += record_stmt_cost (cost_vec,
3944                                              2 * estimated_nunits - 3,
3945                                              scalar_stmt, stmt_info, 0,
3946                                              vect_epilogue);
3947         }
3948       else if (reduction_type == EXTRACT_LAST_REDUCTION
3949                || reduction_type == FOLD_LEFT_REDUCTION)
3950         /* No extra instructions need in the epilogue.  */
3951         ;
3952       else
3953         {
3954           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3955           tree bitsize =
3956             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3957           int element_bitsize = tree_to_uhwi (bitsize);
3958           int nelements = vec_size_in_bits / element_bitsize;
3959
3960           if (code == COND_EXPR)
3961             code = MAX_EXPR;
3962
3963           optab = optab_for_tree_code (code, vectype, optab_default);
3964
3965           /* We have a whole vector shift available.  */
3966           if (optab != unknown_optab
3967               && VECTOR_MODE_P (mode)
3968               && optab_handler (optab, mode) != CODE_FOR_nothing
3969               && have_whole_vector_shift (mode))
3970             {
3971               /* Final reduction via vector shifts and the reduction operator.
3972                  Also requires scalar extract.  */
3973               epilogue_cost += record_stmt_cost (cost_vec,
3974                                                  exact_log2 (nelements) * 2,
3975                                                  vector_stmt, stmt_info, 0,
3976                                                  vect_epilogue);
3977               epilogue_cost += record_stmt_cost (cost_vec, 1,
3978                                                  vec_to_scalar, stmt_info, 0,
3979                                                  vect_epilogue);
3980             }
3981           else
3982             /* Use extracts and reduction op for final reduction.  For N
3983                elements, we have N extracts and N-1 reduction ops.  */
3984             epilogue_cost += record_stmt_cost (cost_vec,
3985                                                nelements + nelements - 1,
3986                                                vector_stmt, stmt_info, 0,
3987                                                vect_epilogue);
3988         }
3989     }
3990
3991   if (dump_enabled_p ())
3992     dump_printf (MSG_NOTE,
3993                  "vect_model_reduction_cost: inside_cost = %d, "
3994                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3995                  prologue_cost, epilogue_cost);
3996 }
3997
3998
3999 /* Function vect_model_induction_cost.
4000
4001    Models cost for induction operations.  */
4002
4003 static void
4004 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4005                            stmt_vector_for_cost *cost_vec)
4006 {
4007   unsigned inside_cost, prologue_cost;
4008
4009   if (PURE_SLP_STMT (stmt_info))
4010     return;
4011
4012   /* loop cost for vec_loop.  */
4013   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4014                                   stmt_info, 0, vect_body);
4015
4016   /* prologue cost for vec_init and vec_step.  */
4017   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4018                                     stmt_info, 0, vect_prologue);
4019
4020   if (dump_enabled_p ())
4021     dump_printf_loc (MSG_NOTE, vect_location,
4022                      "vect_model_induction_cost: inside_cost = %d, "
4023                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4024 }
4025
4026
4027
4028 /* Function get_initial_def_for_reduction
4029
4030    Input:
4031    STMT - a stmt that performs a reduction operation in the loop.
4032    INIT_VAL - the initial value of the reduction variable
4033
4034    Output:
4035    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4036         of the reduction (used for adjusting the epilog - see below).
4037    Return a vector variable, initialized according to the operation that STMT
4038         performs. This vector will be used as the initial value of the
4039         vector of partial results.
4040
4041    Option1 (adjust in epilog): Initialize the vector as follows:
4042      add/bit or/xor:    [0,0,...,0,0]
4043      mult/bit and:      [1,1,...,1,1]
4044      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4045    and when necessary (e.g. add/mult case) let the caller know
4046    that it needs to adjust the result by init_val.
4047
4048    Option2: Initialize the vector as follows:
4049      add/bit or/xor:    [init_val,0,0,...,0]
4050      mult/bit and:      [init_val,1,1,...,1]
4051      min/max/cond_expr: [init_val,init_val,...,init_val]
4052    and no adjustments are needed.
4053
4054    For example, for the following code:
4055
4056    s = init_val;
4057    for (i=0;i<n;i++)
4058      s = s + a[i];
4059
4060    STMT is 's = s + a[i]', and the reduction variable is 's'.
4061    For a vector of 4 units, we want to return either [0,0,0,init_val],
4062    or [0,0,0,0] and let the caller know that it needs to adjust
4063    the result at the end by 'init_val'.
4064
4065    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4066    initialization vector is simpler (same element in all entries), if
4067    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4068
4069    A cost model should help decide between these two schemes.  */
4070
4071 tree
4072 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4073                                tree *adjustment_def)
4074 {
4075   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4076   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4077   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4078   tree scalar_type = TREE_TYPE (init_val);
4079   tree vectype = get_vectype_for_scalar_type (scalar_type);
4080   enum tree_code code = gimple_assign_rhs_code (stmt);
4081   tree def_for_init;
4082   tree init_def;
4083   REAL_VALUE_TYPE real_init_val = dconst0;
4084   int int_init_val = 0;
4085   gimple_seq stmts = NULL;
4086
4087   gcc_assert (vectype);
4088
4089   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4090               || SCALAR_FLOAT_TYPE_P (scalar_type));
4091
4092   gcc_assert (nested_in_vect_loop_p (loop, stmt)
4093               || loop == (gimple_bb (stmt))->loop_father);
4094
4095   vect_reduction_type reduction_type
4096     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4097
4098   switch (code)
4099     {
4100     case WIDEN_SUM_EXPR:
4101     case DOT_PROD_EXPR:
4102     case SAD_EXPR:
4103     case PLUS_EXPR:
4104     case MINUS_EXPR:
4105     case BIT_IOR_EXPR:
4106     case BIT_XOR_EXPR:
4107     case MULT_EXPR:
4108     case BIT_AND_EXPR:
4109       {
4110         /* ADJUSTMENT_DEF is NULL when called from
4111            vect_create_epilog_for_reduction to vectorize double reduction.  */
4112         if (adjustment_def)
4113           *adjustment_def = init_val;
4114
4115         if (code == MULT_EXPR)
4116           {
4117             real_init_val = dconst1;
4118             int_init_val = 1;
4119           }
4120
4121         if (code == BIT_AND_EXPR)
4122           int_init_val = -1;
4123
4124         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4125           def_for_init = build_real (scalar_type, real_init_val);
4126         else
4127           def_for_init = build_int_cst (scalar_type, int_init_val);
4128
4129         if (adjustment_def)
4130           /* Option1: the first element is '0' or '1' as well.  */
4131           init_def = gimple_build_vector_from_val (&stmts, vectype,
4132                                                    def_for_init);
4133         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4134           {
4135             /* Option2 (variable length): the first element is INIT_VAL.  */
4136             init_def = gimple_build_vector_from_val (&stmts, vectype,
4137                                                      def_for_init);
4138             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4139                                      vectype, init_def, init_val);
4140           }
4141         else
4142           {
4143             /* Option2: the first element is INIT_VAL.  */
4144             tree_vector_builder elts (vectype, 1, 2);
4145             elts.quick_push (init_val);
4146             elts.quick_push (def_for_init);
4147             init_def = gimple_build_vector (&stmts, &elts);
4148           }
4149       }
4150       break;
4151
4152     case MIN_EXPR:
4153     case MAX_EXPR:
4154     case COND_EXPR:
4155       {
4156         if (adjustment_def)
4157           {
4158             *adjustment_def = NULL_TREE;
4159             if (reduction_type != COND_REDUCTION
4160                 && reduction_type != EXTRACT_LAST_REDUCTION)
4161               {
4162                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4163                 break;
4164               }
4165           }
4166         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4167         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4168       }
4169       break;
4170
4171     default:
4172       gcc_unreachable ();
4173     }
4174
4175   if (stmts)
4176     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4177   return init_def;
4178 }
4179
4180 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4181    NUMBER_OF_VECTORS is the number of vector defs to create.
4182    If NEUTRAL_OP is nonnull, introducing extra elements of that
4183    value will not change the result.  */
4184
4185 static void
4186 get_initial_defs_for_reduction (slp_tree slp_node,
4187                                 vec<tree> *vec_oprnds,
4188                                 unsigned int number_of_vectors,
4189                                 bool reduc_chain, tree neutral_op)
4190 {
4191   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4192   gimple *stmt = stmts[0];
4193   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4194   unsigned HOST_WIDE_INT nunits;
4195   unsigned j, number_of_places_left_in_vector;
4196   tree vector_type;
4197   tree vop;
4198   int group_size = stmts.length ();
4199   unsigned int vec_num, i;
4200   unsigned number_of_copies = 1;
4201   vec<tree> voprnds;
4202   voprnds.create (number_of_vectors);
4203   struct loop *loop;
4204   auto_vec<tree, 16> permute_results;
4205
4206   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4207
4208   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4209
4210   loop = (gimple_bb (stmt))->loop_father;
4211   gcc_assert (loop);
4212   edge pe = loop_preheader_edge (loop);
4213
4214   gcc_assert (!reduc_chain || neutral_op);
4215
4216   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4217      created vectors. It is greater than 1 if unrolling is performed.
4218
4219      For example, we have two scalar operands, s1 and s2 (e.g., group of
4220      strided accesses of size two), while NUNITS is four (i.e., four scalars
4221      of this type can be packed in a vector).  The output vector will contain
4222      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4223      will be 2).
4224
4225      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4226      vectors containing the operands.
4227
4228      For example, NUNITS is four as before, and the group size is 8
4229      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4230      {s5, s6, s7, s8}.  */
4231
4232   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4233     nunits = group_size;
4234
4235   number_of_copies = nunits * number_of_vectors / group_size;
4236
4237   number_of_places_left_in_vector = nunits;
4238   bool constant_p = true;
4239   tree_vector_builder elts (vector_type, nunits, 1);
4240   elts.quick_grow (nunits);
4241   for (j = 0; j < number_of_copies; j++)
4242     {
4243       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4244         {
4245           tree op;
4246           /* Get the def before the loop.  In reduction chain we have only
4247              one initial value.  */
4248           if ((j != (number_of_copies - 1)
4249                || (reduc_chain && i != 0))
4250               && neutral_op)
4251             op = neutral_op;
4252           else
4253             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4254
4255           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4256           number_of_places_left_in_vector--;
4257           elts[number_of_places_left_in_vector] = op;
4258           if (!CONSTANT_CLASS_P (op))
4259             constant_p = false;
4260
4261           if (number_of_places_left_in_vector == 0)
4262             {
4263               gimple_seq ctor_seq = NULL;
4264               tree init;
4265               if (constant_p && !neutral_op
4266                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4267                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4268                 /* Build the vector directly from ELTS.  */
4269                 init = gimple_build_vector (&ctor_seq, &elts);
4270               else if (neutral_op)
4271                 {
4272                   /* Build a vector of the neutral value and shift the
4273                      other elements into place.  */
4274                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4275                                                        neutral_op);
4276                   int k = nunits;
4277                   while (k > 0 && elts[k - 1] == neutral_op)
4278                     k -= 1;
4279                   while (k > 0)
4280                     {
4281                       k -= 1;
4282                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4283                                            vector_type, init, elts[k]);
4284                     }
4285                 }
4286               else
4287                 {
4288                   /* First time round, duplicate ELTS to fill the
4289                      required number of vectors, then cherry pick the
4290                      appropriate result for each iteration.  */
4291                   if (vec_oprnds->is_empty ())
4292                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4293                                               number_of_vectors,
4294                                               permute_results);
4295                   init = permute_results[number_of_vectors - j - 1];
4296                 }
4297               if (ctor_seq != NULL)
4298                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4299               voprnds.quick_push (init);
4300
4301               number_of_places_left_in_vector = nunits;
4302               elts.new_vector (vector_type, nunits, 1);
4303               elts.quick_grow (nunits);
4304               constant_p = true;
4305             }
4306         }
4307     }
4308
4309   /* Since the vectors are created in the reverse order, we should invert
4310      them.  */
4311   vec_num = voprnds.length ();
4312   for (j = vec_num; j != 0; j--)
4313     {
4314       vop = voprnds[j - 1];
4315       vec_oprnds->quick_push (vop);
4316     }
4317
4318   voprnds.release ();
4319
4320   /* In case that VF is greater than the unrolling factor needed for the SLP
4321      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4322      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4323      to replicate the vectors.  */
4324   tree neutral_vec = NULL;
4325   while (number_of_vectors > vec_oprnds->length ())
4326     {
4327       if (neutral_op)
4328         {
4329           if (!neutral_vec)
4330             {
4331               gimple_seq ctor_seq = NULL;
4332               neutral_vec = gimple_build_vector_from_val
4333                 (&ctor_seq, vector_type, neutral_op);
4334               if (ctor_seq != NULL)
4335                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4336             }
4337           vec_oprnds->quick_push (neutral_vec);
4338         }
4339       else
4340         {
4341           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4342             vec_oprnds->quick_push (vop);
4343         }
4344     }
4345 }
4346
4347
4348 /* Function vect_create_epilog_for_reduction
4349
4350    Create code at the loop-epilog to finalize the result of a reduction
4351    computation.
4352
4353    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4354      reduction statements.
4355    STMT is the scalar reduction stmt that is being vectorized.
4356    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4357      number of elements that we can fit in a vectype (nunits).  In this case
4358      we have to generate more than one vector stmt - i.e - we need to "unroll"
4359      the vector stmt by a factor VF/nunits.  For more details see documentation
4360      in vectorizable_operation.
4361    REDUC_FN is the internal function for the epilog reduction.
4362    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4363      computation.
4364    REDUC_INDEX is the index of the operand in the right hand side of the
4365      statement that is defined by REDUCTION_PHI.
4366    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4367    SLP_NODE is an SLP node containing a group of reduction statements. The
4368      first one in this group is STMT.
4369    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4370      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4371      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4372      any value of the IV in the loop.
4373    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4374    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4375      null if this is not an SLP reduction
4376
4377    This function:
4378    1. Creates the reduction def-use cycles: sets the arguments for
4379       REDUCTION_PHIS:
4380       The loop-entry argument is the vectorized initial-value of the reduction.
4381       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4382       sums.
4383    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4384       by calling the function specified by REDUC_FN if available, or by
4385       other means (whole-vector shifts or a scalar loop).
4386       The function also creates a new phi node at the loop exit to preserve
4387       loop-closed form, as illustrated below.
4388
4389      The flow at the entry to this function:
4390
4391         loop:
4392           vec_def = phi <null, null>            # REDUCTION_PHI
4393           VECT_DEF = vector_stmt                # vectorized form of STMT
4394           s_loop = scalar_stmt                  # (scalar) STMT
4395         loop_exit:
4396           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4397           use <s_out0>
4398           use <s_out0>
4399
4400      The above is transformed by this function into:
4401
4402         loop:
4403           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4404           VECT_DEF = vector_stmt                # vectorized form of STMT
4405           s_loop = scalar_stmt                  # (scalar) STMT
4406         loop_exit:
4407           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4408           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4409           v_out2 = reduce <v_out1>
4410           s_out3 = extract_field <v_out2, 0>
4411           s_out4 = adjust_result <s_out3>
4412           use <s_out4>
4413           use <s_out4>
4414 */
4415
4416 static void
4417 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4418                                   gimple *reduc_def_stmt,
4419                                   int ncopies, internal_fn reduc_fn,
4420                                   vec<gimple *> reduction_phis,
4421                                   bool double_reduc,
4422                                   slp_tree slp_node,
4423                                   slp_instance slp_node_instance,
4424                                   tree induc_val, enum tree_code induc_code,
4425                                   tree neutral_op)
4426 {
4427   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4428   stmt_vec_info prev_phi_info;
4429   tree vectype;
4430   machine_mode mode;
4431   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4432   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4433   basic_block exit_bb;
4434   tree scalar_dest;
4435   tree scalar_type;
4436   gimple *new_phi = NULL, *phi;
4437   gimple_stmt_iterator exit_gsi;
4438   tree vec_dest;
4439   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4440   gimple *epilog_stmt = NULL;
4441   enum tree_code code = gimple_assign_rhs_code (stmt);
4442   gimple *exit_phi;
4443   tree bitsize;
4444   tree adjustment_def = NULL;
4445   tree vec_initial_def = NULL;
4446   tree expr, def, initial_def = NULL;
4447   tree orig_name, scalar_result;
4448   imm_use_iterator imm_iter, phi_imm_iter;
4449   use_operand_p use_p, phi_use_p;
4450   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4451   bool nested_in_vect_loop = false;
4452   auto_vec<gimple *> new_phis;
4453   auto_vec<gimple *> inner_phis;
4454   enum vect_def_type dt = vect_unknown_def_type;
4455   int j, i;
4456   auto_vec<tree> scalar_results;
4457   unsigned int group_size = 1, k, ratio;
4458   auto_vec<tree> vec_initial_defs;
4459   auto_vec<gimple *> phis;
4460   bool slp_reduc = false;
4461   bool direct_slp_reduc;
4462   tree new_phi_result;
4463   gimple *inner_phi = NULL;
4464   tree induction_index = NULL_TREE;
4465
4466   if (slp_node)
4467     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4468
4469   if (nested_in_vect_loop_p (loop, stmt))
4470     {
4471       outer_loop = loop;
4472       loop = loop->inner;
4473       nested_in_vect_loop = true;
4474       gcc_assert (!slp_node);
4475     }
4476
4477   vectype = STMT_VINFO_VECTYPE (stmt_info);
4478   gcc_assert (vectype);
4479   mode = TYPE_MODE (vectype);
4480
4481   /* 1. Create the reduction def-use cycle:
4482      Set the arguments of REDUCTION_PHIS, i.e., transform
4483
4484         loop:
4485           vec_def = phi <null, null>            # REDUCTION_PHI
4486           VECT_DEF = vector_stmt                # vectorized form of STMT
4487           ...
4488
4489      into:
4490
4491         loop:
4492           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4493           VECT_DEF = vector_stmt                # vectorized form of STMT
4494           ...
4495
4496      (in case of SLP, do it for all the phis). */
4497
4498   /* Get the loop-entry arguments.  */
4499   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4500   if (slp_node)
4501     {
4502       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4503       vec_initial_defs.reserve (vec_num);
4504       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4505                                       &vec_initial_defs, vec_num,
4506                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4507                                       neutral_op);
4508     }
4509   else
4510     {
4511       /* Get at the scalar def before the loop, that defines the initial value
4512          of the reduction variable.  */
4513       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4514                                            loop_preheader_edge (loop));
4515       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4516          and we can't use zero for induc_val, use initial_def.  Similarly
4517          for REDUC_MIN and initial_def larger than the base.  */
4518       if (TREE_CODE (initial_def) == INTEGER_CST
4519           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4520               == INTEGER_INDUC_COND_REDUCTION)
4521           && !integer_zerop (induc_val)
4522           && ((induc_code == MAX_EXPR
4523                && tree_int_cst_lt (initial_def, induc_val))
4524               || (induc_code == MIN_EXPR
4525                   && tree_int_cst_lt (induc_val, initial_def))))
4526         induc_val = initial_def;
4527
4528       if (double_reduc)
4529         /* In case of double reduction we only create a vector variable
4530            to be put in the reduction phi node.  The actual statement
4531            creation is done later in this function.  */
4532         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4533       else if (nested_in_vect_loop)
4534         {
4535           /* Do not use an adjustment def as that case is not supported
4536              correctly if ncopies is not one.  */
4537           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4538           vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4539         }
4540       else
4541         vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4542                                                          &adjustment_def);
4543       vec_initial_defs.create (1);
4544       vec_initial_defs.quick_push (vec_initial_def);
4545     }
4546
4547   /* Set phi nodes arguments.  */
4548   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4549     {
4550       tree vec_init_def = vec_initial_defs[i];
4551       tree def = vect_defs[i];
4552       for (j = 0; j < ncopies; j++)
4553         {
4554           if (j != 0)
4555             {
4556               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4557               if (nested_in_vect_loop)
4558                 vec_init_def
4559                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4560                                                     vec_init_def);
4561             }
4562
4563           /* Set the loop-entry arg of the reduction-phi.  */
4564
4565           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4566               == INTEGER_INDUC_COND_REDUCTION)
4567             {
4568               /* Initialise the reduction phi to zero.  This prevents initial
4569                  values of non-zero interferring with the reduction op.  */
4570               gcc_assert (ncopies == 1);
4571               gcc_assert (i == 0);
4572
4573               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4574               tree induc_val_vec
4575                 = build_vector_from_val (vec_init_def_type, induc_val);
4576
4577               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4578                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4579             }
4580           else
4581             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4582                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4583
4584           /* Set the loop-latch arg for the reduction-phi.  */
4585           if (j > 0)
4586             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4587
4588           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4589                        UNKNOWN_LOCATION);
4590
4591           if (dump_enabled_p ())
4592             {
4593               dump_printf_loc (MSG_NOTE, vect_location,
4594                                "transform reduction: created def-use cycle: ");
4595               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4596               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4597             }
4598         }
4599     }
4600
4601   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4602      which is updated with the current index of the loop for every match of
4603      the original loop's cond_expr (VEC_STMT).  This results in a vector
4604      containing the last time the condition passed for that vector lane.
4605      The first match will be a 1 to allow 0 to be used for non-matching
4606      indexes.  If there are no matches at all then the vector will be all
4607      zeroes.  */
4608   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4609     {
4610       tree indx_before_incr, indx_after_incr;
4611       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4612
4613       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4614       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4615
4616       int scalar_precision
4617         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4618       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4619       tree cr_index_vector_type = build_vector_type
4620         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4621
4622       /* First we create a simple vector induction variable which starts
4623          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4624          vector size (STEP).  */
4625
4626       /* Create a {1,2,3,...} vector.  */
4627       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4628
4629       /* Create a vector of the step value.  */
4630       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4631       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4632
4633       /* Create an induction variable.  */
4634       gimple_stmt_iterator incr_gsi;
4635       bool insert_after;
4636       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4637       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4638                  insert_after, &indx_before_incr, &indx_after_incr);
4639
4640       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4641          filled with zeros (VEC_ZERO).  */
4642
4643       /* Create a vector of 0s.  */
4644       tree zero = build_zero_cst (cr_index_scalar_type);
4645       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4646
4647       /* Create a vector phi node.  */
4648       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4649       new_phi = create_phi_node (new_phi_tree, loop->header);
4650       loop_vinfo->add_stmt (new_phi);
4651       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4652                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4653
4654       /* Now take the condition from the loops original cond_expr
4655          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4656          every match uses values from the induction variable
4657          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4658          (NEW_PHI_TREE).
4659          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4660          the new cond_expr (INDEX_COND_EXPR).  */
4661
4662       /* Duplicate the condition from vec_stmt.  */
4663       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4664
4665       /* Create a conditional, where the condition is taken from vec_stmt
4666          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4667          else is the phi (NEW_PHI_TREE).  */
4668       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4669                                      ccompare, indx_before_incr,
4670                                      new_phi_tree);
4671       induction_index = make_ssa_name (cr_index_vector_type);
4672       gimple *index_condition = gimple_build_assign (induction_index,
4673                                                      index_cond_expr);
4674       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4675       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4676       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4677
4678       /* Update the phi with the vec cond.  */
4679       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4680                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4681     }
4682
4683   /* 2. Create epilog code.
4684         The reduction epilog code operates across the elements of the vector
4685         of partial results computed by the vectorized loop.
4686         The reduction epilog code consists of:
4687
4688         step 1: compute the scalar result in a vector (v_out2)
4689         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4690         step 3: adjust the scalar result (s_out3) if needed.
4691
4692         Step 1 can be accomplished using one the following three schemes:
4693           (scheme 1) using reduc_fn, if available.
4694           (scheme 2) using whole-vector shifts, if available.
4695           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4696                      combined.
4697
4698           The overall epilog code looks like this:
4699
4700           s_out0 = phi <s_loop>         # original EXIT_PHI
4701           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4702           v_out2 = reduce <v_out1>              # step 1
4703           s_out3 = extract_field <v_out2, 0>    # step 2
4704           s_out4 = adjust_result <s_out3>       # step 3
4705
4706           (step 3 is optional, and steps 1 and 2 may be combined).
4707           Lastly, the uses of s_out0 are replaced by s_out4.  */
4708
4709
4710   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4711          v_out1 = phi <VECT_DEF>
4712          Store them in NEW_PHIS.  */
4713
4714   exit_bb = single_exit (loop)->dest;
4715   prev_phi_info = NULL;
4716   new_phis.create (vect_defs.length ());
4717   FOR_EACH_VEC_ELT (vect_defs, i, def)
4718     {
4719       for (j = 0; j < ncopies; j++)
4720         {
4721           tree new_def = copy_ssa_name (def);
4722           phi = create_phi_node (new_def, exit_bb);
4723           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4724           if (j == 0)
4725             new_phis.quick_push (phi);
4726           else
4727             {
4728               def = vect_get_vec_def_for_stmt_copy (dt, def);
4729               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4730             }
4731
4732           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4733           prev_phi_info = phi_info;
4734         }
4735     }
4736
4737   /* The epilogue is created for the outer-loop, i.e., for the loop being
4738      vectorized.  Create exit phis for the outer loop.  */
4739   if (double_reduc)
4740     {
4741       loop = outer_loop;
4742       exit_bb = single_exit (loop)->dest;
4743       inner_phis.create (vect_defs.length ());
4744       FOR_EACH_VEC_ELT (new_phis, i, phi)
4745         {
4746           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4747           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4748           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4749                            PHI_RESULT (phi));
4750           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4751           inner_phis.quick_push (phi);
4752           new_phis[i] = outer_phi;
4753           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4754             {
4755               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4756               new_result = copy_ssa_name (PHI_RESULT (phi));
4757               outer_phi = create_phi_node (new_result, exit_bb);
4758               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4759                                PHI_RESULT (phi));
4760               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4761               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4762               prev_phi_info = outer_phi_info;
4763             }
4764         }
4765     }
4766
4767   exit_gsi = gsi_after_labels (exit_bb);
4768
4769   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4770          (i.e. when reduc_fn is not available) and in the final adjustment
4771          code (if needed).  Also get the original scalar reduction variable as
4772          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4773          represents a reduction pattern), the tree-code and scalar-def are
4774          taken from the original stmt that the pattern-stmt (STMT) replaces.
4775          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4776          are taken from STMT.  */
4777
4778   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4779   if (!orig_stmt)
4780     {
4781       /* Regular reduction  */
4782       orig_stmt = stmt;
4783     }
4784   else
4785     {
4786       /* Reduction pattern  */
4787       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4788       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4789       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4790     }
4791
4792   code = gimple_assign_rhs_code (orig_stmt);
4793   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4794      partial results are added and not subtracted.  */
4795   if (code == MINUS_EXPR)
4796     code = PLUS_EXPR;
4797
4798   scalar_dest = gimple_assign_lhs (orig_stmt);
4799   scalar_type = TREE_TYPE (scalar_dest);
4800   scalar_results.create (group_size);
4801   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4802   bitsize = TYPE_SIZE (scalar_type);
4803
4804   /* In case this is a reduction in an inner-loop while vectorizing an outer
4805      loop - we don't need to extract a single scalar result at the end of the
4806      inner-loop (unless it is double reduction, i.e., the use of reduction is
4807      outside the outer-loop).  The final vector of partial results will be used
4808      in the vectorized outer-loop, or reduced to a scalar result at the end of
4809      the outer-loop.  */
4810   if (nested_in_vect_loop && !double_reduc)
4811     goto vect_finalize_reduction;
4812
4813   /* SLP reduction without reduction chain, e.g.,
4814      # a1 = phi <a2, a0>
4815      # b1 = phi <b2, b0>
4816      a2 = operation (a1)
4817      b2 = operation (b1)  */
4818   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4819
4820   /* True if we should implement SLP_REDUC using native reduction operations
4821      instead of scalar operations.  */
4822   direct_slp_reduc = (reduc_fn != IFN_LAST
4823                       && slp_reduc
4824                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4825
4826   /* In case of reduction chain, e.g.,
4827      # a1 = phi <a3, a0>
4828      a2 = operation (a1)
4829      a3 = operation (a2),
4830
4831      we may end up with more than one vector result.  Here we reduce them to
4832      one vector.  */
4833   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4834     {
4835       tree first_vect = PHI_RESULT (new_phis[0]);
4836       gassign *new_vec_stmt = NULL;
4837       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4838       for (k = 1; k < new_phis.length (); k++)
4839         {
4840           gimple *next_phi = new_phis[k];
4841           tree second_vect = PHI_RESULT (next_phi);
4842           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4843           new_vec_stmt = gimple_build_assign (tem, code,
4844                                               first_vect, second_vect);
4845           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4846           first_vect = tem;
4847         }
4848
4849       new_phi_result = first_vect;
4850       if (new_vec_stmt)
4851         {
4852           new_phis.truncate (0);
4853           new_phis.safe_push (new_vec_stmt);
4854         }
4855     }
4856   /* Likewise if we couldn't use a single defuse cycle.  */
4857   else if (ncopies > 1)
4858     {
4859       gcc_assert (new_phis.length () == 1);
4860       tree first_vect = PHI_RESULT (new_phis[0]);
4861       gassign *new_vec_stmt = NULL;
4862       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4863       gimple *next_phi = new_phis[0];
4864       for (int k = 1; k < ncopies; ++k)
4865         {
4866           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4867           tree second_vect = PHI_RESULT (next_phi);
4868           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4869           new_vec_stmt = gimple_build_assign (tem, code,
4870                                               first_vect, second_vect);
4871           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4872           first_vect = tem;
4873         }
4874       new_phi_result = first_vect;
4875       new_phis.truncate (0);
4876       new_phis.safe_push (new_vec_stmt);
4877     }
4878   else
4879     new_phi_result = PHI_RESULT (new_phis[0]);
4880
4881   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4882       && reduc_fn != IFN_LAST)
4883     {
4884       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4885          various data values where the condition matched and another vector
4886          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4887          need to extract the last matching index (which will be the index with
4888          highest value) and use this to index into the data vector.
4889          For the case where there were no matches, the data vector will contain
4890          all default values and the index vector will be all zeros.  */
4891
4892       /* Get various versions of the type of the vector of indexes.  */
4893       tree index_vec_type = TREE_TYPE (induction_index);
4894       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4895       tree index_scalar_type = TREE_TYPE (index_vec_type);
4896       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4897         (index_vec_type);
4898
4899       /* Get an unsigned integer version of the type of the data vector.  */
4900       int scalar_precision
4901         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4902       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4903       tree vectype_unsigned = build_vector_type
4904         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4905
4906       /* First we need to create a vector (ZERO_VEC) of zeros and another
4907          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4908          can create using a MAX reduction and then expanding.
4909          In the case where the loop never made any matches, the max index will
4910          be zero.  */
4911
4912       /* Vector of {0, 0, 0,...}.  */
4913       tree zero_vec = make_ssa_name (vectype);
4914       tree zero_vec_rhs = build_zero_cst (vectype);
4915       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4916       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4917
4918       /* Find maximum value from the vector of found indexes.  */
4919       tree max_index = make_ssa_name (index_scalar_type);
4920       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4921                                                           1, induction_index);
4922       gimple_call_set_lhs (max_index_stmt, max_index);
4923       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4924
4925       /* Vector of {max_index, max_index, max_index,...}.  */
4926       tree max_index_vec = make_ssa_name (index_vec_type);
4927       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4928                                                       max_index);
4929       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4930                                                         max_index_vec_rhs);
4931       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4932
4933       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4934          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4935          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4936          otherwise.  Only one value should match, resulting in a vector
4937          (VEC_COND) with one data value and the rest zeros.
4938          In the case where the loop never made any matches, every index will
4939          match, resulting in a vector with all data values (which will all be
4940          the default value).  */
4941
4942       /* Compare the max index vector to the vector of found indexes to find
4943          the position of the max value.  */
4944       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4945       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4946                                                       induction_index,
4947                                                       max_index_vec);
4948       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4949
4950       /* Use the compare to choose either values from the data vector or
4951          zero.  */
4952       tree vec_cond = make_ssa_name (vectype);
4953       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4954                                                    vec_compare, new_phi_result,
4955                                                    zero_vec);
4956       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4957
4958       /* Finally we need to extract the data value from the vector (VEC_COND)
4959          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4960          reduction, but because this doesn't exist, we can use a MAX reduction
4961          instead.  The data value might be signed or a float so we need to cast
4962          it first.
4963          In the case where the loop never made any matches, the data values are
4964          all identical, and so will reduce down correctly.  */
4965
4966       /* Make the matched data values unsigned.  */
4967       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4968       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4969                                        vec_cond);
4970       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4971                                                         VIEW_CONVERT_EXPR,
4972                                                         vec_cond_cast_rhs);
4973       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4974
4975       /* Reduce down to a scalar value.  */
4976       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4977       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4978                                                            1, vec_cond_cast);
4979       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4980       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4981
4982       /* Convert the reduced value back to the result type and set as the
4983          result.  */
4984       gimple_seq stmts = NULL;
4985       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4986                                data_reduc);
4987       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4988       scalar_results.safe_push (new_temp);
4989     }
4990   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4991            && reduc_fn == IFN_LAST)
4992     {
4993       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4994          idx = 0;
4995          idx_val = induction_index[0];
4996          val = data_reduc[0];
4997          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4998            if (induction_index[i] > idx_val)
4999              val = data_reduc[i], idx_val = induction_index[i];
5000          return val;  */
5001
5002       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5003       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5004       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5005       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5006       /* Enforced by vectorizable_reduction, which ensures we have target
5007          support before allowing a conditional reduction on variable-length
5008          vectors.  */
5009       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5010       tree idx_val = NULL_TREE, val = NULL_TREE;
5011       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5012         {
5013           tree old_idx_val = idx_val;
5014           tree old_val = val;
5015           idx_val = make_ssa_name (idx_eltype);
5016           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5017                                              build3 (BIT_FIELD_REF, idx_eltype,
5018                                                      induction_index,
5019                                                      bitsize_int (el_size),
5020                                                      bitsize_int (off)));
5021           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5022           val = make_ssa_name (data_eltype);
5023           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5024                                              build3 (BIT_FIELD_REF,
5025                                                      data_eltype,
5026                                                      new_phi_result,
5027                                                      bitsize_int (el_size),
5028                                                      bitsize_int (off)));
5029           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5030           if (off != 0)
5031             {
5032               tree new_idx_val = idx_val;
5033               tree new_val = val;
5034               if (off != v_size - el_size)
5035                 {
5036                   new_idx_val = make_ssa_name (idx_eltype);
5037                   epilog_stmt = gimple_build_assign (new_idx_val,
5038                                                      MAX_EXPR, idx_val,
5039                                                      old_idx_val);
5040                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5041                 }
5042               new_val = make_ssa_name (data_eltype);
5043               epilog_stmt = gimple_build_assign (new_val,
5044                                                  COND_EXPR,
5045                                                  build2 (GT_EXPR,
5046                                                          boolean_type_node,
5047                                                          idx_val,
5048                                                          old_idx_val),
5049                                                  val, old_val);
5050               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5051               idx_val = new_idx_val;
5052               val = new_val;
5053             }
5054         }
5055       /* Convert the reduced value back to the result type and set as the
5056          result.  */
5057       gimple_seq stmts = NULL;
5058       val = gimple_convert (&stmts, scalar_type, val);
5059       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5060       scalar_results.safe_push (val);
5061     }
5062
5063   /* 2.3 Create the reduction code, using one of the three schemes described
5064          above. In SLP we simply need to extract all the elements from the
5065          vector (without reducing them), so we use scalar shifts.  */
5066   else if (reduc_fn != IFN_LAST && !slp_reduc)
5067     {
5068       tree tmp;
5069       tree vec_elem_type;
5070
5071       /* Case 1:  Create:
5072          v_out2 = reduc_expr <v_out1>  */
5073
5074       if (dump_enabled_p ())
5075         dump_printf_loc (MSG_NOTE, vect_location,
5076                          "Reduce using direct vector reduction.\n");
5077
5078       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5079       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5080         {
5081           tree tmp_dest
5082             = vect_create_destination_var (scalar_dest, vec_elem_type);
5083           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5084                                                     new_phi_result);
5085           gimple_set_lhs (epilog_stmt, tmp_dest);
5086           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5087           gimple_set_lhs (epilog_stmt, new_temp);
5088           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5089
5090           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5091                                              new_temp);
5092         }
5093       else
5094         {
5095           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5096                                                     new_phi_result);
5097           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5098         }
5099
5100       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5101       gimple_set_lhs (epilog_stmt, new_temp);
5102       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5103
5104       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5105            == INTEGER_INDUC_COND_REDUCTION)
5106           && !operand_equal_p (initial_def, induc_val, 0))
5107         {
5108           /* Earlier we set the initial value to be a vector if induc_val
5109              values.  Check the result and if it is induc_val then replace
5110              with the original initial value, unless induc_val is
5111              the same as initial_def already.  */
5112           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5113                                   induc_val);
5114
5115           tmp = make_ssa_name (new_scalar_dest);
5116           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5117                                              initial_def, new_temp);
5118           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5119           new_temp = tmp;
5120         }
5121
5122       scalar_results.safe_push (new_temp);
5123     }
5124   else if (direct_slp_reduc)
5125     {
5126       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5127          with the elements for other SLP statements replaced with the
5128          neutral value.  We can then do a normal reduction on each vector.  */
5129
5130       /* Enforced by vectorizable_reduction.  */
5131       gcc_assert (new_phis.length () == 1);
5132       gcc_assert (pow2p_hwi (group_size));
5133
5134       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5135       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5136       gimple_seq seq = NULL;
5137
5138       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5139          and the same element size as VECTYPE.  */
5140       tree index = build_index_vector (vectype, 0, 1);
5141       tree index_type = TREE_TYPE (index);
5142       tree index_elt_type = TREE_TYPE (index_type);
5143       tree mask_type = build_same_sized_truth_vector_type (index_type);
5144
5145       /* Create a vector that, for each element, identifies which of
5146          the REDUC_GROUP_SIZE results should use it.  */
5147       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5148       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5149                             build_vector_from_val (index_type, index_mask));
5150
5151       /* Get a neutral vector value.  This is simply a splat of the neutral
5152          scalar value if we have one, otherwise the initial scalar value
5153          is itself a neutral value.  */
5154       tree vector_identity = NULL_TREE;
5155       if (neutral_op)
5156         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5157                                                         neutral_op);
5158       for (unsigned int i = 0; i < group_size; ++i)
5159         {
5160           /* If there's no univeral neutral value, we can use the
5161              initial scalar value from the original PHI.  This is used
5162              for MIN and MAX reduction, for example.  */
5163           if (!neutral_op)
5164             {
5165               tree scalar_value
5166                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5167                                          loop_preheader_edge (loop));
5168               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5169                                                               scalar_value);
5170             }
5171
5172           /* Calculate the equivalent of:
5173
5174              sel[j] = (index[j] == i);
5175
5176              which selects the elements of NEW_PHI_RESULT that should
5177              be included in the result.  */
5178           tree compare_val = build_int_cst (index_elt_type, i);
5179           compare_val = build_vector_from_val (index_type, compare_val);
5180           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5181                                    index, compare_val);
5182
5183           /* Calculate the equivalent of:
5184
5185              vec = seq ? new_phi_result : vector_identity;
5186
5187              VEC is now suitable for a full vector reduction.  */
5188           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5189                                    sel, new_phi_result, vector_identity);
5190
5191           /* Do the reduction and convert it to the appropriate type.  */
5192           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5193                                       TREE_TYPE (vectype), vec);
5194           scalar = gimple_convert (&seq, scalar_type, scalar);
5195           scalar_results.safe_push (scalar);
5196         }
5197       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5198     }
5199   else
5200     {
5201       bool reduce_with_shift;
5202       tree vec_temp;
5203
5204       /* COND reductions all do the final reduction with MAX_EXPR
5205          or MIN_EXPR.  */
5206       if (code == COND_EXPR)
5207         {
5208           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5209               == INTEGER_INDUC_COND_REDUCTION)
5210             code = induc_code;
5211           else
5212             code = MAX_EXPR;
5213         }
5214
5215       /* See if the target wants to do the final (shift) reduction
5216          in a vector mode of smaller size and first reduce upper/lower
5217          halves against each other.  */
5218       enum machine_mode mode1 = mode;
5219       tree vectype1 = vectype;
5220       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5221       unsigned sz1 = sz;
5222       if (!slp_reduc
5223           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5224         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5225
5226       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5227       reduce_with_shift = have_whole_vector_shift (mode1);
5228       if (!VECTOR_MODE_P (mode1))
5229         reduce_with_shift = false;
5230       else
5231         {
5232           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5233           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5234             reduce_with_shift = false;
5235         }
5236
5237       /* First reduce the vector to the desired vector size we should
5238          do shift reduction on by combining upper and lower halves.  */
5239       new_temp = new_phi_result;
5240       while (sz > sz1)
5241         {
5242           gcc_assert (!slp_reduc);
5243           sz /= 2;
5244           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5245
5246           /* The target has to make sure we support lowpart/highpart
5247              extraction, either via direct vector extract or through
5248              an integer mode punning.  */
5249           tree dst1, dst2;
5250           if (convert_optab_handler (vec_extract_optab,
5251                                      TYPE_MODE (TREE_TYPE (new_temp)),
5252                                      TYPE_MODE (vectype1))
5253               != CODE_FOR_nothing)
5254             {
5255               /* Extract sub-vectors directly once vec_extract becomes
5256                  a conversion optab.  */
5257               dst1 = make_ssa_name (vectype1);
5258               epilog_stmt
5259                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5260                                          build3 (BIT_FIELD_REF, vectype1,
5261                                                  new_temp, TYPE_SIZE (vectype1),
5262                                                  bitsize_int (0)));
5263               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5264               dst2 =  make_ssa_name (vectype1);
5265               epilog_stmt
5266                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5267                                          build3 (BIT_FIELD_REF, vectype1,
5268                                                  new_temp, TYPE_SIZE (vectype1),
5269                                                  bitsize_int (sz * BITS_PER_UNIT)));
5270               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271             }
5272           else
5273             {
5274               /* Extract via punning to appropriately sized integer mode
5275                  vector.  */
5276               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5277                                                             1);
5278               tree etype = build_vector_type (eltype, 2);
5279               gcc_assert (convert_optab_handler (vec_extract_optab,
5280                                                  TYPE_MODE (etype),
5281                                                  TYPE_MODE (eltype))
5282                           != CODE_FOR_nothing);
5283               tree tem = make_ssa_name (etype);
5284               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5285                                                  build1 (VIEW_CONVERT_EXPR,
5286                                                          etype, new_temp));
5287               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5288               new_temp = tem;
5289               tem = make_ssa_name (eltype);
5290               epilog_stmt
5291                   = gimple_build_assign (tem, BIT_FIELD_REF,
5292                                          build3 (BIT_FIELD_REF, eltype,
5293                                                  new_temp, TYPE_SIZE (eltype),
5294                                                  bitsize_int (0)));
5295               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5296               dst1 = make_ssa_name (vectype1);
5297               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5298                                                  build1 (VIEW_CONVERT_EXPR,
5299                                                          vectype1, tem));
5300               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5301               tem = make_ssa_name (eltype);
5302               epilog_stmt
5303                   = gimple_build_assign (tem, BIT_FIELD_REF,
5304                                          build3 (BIT_FIELD_REF, eltype,
5305                                                  new_temp, TYPE_SIZE (eltype),
5306                                                  bitsize_int (sz * BITS_PER_UNIT)));
5307               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5308               dst2 =  make_ssa_name (vectype1);
5309               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5310                                                  build1 (VIEW_CONVERT_EXPR,
5311                                                          vectype1, tem));
5312               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5313             }
5314
5315           new_temp = make_ssa_name (vectype1);
5316           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5317           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5318         }
5319
5320       if (reduce_with_shift && !slp_reduc)
5321         {
5322           int element_bitsize = tree_to_uhwi (bitsize);
5323           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5324              for variable-length vectors and also requires direct target support
5325              for loop reductions.  */
5326           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5327           int nelements = vec_size_in_bits / element_bitsize;
5328           vec_perm_builder sel;
5329           vec_perm_indices indices;
5330
5331           int elt_offset;
5332
5333           tree zero_vec = build_zero_cst (vectype1);
5334           /* Case 2: Create:
5335              for (offset = nelements/2; offset >= 1; offset/=2)
5336                 {
5337                   Create:  va' = vec_shift <va, offset>
5338                   Create:  va = vop <va, va'>
5339                 }  */
5340
5341           tree rhs;
5342
5343           if (dump_enabled_p ())
5344             dump_printf_loc (MSG_NOTE, vect_location,
5345                              "Reduce using vector shifts\n");
5346
5347           mode1 = TYPE_MODE (vectype1);
5348           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5349           for (elt_offset = nelements / 2;
5350                elt_offset >= 1;
5351                elt_offset /= 2)
5352             {
5353               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5354               indices.new_vector (sel, 2, nelements);
5355               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5356               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5357                                                  new_temp, zero_vec, mask);
5358               new_name = make_ssa_name (vec_dest, epilog_stmt);
5359               gimple_assign_set_lhs (epilog_stmt, new_name);
5360               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5361
5362               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5363                                                  new_temp);
5364               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5365               gimple_assign_set_lhs (epilog_stmt, new_temp);
5366               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5367             }
5368
5369           /* 2.4  Extract the final scalar result.  Create:
5370              s_out3 = extract_field <v_out2, bitpos>  */
5371
5372           if (dump_enabled_p ())
5373             dump_printf_loc (MSG_NOTE, vect_location,
5374                              "extract scalar result\n");
5375
5376           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5377                         bitsize, bitsize_zero_node);
5378           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5379           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5380           gimple_assign_set_lhs (epilog_stmt, new_temp);
5381           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5382           scalar_results.safe_push (new_temp);
5383         }
5384       else
5385         {
5386           /* Case 3: Create:
5387              s = extract_field <v_out2, 0>
5388              for (offset = element_size;
5389                   offset < vector_size;
5390                   offset += element_size;)
5391                {
5392                  Create:  s' = extract_field <v_out2, offset>
5393                  Create:  s = op <s, s'>  // For non SLP cases
5394                }  */
5395
5396           if (dump_enabled_p ())
5397             dump_printf_loc (MSG_NOTE, vect_location,
5398                              "Reduce using scalar code.\n");
5399
5400           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5401           int element_bitsize = tree_to_uhwi (bitsize);
5402           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5403             {
5404               int bit_offset;
5405               if (gimple_code (new_phi) == GIMPLE_PHI)
5406                 vec_temp = PHI_RESULT (new_phi);
5407               else
5408                 vec_temp = gimple_assign_lhs (new_phi);
5409               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5410                                  bitsize_zero_node);
5411               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5412               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5413               gimple_assign_set_lhs (epilog_stmt, new_temp);
5414               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5415
5416               /* In SLP we don't need to apply reduction operation, so we just
5417                  collect s' values in SCALAR_RESULTS.  */
5418               if (slp_reduc)
5419                 scalar_results.safe_push (new_temp);
5420
5421               for (bit_offset = element_bitsize;
5422                    bit_offset < vec_size_in_bits;
5423                    bit_offset += element_bitsize)
5424                 {
5425                   tree bitpos = bitsize_int (bit_offset);
5426                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5427                                      bitsize, bitpos);
5428
5429                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5430                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5431                   gimple_assign_set_lhs (epilog_stmt, new_name);
5432                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5433
5434                   if (slp_reduc)
5435                     {
5436                       /* In SLP we don't need to apply reduction operation, so
5437                          we just collect s' values in SCALAR_RESULTS.  */
5438                       new_temp = new_name;
5439                       scalar_results.safe_push (new_name);
5440                     }
5441                   else
5442                     {
5443                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5444                                                          new_name, new_temp);
5445                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5446                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5447                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5448                     }
5449                 }
5450             }
5451
5452           /* The only case where we need to reduce scalar results in SLP, is
5453              unrolling.  If the size of SCALAR_RESULTS is greater than
5454              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5455              REDUC_GROUP_SIZE.  */
5456           if (slp_reduc)
5457             {
5458               tree res, first_res, new_res;
5459               gimple *new_stmt;
5460
5461               /* Reduce multiple scalar results in case of SLP unrolling.  */
5462               for (j = group_size; scalar_results.iterate (j, &res);
5463                    j++)
5464                 {
5465                   first_res = scalar_results[j % group_size];
5466                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5467                                                   first_res, res);
5468                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5469                   gimple_assign_set_lhs (new_stmt, new_res);
5470                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5471                   scalar_results[j % group_size] = new_res;
5472                 }
5473             }
5474           else
5475             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5476             scalar_results.safe_push (new_temp);
5477         }
5478
5479       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5480            == INTEGER_INDUC_COND_REDUCTION)
5481           && !operand_equal_p (initial_def, induc_val, 0))
5482         {
5483           /* Earlier we set the initial value to be a vector if induc_val
5484              values.  Check the result and if it is induc_val then replace
5485              with the original initial value, unless induc_val is
5486              the same as initial_def already.  */
5487           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5488                                   induc_val);
5489
5490           tree tmp = make_ssa_name (new_scalar_dest);
5491           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5492                                              initial_def, new_temp);
5493           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5494           scalar_results[0] = tmp;
5495         }
5496     }
5497
5498 vect_finalize_reduction:
5499
5500   if (double_reduc)
5501     loop = loop->inner;
5502
5503   /* 2.5 Adjust the final result by the initial value of the reduction
5504          variable. (When such adjustment is not needed, then
5505          'adjustment_def' is zero).  For example, if code is PLUS we create:
5506          new_temp = loop_exit_def + adjustment_def  */
5507
5508   if (adjustment_def)
5509     {
5510       gcc_assert (!slp_reduc);
5511       if (nested_in_vect_loop)
5512         {
5513           new_phi = new_phis[0];
5514           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5515           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5516           new_dest = vect_create_destination_var (scalar_dest, vectype);
5517         }
5518       else
5519         {
5520           new_temp = scalar_results[0];
5521           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5522           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5523           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5524         }
5525
5526       epilog_stmt = gimple_build_assign (new_dest, expr);
5527       new_temp = make_ssa_name (new_dest, epilog_stmt);
5528       gimple_assign_set_lhs (epilog_stmt, new_temp);
5529       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5530       if (nested_in_vect_loop)
5531         {
5532           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5533           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5534             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5535
5536           if (!double_reduc)
5537             scalar_results.quick_push (new_temp);
5538           else
5539             scalar_results[0] = new_temp;
5540         }
5541       else
5542         scalar_results[0] = new_temp;
5543
5544       new_phis[0] = epilog_stmt;
5545     }
5546
5547   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5548           phis with new adjusted scalar results, i.e., replace use <s_out0>
5549           with use <s_out4>.
5550
5551      Transform:
5552         loop_exit:
5553           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5554           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5555           v_out2 = reduce <v_out1>
5556           s_out3 = extract_field <v_out2, 0>
5557           s_out4 = adjust_result <s_out3>
5558           use <s_out0>
5559           use <s_out0>
5560
5561      into:
5562
5563         loop_exit:
5564           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5565           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5566           v_out2 = reduce <v_out1>
5567           s_out3 = extract_field <v_out2, 0>
5568           s_out4 = adjust_result <s_out3>
5569           use <s_out4>
5570           use <s_out4> */
5571
5572
5573   /* In SLP reduction chain we reduce vector results into one vector if
5574      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5575      LHS of the last stmt in the reduction chain, since we are looking for
5576      the loop exit phi node.  */
5577   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5578     {
5579       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5580       /* Handle reduction patterns.  */
5581       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5582         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5583
5584       scalar_dest = gimple_assign_lhs (dest_stmt);
5585       group_size = 1;
5586     }
5587
5588   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5589      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5590      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5591      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5592      correspond to the first vector stmt, etc.
5593      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5594   if (group_size > new_phis.length ())
5595     {
5596       ratio = group_size / new_phis.length ();
5597       gcc_assert (!(group_size % new_phis.length ()));
5598     }
5599   else
5600     ratio = 1;
5601
5602   for (k = 0; k < group_size; k++)
5603     {
5604       if (k % ratio == 0)
5605         {
5606           epilog_stmt = new_phis[k / ratio];
5607           reduction_phi = reduction_phis[k / ratio];
5608           if (double_reduc)
5609             inner_phi = inner_phis[k / ratio];
5610         }
5611
5612       if (slp_reduc)
5613         {
5614           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5615
5616           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5617           /* SLP statements can't participate in patterns.  */
5618           gcc_assert (!orig_stmt);
5619           scalar_dest = gimple_assign_lhs (current_stmt);
5620         }
5621
5622       phis.create (3);
5623       /* Find the loop-closed-use at the loop exit of the original scalar
5624          result.  (The reduction result is expected to have two immediate uses -
5625          one at the latch block, and one at the loop exit).  */
5626       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5627         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5628             && !is_gimple_debug (USE_STMT (use_p)))
5629           phis.safe_push (USE_STMT (use_p));
5630
5631       /* While we expect to have found an exit_phi because of loop-closed-ssa
5632          form we can end up without one if the scalar cycle is dead.  */
5633
5634       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5635         {
5636           if (outer_loop)
5637             {
5638               stmt_vec_info exit_phi_vinfo
5639                 = loop_vinfo->lookup_stmt (exit_phi);
5640               gphi *vect_phi;
5641
5642               /* FORNOW. Currently not supporting the case that an inner-loop
5643                  reduction is not used in the outer-loop (but only outside the
5644                  outer-loop), unless it is double reduction.  */
5645               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5646                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5647                           || double_reduc);
5648
5649               if (double_reduc)
5650                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5651               else
5652                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5653               if (!double_reduc
5654                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5655                       != vect_double_reduction_def)
5656                 continue;
5657
5658               /* Handle double reduction:
5659
5660                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5661                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5662                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5663                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5664
5665                  At that point the regular reduction (stmt2 and stmt3) is
5666                  already vectorized, as well as the exit phi node, stmt4.
5667                  Here we vectorize the phi node of double reduction, stmt1, and
5668                  update all relevant statements.  */
5669
5670               /* Go through all the uses of s2 to find double reduction phi
5671                  node, i.e., stmt1 above.  */
5672               orig_name = PHI_RESULT (exit_phi);
5673               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5674                 {
5675                   stmt_vec_info use_stmt_vinfo;
5676                   tree vect_phi_init, preheader_arg, vect_phi_res;
5677                   basic_block bb = gimple_bb (use_stmt);
5678                   gimple *use;
5679
5680                   /* Check that USE_STMT is really double reduction phi
5681                      node.  */
5682                   if (gimple_code (use_stmt) != GIMPLE_PHI
5683                       || gimple_phi_num_args (use_stmt) != 2
5684                       || bb->loop_father != outer_loop)
5685                     continue;
5686                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5687                   if (!use_stmt_vinfo
5688                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5689                           != vect_double_reduction_def)
5690                     continue;
5691
5692                   /* Create vector phi node for double reduction:
5693                      vs1 = phi <vs0, vs2>
5694                      vs1 was created previously in this function by a call to
5695                        vect_get_vec_def_for_operand and is stored in
5696                        vec_initial_def;
5697                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5698                      vs0 is created here.  */
5699
5700                   /* Create vector phi node.  */
5701                   vect_phi = create_phi_node (vec_initial_def, bb);
5702                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5703
5704                   /* Create vs0 - initial def of the double reduction phi.  */
5705                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5706                                              loop_preheader_edge (outer_loop));
5707                   vect_phi_init = get_initial_def_for_reduction
5708                     (stmt, preheader_arg, NULL);
5709
5710                   /* Update phi node arguments with vs0 and vs2.  */
5711                   add_phi_arg (vect_phi, vect_phi_init,
5712                                loop_preheader_edge (outer_loop),
5713                                UNKNOWN_LOCATION);
5714                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5715                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5716                   if (dump_enabled_p ())
5717                     {
5718                       dump_printf_loc (MSG_NOTE, vect_location,
5719                                        "created double reduction phi node: ");
5720                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5721                     }
5722
5723                   vect_phi_res = PHI_RESULT (vect_phi);
5724
5725                   /* Replace the use, i.e., set the correct vs1 in the regular
5726                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5727                      loop is redundant.  */
5728                   use = reduction_phi;
5729                   for (j = 0; j < ncopies; j++)
5730                     {
5731                       edge pr_edge = loop_preheader_edge (loop);
5732                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5733                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5734                     }
5735                 }
5736             }
5737         }
5738
5739       phis.release ();
5740       if (nested_in_vect_loop)
5741         {
5742           if (double_reduc)
5743             loop = outer_loop;
5744           else
5745             continue;
5746         }
5747
5748       phis.create (3);
5749       /* Find the loop-closed-use at the loop exit of the original scalar
5750          result.  (The reduction result is expected to have two immediate uses,
5751          one at the latch block, and one at the loop exit).  For double
5752          reductions we are looking for exit phis of the outer loop.  */
5753       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5754         {
5755           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5756             {
5757               if (!is_gimple_debug (USE_STMT (use_p)))
5758                 phis.safe_push (USE_STMT (use_p));
5759             }
5760           else
5761             {
5762               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5763                 {
5764                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5765
5766                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5767                     {
5768                       if (!flow_bb_inside_loop_p (loop,
5769                                              gimple_bb (USE_STMT (phi_use_p)))
5770                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5771                         phis.safe_push (USE_STMT (phi_use_p));
5772                     }
5773                 }
5774             }
5775         }
5776
5777       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5778         {
5779           /* Replace the uses:  */
5780           orig_name = PHI_RESULT (exit_phi);
5781           scalar_result = scalar_results[k];
5782           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5783             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5784               SET_USE (use_p, scalar_result);
5785         }
5786
5787       phis.release ();
5788     }
5789 }
5790
5791 /* Return a vector of type VECTYPE that is equal to the vector select
5792    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5793    before GSI.  */
5794
5795 static tree
5796 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5797                      tree vec, tree identity)
5798 {
5799   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5800   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5801                                           mask, vec, identity);
5802   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5803   return cond;
5804 }
5805
5806 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5807    order, starting with LHS.  Insert the extraction statements before GSI and
5808    associate the new scalar SSA names with variable SCALAR_DEST.
5809    Return the SSA name for the result.  */
5810
5811 static tree
5812 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5813                        tree_code code, tree lhs, tree vector_rhs)
5814 {
5815   tree vectype = TREE_TYPE (vector_rhs);
5816   tree scalar_type = TREE_TYPE (vectype);
5817   tree bitsize = TYPE_SIZE (scalar_type);
5818   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5819   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5820
5821   for (unsigned HOST_WIDE_INT bit_offset = 0;
5822        bit_offset < vec_size_in_bits;
5823        bit_offset += element_bitsize)
5824     {
5825       tree bitpos = bitsize_int (bit_offset);
5826       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5827                          bitsize, bitpos);
5828
5829       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5830       rhs = make_ssa_name (scalar_dest, stmt);
5831       gimple_assign_set_lhs (stmt, rhs);
5832       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5833
5834       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5835       tree new_name = make_ssa_name (scalar_dest, stmt);
5836       gimple_assign_set_lhs (stmt, new_name);
5837       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5838       lhs = new_name;
5839     }
5840   return lhs;
5841 }
5842
5843 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5844    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5845    statement.  CODE is the operation performed by STMT and OPS are
5846    its scalar operands.  REDUC_INDEX is the index of the operand in
5847    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5848    implements in-order reduction, or IFN_LAST if we should open-code it.
5849    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5850    that should be used to control the operation in a fully-masked loop.  */
5851
5852 static bool
5853 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5854                                gimple **vec_stmt, slp_tree slp_node,
5855                                gimple *reduc_def_stmt,
5856                                tree_code code, internal_fn reduc_fn,
5857                                tree ops[3], tree vectype_in,
5858                                int reduc_index, vec_loop_masks *masks)
5859 {
5860   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5861   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5862   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5863   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5864   gimple *new_stmt = NULL;
5865
5866   int ncopies;
5867   if (slp_node)
5868     ncopies = 1;
5869   else
5870     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5871
5872   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5873   gcc_assert (ncopies == 1);
5874   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5875   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5876   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5877               == FOLD_LEFT_REDUCTION);
5878
5879   if (slp_node)
5880     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5881                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5882
5883   tree op0 = ops[1 - reduc_index];
5884
5885   int group_size = 1;
5886   gimple *scalar_dest_def;
5887   auto_vec<tree> vec_oprnds0;
5888   if (slp_node)
5889     {
5890       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5891       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5892       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5893     }
5894   else
5895     {
5896       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5897       vec_oprnds0.create (1);
5898       vec_oprnds0.quick_push (loop_vec_def0);
5899       scalar_dest_def = stmt;
5900     }
5901
5902   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5903   tree scalar_type = TREE_TYPE (scalar_dest);
5904   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5905
5906   int vec_num = vec_oprnds0.length ();
5907   gcc_assert (vec_num == 1 || slp_node);
5908   tree vec_elem_type = TREE_TYPE (vectype_out);
5909   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5910
5911   tree vector_identity = NULL_TREE;
5912   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5913     vector_identity = build_zero_cst (vectype_out);
5914
5915   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5916   int i;
5917   tree def0;
5918   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5919     {
5920       tree mask = NULL_TREE;
5921       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5922         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5923
5924       /* Handle MINUS by adding the negative.  */
5925       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5926         {
5927           tree negated = make_ssa_name (vectype_out);
5928           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5929           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5930           def0 = negated;
5931         }
5932
5933       if (mask)
5934         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5935                                     vector_identity);
5936
5937       /* On the first iteration the input is simply the scalar phi
5938          result, and for subsequent iterations it is the output of
5939          the preceding operation.  */
5940       if (reduc_fn != IFN_LAST)
5941         {
5942           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5943           /* For chained SLP reductions the output of the previous reduction
5944              operation serves as the input of the next. For the final statement
5945              the output cannot be a temporary - we reuse the original
5946              scalar destination of the last statement.  */
5947           if (i != vec_num - 1)
5948             {
5949               gimple_set_lhs (new_stmt, scalar_dest_var);
5950               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5951               gimple_set_lhs (new_stmt, reduc_var);
5952             }
5953         }
5954       else
5955         {
5956           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5957                                              reduc_var, def0);
5958           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5959           /* Remove the statement, so that we can use the same code paths
5960              as for statements that we've just created.  */
5961           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5962           gsi_remove (&tmp_gsi, false);
5963         }
5964
5965       if (i == vec_num - 1)
5966         {
5967           gimple_set_lhs (new_stmt, scalar_dest);
5968           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
5969         }
5970       else
5971         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
5972
5973       if (slp_node)
5974         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5975     }
5976
5977   if (!slp_node)
5978     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5979
5980   return true;
5981 }
5982
5983 /* Function is_nonwrapping_integer_induction.
5984
5985    Check if STMT (which is part of loop LOOP) both increments and
5986    does not cause overflow.  */
5987
5988 static bool
5989 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5990 {
5991   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5992   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5993   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5994   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5995   widest_int ni, max_loop_value, lhs_max;
5996   wi::overflow_type overflow = wi::OVF_NONE;
5997
5998   /* Make sure the loop is integer based.  */
5999   if (TREE_CODE (base) != INTEGER_CST
6000       || TREE_CODE (step) != INTEGER_CST)
6001     return false;
6002
6003   /* Check that the max size of the loop will not wrap.  */
6004
6005   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6006     return true;
6007
6008   if (! max_stmt_executions (loop, &ni))
6009     return false;
6010
6011   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6012                             &overflow);
6013   if (overflow)
6014     return false;
6015
6016   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6017                             TYPE_SIGN (lhs_type), &overflow);
6018   if (overflow)
6019     return false;
6020
6021   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6022           <= TYPE_PRECISION (lhs_type));
6023 }
6024
6025 /* Function vectorizable_reduction.
6026
6027    Check if STMT performs a reduction operation that can be vectorized.
6028    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6029    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6030    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6031
6032    This function also handles reduction idioms (patterns) that have been
6033    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6034    of this form:
6035      X = pattern_expr (arg0, arg1, ..., X)
6036    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6037    sequence that had been detected and replaced by the pattern-stmt (STMT).
6038
6039    This function also handles reduction of condition expressions, for example:
6040      for (int i = 0; i < N; i++)
6041        if (a[i] < value)
6042          last = a[i];
6043    This is handled by vectorising the loop and creating an additional vector
6044    containing the loop indexes for which "a[i] < value" was true.  In the
6045    function epilogue this is reduced to a single max value and then used to
6046    index into the vector of results.
6047
6048    In some cases of reduction patterns, the type of the reduction variable X is
6049    different than the type of the other arguments of STMT.
6050    In such cases, the vectype that is used when transforming STMT into a vector
6051    stmt is different than the vectype that is used to determine the
6052    vectorization factor, because it consists of a different number of elements
6053    than the actual number of elements that are being operated upon in parallel.
6054
6055    For example, consider an accumulation of shorts into an int accumulator.
6056    On some targets it's possible to vectorize this pattern operating on 8
6057    shorts at a time (hence, the vectype for purposes of determining the
6058    vectorization factor should be V8HI); on the other hand, the vectype that
6059    is used to create the vector form is actually V4SI (the type of the result).
6060
6061    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6062    indicates what is the actual level of parallelism (V8HI in the example), so
6063    that the right vectorization factor would be derived.  This vectype
6064    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6065    be used to create the vectorized stmt.  The right vectype for the vectorized
6066    stmt is obtained from the type of the result X:
6067         get_vectype_for_scalar_type (TREE_TYPE (X))
6068
6069    This means that, contrary to "regular" reductions (or "regular" stmts in
6070    general), the following equation:
6071       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6072    does *NOT* necessarily hold for reduction patterns.  */
6073
6074 bool
6075 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6076                         gimple **vec_stmt, slp_tree slp_node,
6077                         slp_instance slp_node_instance,
6078                         stmt_vector_for_cost *cost_vec)
6079 {
6080   tree vec_dest;
6081   tree scalar_dest;
6082   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6083   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6084   tree vectype_in = NULL_TREE;
6085   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6086   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6087   enum tree_code code, orig_code;
6088   internal_fn reduc_fn;
6089   machine_mode vec_mode;
6090   int op_type;
6091   optab optab;
6092   tree new_temp = NULL_TREE;
6093   gimple *def_stmt;
6094   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6095   gimple *cond_reduc_def_stmt = NULL;
6096   enum tree_code cond_reduc_op_code = ERROR_MARK;
6097   tree scalar_type;
6098   bool is_simple_use;
6099   gimple *orig_stmt;
6100   stmt_vec_info orig_stmt_info = NULL;
6101   int i;
6102   int ncopies;
6103   int epilog_copies;
6104   stmt_vec_info prev_stmt_info, prev_phi_info;
6105   bool single_defuse_cycle = false;
6106   gimple *new_stmt = NULL;
6107   int j;
6108   tree ops[3];
6109   enum vect_def_type dts[3];
6110   bool nested_cycle = false, found_nested_cycle_def = false;
6111   bool double_reduc = false;
6112   basic_block def_bb;
6113   struct loop * def_stmt_loop;
6114   tree def_arg;
6115   auto_vec<tree> vec_oprnds0;
6116   auto_vec<tree> vec_oprnds1;
6117   auto_vec<tree> vec_oprnds2;
6118   auto_vec<tree> vect_defs;
6119   auto_vec<gimple *> phis;
6120   int vec_num;
6121   tree def0, tem;
6122   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6123   tree cond_reduc_val = NULL_TREE;
6124
6125   /* Make sure it was already recognized as a reduction computation.  */
6126   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6127       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6128     return false;
6129
6130   if (nested_in_vect_loop_p (loop, stmt))
6131     {
6132       loop = loop->inner;
6133       nested_cycle = true;
6134     }
6135
6136   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6137     gcc_assert (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt);
6138
6139   if (gimple_code (stmt) == GIMPLE_PHI)
6140     {
6141       tree phi_result = gimple_phi_result (stmt);
6142       /* Analysis is fully done on the reduction stmt invocation.  */
6143       if (! vec_stmt)
6144         {
6145           if (slp_node)
6146             slp_node_instance->reduc_phis = slp_node;
6147
6148           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6149           return true;
6150         }
6151
6152       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6153         /* Leave the scalar phi in place.  Note that checking
6154            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6155            for reductions involving a single statement.  */
6156         return true;
6157
6158       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6159       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6160         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6161
6162       stmt_vec_info reduc_stmt_info = vinfo_for_stmt (reduc_stmt);
6163       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6164           == EXTRACT_LAST_REDUCTION)
6165         /* Leave the scalar phi in place.  */
6166         return true;
6167
6168       gcc_assert (is_gimple_assign (reduc_stmt));
6169       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6170         {
6171           tree op = gimple_op (reduc_stmt, k);
6172           if (op == gimple_phi_result (stmt))
6173             continue;
6174           if (k == 1
6175               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6176             continue;
6177           if (!vectype_in
6178               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6179                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6180             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6181           break;
6182         }
6183       gcc_assert (vectype_in);
6184
6185       if (slp_node)
6186         ncopies = 1;
6187       else
6188         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6189
6190       stmt_vec_info use_stmt_info;
6191       if (ncopies > 1
6192           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6193           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6194           && (use_stmt_info == reduc_stmt_info
6195               || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt))
6196         single_defuse_cycle = true;
6197
6198       /* Create the destination vector  */
6199       scalar_dest = gimple_assign_lhs (reduc_stmt);
6200       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6201
6202       if (slp_node)
6203         /* The size vect_schedule_slp_instance computes is off for us.  */
6204         vec_num = vect_get_num_vectors
6205           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6206            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6207            vectype_in);
6208       else
6209         vec_num = 1;
6210
6211       /* Generate the reduction PHIs upfront.  */
6212       prev_phi_info = NULL;
6213       for (j = 0; j < ncopies; j++)
6214         {
6215           if (j == 0 || !single_defuse_cycle)
6216             {
6217               for (i = 0; i < vec_num; i++)
6218                 {
6219                   /* Create the reduction-phi that defines the reduction
6220                      operand.  */
6221                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6222                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6223
6224                   if (slp_node)
6225                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6226                   else
6227                     {
6228                       if (j == 0)
6229                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6230                       else
6231                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6232                       prev_phi_info = new_phi_info;
6233                     }
6234                 }
6235             }
6236         }
6237
6238       return true;
6239     }
6240
6241   /* 1. Is vectorizable reduction?  */
6242   /* Not supportable if the reduction variable is used in the loop, unless
6243      it's a reduction chain.  */
6244   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6245       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6246     return false;
6247
6248   /* Reductions that are not used even in an enclosing outer-loop,
6249      are expected to be "live" (used out of the loop).  */
6250   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6251       && !STMT_VINFO_LIVE_P (stmt_info))
6252     return false;
6253
6254   /* 2. Has this been recognized as a reduction pattern?
6255
6256      Check if STMT represents a pattern that has been recognized
6257      in earlier analysis stages.  For stmts that represent a pattern,
6258      the STMT_VINFO_RELATED_STMT field records the last stmt in
6259      the original sequence that constitutes the pattern.  */
6260
6261   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6262   if (orig_stmt)
6263     {
6264       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6265       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6266       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6267     }
6268
6269   /* 3. Check the operands of the operation.  The first operands are defined
6270         inside the loop body. The last operand is the reduction variable,
6271         which is defined by the loop-header-phi.  */
6272
6273   gcc_assert (is_gimple_assign (stmt));
6274
6275   /* Flatten RHS.  */
6276   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6277     {
6278     case GIMPLE_BINARY_RHS:
6279       code = gimple_assign_rhs_code (stmt);
6280       op_type = TREE_CODE_LENGTH (code);
6281       gcc_assert (op_type == binary_op);
6282       ops[0] = gimple_assign_rhs1 (stmt);
6283       ops[1] = gimple_assign_rhs2 (stmt);
6284       break;
6285
6286     case GIMPLE_TERNARY_RHS:
6287       code = gimple_assign_rhs_code (stmt);
6288       op_type = TREE_CODE_LENGTH (code);
6289       gcc_assert (op_type == ternary_op);
6290       ops[0] = gimple_assign_rhs1 (stmt);
6291       ops[1] = gimple_assign_rhs2 (stmt);
6292       ops[2] = gimple_assign_rhs3 (stmt);
6293       break;
6294
6295     case GIMPLE_UNARY_RHS:
6296       return false;
6297
6298     default:
6299       gcc_unreachable ();
6300     }
6301
6302   if (code == COND_EXPR && slp_node)
6303     return false;
6304
6305   scalar_dest = gimple_assign_lhs (stmt);
6306   scalar_type = TREE_TYPE (scalar_dest);
6307   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6308       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6309     return false;
6310
6311   /* Do not try to vectorize bit-precision reductions.  */
6312   if (!type_has_mode_precision_p (scalar_type))
6313     return false;
6314
6315   /* All uses but the last are expected to be defined in the loop.
6316      The last use is the reduction variable.  In case of nested cycle this
6317      assumption is not true: we use reduc_index to record the index of the
6318      reduction variable.  */
6319   gimple *reduc_def_stmt = NULL;
6320   int reduc_index = -1;
6321   for (i = 0; i < op_type; i++)
6322     {
6323       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6324       if (i == 0 && code == COND_EXPR)
6325         continue;
6326
6327       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6328                                           &dts[i], &tem, &def_stmt);
6329       dt = dts[i];
6330       gcc_assert (is_simple_use);
6331       if (dt == vect_reduction_def)
6332         {
6333           reduc_def_stmt = def_stmt;
6334           reduc_index = i;
6335           continue;
6336         }
6337       else if (tem)
6338         {
6339           /* To properly compute ncopies we are interested in the widest
6340              input type in case we're looking at a widening accumulation.  */
6341           if (!vectype_in
6342               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6343                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6344             vectype_in = tem;
6345         }
6346
6347       if (dt != vect_internal_def
6348           && dt != vect_external_def
6349           && dt != vect_constant_def
6350           && dt != vect_induction_def
6351           && !(dt == vect_nested_cycle && nested_cycle))
6352         return false;
6353
6354       if (dt == vect_nested_cycle)
6355         {
6356           found_nested_cycle_def = true;
6357           reduc_def_stmt = def_stmt;
6358           reduc_index = i;
6359         }
6360
6361       if (i == 1 && code == COND_EXPR)
6362         {
6363           /* Record how value of COND_EXPR is defined.  */
6364           if (dt == vect_constant_def)
6365             {
6366               cond_reduc_dt = dt;
6367               cond_reduc_val = ops[i];
6368             }
6369           if (dt == vect_induction_def
6370               && def_stmt != NULL
6371               && is_nonwrapping_integer_induction (def_stmt, loop))
6372             {
6373               cond_reduc_dt = dt;
6374               cond_reduc_def_stmt = def_stmt;
6375             }
6376         }
6377     }
6378
6379   if (!vectype_in)
6380     vectype_in = vectype_out;
6381
6382   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6383      directy used in stmt.  */
6384   if (reduc_index == -1)
6385     {
6386       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6387         {
6388           if (dump_enabled_p ())
6389             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6390                              "in-order reduction chain without SLP.\n");
6391           return false;
6392         }
6393
6394       if (orig_stmt)
6395         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6396       else
6397         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6398     }
6399
6400   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6401     return false;
6402
6403   if (!(reduc_index == -1
6404         || dts[reduc_index] == vect_reduction_def
6405         || dts[reduc_index] == vect_nested_cycle
6406         || ((dts[reduc_index] == vect_internal_def
6407              || dts[reduc_index] == vect_external_def
6408              || dts[reduc_index] == vect_constant_def
6409              || dts[reduc_index] == vect_induction_def)
6410             && nested_cycle && found_nested_cycle_def)))
6411     {
6412       /* For pattern recognized stmts, orig_stmt might be a reduction,
6413          but some helper statements for the pattern might not, or
6414          might be COND_EXPRs with reduction uses in the condition.  */
6415       gcc_assert (orig_stmt);
6416       return false;
6417     }
6418
6419   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6420   /* PHIs should not participate in patterns.  */
6421   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6422   enum vect_reduction_type v_reduc_type
6423     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6424   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6425
6426   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6427   /* If we have a condition reduction, see if we can simplify it further.  */
6428   if (v_reduc_type == COND_REDUCTION)
6429     {
6430       /* TODO: We can't yet handle reduction chains, since we need to treat
6431          each COND_EXPR in the chain specially, not just the last one.
6432          E.g. for:
6433
6434             x_1 = PHI <x_3, ...>
6435             x_2 = a_2 ? ... : x_1;
6436             x_3 = a_3 ? ... : x_2;
6437
6438          we're interested in the last element in x_3 for which a_2 || a_3
6439          is true, whereas the current reduction chain handling would
6440          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6441          as a reduction operation.  */
6442       if (reduc_index == -1)
6443         {
6444           if (dump_enabled_p ())
6445             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6446                              "conditional reduction chains not supported\n");
6447           return false;
6448         }
6449
6450       /* vect_is_simple_reduction ensured that operand 2 is the
6451          loop-carried operand.  */
6452       gcc_assert (reduc_index == 2);
6453
6454       /* Loop peeling modifies initial value of reduction PHI, which
6455          makes the reduction stmt to be transformed different to the
6456          original stmt analyzed.  We need to record reduction code for
6457          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6458          it can be used directly at transform stage.  */
6459       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6460           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6461         {
6462           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6463           gcc_assert (cond_reduc_dt == vect_constant_def);
6464           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6465         }
6466       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6467                                                vectype_in, OPTIMIZE_FOR_SPEED))
6468         {
6469           if (dump_enabled_p ())
6470             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471                              "optimizing condition reduction with"
6472                              " FOLD_EXTRACT_LAST.\n");
6473           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6474         }
6475       else if (cond_reduc_dt == vect_induction_def)
6476         {
6477           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6478           tree base
6479             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6480           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6481
6482           gcc_assert (TREE_CODE (base) == INTEGER_CST
6483                       && TREE_CODE (step) == INTEGER_CST);
6484           cond_reduc_val = NULL_TREE;
6485           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6486              above base; punt if base is the minimum value of the type for
6487              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6488           if (tree_int_cst_sgn (step) == -1)
6489             {
6490               cond_reduc_op_code = MIN_EXPR;
6491               if (tree_int_cst_sgn (base) == -1)
6492                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6493               else if (tree_int_cst_lt (base,
6494                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6495                 cond_reduc_val
6496                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6497             }
6498           else
6499             {
6500               cond_reduc_op_code = MAX_EXPR;
6501               if (tree_int_cst_sgn (base) == 1)
6502                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6503               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6504                                         base))
6505                 cond_reduc_val
6506                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6507             }
6508           if (cond_reduc_val)
6509             {
6510               if (dump_enabled_p ())
6511                 dump_printf_loc (MSG_NOTE, vect_location,
6512                                  "condition expression based on "
6513                                  "integer induction.\n");
6514               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6515                 = INTEGER_INDUC_COND_REDUCTION;
6516             }
6517         }
6518       else if (cond_reduc_dt == vect_constant_def)
6519         {
6520           enum vect_def_type cond_initial_dt;
6521           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6522           tree cond_initial_val
6523             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6524
6525           gcc_assert (cond_reduc_val != NULL_TREE);
6526           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6527           if (cond_initial_dt == vect_constant_def
6528               && types_compatible_p (TREE_TYPE (cond_initial_val),
6529                                      TREE_TYPE (cond_reduc_val)))
6530             {
6531               tree e = fold_binary (LE_EXPR, boolean_type_node,
6532                                     cond_initial_val, cond_reduc_val);
6533               if (e && (integer_onep (e) || integer_zerop (e)))
6534                 {
6535                   if (dump_enabled_p ())
6536                     dump_printf_loc (MSG_NOTE, vect_location,
6537                                      "condition expression based on "
6538                                      "compile time constant.\n");
6539                   /* Record reduction code at analysis stage.  */
6540                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6541                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6542                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6543                     = CONST_COND_REDUCTION;
6544                 }
6545             }
6546         }
6547     }
6548
6549   if (orig_stmt)
6550     gcc_assert (tmp == orig_stmt
6551                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6552                     == orig_stmt));
6553   else
6554     /* We changed STMT to be the first stmt in reduction chain, hence we
6555        check that in this case the first element in the chain is STMT.  */
6556     gcc_assert (stmt == tmp
6557                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6558
6559   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6560     return false;
6561
6562   if (slp_node)
6563     ncopies = 1;
6564   else
6565     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6566
6567   gcc_assert (ncopies >= 1);
6568
6569   vec_mode = TYPE_MODE (vectype_in);
6570   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6571
6572   if (code == COND_EXPR)
6573     {
6574       /* Only call during the analysis stage, otherwise we'll lose
6575          STMT_VINFO_TYPE.  */
6576       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6577                                                 ops[reduc_index], 0, NULL,
6578                                                 cost_vec))
6579         {
6580           if (dump_enabled_p ())
6581             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6582                              "unsupported condition in reduction\n");
6583           return false;
6584         }
6585     }
6586   else
6587     {
6588       /* 4. Supportable by target?  */
6589
6590       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6591           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6592         {
6593           /* Shifts and rotates are only supported by vectorizable_shifts,
6594              not vectorizable_reduction.  */
6595           if (dump_enabled_p ())
6596             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6597                              "unsupported shift or rotation.\n");
6598           return false;
6599         }
6600
6601       /* 4.1. check support for the operation in the loop  */
6602       optab = optab_for_tree_code (code, vectype_in, optab_default);
6603       if (!optab)
6604         {
6605           if (dump_enabled_p ())
6606             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6607                              "no optab.\n");
6608
6609           return false;
6610         }
6611
6612       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6613         {
6614           if (dump_enabled_p ())
6615             dump_printf (MSG_NOTE, "op not supported by target.\n");
6616
6617           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6618               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6619             return false;
6620
6621           if (dump_enabled_p ())
6622             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6623         }
6624
6625       /* Worthwhile without SIMD support?  */
6626       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6627           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6628         {
6629           if (dump_enabled_p ())
6630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6631                              "not worthwhile without SIMD support.\n");
6632
6633           return false;
6634         }
6635     }
6636
6637   /* 4.2. Check support for the epilog operation.
6638
6639           If STMT represents a reduction pattern, then the type of the
6640           reduction variable may be different than the type of the rest
6641           of the arguments.  For example, consider the case of accumulation
6642           of shorts into an int accumulator; The original code:
6643                         S1: int_a = (int) short_a;
6644           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6645
6646           was replaced with:
6647                         STMT: int_acc = widen_sum <short_a, int_acc>
6648
6649           This means that:
6650           1. The tree-code that is used to create the vector operation in the
6651              epilog code (that reduces the partial results) is not the
6652              tree-code of STMT, but is rather the tree-code of the original
6653              stmt from the pattern that STMT is replacing.  I.e, in the example
6654              above we want to use 'widen_sum' in the loop, but 'plus' in the
6655              epilog.
6656           2. The type (mode) we use to check available target support
6657              for the vector operation to be created in the *epilog*, is
6658              determined by the type of the reduction variable (in the example
6659              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6660              However the type (mode) we use to check available target support
6661              for the vector operation to be created *inside the loop*, is
6662              determined by the type of the other arguments to STMT (in the
6663              example we'd check this: optab_handler (widen_sum_optab,
6664              vect_short_mode)).
6665
6666           This is contrary to "regular" reductions, in which the types of all
6667           the arguments are the same as the type of the reduction variable.
6668           For "regular" reductions we can therefore use the same vector type
6669           (and also the same tree-code) when generating the epilog code and
6670           when generating the code inside the loop.  */
6671
6672   vect_reduction_type reduction_type
6673     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6674   if (orig_stmt
6675       && (reduction_type == TREE_CODE_REDUCTION
6676           || reduction_type == FOLD_LEFT_REDUCTION))
6677     {
6678       /* This is a reduction pattern: get the vectype from the type of the
6679          reduction variable, and get the tree-code from orig_stmt.  */
6680       orig_code = gimple_assign_rhs_code (orig_stmt);
6681       gcc_assert (vectype_out);
6682       vec_mode = TYPE_MODE (vectype_out);
6683     }
6684   else
6685     {
6686       /* Regular reduction: use the same vectype and tree-code as used for
6687          the vector code inside the loop can be used for the epilog code. */
6688       orig_code = code;
6689
6690       if (code == MINUS_EXPR)
6691         orig_code = PLUS_EXPR;
6692
6693       /* For simple condition reductions, replace with the actual expression
6694          we want to base our reduction around.  */
6695       if (reduction_type == CONST_COND_REDUCTION)
6696         {
6697           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6698           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6699         }
6700       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6701         orig_code = cond_reduc_op_code;
6702     }
6703
6704   if (nested_cycle)
6705     {
6706       def_bb = gimple_bb (reduc_def_stmt);
6707       def_stmt_loop = def_bb->loop_father;
6708       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6709                                        loop_preheader_edge (def_stmt_loop));
6710       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6711       if (def_arg_stmt_info
6712           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6713               == vect_double_reduction_def))
6714         double_reduc = true;
6715     }
6716
6717   reduc_fn = IFN_LAST;
6718
6719   if (reduction_type == TREE_CODE_REDUCTION
6720       || reduction_type == FOLD_LEFT_REDUCTION
6721       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6722       || reduction_type == CONST_COND_REDUCTION)
6723     {
6724       if (reduction_type == FOLD_LEFT_REDUCTION
6725           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6726           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6727         {
6728           if (reduc_fn != IFN_LAST
6729               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6730                                                   OPTIMIZE_FOR_SPEED))
6731             {
6732               if (dump_enabled_p ())
6733                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6734                                  "reduc op not supported by target.\n");
6735
6736               reduc_fn = IFN_LAST;
6737             }
6738         }
6739       else
6740         {
6741           if (!nested_cycle || double_reduc)
6742             {
6743               if (dump_enabled_p ())
6744                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6745                                  "no reduc code for scalar code.\n");
6746
6747               return false;
6748             }
6749         }
6750     }
6751   else if (reduction_type == COND_REDUCTION)
6752     {
6753       int scalar_precision
6754         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6755       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6756       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6757                                                 nunits_out);
6758
6759       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6760                                           OPTIMIZE_FOR_SPEED))
6761         reduc_fn = IFN_REDUC_MAX;
6762     }
6763
6764   if (reduction_type != EXTRACT_LAST_REDUCTION
6765       && reduc_fn == IFN_LAST
6766       && !nunits_out.is_constant ())
6767     {
6768       if (dump_enabled_p ())
6769         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6770                          "missing target support for reduction on"
6771                          " variable-length vectors.\n");
6772       return false;
6773     }
6774
6775   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6776       && ncopies > 1)
6777     {
6778       if (dump_enabled_p ())
6779         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6780                          "multiple types in double reduction or condition "
6781                          "reduction.\n");
6782       return false;
6783     }
6784
6785   /* For SLP reductions, see if there is a neutral value we can use.  */
6786   tree neutral_op = NULL_TREE;
6787   if (slp_node)
6788     neutral_op = neutral_op_for_slp_reduction
6789                    (slp_node_instance->reduc_phis, code,
6790                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6791
6792   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6793     {
6794       /* We can't support in-order reductions of code such as this:
6795
6796            for (int i = 0; i < n1; ++i)
6797              for (int j = 0; j < n2; ++j)
6798                l += a[j];
6799
6800          since GCC effectively transforms the loop when vectorizing:
6801
6802            for (int i = 0; i < n1 / VF; ++i)
6803              for (int j = 0; j < n2; ++j)
6804                for (int k = 0; k < VF; ++k)
6805                  l += a[j];
6806
6807          which is a reassociation of the original operation.  */
6808       if (dump_enabled_p ())
6809         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6810                          "in-order double reduction not supported.\n");
6811
6812       return false;
6813     }
6814
6815   if (reduction_type == FOLD_LEFT_REDUCTION
6816       && slp_node
6817       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6818     {
6819       /* We cannot use in-order reductions in this case because there is
6820          an implicit reassociation of the operations involved.  */
6821       if (dump_enabled_p ())
6822         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6823                          "in-order unchained SLP reductions not supported.\n");
6824       return false;
6825     }
6826
6827   /* For double reductions, and for SLP reductions with a neutral value,
6828      we construct a variable-length initial vector by loading a vector
6829      full of the neutral value and then shift-and-inserting the start
6830      values into the low-numbered elements.  */
6831   if ((double_reduc || neutral_op)
6832       && !nunits_out.is_constant ()
6833       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6834                                           vectype_out, OPTIMIZE_FOR_SPEED))
6835     {
6836       if (dump_enabled_p ())
6837         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6838                          "reduction on variable-length vectors requires"
6839                          " target support for a vector-shift-and-insert"
6840                          " operation.\n");
6841       return false;
6842     }
6843
6844   /* Check extra constraints for variable-length unchained SLP reductions.  */
6845   if (STMT_SLP_TYPE (stmt_info)
6846       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6847       && !nunits_out.is_constant ())
6848     {
6849       /* We checked above that we could build the initial vector when
6850          there's a neutral element value.  Check here for the case in
6851          which each SLP statement has its own initial value and in which
6852          that value needs to be repeated for every instance of the
6853          statement within the initial vector.  */
6854       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6855       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6856       if (!neutral_op
6857           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6858         {
6859           if (dump_enabled_p ())
6860             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6861                              "unsupported form of SLP reduction for"
6862                              " variable-length vectors: cannot build"
6863                              " initial vector.\n");
6864           return false;
6865         }
6866       /* The epilogue code relies on the number of elements being a multiple
6867          of the group size.  The duplicate-and-interleave approach to setting
6868          up the the initial vector does too.  */
6869       if (!multiple_p (nunits_out, group_size))
6870         {
6871           if (dump_enabled_p ())
6872             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6873                              "unsupported form of SLP reduction for"
6874                              " variable-length vectors: the vector size"
6875                              " is not a multiple of the number of results.\n");
6876           return false;
6877         }
6878     }
6879
6880   /* In case of widenning multiplication by a constant, we update the type
6881      of the constant to be the type of the other operand.  We check that the
6882      constant fits the type in the pattern recognition pass.  */
6883   if (code == DOT_PROD_EXPR
6884       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6885     {
6886       if (TREE_CODE (ops[0]) == INTEGER_CST)
6887         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6888       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6889         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6890       else
6891         {
6892           if (dump_enabled_p ())
6893             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6894                              "invalid types in dot-prod\n");
6895
6896           return false;
6897         }
6898     }
6899
6900   if (reduction_type == COND_REDUCTION)
6901     {
6902       widest_int ni;
6903
6904       if (! max_loop_iterations (loop, &ni))
6905         {
6906           if (dump_enabled_p ())
6907             dump_printf_loc (MSG_NOTE, vect_location,
6908                              "loop count not known, cannot create cond "
6909                              "reduction.\n");
6910           return false;
6911         }
6912       /* Convert backedges to iterations.  */
6913       ni += 1;
6914
6915       /* The additional index will be the same type as the condition.  Check
6916          that the loop can fit into this less one (because we'll use up the
6917          zero slot for when there are no matches).  */
6918       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6919       if (wi::geu_p (ni, wi::to_widest (max_index)))
6920         {
6921           if (dump_enabled_p ())
6922             dump_printf_loc (MSG_NOTE, vect_location,
6923                              "loop size is greater than data size.\n");
6924           return false;
6925         }
6926     }
6927
6928   /* In case the vectorization factor (VF) is bigger than the number
6929      of elements that we can fit in a vectype (nunits), we have to generate
6930      more than one vector stmt - i.e - we need to "unroll" the
6931      vector stmt by a factor VF/nunits.  For more details see documentation
6932      in vectorizable_operation.  */
6933
6934   /* If the reduction is used in an outer loop we need to generate
6935      VF intermediate results, like so (e.g. for ncopies=2):
6936         r0 = phi (init, r0)
6937         r1 = phi (init, r1)
6938         r0 = x0 + r0;
6939         r1 = x1 + r1;
6940     (i.e. we generate VF results in 2 registers).
6941     In this case we have a separate def-use cycle for each copy, and therefore
6942     for each copy we get the vector def for the reduction variable from the
6943     respective phi node created for this copy.
6944
6945     Otherwise (the reduction is unused in the loop nest), we can combine
6946     together intermediate results, like so (e.g. for ncopies=2):
6947         r = phi (init, r)
6948         r = x0 + r;
6949         r = x1 + r;
6950    (i.e. we generate VF/2 results in a single register).
6951    In this case for each copy we get the vector def for the reduction variable
6952    from the vectorized reduction operation generated in the previous iteration.
6953
6954    This only works when we see both the reduction PHI and its only consumer
6955    in vectorizable_reduction and there are no intermediate stmts
6956    participating.  */
6957   stmt_vec_info use_stmt_info;
6958   tree reduc_phi_result = gimple_phi_result (reduc_def_stmt);
6959   if (ncopies > 1
6960       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6961       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6962       && (use_stmt_info == stmt_info
6963           || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt))
6964     {
6965       single_defuse_cycle = true;
6966       epilog_copies = 1;
6967     }
6968   else
6969     epilog_copies = ncopies;
6970
6971   /* If the reduction stmt is one of the patterns that have lane
6972      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6973   if ((ncopies > 1
6974        && ! single_defuse_cycle)
6975       && (code == DOT_PROD_EXPR
6976           || code == WIDEN_SUM_EXPR
6977           || code == SAD_EXPR))
6978     {
6979       if (dump_enabled_p ())
6980         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6981                          "multi def-use cycle not possible for lane-reducing "
6982                          "reduction operation\n");
6983       return false;
6984     }
6985
6986   if (slp_node)
6987     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6988   else
6989     vec_num = 1;
6990
6991   internal_fn cond_fn = get_conditional_internal_fn (code);
6992   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6993
6994   if (!vec_stmt) /* transformation not required.  */
6995     {
6996       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6997       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6998         {
6999           if (reduction_type != FOLD_LEFT_REDUCTION
7000               && (cond_fn == IFN_LAST
7001                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7002                                                       OPTIMIZE_FOR_SPEED)))
7003             {
7004               if (dump_enabled_p ())
7005                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7006                                  "can't use a fully-masked loop because no"
7007                                  " conditional operation is available.\n");
7008               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7009             }
7010           else if (reduc_index == -1)
7011             {
7012               if (dump_enabled_p ())
7013                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7014                                  "can't use a fully-masked loop for chained"
7015                                  " reductions.\n");
7016               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7017             }
7018           else
7019             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7020                                    vectype_in);
7021         }
7022       if (dump_enabled_p ()
7023           && reduction_type == FOLD_LEFT_REDUCTION)
7024         dump_printf_loc (MSG_NOTE, vect_location,
7025                          "using an in-order (fold-left) reduction.\n");
7026       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7027       return true;
7028     }
7029
7030   /* Transform.  */
7031
7032   if (dump_enabled_p ())
7033     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7034
7035   /* FORNOW: Multiple types are not supported for condition.  */
7036   if (code == COND_EXPR)
7037     gcc_assert (ncopies == 1);
7038
7039   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7040
7041   if (reduction_type == FOLD_LEFT_REDUCTION)
7042     return vectorize_fold_left_reduction
7043       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7044        reduc_fn, ops, vectype_in, reduc_index, masks);
7045
7046   if (reduction_type == EXTRACT_LAST_REDUCTION)
7047     {
7048       gcc_assert (!slp_node);
7049       return vectorizable_condition (stmt, gsi, vec_stmt,
7050                                      NULL, reduc_index, NULL, NULL);
7051     }
7052
7053   /* Create the destination vector  */
7054   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7055
7056   prev_stmt_info = NULL;
7057   prev_phi_info = NULL;
7058   if (!slp_node)
7059     {
7060       vec_oprnds0.create (1);
7061       vec_oprnds1.create (1);
7062       if (op_type == ternary_op)
7063         vec_oprnds2.create (1);
7064     }
7065
7066   phis.create (vec_num);
7067   vect_defs.create (vec_num);
7068   if (!slp_node)
7069     vect_defs.quick_push (NULL_TREE);
7070
7071   if (slp_node)
7072     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7073   else
7074     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7075
7076   for (j = 0; j < ncopies; j++)
7077     {
7078       if (code == COND_EXPR)
7079         {
7080           gcc_assert (!slp_node);
7081           vectorizable_condition (stmt, gsi, vec_stmt,
7082                                   PHI_RESULT (phis[0]),
7083                                   reduc_index, NULL, NULL);
7084           /* Multiple types are not supported for condition.  */
7085           break;
7086         }
7087
7088       /* Handle uses.  */
7089       if (j == 0)
7090         {
7091           if (slp_node)
7092             {
7093               /* Get vec defs for all the operands except the reduction index,
7094                  ensuring the ordering of the ops in the vector is kept.  */
7095               auto_vec<tree, 3> slp_ops;
7096               auto_vec<vec<tree>, 3> vec_defs;
7097
7098               slp_ops.quick_push (ops[0]);
7099               slp_ops.quick_push (ops[1]);
7100               if (op_type == ternary_op)
7101                 slp_ops.quick_push (ops[2]);
7102
7103               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7104
7105               vec_oprnds0.safe_splice (vec_defs[0]);
7106               vec_defs[0].release ();
7107               vec_oprnds1.safe_splice (vec_defs[1]);
7108               vec_defs[1].release ();
7109               if (op_type == ternary_op)
7110                 {
7111                   vec_oprnds2.safe_splice (vec_defs[2]);
7112                   vec_defs[2].release ();
7113                 }
7114             }
7115           else
7116             {
7117               vec_oprnds0.quick_push
7118                 (vect_get_vec_def_for_operand (ops[0], stmt));
7119               vec_oprnds1.quick_push
7120                 (vect_get_vec_def_for_operand (ops[1], stmt));
7121               if (op_type == ternary_op)
7122                 vec_oprnds2.quick_push
7123                   (vect_get_vec_def_for_operand (ops[2], stmt));
7124             }
7125         }
7126       else
7127         {
7128           if (!slp_node)
7129             {
7130               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7131
7132               if (single_defuse_cycle && reduc_index == 0)
7133                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7134               else
7135                 vec_oprnds0[0]
7136                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7137               if (single_defuse_cycle && reduc_index == 1)
7138                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7139               else
7140                 vec_oprnds1[0]
7141                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7142               if (op_type == ternary_op)
7143                 {
7144                   if (single_defuse_cycle && reduc_index == 2)
7145                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7146                   else
7147                     vec_oprnds2[0]
7148                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7149                 }
7150             }
7151         }
7152
7153       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7154         {
7155           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7156           if (masked_loop_p)
7157             {
7158               /* Make sure that the reduction accumulator is vop[0].  */
7159               if (reduc_index == 1)
7160                 {
7161                   gcc_assert (commutative_tree_code (code));
7162                   std::swap (vop[0], vop[1]);
7163                 }
7164               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7165                                               vectype_in, i * ncopies + j);
7166               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7167                                                         vop[0], vop[1],
7168                                                         vop[0]);
7169               new_temp = make_ssa_name (vec_dest, call);
7170               gimple_call_set_lhs (call, new_temp);
7171               gimple_call_set_nothrow (call, true);
7172               new_stmt = call;
7173             }
7174           else
7175             {
7176               if (op_type == ternary_op)
7177                 vop[2] = vec_oprnds2[i];
7178
7179               new_stmt = gimple_build_assign (vec_dest, code,
7180                                               vop[0], vop[1], vop[2]);
7181               new_temp = make_ssa_name (vec_dest, new_stmt);
7182               gimple_assign_set_lhs (new_stmt, new_temp);
7183             }
7184           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7185
7186           if (slp_node)
7187             {
7188               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7189               vect_defs.quick_push (new_temp);
7190             }
7191           else
7192             vect_defs[0] = new_temp;
7193         }
7194
7195       if (slp_node)
7196         continue;
7197
7198       if (j == 0)
7199         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7200       else
7201         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7202
7203       prev_stmt_info = vinfo_for_stmt (new_stmt);
7204     }
7205
7206   /* Finalize the reduction-phi (set its arguments) and create the
7207      epilog reduction code.  */
7208   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7209     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7210
7211   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7212                                     epilog_copies, reduc_fn, phis,
7213                                     double_reduc, slp_node, slp_node_instance,
7214                                     cond_reduc_val, cond_reduc_op_code,
7215                                     neutral_op);
7216
7217   return true;
7218 }
7219
7220 /* Function vect_min_worthwhile_factor.
7221
7222    For a loop where we could vectorize the operation indicated by CODE,
7223    return the minimum vectorization factor that makes it worthwhile
7224    to use generic vectors.  */
7225 static unsigned int
7226 vect_min_worthwhile_factor (enum tree_code code)
7227 {
7228   switch (code)
7229     {
7230     case PLUS_EXPR:
7231     case MINUS_EXPR:
7232     case NEGATE_EXPR:
7233       return 4;
7234
7235     case BIT_AND_EXPR:
7236     case BIT_IOR_EXPR:
7237     case BIT_XOR_EXPR:
7238     case BIT_NOT_EXPR:
7239       return 2;
7240
7241     default:
7242       return INT_MAX;
7243     }
7244 }
7245
7246 /* Return true if VINFO indicates we are doing loop vectorization and if
7247    it is worth decomposing CODE operations into scalar operations for
7248    that loop's vectorization factor.  */
7249
7250 bool
7251 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7252 {
7253   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7254   unsigned HOST_WIDE_INT value;
7255   return (loop_vinfo
7256           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7257           && value >= vect_min_worthwhile_factor (code));
7258 }
7259
7260 /* Function vectorizable_induction
7261
7262    Check if PHI performs an induction computation that can be vectorized.
7263    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7264    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7265    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7266
7267 bool
7268 vectorizable_induction (gimple *phi,
7269                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7270                         gimple **vec_stmt, slp_tree slp_node,
7271                         stmt_vector_for_cost *cost_vec)
7272 {
7273   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7274   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7275   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7276   unsigned ncopies;
7277   bool nested_in_vect_loop = false;
7278   struct loop *iv_loop;
7279   tree vec_def;
7280   edge pe = loop_preheader_edge (loop);
7281   basic_block new_bb;
7282   tree new_vec, vec_init, vec_step, t;
7283   tree new_name;
7284   gimple *new_stmt;
7285   gphi *induction_phi;
7286   tree induc_def, vec_dest;
7287   tree init_expr, step_expr;
7288   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7289   unsigned i;
7290   tree expr;
7291   gimple_seq stmts;
7292   imm_use_iterator imm_iter;
7293   use_operand_p use_p;
7294   gimple *exit_phi;
7295   edge latch_e;
7296   tree loop_arg;
7297   gimple_stmt_iterator si;
7298   basic_block bb = gimple_bb (phi);
7299
7300   if (gimple_code (phi) != GIMPLE_PHI)
7301     return false;
7302
7303   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7304     return false;
7305
7306   /* Make sure it was recognized as induction computation.  */
7307   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7308     return false;
7309
7310   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7311   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7312
7313   if (slp_node)
7314     ncopies = 1;
7315   else
7316     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7317   gcc_assert (ncopies >= 1);
7318
7319   /* FORNOW. These restrictions should be relaxed.  */
7320   if (nested_in_vect_loop_p (loop, phi))
7321     {
7322       imm_use_iterator imm_iter;
7323       use_operand_p use_p;
7324       gimple *exit_phi;
7325       edge latch_e;
7326       tree loop_arg;
7327
7328       if (ncopies > 1)
7329         {
7330           if (dump_enabled_p ())
7331             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7332                              "multiple types in nested loop.\n");
7333           return false;
7334         }
7335
7336       /* FORNOW: outer loop induction with SLP not supported.  */
7337       if (STMT_SLP_TYPE (stmt_info))
7338         return false;
7339
7340       exit_phi = NULL;
7341       latch_e = loop_latch_edge (loop->inner);
7342       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7343       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7344         {
7345           gimple *use_stmt = USE_STMT (use_p);
7346           if (is_gimple_debug (use_stmt))
7347             continue;
7348
7349           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7350             {
7351               exit_phi = use_stmt;
7352               break;
7353             }
7354         }
7355       if (exit_phi)
7356         {
7357           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7358           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7359                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7360             {
7361               if (dump_enabled_p ())
7362                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7363                                  "inner-loop induction only used outside "
7364                                  "of the outer vectorized loop.\n");
7365               return false;
7366             }
7367         }
7368
7369       nested_in_vect_loop = true;
7370       iv_loop = loop->inner;
7371     }
7372   else
7373     iv_loop = loop;
7374   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7375
7376   if (slp_node && !nunits.is_constant ())
7377     {
7378       /* The current SLP code creates the initial value element-by-element.  */
7379       if (dump_enabled_p ())
7380         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7381                          "SLP induction not supported for variable-length"
7382                          " vectors.\n");
7383       return false;
7384     }
7385
7386   if (!vec_stmt) /* transformation not required.  */
7387     {
7388       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7389       DUMP_VECT_SCOPE ("vectorizable_induction");
7390       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7391       return true;
7392     }
7393
7394   /* Transform.  */
7395
7396   /* Compute a vector variable, initialized with the first VF values of
7397      the induction variable.  E.g., for an iv with IV_PHI='X' and
7398      evolution S, for a vector of 4 units, we want to compute:
7399      [X, X + S, X + 2*S, X + 3*S].  */
7400
7401   if (dump_enabled_p ())
7402     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7403
7404   latch_e = loop_latch_edge (iv_loop);
7405   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7406
7407   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7408   gcc_assert (step_expr != NULL_TREE);
7409
7410   pe = loop_preheader_edge (iv_loop);
7411   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7412                                      loop_preheader_edge (iv_loop));
7413
7414   stmts = NULL;
7415   if (!nested_in_vect_loop)
7416     {
7417       /* Convert the initial value to the desired type.  */
7418       tree new_type = TREE_TYPE (vectype);
7419       init_expr = gimple_convert (&stmts, new_type, init_expr);
7420
7421       /* If we are using the loop mask to "peel" for alignment then we need
7422          to adjust the start value here.  */
7423       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7424       if (skip_niters != NULL_TREE)
7425         {
7426           if (FLOAT_TYPE_P (vectype))
7427             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7428                                         skip_niters);
7429           else
7430             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7431           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7432                                          skip_niters, step_expr);
7433           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7434                                     init_expr, skip_step);
7435         }
7436     }
7437
7438   /* Convert the step to the desired type.  */
7439   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7440
7441   if (stmts)
7442     {
7443       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7444       gcc_assert (!new_bb);
7445     }
7446
7447   /* Find the first insertion point in the BB.  */
7448   si = gsi_after_labels (bb);
7449
7450   /* For SLP induction we have to generate several IVs as for example
7451      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7452      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7453      [VF*S, VF*S, VF*S, VF*S] for all.  */
7454   if (slp_node)
7455     {
7456       /* Enforced above.  */
7457       unsigned int const_nunits = nunits.to_constant ();
7458
7459       /* Generate [VF*S, VF*S, ... ].  */
7460       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7461         {
7462           expr = build_int_cst (integer_type_node, vf);
7463           expr = fold_convert (TREE_TYPE (step_expr), expr);
7464         }
7465       else
7466         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7467       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7468                               expr, step_expr);
7469       if (! CONSTANT_CLASS_P (new_name))
7470         new_name = vect_init_vector (phi, new_name,
7471                                      TREE_TYPE (step_expr), NULL);
7472       new_vec = build_vector_from_val (vectype, new_name);
7473       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7474
7475       /* Now generate the IVs.  */
7476       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7477       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7478       unsigned elts = const_nunits * nvects;
7479       unsigned nivs = least_common_multiple (group_size,
7480                                              const_nunits) / const_nunits;
7481       gcc_assert (elts % group_size == 0);
7482       tree elt = init_expr;
7483       unsigned ivn;
7484       for (ivn = 0; ivn < nivs; ++ivn)
7485         {
7486           tree_vector_builder elts (vectype, const_nunits, 1);
7487           stmts = NULL;
7488           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7489             {
7490               if (ivn*const_nunits + eltn >= group_size
7491                   && (ivn * const_nunits + eltn) % group_size == 0)
7492                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7493                                     elt, step_expr);
7494               elts.quick_push (elt);
7495             }
7496           vec_init = gimple_build_vector (&stmts, &elts);
7497           if (stmts)
7498             {
7499               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7500               gcc_assert (!new_bb);
7501             }
7502
7503           /* Create the induction-phi that defines the induction-operand.  */
7504           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7505           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7506           loop_vinfo->add_stmt (induction_phi);
7507           induc_def = PHI_RESULT (induction_phi);
7508
7509           /* Create the iv update inside the loop  */
7510           vec_def = make_ssa_name (vec_dest);
7511           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7512           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7513           loop_vinfo->add_stmt (new_stmt);
7514
7515           /* Set the arguments of the phi node:  */
7516           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7517           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7518                        UNKNOWN_LOCATION);
7519
7520           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7521         }
7522
7523       /* Re-use IVs when we can.  */
7524       if (ivn < nvects)
7525         {
7526           unsigned vfp
7527             = least_common_multiple (group_size, const_nunits) / group_size;
7528           /* Generate [VF'*S, VF'*S, ... ].  */
7529           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7530             {
7531               expr = build_int_cst (integer_type_node, vfp);
7532               expr = fold_convert (TREE_TYPE (step_expr), expr);
7533             }
7534           else
7535             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7536           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7537                                   expr, step_expr);
7538           if (! CONSTANT_CLASS_P (new_name))
7539             new_name = vect_init_vector (phi, new_name,
7540                                          TREE_TYPE (step_expr), NULL);
7541           new_vec = build_vector_from_val (vectype, new_name);
7542           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7543           for (; ivn < nvects; ++ivn)
7544             {
7545               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7546               tree def;
7547               if (gimple_code (iv) == GIMPLE_PHI)
7548                 def = gimple_phi_result (iv);
7549               else
7550                 def = gimple_assign_lhs (iv);
7551               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7552                                               PLUS_EXPR,
7553                                               def, vec_step);
7554               if (gimple_code (iv) == GIMPLE_PHI)
7555                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7556               else
7557                 {
7558                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7559                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7560                 }
7561               loop_vinfo->add_stmt (new_stmt);
7562               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7563             }
7564         }
7565
7566       return true;
7567     }
7568
7569   /* Create the vector that holds the initial_value of the induction.  */
7570   if (nested_in_vect_loop)
7571     {
7572       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7573          been created during vectorization of previous stmts.  We obtain it
7574          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7575       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7576       /* If the initial value is not of proper type, convert it.  */
7577       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7578         {
7579           new_stmt
7580             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7581                                                           vect_simple_var,
7582                                                           "vec_iv_"),
7583                                    VIEW_CONVERT_EXPR,
7584                                    build1 (VIEW_CONVERT_EXPR, vectype,
7585                                            vec_init));
7586           vec_init = gimple_assign_lhs (new_stmt);
7587           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7588                                                  new_stmt);
7589           gcc_assert (!new_bb);
7590           loop_vinfo->add_stmt (new_stmt);
7591         }
7592     }
7593   else
7594     {
7595       /* iv_loop is the loop to be vectorized. Create:
7596          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7597       stmts = NULL;
7598       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7599
7600       unsigned HOST_WIDE_INT const_nunits;
7601       if (nunits.is_constant (&const_nunits))
7602         {
7603           tree_vector_builder elts (vectype, const_nunits, 1);
7604           elts.quick_push (new_name);
7605           for (i = 1; i < const_nunits; i++)
7606             {
7607               /* Create: new_name_i = new_name + step_expr  */
7608               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7609                                        new_name, step_expr);
7610               elts.quick_push (new_name);
7611             }
7612           /* Create a vector from [new_name_0, new_name_1, ...,
7613              new_name_nunits-1]  */
7614           vec_init = gimple_build_vector (&stmts, &elts);
7615         }
7616       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7617         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7618         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7619                                  new_name, step_expr);
7620       else
7621         {
7622           /* Build:
7623                 [base, base, base, ...]
7624                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7625           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7626           gcc_assert (flag_associative_math);
7627           tree index = build_index_vector (vectype, 0, 1);
7628           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7629                                                         new_name);
7630           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7631                                                         step_expr);
7632           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7633           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7634                                    vec_init, step_vec);
7635           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7636                                    vec_init, base_vec);
7637         }
7638
7639       if (stmts)
7640         {
7641           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7642           gcc_assert (!new_bb);
7643         }
7644     }
7645
7646
7647   /* Create the vector that holds the step of the induction.  */
7648   if (nested_in_vect_loop)
7649     /* iv_loop is nested in the loop to be vectorized. Generate:
7650        vec_step = [S, S, S, S]  */
7651     new_name = step_expr;
7652   else
7653     {
7654       /* iv_loop is the loop to be vectorized. Generate:
7655           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7656       gimple_seq seq = NULL;
7657       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7658         {
7659           expr = build_int_cst (integer_type_node, vf);
7660           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7661         }
7662       else
7663         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7664       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7665                                expr, step_expr);
7666       if (seq)
7667         {
7668           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7669           gcc_assert (!new_bb);
7670         }
7671     }
7672
7673   t = unshare_expr (new_name);
7674   gcc_assert (CONSTANT_CLASS_P (new_name)
7675               || TREE_CODE (new_name) == SSA_NAME);
7676   new_vec = build_vector_from_val (vectype, t);
7677   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7678
7679
7680   /* Create the following def-use cycle:
7681      loop prolog:
7682          vec_init = ...
7683          vec_step = ...
7684      loop:
7685          vec_iv = PHI <vec_init, vec_loop>
7686          ...
7687          STMT
7688          ...
7689          vec_loop = vec_iv + vec_step;  */
7690
7691   /* Create the induction-phi that defines the induction-operand.  */
7692   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7693   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7694   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7695   induc_def = PHI_RESULT (induction_phi);
7696
7697   /* Create the iv update inside the loop  */
7698   vec_def = make_ssa_name (vec_dest);
7699   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7700   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7701   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7702
7703   /* Set the arguments of the phi node:  */
7704   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7705   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7706                UNKNOWN_LOCATION);
7707
7708   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7709
7710   /* In case that vectorization factor (VF) is bigger than the number
7711      of elements that we can fit in a vectype (nunits), we have to generate
7712      more than one vector stmt - i.e - we need to "unroll" the
7713      vector stmt by a factor VF/nunits.  For more details see documentation
7714      in vectorizable_operation.  */
7715
7716   if (ncopies > 1)
7717     {
7718       gimple_seq seq = NULL;
7719       stmt_vec_info prev_stmt_vinfo;
7720       /* FORNOW. This restriction should be relaxed.  */
7721       gcc_assert (!nested_in_vect_loop);
7722
7723       /* Create the vector that holds the step of the induction.  */
7724       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7725         {
7726           expr = build_int_cst (integer_type_node, nunits);
7727           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7728         }
7729       else
7730         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7731       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7732                                expr, step_expr);
7733       if (seq)
7734         {
7735           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7736           gcc_assert (!new_bb);
7737         }
7738
7739       t = unshare_expr (new_name);
7740       gcc_assert (CONSTANT_CLASS_P (new_name)
7741                   || TREE_CODE (new_name) == SSA_NAME);
7742       new_vec = build_vector_from_val (vectype, t);
7743       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7744
7745       vec_def = induc_def;
7746       prev_stmt_vinfo = induction_phi_info;
7747       for (i = 1; i < ncopies; i++)
7748         {
7749           /* vec_i = vec_prev + vec_step  */
7750           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7751                                           vec_def, vec_step);
7752           vec_def = make_ssa_name (vec_dest, new_stmt);
7753           gimple_assign_set_lhs (new_stmt, vec_def);
7754
7755           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7756           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7757           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7758           prev_stmt_vinfo = new_stmt_info;
7759         }
7760     }
7761
7762   if (nested_in_vect_loop)
7763     {
7764       /* Find the loop-closed exit-phi of the induction, and record
7765          the final vector of induction results:  */
7766       exit_phi = NULL;
7767       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7768         {
7769           gimple *use_stmt = USE_STMT (use_p);
7770           if (is_gimple_debug (use_stmt))
7771             continue;
7772
7773           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7774             {
7775               exit_phi = use_stmt;
7776               break;
7777             }
7778         }
7779       if (exit_phi)
7780         {
7781           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7782           /* FORNOW. Currently not supporting the case that an inner-loop induction
7783              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7784           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7785                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7786
7787           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7788           if (dump_enabled_p ())
7789             {
7790               dump_printf_loc (MSG_NOTE, vect_location,
7791                                "vector of inductions after inner-loop:");
7792               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7793             }
7794         }
7795     }
7796
7797
7798   if (dump_enabled_p ())
7799     {
7800       dump_printf_loc (MSG_NOTE, vect_location,
7801                        "transform induction: created def-use cycle: ");
7802       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7803       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7804                         SSA_NAME_DEF_STMT (vec_def), 0);
7805     }
7806
7807   return true;
7808 }
7809
7810 /* Function vectorizable_live_operation.
7811
7812    STMT computes a value that is used outside the loop.  Check if
7813    it can be supported.  */
7814
7815 bool
7816 vectorizable_live_operation (gimple *stmt,
7817                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7818                              slp_tree slp_node, int slp_index,
7819                              gimple **vec_stmt,
7820                              stmt_vector_for_cost *)
7821 {
7822   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7823   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7824   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7825   imm_use_iterator imm_iter;
7826   tree lhs, lhs_type, bitsize, vec_bitsize;
7827   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7828   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7829   int ncopies;
7830   gimple *use_stmt;
7831   auto_vec<tree> vec_oprnds;
7832   int vec_entry = 0;
7833   poly_uint64 vec_index = 0;
7834
7835   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7836
7837   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7838     return false;
7839
7840   /* FORNOW.  CHECKME.  */
7841   if (nested_in_vect_loop_p (loop, stmt))
7842     return false;
7843
7844   /* If STMT is not relevant and it is a simple assignment and its inputs are
7845      invariant then it can remain in place, unvectorized.  The original last
7846      scalar value that it computes will be used.  */
7847   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7848     {
7849       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7850       if (dump_enabled_p ())
7851         dump_printf_loc (MSG_NOTE, vect_location,
7852                          "statement is simple and uses invariant.  Leaving in "
7853                          "place.\n");
7854       return true;
7855     }
7856
7857   if (slp_node)
7858     ncopies = 1;
7859   else
7860     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7861
7862   if (slp_node)
7863     {
7864       gcc_assert (slp_index >= 0);
7865
7866       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7867       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7868
7869       /* Get the last occurrence of the scalar index from the concatenation of
7870          all the slp vectors. Calculate which slp vector it is and the index
7871          within.  */
7872       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7873
7874       /* Calculate which vector contains the result, and which lane of
7875          that vector we need.  */
7876       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7877         {
7878           if (dump_enabled_p ())
7879             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7880                              "Cannot determine which vector holds the"
7881                              " final result.\n");
7882           return false;
7883         }
7884     }
7885
7886   if (!vec_stmt)
7887     {
7888       /* No transformation required.  */
7889       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7890         {
7891           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7892                                                OPTIMIZE_FOR_SPEED))
7893             {
7894               if (dump_enabled_p ())
7895                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7896                                  "can't use a fully-masked loop because "
7897                                  "the target doesn't support extract last "
7898                                  "reduction.\n");
7899               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7900             }
7901           else if (slp_node)
7902             {
7903               if (dump_enabled_p ())
7904                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7905                                  "can't use a fully-masked loop because an "
7906                                  "SLP statement is live after the loop.\n");
7907               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7908             }
7909           else if (ncopies > 1)
7910             {
7911               if (dump_enabled_p ())
7912                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7913                                  "can't use a fully-masked loop because"
7914                                  " ncopies is greater than 1.\n");
7915               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7916             }
7917           else
7918             {
7919               gcc_assert (ncopies == 1 && !slp_node);
7920               vect_record_loop_mask (loop_vinfo,
7921                                      &LOOP_VINFO_MASKS (loop_vinfo),
7922                                      1, vectype);
7923             }
7924         }
7925       return true;
7926     }
7927
7928   /* If stmt has a related stmt, then use that for getting the lhs.  */
7929   if (is_pattern_stmt_p (stmt_info))
7930     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7931
7932   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7933         : gimple_get_lhs (stmt);
7934   lhs_type = TREE_TYPE (lhs);
7935
7936   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7937              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7938              : TYPE_SIZE (TREE_TYPE (vectype)));
7939   vec_bitsize = TYPE_SIZE (vectype);
7940
7941   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7942   tree vec_lhs, bitstart;
7943   if (slp_node)
7944     {
7945       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7946
7947       /* Get the correct slp vectorized stmt.  */
7948       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
7949       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7950         vec_lhs = gimple_phi_result (phi);
7951       else
7952         vec_lhs = gimple_get_lhs (vec_stmt);
7953
7954       /* Get entry to use.  */
7955       bitstart = bitsize_int (vec_index);
7956       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7957     }
7958   else
7959     {
7960       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7961       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7962       gcc_checking_assert (ncopies == 1
7963                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7964
7965       /* For multiple copies, get the last copy.  */
7966       for (int i = 1; i < ncopies; ++i)
7967         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7968                                                   vec_lhs);
7969
7970       /* Get the last lane in the vector.  */
7971       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7972     }
7973
7974   gimple_seq stmts = NULL;
7975   tree new_tree;
7976   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7977     {
7978       /* Emit:
7979
7980            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7981
7982          where VEC_LHS is the vectorized live-out result and MASK is
7983          the loop mask for the final iteration.  */
7984       gcc_assert (ncopies == 1 && !slp_node);
7985       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7986       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7987                                       1, vectype, 0);
7988       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7989                                       scalar_type, mask, vec_lhs);
7990
7991       /* Convert the extracted vector element to the required scalar type.  */
7992       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7993     }
7994   else
7995     {
7996       tree bftype = TREE_TYPE (vectype);
7997       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7998         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7999       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8000       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8001                                        &stmts, true, NULL_TREE);
8002     }
8003
8004   if (stmts)
8005     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8006
8007   /* Replace use of lhs with newly computed result.  If the use stmt is a
8008      single arg PHI, just replace all uses of PHI result.  It's necessary
8009      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8010   use_operand_p use_p;
8011   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8012     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8013         && !is_gimple_debug (use_stmt))
8014     {
8015       if (gimple_code (use_stmt) == GIMPLE_PHI
8016           && gimple_phi_num_args (use_stmt) == 1)
8017         {
8018           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8019         }
8020       else
8021         {
8022           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8023             SET_USE (use_p, new_tree);
8024         }
8025       update_stmt (use_stmt);
8026     }
8027
8028   return true;
8029 }
8030
8031 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8032
8033 static void
8034 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8035 {
8036   ssa_op_iter op_iter;
8037   imm_use_iterator imm_iter;
8038   def_operand_p def_p;
8039   gimple *ustmt;
8040
8041   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8042     {
8043       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8044         {
8045           basic_block bb;
8046
8047           if (!is_gimple_debug (ustmt))
8048             continue;
8049
8050           bb = gimple_bb (ustmt);
8051
8052           if (!flow_bb_inside_loop_p (loop, bb))
8053             {
8054               if (gimple_debug_bind_p (ustmt))
8055                 {
8056                   if (dump_enabled_p ())
8057                     dump_printf_loc (MSG_NOTE, vect_location,
8058                                      "killing debug use\n");
8059
8060                   gimple_debug_bind_reset_value (ustmt);
8061                   update_stmt (ustmt);
8062                 }
8063               else
8064                 gcc_unreachable ();
8065             }
8066         }
8067     }
8068 }
8069
8070 /* Given loop represented by LOOP_VINFO, return true if computation of
8071    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8072    otherwise.  */
8073
8074 static bool
8075 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8076 {
8077   /* Constant case.  */
8078   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8079     {
8080       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8081       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8082
8083       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8084       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8085       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8086         return true;
8087     }
8088
8089   widest_int max;
8090   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8091   /* Check the upper bound of loop niters.  */
8092   if (get_max_loop_iterations (loop, &max))
8093     {
8094       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8095       signop sgn = TYPE_SIGN (type);
8096       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8097       if (max < type_max)
8098         return true;
8099     }
8100   return false;
8101 }
8102
8103 /* Return a mask type with half the number of elements as TYPE.  */
8104
8105 tree
8106 vect_halve_mask_nunits (tree type)
8107 {
8108   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8109   return build_truth_vector_type (nunits, current_vector_size);
8110 }
8111
8112 /* Return a mask type with twice as many elements as TYPE.  */
8113
8114 tree
8115 vect_double_mask_nunits (tree type)
8116 {
8117   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8118   return build_truth_vector_type (nunits, current_vector_size);
8119 }
8120
8121 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8122    contain a sequence of NVECTORS masks that each control a vector of type
8123    VECTYPE.  */
8124
8125 void
8126 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8127                        unsigned int nvectors, tree vectype)
8128 {
8129   gcc_assert (nvectors != 0);
8130   if (masks->length () < nvectors)
8131     masks->safe_grow_cleared (nvectors);
8132   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8133   /* The number of scalars per iteration and the number of vectors are
8134      both compile-time constants.  */
8135   unsigned int nscalars_per_iter
8136     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8137                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8138   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8139     {
8140       rgm->max_nscalars_per_iter = nscalars_per_iter;
8141       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8142     }
8143 }
8144
8145 /* Given a complete set of masks MASKS, extract mask number INDEX
8146    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8147    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8148
8149    See the comment above vec_loop_masks for more details about the mask
8150    arrangement.  */
8151
8152 tree
8153 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8154                     unsigned int nvectors, tree vectype, unsigned int index)
8155 {
8156   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8157   tree mask_type = rgm->mask_type;
8158
8159   /* Populate the rgroup's mask array, if this is the first time we've
8160      used it.  */
8161   if (rgm->masks.is_empty ())
8162     {
8163       rgm->masks.safe_grow_cleared (nvectors);
8164       for (unsigned int i = 0; i < nvectors; ++i)
8165         {
8166           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8167           /* Provide a dummy definition until the real one is available.  */
8168           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8169           rgm->masks[i] = mask;
8170         }
8171     }
8172
8173   tree mask = rgm->masks[index];
8174   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8175                 TYPE_VECTOR_SUBPARTS (vectype)))
8176     {
8177       /* A loop mask for data type X can be reused for data type Y
8178          if X has N times more elements than Y and if Y's elements
8179          are N times bigger than X's.  In this case each sequence
8180          of N elements in the loop mask will be all-zero or all-one.
8181          We can then view-convert the mask so that each sequence of
8182          N elements is replaced by a single element.  */
8183       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8184                               TYPE_VECTOR_SUBPARTS (vectype)));
8185       gimple_seq seq = NULL;
8186       mask_type = build_same_sized_truth_vector_type (vectype);
8187       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8188       if (seq)
8189         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8190     }
8191   return mask;
8192 }
8193
8194 /* Scale profiling counters by estimation for LOOP which is vectorized
8195    by factor VF.  */
8196
8197 static void
8198 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8199 {
8200   edge preheader = loop_preheader_edge (loop);
8201   /* Reduce loop iterations by the vectorization factor.  */
8202   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8203   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8204
8205   if (freq_h.nonzero_p ())
8206     {
8207       profile_probability p;
8208
8209       /* Avoid dropping loop body profile counter to 0 because of zero count
8210          in loop's preheader.  */
8211       if (!(freq_e == profile_count::zero ()))
8212         freq_e = freq_e.force_nonzero ();
8213       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8214       scale_loop_frequencies (loop, p);
8215     }
8216
8217   edge exit_e = single_exit (loop);
8218   exit_e->probability = profile_probability::always ()
8219                                  .apply_scale (1, new_est_niter + 1);
8220
8221   edge exit_l = single_pred_edge (loop->latch);
8222   profile_probability prob = exit_l->probability;
8223   exit_l->probability = exit_e->probability.invert ();
8224   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8225     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8226 }
8227
8228 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8229    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8230    *SLP_SCHEDULE is a running record of whether we have called
8231    vect_schedule_slp.  */
8232
8233 static void
8234 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8235                           gimple_stmt_iterator *gsi,
8236                           stmt_vec_info *seen_store, bool *slp_scheduled)
8237 {
8238   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8239   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8240   stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
8241   if (!stmt_info)
8242     return;
8243
8244   if (dump_enabled_p ())
8245     {
8246       dump_printf_loc (MSG_NOTE, vect_location,
8247                        "------>vectorizing statement: ");
8248       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8249     }
8250
8251   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8252     vect_loop_kill_debug_uses (loop, stmt);
8253
8254   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8255       && !STMT_VINFO_LIVE_P (stmt_info))
8256     return;
8257
8258   if (STMT_VINFO_VECTYPE (stmt_info))
8259     {
8260       poly_uint64 nunits
8261         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8262       if (!STMT_SLP_TYPE (stmt_info)
8263           && maybe_ne (nunits, vf)
8264           && dump_enabled_p ())
8265         /* For SLP VF is set according to unrolling factor, and not
8266            to vector size, hence for SLP this print is not valid.  */
8267         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8268     }
8269
8270   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8271      reached.  */
8272   if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8273     {
8274
8275       if (!*slp_scheduled)
8276         {
8277           *slp_scheduled = true;
8278
8279           DUMP_VECT_SCOPE ("scheduling SLP instances");
8280
8281           vect_schedule_slp (loop_vinfo);
8282         }
8283
8284       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8285       if (slptype == pure_slp)
8286         return;
8287     }
8288
8289   if (dump_enabled_p ())
8290     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8291
8292   bool grouped_store = false;
8293   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8294     *seen_store = stmt_info;
8295 }
8296
8297 /* Function vect_transform_loop.
8298
8299    The analysis phase has determined that the loop is vectorizable.
8300    Vectorize the loop - created vectorized stmts to replace the scalar
8301    stmts in the loop, and update the loop exit condition.
8302    Returns scalar epilogue loop if any.  */
8303
8304 struct loop *
8305 vect_transform_loop (loop_vec_info loop_vinfo)
8306 {
8307   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8308   struct loop *epilogue = NULL;
8309   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8310   int nbbs = loop->num_nodes;
8311   int i;
8312   tree niters_vector = NULL_TREE;
8313   tree step_vector = NULL_TREE;
8314   tree niters_vector_mult_vf = NULL_TREE;
8315   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8316   unsigned int lowest_vf = constant_lower_bound (vf);
8317   bool slp_scheduled = false;
8318   gimple *stmt;
8319   bool check_profitability = false;
8320   unsigned int th;
8321
8322   DUMP_VECT_SCOPE ("vec_transform_loop");
8323
8324   loop_vinfo->shared->check_datarefs ();
8325
8326   /* Use the more conservative vectorization threshold.  If the number
8327      of iterations is constant assume the cost check has been performed
8328      by our caller.  If the threshold makes all loops profitable that
8329      run at least the (estimated) vectorization factor number of times
8330      checking is pointless, too.  */
8331   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8332   if (th >= vect_vf_for_cost (loop_vinfo)
8333       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8334     {
8335       if (dump_enabled_p ())
8336         dump_printf_loc (MSG_NOTE, vect_location,
8337                          "Profitability threshold is %d loop iterations.\n",
8338                          th);
8339       check_profitability = true;
8340     }
8341
8342   /* Make sure there exists a single-predecessor exit bb.  Do this before
8343      versioning.   */
8344   edge e = single_exit (loop);
8345   if (! single_pred_p (e->dest))
8346     {
8347       split_loop_exit_edge (e);
8348       if (dump_enabled_p ())
8349         dump_printf (MSG_NOTE, "split exit edge\n");
8350     }
8351
8352   /* Version the loop first, if required, so the profitability check
8353      comes first.  */
8354
8355   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8356     {
8357       poly_uint64 versioning_threshold
8358         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8359       if (check_profitability
8360           && ordered_p (poly_uint64 (th), versioning_threshold))
8361         {
8362           versioning_threshold = ordered_max (poly_uint64 (th),
8363                                               versioning_threshold);
8364           check_profitability = false;
8365         }
8366       vect_loop_versioning (loop_vinfo, th, check_profitability,
8367                             versioning_threshold);
8368       check_profitability = false;
8369     }
8370
8371   /* Make sure there exists a single-predecessor exit bb also on the
8372      scalar loop copy.  Do this after versioning but before peeling
8373      so CFG structure is fine for both scalar and if-converted loop
8374      to make slpeel_duplicate_current_defs_from_edges face matched
8375      loop closed PHI nodes on the exit.  */
8376   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8377     {
8378       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8379       if (! single_pred_p (e->dest))
8380         {
8381           split_loop_exit_edge (e);
8382           if (dump_enabled_p ())
8383             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8384         }
8385     }
8386
8387   tree niters = vect_build_loop_niters (loop_vinfo);
8388   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8389   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8390   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8391   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8392                               &step_vector, &niters_vector_mult_vf, th,
8393                               check_profitability, niters_no_overflow);
8394
8395   if (niters_vector == NULL_TREE)
8396     {
8397       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8398           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8399           && known_eq (lowest_vf, vf))
8400         {
8401           niters_vector
8402             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8403                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8404           step_vector = build_one_cst (TREE_TYPE (niters));
8405         }
8406       else
8407         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8408                                      &step_vector, niters_no_overflow);
8409     }
8410
8411   /* 1) Make sure the loop header has exactly two entries
8412      2) Make sure we have a preheader basic block.  */
8413
8414   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8415
8416   split_edge (loop_preheader_edge (loop));
8417
8418   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8419       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8420     /* This will deal with any possible peeling.  */
8421     vect_prepare_for_masked_peels (loop_vinfo);
8422
8423   /* FORNOW: the vectorizer supports only loops which body consist
8424      of one basic block (header + empty latch). When the vectorizer will
8425      support more involved loop forms, the order by which the BBs are
8426      traversed need to be reconsidered.  */
8427
8428   for (i = 0; i < nbbs; i++)
8429     {
8430       basic_block bb = bbs[i];
8431       stmt_vec_info stmt_info;
8432
8433       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8434            gsi_next (&si))
8435         {
8436           gphi *phi = si.phi ();
8437           if (dump_enabled_p ())
8438             {
8439               dump_printf_loc (MSG_NOTE, vect_location,
8440                                "------>vectorizing phi: ");
8441               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8442             }
8443           stmt_info = loop_vinfo->lookup_stmt (phi);
8444           if (!stmt_info)
8445             continue;
8446
8447           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8448             vect_loop_kill_debug_uses (loop, phi);
8449
8450           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8451               && !STMT_VINFO_LIVE_P (stmt_info))
8452             continue;
8453
8454           if (STMT_VINFO_VECTYPE (stmt_info)
8455               && (maybe_ne
8456                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8457               && dump_enabled_p ())
8458             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8459
8460           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8461                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8462                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8463               && ! PURE_SLP_STMT (stmt_info))
8464             {
8465               if (dump_enabled_p ())
8466                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8467               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8468             }
8469         }
8470
8471       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8472            !gsi_end_p (si);)
8473         {
8474           stmt = gsi_stmt (si);
8475           /* During vectorization remove existing clobber stmts.  */
8476           if (gimple_clobber_p (stmt))
8477             {
8478               unlink_stmt_vdef (stmt);
8479               gsi_remove (&si, true);
8480               release_defs (stmt);
8481             }
8482           else
8483             {
8484               stmt_info = loop_vinfo->lookup_stmt (stmt);
8485
8486               /* vector stmts created in the outer-loop during vectorization of
8487                  stmts in an inner-loop may not have a stmt_info, and do not
8488                  need to be vectorized.  */
8489               stmt_vec_info seen_store = NULL;
8490               if (stmt_info)
8491                 {
8492                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8493                     {
8494                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8495                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8496                            !gsi_end_p (subsi); gsi_next (&subsi))
8497                         vect_transform_loop_stmt (loop_vinfo,
8498                                                   gsi_stmt (subsi), &si,
8499                                                   &seen_store,
8500                                                   &slp_scheduled);
8501                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8502                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8503                                                 &seen_store, &slp_scheduled);
8504                     }
8505                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8506                                             &seen_store, &slp_scheduled);
8507                 }
8508               if (seen_store)
8509                 {
8510                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8511                     {
8512                       /* Interleaving.  If IS_STORE is TRUE, the
8513                          vectorization of the interleaving chain was
8514                          completed - free all the stores in the chain.  */
8515                       gsi_next (&si);
8516                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8517                     }
8518                   else
8519                     {
8520                       /* Free the attached stmt_vec_info and remove the
8521                          stmt.  */
8522                       free_stmt_vec_info (stmt);
8523                       unlink_stmt_vdef (stmt);
8524                       gsi_remove (&si, true);
8525                       release_defs (stmt);
8526                     }
8527                 }
8528               else
8529                 gsi_next (&si);
8530             }
8531         }
8532
8533       /* Stub out scalar statements that must not survive vectorization.
8534          Doing this here helps with grouped statements, or statements that
8535          are involved in patterns.  */
8536       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8537            !gsi_end_p (gsi); gsi_next (&gsi))
8538         {
8539           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8540           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8541             {
8542               tree lhs = gimple_get_lhs (call);
8543               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8544                 {
8545                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8546                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8547                   gsi_replace (&gsi, new_stmt, true);
8548                 }
8549             }
8550         }
8551     }                           /* BBs in loop */
8552
8553   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8554      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8555   if (integer_onep (step_vector))
8556     niters_no_overflow = true;
8557   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8558                            niters_vector_mult_vf, !niters_no_overflow);
8559
8560   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8561   scale_profile_for_vect_loop (loop, assumed_vf);
8562
8563   /* True if the final iteration might not handle a full vector's
8564      worth of scalar iterations.  */
8565   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8566   /* The minimum number of iterations performed by the epilogue.  This
8567      is 1 when peeling for gaps because we always need a final scalar
8568      iteration.  */
8569   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8570   /* +1 to convert latch counts to loop iteration counts,
8571      -min_epilogue_iters to remove iterations that cannot be performed
8572        by the vector code.  */
8573   int bias_for_lowest = 1 - min_epilogue_iters;
8574   int bias_for_assumed = bias_for_lowest;
8575   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8576   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8577     {
8578       /* When the amount of peeling is known at compile time, the first
8579          iteration will have exactly alignment_npeels active elements.
8580          In the worst case it will have at least one.  */
8581       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8582       bias_for_lowest += lowest_vf - min_first_active;
8583       bias_for_assumed += assumed_vf - min_first_active;
8584     }
8585   /* In these calculations the "- 1" converts loop iteration counts
8586      back to latch counts.  */
8587   if (loop->any_upper_bound)
8588     loop->nb_iterations_upper_bound
8589       = (final_iter_may_be_partial
8590          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8591                           lowest_vf) - 1
8592          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8593                            lowest_vf) - 1);
8594   if (loop->any_likely_upper_bound)
8595     loop->nb_iterations_likely_upper_bound
8596       = (final_iter_may_be_partial
8597          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8598                           + bias_for_lowest, lowest_vf) - 1
8599          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8600                            + bias_for_lowest, lowest_vf) - 1);
8601   if (loop->any_estimate)
8602     loop->nb_iterations_estimate
8603       = (final_iter_may_be_partial
8604          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8605                           assumed_vf) - 1
8606          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8607                            assumed_vf) - 1);
8608
8609   if (dump_enabled_p ())
8610     {
8611       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8612         {
8613           dump_printf_loc (MSG_NOTE, vect_location,
8614                            "LOOP VECTORIZED\n");
8615           if (loop->inner)
8616             dump_printf_loc (MSG_NOTE, vect_location,
8617                              "OUTER LOOP VECTORIZED\n");
8618           dump_printf (MSG_NOTE, "\n");
8619         }
8620       else
8621         {
8622           dump_printf_loc (MSG_NOTE, vect_location,
8623                            "LOOP EPILOGUE VECTORIZED (VS=");
8624           dump_dec (MSG_NOTE, current_vector_size);
8625           dump_printf (MSG_NOTE, ")\n");
8626         }
8627     }
8628
8629   /* Free SLP instances here because otherwise stmt reference counting
8630      won't work.  */
8631   slp_instance instance;
8632   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8633     vect_free_slp_instance (instance, true);
8634   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8635   /* Clear-up safelen field since its value is invalid after vectorization
8636      since vectorized loop can have loop-carried dependencies.  */
8637   loop->safelen = 0;
8638
8639   /* Don't vectorize epilogue for epilogue.  */
8640   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8641     epilogue = NULL;
8642
8643   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8644     epilogue = NULL;
8645
8646   if (epilogue)
8647     {
8648       auto_vector_sizes vector_sizes;
8649       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8650       unsigned int next_size = 0;
8651
8652       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8653           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8654           && known_eq (vf, lowest_vf))
8655         {
8656           unsigned int eiters
8657             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8658                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8659           eiters = eiters % lowest_vf;
8660           epilogue->nb_iterations_upper_bound = eiters - 1;
8661
8662           unsigned int ratio;
8663           while (next_size < vector_sizes.length ()
8664                  && !(constant_multiple_p (current_vector_size,
8665                                            vector_sizes[next_size], &ratio)
8666                       && eiters >= lowest_vf / ratio))
8667             next_size += 1;
8668         }
8669       else
8670         while (next_size < vector_sizes.length ()
8671                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8672           next_size += 1;
8673
8674       if (next_size == vector_sizes.length ())
8675         epilogue = NULL;
8676     }
8677
8678   if (epilogue)
8679     {
8680       epilogue->force_vectorize = loop->force_vectorize;
8681       epilogue->safelen = loop->safelen;
8682       epilogue->dont_vectorize = false;
8683
8684       /* We may need to if-convert epilogue to vectorize it.  */
8685       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8686         tree_if_conversion (epilogue);
8687     }
8688
8689   return epilogue;
8690 }
8691
8692 /* The code below is trying to perform simple optimization - revert
8693    if-conversion for masked stores, i.e. if the mask of a store is zero
8694    do not perform it and all stored value producers also if possible.
8695    For example,
8696      for (i=0; i<n; i++)
8697        if (c[i])
8698         {
8699           p1[i] += 1;
8700           p2[i] = p3[i] +2;
8701         }
8702    this transformation will produce the following semi-hammock:
8703
8704    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8705      {
8706        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8707        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8708        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8709        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8710        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8711        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8712      }
8713 */
8714
8715 void
8716 optimize_mask_stores (struct loop *loop)
8717 {
8718   basic_block *bbs = get_loop_body (loop);
8719   unsigned nbbs = loop->num_nodes;
8720   unsigned i;
8721   basic_block bb;
8722   struct loop *bb_loop;
8723   gimple_stmt_iterator gsi;
8724   gimple *stmt;
8725   auto_vec<gimple *> worklist;
8726
8727   vect_location = find_loop_location (loop);
8728   /* Pick up all masked stores in loop if any.  */
8729   for (i = 0; i < nbbs; i++)
8730     {
8731       bb = bbs[i];
8732       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8733            gsi_next (&gsi))
8734         {
8735           stmt = gsi_stmt (gsi);
8736           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8737             worklist.safe_push (stmt);
8738         }
8739     }
8740
8741   free (bbs);
8742   if (worklist.is_empty ())
8743     return;
8744
8745   /* Loop has masked stores.  */
8746   while (!worklist.is_empty ())
8747     {
8748       gimple *last, *last_store;
8749       edge e, efalse;
8750       tree mask;
8751       basic_block store_bb, join_bb;
8752       gimple_stmt_iterator gsi_to;
8753       tree vdef, new_vdef;
8754       gphi *phi;
8755       tree vectype;
8756       tree zero;
8757
8758       last = worklist.pop ();
8759       mask = gimple_call_arg (last, 2);
8760       bb = gimple_bb (last);
8761       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8762          the same loop as if_bb.  It could be different to LOOP when two
8763          level loop-nest is vectorized and mask_store belongs to the inner
8764          one.  */
8765       e = split_block (bb, last);
8766       bb_loop = bb->loop_father;
8767       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8768       join_bb = e->dest;
8769       store_bb = create_empty_bb (bb);
8770       add_bb_to_loop (store_bb, bb_loop);
8771       e->flags = EDGE_TRUE_VALUE;
8772       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8773       /* Put STORE_BB to likely part.  */
8774       efalse->probability = profile_probability::unlikely ();
8775       store_bb->count = efalse->count ();
8776       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8777       if (dom_info_available_p (CDI_DOMINATORS))
8778         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8779       if (dump_enabled_p ())
8780         dump_printf_loc (MSG_NOTE, vect_location,
8781                          "Create new block %d to sink mask stores.",
8782                          store_bb->index);
8783       /* Create vector comparison with boolean result.  */
8784       vectype = TREE_TYPE (mask);
8785       zero = build_zero_cst (vectype);
8786       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8787       gsi = gsi_last_bb (bb);
8788       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8789       /* Create new PHI node for vdef of the last masked store:
8790          .MEM_2 = VDEF <.MEM_1>
8791          will be converted to
8792          .MEM.3 = VDEF <.MEM_1>
8793          and new PHI node will be created in join bb
8794          .MEM_2 = PHI <.MEM_1, .MEM_3>
8795       */
8796       vdef = gimple_vdef (last);
8797       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8798       gimple_set_vdef (last, new_vdef);
8799       phi = create_phi_node (vdef, join_bb);
8800       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8801
8802       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8803       while (true)
8804         {
8805           gimple_stmt_iterator gsi_from;
8806           gimple *stmt1 = NULL;
8807
8808           /* Move masked store to STORE_BB.  */
8809           last_store = last;
8810           gsi = gsi_for_stmt (last);
8811           gsi_from = gsi;
8812           /* Shift GSI to the previous stmt for further traversal.  */
8813           gsi_prev (&gsi);
8814           gsi_to = gsi_start_bb (store_bb);
8815           gsi_move_before (&gsi_from, &gsi_to);
8816           /* Setup GSI_TO to the non-empty block start.  */
8817           gsi_to = gsi_start_bb (store_bb);
8818           if (dump_enabled_p ())
8819             {
8820               dump_printf_loc (MSG_NOTE, vect_location,
8821                                "Move stmt to created bb\n");
8822               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8823             }
8824           /* Move all stored value producers if possible.  */
8825           while (!gsi_end_p (gsi))
8826             {
8827               tree lhs;
8828               imm_use_iterator imm_iter;
8829               use_operand_p use_p;
8830               bool res;
8831
8832               /* Skip debug statements.  */
8833               if (is_gimple_debug (gsi_stmt (gsi)))
8834                 {
8835                   gsi_prev (&gsi);
8836                   continue;
8837                 }
8838               stmt1 = gsi_stmt (gsi);
8839               /* Do not consider statements writing to memory or having
8840                  volatile operand.  */
8841               if (gimple_vdef (stmt1)
8842                   || gimple_has_volatile_ops (stmt1))
8843                 break;
8844               gsi_from = gsi;
8845               gsi_prev (&gsi);
8846               lhs = gimple_get_lhs (stmt1);
8847               if (!lhs)
8848                 break;
8849
8850               /* LHS of vectorized stmt must be SSA_NAME.  */
8851               if (TREE_CODE (lhs) != SSA_NAME)
8852                 break;
8853
8854               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8855                 {
8856                   /* Remove dead scalar statement.  */
8857                   if (has_zero_uses (lhs))
8858                     {
8859                       gsi_remove (&gsi_from, true);
8860                       continue;
8861                     }
8862                 }
8863
8864               /* Check that LHS does not have uses outside of STORE_BB.  */
8865               res = true;
8866               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8867                 {
8868                   gimple *use_stmt;
8869                   use_stmt = USE_STMT (use_p);
8870                   if (is_gimple_debug (use_stmt))
8871                     continue;
8872                   if (gimple_bb (use_stmt) != store_bb)
8873                     {
8874                       res = false;
8875                       break;
8876                     }
8877                 }
8878               if (!res)
8879                 break;
8880
8881               if (gimple_vuse (stmt1)
8882                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8883                 break;
8884
8885               /* Can move STMT1 to STORE_BB.  */
8886               if (dump_enabled_p ())
8887                 {
8888                   dump_printf_loc (MSG_NOTE, vect_location,
8889                                    "Move stmt to created bb\n");
8890                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8891                 }
8892               gsi_move_before (&gsi_from, &gsi_to);
8893               /* Shift GSI_TO for further insertion.  */
8894               gsi_prev (&gsi_to);
8895             }
8896           /* Put other masked stores with the same mask to STORE_BB.  */
8897           if (worklist.is_empty ()
8898               || gimple_call_arg (worklist.last (), 2) != mask
8899               || worklist.last () != stmt1)
8900             break;
8901           last = worklist.pop ();
8902         }
8903       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8904     }
8905 }