gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     {
 219       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 220       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 221     }
 222   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 223     return false;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             {
 238               dump_printf_loc (MSG_NOTE, vect_location,
 239                                "==> examining pattern def stmt: ");
 240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 241                                 def_stmt_info->stmt, 0);
 242             }
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245             return false;
 246         }
 247
 248       if (dump_enabled_p ())
 249         {
 250           dump_printf_loc (MSG_NOTE, vect_location,
 251                            "==> examining pattern statement: ");
 252           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 253         }
 254       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 255         return false;
 256     }
 257
 258   return true;
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static bool
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 314               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 315             }
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 {
 327                   dump_printf_loc (MSG_NOTE, vect_location,
 328                                    "get vectype for scalar type:  ");
 329                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 330                   dump_printf (MSG_NOTE, "\n");
 331                 }
 332
 333               vectype = get_vectype_for_scalar_type (scalar_type);
 334               if (!vectype)
 335                 {
 336                   if (dump_enabled_p ())
 337                     {
 338                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                        "not vectorized: unsupported "
 340                                        "data-type ");
 341                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                          scalar_type);
 343                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 344                     }
 345                   return false;
 346                 }
 347               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 348
 349               if (dump_enabled_p ())
 350                 {
 351                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 352                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 353                   dump_printf (MSG_NOTE, "\n");
 354                 }
 355
 356               if (dump_enabled_p ())
 357                 {
 358                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 359                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 360                   dump_printf (MSG_NOTE, "\n");
 361                 }
 362
 363               vect_update_max_nunits (&vectorization_factor, vectype);
 364             }
 365         }
 366
 367       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 368            gsi_next (&si))
 369         {
 370           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 371           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 372                                            &mask_producers))
 373             return false;
 374         }
 375     }
 376
 377   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 378   if (dump_enabled_p ())
 379     {
 380       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 381       dump_dec (MSG_NOTE, vectorization_factor);
 382       dump_printf (MSG_NOTE, "\n");
 383     }
 384
 385   if (known_le (vectorization_factor, 1U))
 386     {
 387       if (dump_enabled_p ())
 388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                          "not vectorized: unsupported data-type\n");
 390       return false;
 391     }
 392   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 393
 394   for (i = 0; i < mask_producers.length (); i++)
 395     {
 396       stmt_info = mask_producers[i];
 397       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 398       if (!mask_type)
 399         return false;
 400       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 401     }
 402
 403   return true;
 404 }
 405
 406
 407 /* Function vect_is_simple_iv_evolution.
 408
 409    FORNOW: A simple evolution of an induction variables in the loop is
 410    considered a polynomial evolution.  */
 411
 412 static bool
 413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 414                              tree * step)
 415 {
 416   tree init_expr;
 417   tree step_expr;
 418   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 419   basic_block bb;
 420
 421   /* When there is no evolution in this loop, the evolution function
 422      is not "simple".  */
 423   if (evolution_part == NULL_TREE)
 424     return false;
 425
 426   /* When the evolution is a polynomial of degree >= 2
 427      the evolution function is not "simple".  */
 428   if (tree_is_chrec (evolution_part))
 429     return false;
 430
 431   step_expr = evolution_part;
 432   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 433
 434   if (dump_enabled_p ())
 435     {
 436       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 437       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 438       dump_printf (MSG_NOTE, ",  init: ");
 439       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 440       dump_printf (MSG_NOTE, "\n");
 441     }
 442
 443   *init = init_expr;
 444   *step = step_expr;
 445
 446   if (TREE_CODE (step_expr) != INTEGER_CST
 447       && (TREE_CODE (step_expr) != SSA_NAME
 448           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 449               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 450           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 451               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 452                   || !flag_associative_math)))
 453       && (TREE_CODE (step_expr) != REAL_CST
 454           || !flag_associative_math))
 455     {
 456       if (dump_enabled_p ())
 457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 458                          "step unknown.\n");
 459       return false;
 460     }
 461
 462   return true;
 463 }
 464
 465 /* Function vect_analyze_scalar_cycles_1.
 466
 467    Examine the cross iteration def-use cycles of scalar variables
 468    in LOOP.  LOOP_VINFO represents the loop that is now being
 469    considered for vectorization (can be LOOP, or an outer-loop
 470    enclosing LOOP).  */
 471
 472 static void
 473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 474 {
 475   basic_block bb = loop->header;
 476   tree init, step;
 477   auto_vec<gimple *, 64> worklist;
 478   gphi_iterator gsi;
 479   bool double_reduc;
 480
 481   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 482
 483   /* First - identify all inductions.  Reduction detection assumes that all the
 484      inductions have been identified, therefore, this order must not be
 485      changed.  */
 486   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 487     {
 488       gphi *phi = gsi.phi ();
 489       tree access_fn = NULL;
 490       tree def = PHI_RESULT (phi);
 491       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 492
 493       if (dump_enabled_p ())
 494         {
 495           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 496           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 497         }
 498
 499       /* Skip virtual phi's.  The data dependences that are associated with
 500          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 501       if (virtual_operand_p (def))
 502         continue;
 503
 504       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 505
 506       /* Analyze the evolution function.  */
 507       access_fn = analyze_scalar_evolution (loop, def);
 508       if (access_fn)
 509         {
 510           STRIP_NOPS (access_fn);
 511           if (dump_enabled_p ())
 512             {
 513               dump_printf_loc (MSG_NOTE, vect_location,
 514                                "Access function of PHI: ");
 515               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 516               dump_printf (MSG_NOTE, "\n");
 517             }
 518           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519             = initial_condition_in_loop_num (access_fn, loop->num);
 520           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 521             = evolution_part_in_loop_num (access_fn, loop->num);
 522         }
 523
 524       if (!access_fn
 525           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 526           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 527               && TREE_CODE (step) != INTEGER_CST))
 528         {
 529           worklist.safe_push (phi);
 530           continue;
 531         }
 532
 533       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 534                   != NULL_TREE);
 535       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 539       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 540     }
 541
 542
 543   /* Second - identify all reductions and nested cycles.  */
 544   while (worklist.length () > 0)
 545     {
 546       gimple *phi = worklist.pop ();
 547       tree def = PHI_RESULT (phi);
 548       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 549       gimple *reduc_stmt;
 550
 551       if (dump_enabled_p ())
 552         {
 553           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 554           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 555         }
 556
 557       gcc_assert (!virtual_operand_p (def)
 558                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 559
 560       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 561                                                 &double_reduc, false);
 562       if (reduc_stmt)
 563         {
 564           if (double_reduc)
 565             {
 566               if (dump_enabled_p ())
 567                 dump_printf_loc (MSG_NOTE, vect_location,
 568                                  "Detected double reduction.\n");
 569
 570               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 571               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 572                                                     vect_double_reduction_def;
 573             }
 574           else
 575             {
 576               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 577                 {
 578                   if (dump_enabled_p ())
 579                     dump_printf_loc (MSG_NOTE, vect_location,
 580                                      "Detected vectorizable nested cycle.\n");
 581
 582                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 583                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 584                                                              vect_nested_cycle;
 585                 }
 586               else
 587                 {
 588                   if (dump_enabled_p ())
 589                     dump_printf_loc (MSG_NOTE, vect_location,
 590                                      "Detected reduction.\n");
 591
 592                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 593                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 594                                                            vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 600                 }
 601             }
 602         }
 603       else
 604         if (dump_enabled_p ())
 605           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 606                            "Unknown def-use cycle pattern.\n");
 607     }
 608 }
 609
 610
 611 /* Function vect_analyze_scalar_cycles.
 612
 613    Examine the cross iteration def-use cycles of scalar variables, by
 614    analyzing the loop-header PHIs of scalar variables.  Classify each
 615    cycle as one of the following: invariant, induction, reduction, unknown.
 616    We do that for the loop represented by LOOP_VINFO, and also to its
 617    inner-loop, if exists.
 618    Examples for scalar cycles:
 619
 620    Example1: reduction:
 621
 622               loop1:
 623               for (i=0; i<N; i++)
 624                  sum += a[i];
 625
 626    Example2: induction:
 627
 628               loop2:
 629               for (i=0; i<N; i++)
 630                  a[i] = i;  */
 631
 632 static void
 633 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 634 {
 635   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 636
 637   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 638
 639   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 640      Reductions in such inner-loop therefore have different properties than
 641      the reductions in the nest that gets vectorized:
 642      1. When vectorized, they are executed in the same order as in the original
 643         scalar loop, so we can't change the order of computation when
 644         vectorizing them.
 645      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 646         current checks are too strict.  */
 647
 648   if (loop->inner)
 649     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 650 }
 651
 652 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 653
 654 static void
 655 vect_fixup_reduc_chain (gimple *stmt)
 656 {
 657   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 658   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 659   stmt_vec_info stmtp;
 660   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 661               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 662   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 663   do
 664     {
 665       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 666       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 667       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 668       if (stmt)
 669         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 670           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 671     }
 672   while (stmt);
 673   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 674 }
 675
 676 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 677
 678 static void
 679 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 680 {
 681   gimple *first;
 682   unsigned i;
 683
 684   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 685     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 686       {
 687         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 688         while (next)
 689           {
 690             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 691               break;
 692             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 693           }
 694         /* If not all stmt in the chain are patterns try to handle
 695            the chain without patterns.  */
 696         if (! next)
 697           {
 698             vect_fixup_reduc_chain (first);
 699             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 700               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 701           }
 702       }
 703 }
 704
 705 /* Function vect_get_loop_niters.
 706
 707    Determine how many iterations the loop is executed and place it
 708    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 709    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 710    niter information holds in ASSUMPTIONS.
 711
 712    Return the loop exit condition.  */
 713
 714
 715 static gcond *
 716 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 717                       tree *number_of_iterations, tree *number_of_iterationsm1)
 718 {
 719   edge exit = single_exit (loop);
 720   struct tree_niter_desc niter_desc;
 721   tree niter_assumptions, niter, may_be_zero;
 722   gcond *cond = get_loop_exit_condition (loop);
 723
 724   *assumptions = boolean_true_node;
 725   *number_of_iterationsm1 = chrec_dont_know;
 726   *number_of_iterations = chrec_dont_know;
 727   DUMP_VECT_SCOPE ("get_loop_niters");
 728
 729   if (!exit)
 730     return cond;
 731
 732   niter = chrec_dont_know;
 733   may_be_zero = NULL_TREE;
 734   niter_assumptions = boolean_true_node;
 735   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 736       || chrec_contains_undetermined (niter_desc.niter))
 737     return cond;
 738
 739   niter_assumptions = niter_desc.assumptions;
 740   may_be_zero = niter_desc.may_be_zero;
 741   niter = niter_desc.niter;
 742
 743   if (may_be_zero && integer_zerop (may_be_zero))
 744     may_be_zero = NULL_TREE;
 745
 746   if (may_be_zero)
 747     {
 748       if (COMPARISON_CLASS_P (may_be_zero))
 749         {
 750           /* Try to combine may_be_zero with assumptions, this can simplify
 751              computation of niter expression.  */
 752           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 753             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 754                                              niter_assumptions,
 755                                              fold_build1 (TRUTH_NOT_EXPR,
 756                                                           boolean_type_node,
 757                                                           may_be_zero));
 758           else
 759             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 760                                  build_int_cst (TREE_TYPE (niter), 0),
 761                                  rewrite_to_non_trapping_overflow (niter));
 762
 763           may_be_zero = NULL_TREE;
 764         }
 765       else if (integer_nonzerop (may_be_zero))
 766         {
 767           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 768           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 769           return cond;
 770         }
 771       else
 772         return cond;
 773     }
 774
 775   *assumptions = niter_assumptions;
 776   *number_of_iterationsm1 = niter;
 777
 778   /* We want the number of loop header executions which is the number
 779      of latch executions plus one.
 780      ???  For UINT_MAX latch executions this number overflows to zero
 781      for loops like do { n++; } while (n != 0);  */
 782   if (niter && !chrec_contains_undetermined (niter))
 783     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 784                           build_int_cst (TREE_TYPE (niter), 1));
 785   *number_of_iterations = niter;
 786
 787   return cond;
 788 }
 789
 790 /* Function bb_in_loop_p
 791
 792    Used as predicate for dfs order traversal of the loop bbs.  */
 793
 794 static bool
 795 bb_in_loop_p (const_basic_block bb, const void *data)
 796 {
 797   const struct loop *const loop = (const struct loop *)data;
 798   if (flow_bb_inside_loop_p (loop, bb))
 799     return true;
 800   return false;
 801 }
 802
 803
 804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 805    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 806
 807 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 808   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 809     loop (loop_in),
 810     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 811     num_itersm1 (NULL_TREE),
 812     num_iters (NULL_TREE),
 813     num_iters_unchanged (NULL_TREE),
 814     num_iters_assumptions (NULL_TREE),
 815     th (0),
 816     versioning_threshold (0),
 817     vectorization_factor (0),
 818     max_vectorization_factor (0),
 819     mask_skip_niters (NULL_TREE),
 820     mask_compare_type (NULL_TREE),
 821     unaligned_dr (NULL),
 822     peeling_for_alignment (0),
 823     ptr_mask (0),
 824     ivexpr_map (NULL),
 825     slp_unrolling_factor (1),
 826     single_scalar_iteration_cost (0),
 827     vectorizable (false),
 828     can_fully_mask_p (true),
 829     fully_masked_p (false),
 830     peeling_for_gaps (false),
 831     peeling_for_niter (false),
 832     operands_swapped (false),
 833     no_data_dependencies (false),
 834     has_mask_store (false),
 835     scalar_loop (NULL),
 836     orig_loop_info (NULL)
 837 {
 838   /* Create/Update stmt_info for all stmts in the loop.  */
 839   basic_block *body = get_loop_body (loop);
 840   for (unsigned int i = 0; i < loop->num_nodes; i++)
 841     {
 842       basic_block bb = body[i];
 843       gimple_stmt_iterator si;
 844
 845       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 846         {
 847           gimple *phi = gsi_stmt (si);
 848           gimple_set_uid (phi, 0);
 849           add_stmt (phi);
 850         }
 851
 852       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 853         {
 854           gimple *stmt = gsi_stmt (si);
 855           gimple_set_uid (stmt, 0);
 856           add_stmt (stmt);
 857         }
 858     }
 859   free (body);
 860
 861   /* CHECKME: We want to visit all BBs before their successors (except for
 862      latch blocks, for which this assertion wouldn't hold).  In the simple
 863      case of the loop forms we allow, a dfs order of the BBs would the same
 864      as reversed postorder traversal, so we are safe.  */
 865
 866   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 867                                           bbs, loop->num_nodes, loop);
 868   gcc_assert (nbbs == loop->num_nodes);
 869 }
 870
 871 /* Free all levels of MASKS.  */
 872
 873 void
 874 release_vec_loop_masks (vec_loop_masks *masks)
 875 {
 876   rgroup_masks *rgm;
 877   unsigned int i;
 878   FOR_EACH_VEC_ELT (*masks, i, rgm)
 879     rgm->masks.release ();
 880   masks->release ();
 881 }
 882
 883 /* Free all memory used by the _loop_vec_info, as well as all the
 884    stmt_vec_info structs of all the stmts in the loop.  */
 885
 886 _loop_vec_info::~_loop_vec_info ()
 887 {
 888   int nbbs;
 889   gimple_stmt_iterator si;
 890   int j;
 891
 892   /* ???  We're releasing loop_vinfos en-block.  */
 893   set_stmt_vec_info_vec (&stmt_vec_infos);
 894   nbbs = loop->num_nodes;
 895   for (j = 0; j < nbbs; j++)
 896     {
 897       basic_block bb = bbs[j];
 898       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 899         free_stmt_vec_info (gsi_stmt (si));
 900
 901       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 902         {
 903           gimple *stmt = gsi_stmt (si);
 904
 905           /* We may have broken canonical form by moving a constant
 906              into RHS1 of a commutative op.  Fix such occurrences.  */
 907           if (operands_swapped && is_gimple_assign (stmt))
 908             {
 909               enum tree_code code = gimple_assign_rhs_code (stmt);
 910
 911               if ((code == PLUS_EXPR
 912                    || code == POINTER_PLUS_EXPR
 913                    || code == MULT_EXPR)
 914                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 915                 swap_ssa_operands (stmt,
 916                                    gimple_assign_rhs1_ptr (stmt),
 917                                    gimple_assign_rhs2_ptr (stmt));
 918               else if (code == COND_EXPR
 919                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 920                 {
 921                   tree cond_expr = gimple_assign_rhs1 (stmt);
 922                   enum tree_code cond_code = TREE_CODE (cond_expr);
 923
 924                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 925                     {
 926                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 927                                                                   0));
 928                       cond_code = invert_tree_comparison (cond_code,
 929                                                           honor_nans);
 930                       if (cond_code != ERROR_MARK)
 931                         {
 932                           TREE_SET_CODE (cond_expr, cond_code);
 933                           swap_ssa_operands (stmt,
 934                                              gimple_assign_rhs2_ptr (stmt),
 935                                              gimple_assign_rhs3_ptr (stmt));
 936                         }
 937                     }
 938                 }
 939             }
 940
 941           /* Free stmt_vec_info.  */
 942           free_stmt_vec_info (stmt);
 943           gsi_next (&si);
 944         }
 945     }
 946
 947   free (bbs);
 948
 949   release_vec_loop_masks (&masks);
 950   delete ivexpr_map;
 951
 952   loop->aux = NULL;
 953 }
 954
 955 /* Return an invariant or register for EXPR and emit necessary
 956    computations in the LOOP_VINFO loop preheader.  */
 957
 958 tree
 959 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 960 {
 961   if (is_gimple_reg (expr)
 962       || is_gimple_min_invariant (expr))
 963     return expr;
 964
 965   if (! loop_vinfo->ivexpr_map)
 966     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 967   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 968   if (! cached)
 969     {
 970       gimple_seq stmts = NULL;
 971       cached = force_gimple_operand (unshare_expr (expr),
 972                                      &stmts, true, NULL_TREE);
 973       if (stmts)
 974         {
 975           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 976           gsi_insert_seq_on_edge_immediate (e, stmts);
 977         }
 978     }
 979   return cached;
 980 }
 981
 982 /* Return true if we can use CMP_TYPE as the comparison type to produce
 983    all masks required to mask LOOP_VINFO.  */
 984
 985 static bool
 986 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 987 {
 988   rgroup_masks *rgm;
 989   unsigned int i;
 990   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 991     if (rgm->mask_type != NULL_TREE
 992         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 993                                             cmp_type, rgm->mask_type,
 994                                             OPTIMIZE_FOR_SPEED))
 995       return false;
 996   return true;
 997 }
 998
 999 /* Calculate the maximum number of scalars per iteration for every
1000    rgroup in LOOP_VINFO.  */
1001
1002 static unsigned int
1003 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1004 {
1005   unsigned int res = 1;
1006   unsigned int i;
1007   rgroup_masks *rgm;
1008   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1009     res = MAX (res, rgm->max_nscalars_per_iter);
1010   return res;
1011 }
1012
1013 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1014    whether we can actually generate the masks required.  Return true if so,
1015    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1016
1017 static bool
1018 vect_verify_full_masking (loop_vec_info loop_vinfo)
1019 {
1020   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1021   unsigned int min_ni_width;
1022
1023   /* Use a normal loop if there are no statements that need masking.
1024      This only happens in rare degenerate cases: it means that the loop
1025      has no loads, no stores, and no live-out values.  */
1026   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1027     return false;
1028
1029   /* Get the maximum number of iterations that is representable
1030      in the counter type.  */
1031   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1032   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1033
1034   /* Get a more refined estimate for the number of iterations.  */
1035   widest_int max_back_edges;
1036   if (max_loop_iterations (loop, &max_back_edges))
1037     max_ni = wi::smin (max_ni, max_back_edges + 1);
1038
1039   /* Account for rgroup masks, in which each bit is replicated N times.  */
1040   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1041
1042   /* Work out how many bits we need to represent the limit.  */
1043   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1044
1045   /* Find a scalar mode for which WHILE_ULT is supported.  */
1046   opt_scalar_int_mode cmp_mode_iter;
1047   tree cmp_type = NULL_TREE;
1048   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1049     {
1050       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1051       if (cmp_bits >= min_ni_width
1052           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1053         {
1054           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1055           if (this_type
1056               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1057             {
1058               /* Although we could stop as soon as we find a valid mode,
1059                  it's often better to continue until we hit Pmode, since the
1060                  operands to the WHILE are more likely to be reusable in
1061                  address calculations.  */
1062               cmp_type = this_type;
1063               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064                 break;
1065             }
1066         }
1067     }
1068
1069   if (!cmp_type)
1070     return false;
1071
1072   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073   return true;
1074 }
1075
1076 /* Calculate the cost of one scalar iteration of the loop.  */
1077 static void
1078 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1079 {
1080   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1081   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1082   int nbbs = loop->num_nodes, factor;
1083   int innerloop_iters, i;
1084
1085   /* Gather costs for statements in the scalar loop.  */
1086
1087   /* FORNOW.  */
1088   innerloop_iters = 1;
1089   if (loop->inner)
1090     innerloop_iters = 50; /* FIXME */
1091
1092   for (i = 0; i < nbbs; i++)
1093     {
1094       gimple_stmt_iterator si;
1095       basic_block bb = bbs[i];
1096
1097       if (bb->loop_father == loop->inner)
1098         factor = innerloop_iters;
1099       else
1100         factor = 1;
1101
1102       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1103         {
1104           gimple *stmt = gsi_stmt (si);
1105           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1106
1107           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1108             continue;
1109
1110           /* Skip stmts that are not vectorized inside the loop.  */
1111           if (stmt_info
1112               && !STMT_VINFO_RELEVANT_P (stmt_info)
1113               && (!STMT_VINFO_LIVE_P (stmt_info)
1114                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1115               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1116             continue;
1117
1118           vect_cost_for_stmt kind;
1119           if (STMT_VINFO_DATA_REF (stmt_info))
1120             {
1121               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1122                kind = scalar_load;
1123              else
1124                kind = scalar_store;
1125             }
1126           else
1127             kind = scalar_stmt;
1128
1129           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1130                             factor, kind, stmt_info, 0, vect_prologue);
1131         }
1132     }
1133
1134   /* Now accumulate cost.  */
1135   void *target_cost_data = init_cost (loop);
1136   stmt_info_for_cost *si;
1137   int j;
1138   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1139                     j, si)
1140     {
1141       struct _stmt_vec_info *stmt_info
1142         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
1143       (void) add_stmt_cost (target_cost_data, si->count,
1144                             si->kind, stmt_info, si->misalign,
1145                             vect_body);
1146     }
1147   unsigned dummy, body_cost = 0;
1148   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1149   destroy_cost_data (target_cost_data);
1150   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1151 }
1152
1153
1154 /* Function vect_analyze_loop_form_1.
1155
1156    Verify that certain CFG restrictions hold, including:
1157    - the loop has a pre-header
1158    - the loop has a single entry and exit
1159    - the loop exit condition is simple enough
1160    - the number of iterations can be analyzed, i.e, a countable loop.  The
1161      niter could be analyzed under some assumptions.  */
1162
1163 bool
1164 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1165                           tree *assumptions, tree *number_of_iterationsm1,
1166                           tree *number_of_iterations, gcond **inner_loop_cond)
1167 {
1168   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1169
1170   /* Different restrictions apply when we are considering an inner-most loop,
1171      vs. an outer (nested) loop.
1172      (FORNOW. May want to relax some of these restrictions in the future).  */
1173
1174   if (!loop->inner)
1175     {
1176       /* Inner-most loop.  We currently require that the number of BBs is
1177          exactly 2 (the header and latch).  Vectorizable inner-most loops
1178          look like this:
1179
1180                         (pre-header)
1181                            |
1182                           header <--------+
1183                            | |            |
1184                            | +--> latch --+
1185                            |
1186                         (exit-bb)  */
1187
1188       if (loop->num_nodes != 2)
1189         {
1190           if (dump_enabled_p ())
1191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1192                              "not vectorized: control flow in loop.\n");
1193           return false;
1194         }
1195
1196       if (empty_block_p (loop->header))
1197         {
1198           if (dump_enabled_p ())
1199             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1200                              "not vectorized: empty loop.\n");
1201           return false;
1202         }
1203     }
1204   else
1205     {
1206       struct loop *innerloop = loop->inner;
1207       edge entryedge;
1208
1209       /* Nested loop. We currently require that the loop is doubly-nested,
1210          contains a single inner loop, and the number of BBs is exactly 5.
1211          Vectorizable outer-loops look like this:
1212
1213                         (pre-header)
1214                            |
1215                           header <---+
1216                            |         |
1217                           inner-loop |
1218                            |         |
1219                           tail ------+
1220                            |
1221                         (exit-bb)
1222
1223          The inner-loop has the properties expected of inner-most loops
1224          as described above.  */
1225
1226       if ((loop->inner)->inner || (loop->inner)->next)
1227         {
1228           if (dump_enabled_p ())
1229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230                              "not vectorized: multiple nested loops.\n");
1231           return false;
1232         }
1233
1234       if (loop->num_nodes != 5)
1235         {
1236           if (dump_enabled_p ())
1237             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238                              "not vectorized: control flow in loop.\n");
1239           return false;
1240         }
1241
1242       entryedge = loop_preheader_edge (innerloop);
1243       if (entryedge->src != loop->header
1244           || !single_exit (innerloop)
1245           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1246         {
1247           if (dump_enabled_p ())
1248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                              "not vectorized: unsupported outerloop form.\n");
1250           return false;
1251         }
1252
1253       /* Analyze the inner-loop.  */
1254       tree inner_niterm1, inner_niter, inner_assumptions;
1255       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1256                                       &inner_assumptions, &inner_niterm1,
1257                                       &inner_niter, NULL)
1258           /* Don't support analyzing niter under assumptions for inner
1259              loop.  */
1260           || !integer_onep (inner_assumptions))
1261         {
1262           if (dump_enabled_p ())
1263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1264                              "not vectorized: Bad inner loop.\n");
1265           return false;
1266         }
1267
1268       if (!expr_invariant_in_loop_p (loop, inner_niter))
1269         {
1270           if (dump_enabled_p ())
1271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272                              "not vectorized: inner-loop count not"
1273                              " invariant.\n");
1274           return false;
1275         }
1276
1277       if (dump_enabled_p ())
1278         dump_printf_loc (MSG_NOTE, vect_location,
1279                          "Considering outer-loop vectorization.\n");
1280     }
1281
1282   if (!single_exit (loop)
1283       || EDGE_COUNT (loop->header->preds) != 2)
1284     {
1285       if (dump_enabled_p ())
1286         {
1287           if (!single_exit (loop))
1288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289                              "not vectorized: multiple exits.\n");
1290           else if (EDGE_COUNT (loop->header->preds) != 2)
1291             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1292                              "not vectorized: too many incoming edges.\n");
1293         }
1294       return false;
1295     }
1296
1297   /* We assume that the loop exit condition is at the end of the loop. i.e,
1298      that the loop is represented as a do-while (with a proper if-guard
1299      before the loop if needed), where the loop header contains all the
1300      executable statements, and the latch is empty.  */
1301   if (!empty_block_p (loop->latch)
1302       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1303     {
1304       if (dump_enabled_p ())
1305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306                          "not vectorized: latch block not empty.\n");
1307       return false;
1308     }
1309
1310   /* Make sure the exit is not abnormal.  */
1311   edge e = single_exit (loop);
1312   if (e->flags & EDGE_ABNORMAL)
1313     {
1314       if (dump_enabled_p ())
1315         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1316                          "not vectorized: abnormal loop exit edge.\n");
1317       return false;
1318     }
1319
1320   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1321                                      number_of_iterationsm1);
1322   if (!*loop_cond)
1323     {
1324       if (dump_enabled_p ())
1325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1326                          "not vectorized: complicated exit condition.\n");
1327       return false;
1328     }
1329
1330   if (integer_zerop (*assumptions)
1331       || !*number_of_iterations
1332       || chrec_contains_undetermined (*number_of_iterations))
1333     {
1334       if (dump_enabled_p ())
1335         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1336                          "not vectorized: number of iterations cannot be "
1337                          "computed.\n");
1338       return false;
1339     }
1340
1341   if (integer_zerop (*number_of_iterations))
1342     {
1343       if (dump_enabled_p ())
1344         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                          "not vectorized: number of iterations = 0.\n");
1346       return false;
1347     }
1348
1349   return true;
1350 }
1351
1352 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1353
1354 loop_vec_info
1355 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1356 {
1357   tree assumptions, number_of_iterations, number_of_iterationsm1;
1358   gcond *loop_cond, *inner_loop_cond = NULL;
1359
1360   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1361                                   &assumptions, &number_of_iterationsm1,
1362                                   &number_of_iterations, &inner_loop_cond))
1363     return NULL;
1364
1365   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1366   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1367   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1368   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1369   if (!integer_onep (assumptions))
1370     {
1371       /* We consider to vectorize this loop by versioning it under
1372          some assumptions.  In order to do this, we need to clear
1373          existing information computed by scev and niter analyzer.  */
1374       scev_reset_htab ();
1375       free_numbers_of_iterations_estimates (loop);
1376       /* Also set flag for this loop so that following scev and niter
1377          analysis are done under the assumptions.  */
1378       loop_constraint_set (loop, LOOP_C_FINITE);
1379       /* Also record the assumptions for versioning.  */
1380       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1381     }
1382
1383   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1384     {
1385       if (dump_enabled_p ())
1386         {
1387           dump_printf_loc (MSG_NOTE, vect_location,
1388                            "Symbolic number of iterations is ");
1389           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1390           dump_printf (MSG_NOTE, "\n");
1391         }
1392     }
1393
1394   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1395   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1396   if (inner_loop_cond)
1397     {
1398       stmt_vec_info inner_loop_cond_info
1399         = loop_vinfo->lookup_stmt (inner_loop_cond);
1400       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1401     }
1402
1403   gcc_assert (!loop->aux);
1404   loop->aux = loop_vinfo;
1405   return loop_vinfo;
1406 }
1407
1408
1409
1410 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1411    statements update the vectorization factor.  */
1412
1413 static void
1414 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1415 {
1416   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1417   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1418   int nbbs = loop->num_nodes;
1419   poly_uint64 vectorization_factor;
1420   int i;
1421
1422   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1423
1424   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1425   gcc_assert (known_ne (vectorization_factor, 0U));
1426
1427   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1428      vectorization factor of the loop is the unrolling factor required by
1429      the SLP instances.  If that unrolling factor is 1, we say, that we
1430      perform pure SLP on loop - cross iteration parallelism is not
1431      exploited.  */
1432   bool only_slp_in_loop = true;
1433   for (i = 0; i < nbbs; i++)
1434     {
1435       basic_block bb = bbs[i];
1436       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1437            gsi_next (&si))
1438         {
1439           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1440           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1441               && STMT_VINFO_RELATED_STMT (stmt_info))
1442             stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
1443           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1444                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1445               && !PURE_SLP_STMT (stmt_info))
1446             /* STMT needs both SLP and loop-based vectorization.  */
1447             only_slp_in_loop = false;
1448         }
1449     }
1450
1451   if (only_slp_in_loop)
1452     {
1453       dump_printf_loc (MSG_NOTE, vect_location,
1454                        "Loop contains only SLP stmts\n");
1455       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1456     }
1457   else
1458     {
1459       dump_printf_loc (MSG_NOTE, vect_location,
1460                        "Loop contains SLP and non-SLP stmts\n");
1461       /* Both the vectorization factor and unroll factor have the form
1462          current_vector_size * X for some rational X, so they must have
1463          a common multiple.  */
1464       vectorization_factor
1465         = force_common_multiple (vectorization_factor,
1466                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1467     }
1468
1469   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1470   if (dump_enabled_p ())
1471     {
1472       dump_printf_loc (MSG_NOTE, vect_location,
1473                        "Updating vectorization factor to ");
1474       dump_dec (MSG_NOTE, vectorization_factor);
1475       dump_printf (MSG_NOTE, ".\n");
1476     }
1477 }
1478
1479 /* Return true if STMT_INFO describes a double reduction phi and if
1480    the other phi in the reduction is also relevant for vectorization.
1481    This rejects cases such as:
1482
1483       outer1:
1484         x_1 = PHI <x_3(outer2), ...>;
1485         ...
1486
1487       inner:
1488         x_2 = ...;
1489         ...
1490
1491       outer2:
1492         x_3 = PHI <x_2(inner)>;
1493
1494    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1495
1496 static bool
1497 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1498 {
1499   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1500     return false;
1501
1502   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1503   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1504 }
1505
1506 /* Function vect_analyze_loop_operations.
1507
1508    Scan the loop stmts and make sure they are all vectorizable.  */
1509
1510 static bool
1511 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1512 {
1513   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1514   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1515   int nbbs = loop->num_nodes;
1516   int i;
1517   stmt_vec_info stmt_info;
1518   bool need_to_vectorize = false;
1519   bool ok;
1520
1521   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1522
1523   stmt_vector_for_cost cost_vec;
1524   cost_vec.create (2);
1525
1526   for (i = 0; i < nbbs; i++)
1527     {
1528       basic_block bb = bbs[i];
1529
1530       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1531            gsi_next (&si))
1532         {
1533           gphi *phi = si.phi ();
1534           ok = true;
1535
1536           stmt_info = loop_vinfo->lookup_stmt (phi);
1537           if (dump_enabled_p ())
1538             {
1539               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1540               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1541             }
1542           if (virtual_operand_p (gimple_phi_result (phi)))
1543             continue;
1544
1545           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1546              (i.e., a phi in the tail of the outer-loop).  */
1547           if (! is_loop_header_bb_p (bb))
1548             {
1549               /* FORNOW: we currently don't support the case that these phis
1550                  are not used in the outerloop (unless it is double reduction,
1551                  i.e., this phi is vect_reduction_def), cause this case
1552                  requires to actually do something here.  */
1553               if (STMT_VINFO_LIVE_P (stmt_info)
1554                   && !vect_active_double_reduction_p (stmt_info))
1555                 {
1556                   if (dump_enabled_p ())
1557                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1558                                      "Unsupported loop-closed phi in "
1559                                      "outer-loop.\n");
1560                   return false;
1561                 }
1562
1563               /* If PHI is used in the outer loop, we check that its operand
1564                  is defined in the inner loop.  */
1565               if (STMT_VINFO_RELEVANT_P (stmt_info))
1566                 {
1567                   tree phi_op;
1568
1569                   if (gimple_phi_num_args (phi) != 1)
1570                     return false;
1571
1572                   phi_op = PHI_ARG_DEF (phi, 0);
1573                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1574                   if (!op_def_info)
1575                     return false;
1576
1577                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1578                       && (STMT_VINFO_RELEVANT (op_def_info)
1579                           != vect_used_in_outer_by_reduction))
1580                     return false;
1581                 }
1582
1583               continue;
1584             }
1585
1586           gcc_assert (stmt_info);
1587
1588           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1589                || STMT_VINFO_LIVE_P (stmt_info))
1590               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1591             {
1592               /* A scalar-dependence cycle that we don't support.  */
1593               if (dump_enabled_p ())
1594                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1595                                  "not vectorized: scalar dependence cycle.\n");
1596               return false;
1597             }
1598
1599           if (STMT_VINFO_RELEVANT_P (stmt_info))
1600             {
1601               need_to_vectorize = true;
1602               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1603                   && ! PURE_SLP_STMT (stmt_info))
1604                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1605               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1606                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1607                        && ! PURE_SLP_STMT (stmt_info))
1608                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1609                                              &cost_vec);
1610             }
1611
1612           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1613           if (ok
1614               && STMT_VINFO_LIVE_P (stmt_info)
1615               && !PURE_SLP_STMT (stmt_info))
1616             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1617                                               &cost_vec);
1618
1619           if (!ok)
1620             {
1621               if (dump_enabled_p ())
1622                 {
1623                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624                                    "not vectorized: relevant phi not "
1625                                    "supported: ");
1626                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1627                 }
1628               return false;
1629             }
1630         }
1631
1632       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1633            gsi_next (&si))
1634         {
1635           gimple *stmt = gsi_stmt (si);
1636           if (!gimple_clobber_p (stmt)
1637               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1638                                      &cost_vec))
1639             return false;
1640         }
1641     } /* bbs */
1642
1643   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1644   cost_vec.release ();
1645
1646   /* All operations in the loop are either irrelevant (deal with loop
1647      control, or dead), or only used outside the loop and can be moved
1648      out of the loop (e.g. invariants, inductions).  The loop can be
1649      optimized away by scalar optimizations.  We're better off not
1650      touching this loop.  */
1651   if (!need_to_vectorize)
1652     {
1653       if (dump_enabled_p ())
1654         dump_printf_loc (MSG_NOTE, vect_location,
1655                          "All the computation can be taken out of the loop.\n");
1656       if (dump_enabled_p ())
1657         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1658                          "not vectorized: redundant loop. no profit to "
1659                          "vectorize.\n");
1660       return false;
1661     }
1662
1663   return true;
1664 }
1665
1666 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1667    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1668    definitely no, or -1 if it's worth retrying.  */
1669
1670 static int
1671 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1672 {
1673   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1674   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1675
1676   /* Only fully-masked loops can have iteration counts less than the
1677      vectorization factor.  */
1678   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1679     {
1680       HOST_WIDE_INT max_niter;
1681
1682       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1683         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1684       else
1685         max_niter = max_stmt_executions_int (loop);
1686
1687       if (max_niter != -1
1688           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1689         {
1690           if (dump_enabled_p ())
1691             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1692                              "not vectorized: iteration count smaller than "
1693                              "vectorization factor.\n");
1694           return 0;
1695         }
1696     }
1697
1698   int min_profitable_iters, min_profitable_estimate;
1699   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1700                                       &min_profitable_estimate);
1701
1702   if (min_profitable_iters < 0)
1703     {
1704       if (dump_enabled_p ())
1705         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1706                          "not vectorized: vectorization not profitable.\n");
1707       if (dump_enabled_p ())
1708         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1709                          "not vectorized: vector version will never be "
1710                          "profitable.\n");
1711       return -1;
1712     }
1713
1714   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1715                                * assumed_vf);
1716
1717   /* Use the cost model only if it is more conservative than user specified
1718      threshold.  */
1719   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1720                                     min_profitable_iters);
1721
1722   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1723
1724   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1725       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1726     {
1727       if (dump_enabled_p ())
1728         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729                          "not vectorized: vectorization not profitable.\n");
1730       if (dump_enabled_p ())
1731         dump_printf_loc (MSG_NOTE, vect_location,
1732                          "not vectorized: iteration count smaller than user "
1733                          "specified loop bound parameter or minimum profitable "
1734                          "iterations (whichever is more conservative).\n");
1735       return 0;
1736     }
1737
1738   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1739   if (estimated_niter == -1)
1740     estimated_niter = likely_max_stmt_executions_int (loop);
1741   if (estimated_niter != -1
1742       && ((unsigned HOST_WIDE_INT) estimated_niter
1743           < MAX (th, (unsigned) min_profitable_estimate)))
1744     {
1745       if (dump_enabled_p ())
1746         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1747                          "not vectorized: estimated iteration count too "
1748                          "small.\n");
1749       if (dump_enabled_p ())
1750         dump_printf_loc (MSG_NOTE, vect_location,
1751                          "not vectorized: estimated iteration count smaller "
1752                          "than specified loop bound parameter or minimum "
1753                          "profitable iterations (whichever is more "
1754                          "conservative).\n");
1755       return -1;
1756     }
1757
1758   return 1;
1759 }
1760
1761 static bool
1762 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1763                            vec<data_reference_p> *datarefs,
1764                            unsigned int *n_stmts)
1765 {
1766   *n_stmts = 0;
1767   for (unsigned i = 0; i < loop->num_nodes; i++)
1768     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1769          !gsi_end_p (gsi); gsi_next (&gsi))
1770       {
1771         gimple *stmt = gsi_stmt (gsi);
1772         if (is_gimple_debug (stmt))
1773           continue;
1774         ++(*n_stmts);
1775         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1776           {
1777             if (is_gimple_call (stmt) && loop->safelen)
1778               {
1779                 tree fndecl = gimple_call_fndecl (stmt), op;
1780                 if (fndecl != NULL_TREE)
1781                   {
1782                     cgraph_node *node = cgraph_node::get (fndecl);
1783                     if (node != NULL && node->simd_clones != NULL)
1784                       {
1785                         unsigned int j, n = gimple_call_num_args (stmt);
1786                         for (j = 0; j < n; j++)
1787                           {
1788                             op = gimple_call_arg (stmt, j);
1789                             if (DECL_P (op)
1790                                 || (REFERENCE_CLASS_P (op)
1791                                     && get_base_address (op)))
1792                               break;
1793                           }
1794                         op = gimple_call_lhs (stmt);
1795                         /* Ignore #pragma omp declare simd functions
1796                            if they don't have data references in the
1797                            call stmt itself.  */
1798                         if (j == n
1799                             && !(op
1800                                  && (DECL_P (op)
1801                                      || (REFERENCE_CLASS_P (op)
1802                                          && get_base_address (op)))))
1803                           continue;
1804                       }
1805                   }
1806               }
1807             return false;
1808           }
1809         /* If dependence analysis will give up due to the limit on the
1810            number of datarefs stop here and fail fatally.  */
1811         if (datarefs->length ()
1812             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1813           return false;
1814       }
1815   return true;
1816 }
1817
1818 /* Function vect_analyze_loop_2.
1819
1820    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1821    for it.  The different analyses will record information in the
1822    loop_vec_info struct.  */
1823 static bool
1824 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1825 {
1826   bool ok;
1827   int res;
1828   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1829   poly_uint64 min_vf = 2;
1830
1831   /* The first group of checks is independent of the vector size.  */
1832   fatal = true;
1833
1834   /* Find all data references in the loop (which correspond to vdefs/vuses)
1835      and analyze their evolution in the loop.  */
1836
1837   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1838
1839   /* Gather the data references and count stmts in the loop.  */
1840   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1841     {
1842       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1843                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1844                                       n_stmts))
1845         {
1846           if (dump_enabled_p ())
1847             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848                              "not vectorized: loop contains function "
1849                              "calls or data references that cannot "
1850                              "be analyzed\n");
1851           return false;
1852         }
1853       loop_vinfo->shared->save_datarefs ();
1854     }
1855   else
1856     loop_vinfo->shared->check_datarefs ();
1857
1858   /* Analyze the data references and also adjust the minimal
1859      vectorization factor according to the loads and stores.  */
1860
1861   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1862   if (!ok)
1863     {
1864       if (dump_enabled_p ())
1865         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866                          "bad data references.\n");
1867       return false;
1868     }
1869
1870   /* Classify all cross-iteration scalar data-flow cycles.
1871      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1872   vect_analyze_scalar_cycles (loop_vinfo);
1873
1874   vect_pattern_recog (loop_vinfo);
1875
1876   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1877
1878   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1879      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1880
1881   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1882   if (!ok)
1883     {
1884       if (dump_enabled_p ())
1885         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1886                          "bad data access.\n");
1887       return false;
1888     }
1889
1890   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1891
1892   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1893   if (!ok)
1894     {
1895       if (dump_enabled_p ())
1896         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1897                          "unexpected pattern.\n");
1898       return false;
1899     }
1900
1901   /* While the rest of the analysis below depends on it in some way.  */
1902   fatal = false;
1903
1904   /* Analyze data dependences between the data-refs in the loop
1905      and adjust the maximum vectorization factor according to
1906      the dependences.
1907      FORNOW: fail at the first data dependence that we encounter.  */
1908
1909   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1910   if (!ok
1911       || (max_vf != MAX_VECTORIZATION_FACTOR
1912           && maybe_lt (max_vf, min_vf)))
1913     {
1914       if (dump_enabled_p ())
1915             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1916                              "bad data dependence.\n");
1917       return false;
1918     }
1919   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1920
1921   ok = vect_determine_vectorization_factor (loop_vinfo);
1922   if (!ok)
1923     {
1924       if (dump_enabled_p ())
1925         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1926                          "can't determine vectorization factor.\n");
1927       return false;
1928     }
1929   if (max_vf != MAX_VECTORIZATION_FACTOR
1930       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1931     {
1932       if (dump_enabled_p ())
1933         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934                          "bad data dependence.\n");
1935       return false;
1936     }
1937
1938   /* Compute the scalar iteration cost.  */
1939   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1940
1941   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1942   unsigned th;
1943
1944   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1945   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1946   if (!ok)
1947     return false;
1948
1949   /* If there are any SLP instances mark them as pure_slp.  */
1950   bool slp = vect_make_slp_decision (loop_vinfo);
1951   if (slp)
1952     {
1953       /* Find stmts that need to be both vectorized and SLPed.  */
1954       vect_detect_hybrid_slp (loop_vinfo);
1955
1956       /* Update the vectorization factor based on the SLP decision.  */
1957       vect_update_vf_for_slp (loop_vinfo);
1958     }
1959
1960   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1961
1962   /* We don't expect to have to roll back to anything other than an empty
1963      set of rgroups.  */
1964   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1965
1966   /* This is the point where we can re-start analysis with SLP forced off.  */
1967 start_over:
1968
1969   /* Now the vectorization factor is final.  */
1970   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1971   gcc_assert (known_ne (vectorization_factor, 0U));
1972
1973   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1974     {
1975       dump_printf_loc (MSG_NOTE, vect_location,
1976                        "vectorization_factor = ");
1977       dump_dec (MSG_NOTE, vectorization_factor);
1978       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1979                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1980     }
1981
1982   HOST_WIDE_INT max_niter
1983     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1984
1985   /* Analyze the alignment of the data-refs in the loop.
1986      Fail if a data reference is found that cannot be vectorized.  */
1987
1988   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1989   if (!ok)
1990     {
1991       if (dump_enabled_p ())
1992         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1993                          "bad data alignment.\n");
1994       return false;
1995     }
1996
1997   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1998      It is important to call pruning after vect_analyze_data_ref_accesses,
1999      since we use grouping information gathered by interleaving analysis.  */
2000   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2001   if (!ok)
2002     return false;
2003
2004   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2005      vectorization.  */
2006   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2007     {
2008     /* This pass will decide on using loop versioning and/or loop peeling in
2009        order to enhance the alignment of data references in the loop.  */
2010     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2011     if (!ok)
2012       {
2013         if (dump_enabled_p ())
2014           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015                            "bad data alignment.\n");
2016         return false;
2017       }
2018     }
2019
2020   if (slp)
2021     {
2022       /* Analyze operations in the SLP instances.  Note this may
2023          remove unsupported SLP instances which makes the above
2024          SLP kind detection invalid.  */
2025       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2026       vect_slp_analyze_operations (loop_vinfo);
2027       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2028         goto again;
2029     }
2030
2031   /* Scan all the remaining operations in the loop that are not subject
2032      to SLP and make sure they are vectorizable.  */
2033   ok = vect_analyze_loop_operations (loop_vinfo);
2034   if (!ok)
2035     {
2036       if (dump_enabled_p ())
2037         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038                          "bad operation or unsupported loop bound.\n");
2039       return false;
2040     }
2041
2042   /* Decide whether to use a fully-masked loop for this vectorization
2043      factor.  */
2044   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2045     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2046        && vect_verify_full_masking (loop_vinfo));
2047   if (dump_enabled_p ())
2048     {
2049       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2050         dump_printf_loc (MSG_NOTE, vect_location,
2051                          "using a fully-masked loop.\n");
2052       else
2053         dump_printf_loc (MSG_NOTE, vect_location,
2054                          "not using a fully-masked loop.\n");
2055     }
2056
2057   /* If epilog loop is required because of data accesses with gaps,
2058      one additional iteration needs to be peeled.  Check if there is
2059      enough iterations for vectorization.  */
2060   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2061       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2062       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2063     {
2064       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2065       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2066
2067       if (known_lt (wi::to_widest (scalar_niters), vf))
2068         {
2069           if (dump_enabled_p ())
2070             dump_printf_loc (MSG_NOTE, vect_location,
2071                              "loop has no enough iterations to support"
2072                              " peeling for gaps.\n");
2073           return false;
2074         }
2075     }
2076
2077   /* Check the costings of the loop make vectorizing worthwhile.  */
2078   res = vect_analyze_loop_costing (loop_vinfo);
2079   if (res < 0)
2080     goto again;
2081   if (!res)
2082     {
2083       if (dump_enabled_p ())
2084         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2085                          "Loop costings not worthwhile.\n");
2086       return false;
2087     }
2088
2089   /* Decide whether we need to create an epilogue loop to handle
2090      remaining scalar iterations.  */
2091   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2092
2093   unsigned HOST_WIDE_INT const_vf;
2094   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2095     /* The main loop handles all iterations.  */
2096     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2097   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2098            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2099     {
2100       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2101                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2102                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2103         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2104     }
2105   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2106            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2107            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2108                 < (unsigned) exact_log2 (const_vf))
2109                /* In case of versioning, check if the maximum number of
2110                   iterations is greater than th.  If they are identical,
2111                   the epilogue is unnecessary.  */
2112                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2113                    || ((unsigned HOST_WIDE_INT) max_niter
2114                        > (th / const_vf) * const_vf))))
2115     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2116
2117   /* If an epilogue loop is required make sure we can create one.  */
2118   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2119       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2120     {
2121       if (dump_enabled_p ())
2122         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2123       if (!vect_can_advance_ivs_p (loop_vinfo)
2124           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2125                                            single_exit (LOOP_VINFO_LOOP
2126                                                          (loop_vinfo))))
2127         {
2128           if (dump_enabled_p ())
2129             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2130                              "not vectorized: can't create required "
2131                              "epilog loop\n");
2132           goto again;
2133         }
2134     }
2135
2136   /* During peeling, we need to check if number of loop iterations is
2137      enough for both peeled prolog loop and vector loop.  This check
2138      can be merged along with threshold check of loop versioning, so
2139      increase threshold for this case if necessary.  */
2140   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2141     {
2142       poly_uint64 niters_th = 0;
2143
2144       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2145         {
2146           /* Niters for peeled prolog loop.  */
2147           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2148             {
2149               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2150               tree vectype
2151                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2152               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2153             }
2154           else
2155             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2156         }
2157
2158       /* Niters for at least one iteration of vectorized loop.  */
2159       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2160         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2161       /* One additional iteration because of peeling for gap.  */
2162       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2163         niters_th += 1;
2164       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2165     }
2166
2167   gcc_assert (known_eq (vectorization_factor,
2168                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2169
2170   /* Ok to vectorize!  */
2171   return true;
2172
2173 again:
2174   /* Try again with SLP forced off but if we didn't do any SLP there is
2175      no point in re-trying.  */
2176   if (!slp)
2177     return false;
2178
2179   /* If there are reduction chains re-trying will fail anyway.  */
2180   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2181     return false;
2182
2183   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2184      via interleaving or lane instructions.  */
2185   slp_instance instance;
2186   slp_tree node;
2187   unsigned i, j;
2188   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2189     {
2190       stmt_vec_info vinfo;
2191       vinfo = vinfo_for_stmt
2192           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2193       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2194         continue;
2195       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2196       unsigned int size = DR_GROUP_SIZE (vinfo);
2197       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2198       if (! vect_store_lanes_supported (vectype, size, false)
2199          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2200          && ! vect_grouped_store_supported (vectype, size))
2201        return false;
2202       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2203         {
2204           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2205           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2206           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2207           size = DR_GROUP_SIZE (vinfo);
2208           vectype = STMT_VINFO_VECTYPE (vinfo);
2209           if (! vect_load_lanes_supported (vectype, size, false)
2210               && ! vect_grouped_load_supported (vectype, single_element_p,
2211                                                 size))
2212             return false;
2213         }
2214     }
2215
2216   if (dump_enabled_p ())
2217     dump_printf_loc (MSG_NOTE, vect_location,
2218                      "re-trying with SLP disabled\n");
2219
2220   /* Roll back state appropriately.  No SLP this time.  */
2221   slp = false;
2222   /* Restore vectorization factor as it were without SLP.  */
2223   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2224   /* Free the SLP instances.  */
2225   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2226     vect_free_slp_instance (instance, false);
2227   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2228   /* Reset SLP type to loop_vect on all stmts.  */
2229   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2230     {
2231       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2232       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2233            !gsi_end_p (si); gsi_next (&si))
2234         {
2235           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2236           STMT_SLP_TYPE (stmt_info) = loop_vect;
2237         }
2238       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2239            !gsi_end_p (si); gsi_next (&si))
2240         {
2241           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2242           STMT_SLP_TYPE (stmt_info) = loop_vect;
2243           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2244             {
2245               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2246               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2247               STMT_SLP_TYPE (stmt_info) = loop_vect;
2248               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2249                    !gsi_end_p (pi); gsi_next (&pi))
2250                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2251                   = loop_vect;
2252             }
2253         }
2254     }
2255   /* Free optimized alias test DDRS.  */
2256   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2257   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2258   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2259   /* Reset target cost data.  */
2260   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2261   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2262     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2263   /* Reset accumulated rgroup information.  */
2264   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2265   /* Reset assorted flags.  */
2266   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2267   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2268   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2269   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2270   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2271
2272   goto start_over;
2273 }
2274
2275 /* Function vect_analyze_loop.
2276
2277    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2278    for it.  The different analyses will record information in the
2279    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2280    be vectorized.  */
2281 loop_vec_info
2282 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2283                    vec_info_shared *shared)
2284 {
2285   loop_vec_info loop_vinfo;
2286   auto_vector_sizes vector_sizes;
2287
2288   /* Autodetect first vector size we try.  */
2289   current_vector_size = 0;
2290   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2291   unsigned int next_size = 0;
2292
2293   DUMP_VECT_SCOPE ("analyze_loop_nest");
2294
2295   if (loop_outer (loop)
2296       && loop_vec_info_for_loop (loop_outer (loop))
2297       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2298     {
2299       if (dump_enabled_p ())
2300         dump_printf_loc (MSG_NOTE, vect_location,
2301                          "outer-loop already vectorized.\n");
2302       return NULL;
2303     }
2304
2305   if (!find_loop_nest (loop, &shared->loop_nest))
2306     {
2307       if (dump_enabled_p ())
2308         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2309                          "not vectorized: loop nest containing two "
2310                          "or more consecutive inner loops cannot be "
2311                          "vectorized\n");
2312       return NULL;
2313     }
2314
2315   unsigned n_stmts = 0;
2316   poly_uint64 autodetected_vector_size = 0;
2317   while (1)
2318     {
2319       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2320       loop_vinfo = vect_analyze_loop_form (loop, shared);
2321       if (!loop_vinfo)
2322         {
2323           if (dump_enabled_p ())
2324             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2325                              "bad loop form.\n");
2326           return NULL;
2327         }
2328
2329       bool fatal = false;
2330
2331       if (orig_loop_vinfo)
2332         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2333
2334       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2335         {
2336           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2337
2338           return loop_vinfo;
2339         }
2340
2341       delete loop_vinfo;
2342
2343       if (next_size == 0)
2344         autodetected_vector_size = current_vector_size;
2345
2346       if (next_size < vector_sizes.length ()
2347           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2348         next_size += 1;
2349
2350       if (fatal
2351           || next_size == vector_sizes.length ()
2352           || known_eq (current_vector_size, 0U))
2353         return NULL;
2354
2355       /* Try the next biggest vector size.  */
2356       current_vector_size = vector_sizes[next_size++];
2357       if (dump_enabled_p ())
2358         {
2359           dump_printf_loc (MSG_NOTE, vect_location,
2360                            "***** Re-trying analysis with "
2361                            "vector size ");
2362           dump_dec (MSG_NOTE, current_vector_size);
2363           dump_printf (MSG_NOTE, "\n");
2364         }
2365     }
2366 }
2367
2368 /* Return true if there is an in-order reduction function for CODE, storing
2369    it in *REDUC_FN if so.  */
2370
2371 static bool
2372 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2373 {
2374   switch (code)
2375     {
2376     case PLUS_EXPR:
2377       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2378       return true;
2379
2380     default:
2381       return false;
2382     }
2383 }
2384
2385 /* Function reduction_fn_for_scalar_code
2386
2387    Input:
2388    CODE - tree_code of a reduction operations.
2389
2390    Output:
2391    REDUC_FN - the corresponding internal function to be used to reduce the
2392       vector of partial results into a single scalar result, or IFN_LAST
2393       if the operation is a supported reduction operation, but does not have
2394       such an internal function.
2395
2396    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2397
2398 static bool
2399 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2400 {
2401   switch (code)
2402     {
2403       case MAX_EXPR:
2404         *reduc_fn = IFN_REDUC_MAX;
2405         return true;
2406
2407       case MIN_EXPR:
2408         *reduc_fn = IFN_REDUC_MIN;
2409         return true;
2410
2411       case PLUS_EXPR:
2412         *reduc_fn = IFN_REDUC_PLUS;
2413         return true;
2414
2415       case BIT_AND_EXPR:
2416         *reduc_fn = IFN_REDUC_AND;
2417         return true;
2418
2419       case BIT_IOR_EXPR:
2420         *reduc_fn = IFN_REDUC_IOR;
2421         return true;
2422
2423       case BIT_XOR_EXPR:
2424         *reduc_fn = IFN_REDUC_XOR;
2425         return true;
2426
2427       case MULT_EXPR:
2428       case MINUS_EXPR:
2429         *reduc_fn = IFN_LAST;
2430         return true;
2431
2432       default:
2433        return false;
2434     }
2435 }
2436
2437 /* If there is a neutral value X such that SLP reduction NODE would not
2438    be affected by the introduction of additional X elements, return that X,
2439    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2440    is true if the SLP statements perform a single reduction, false if each
2441    statement performs an independent reduction.  */
2442
2443 static tree
2444 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2445                               bool reduc_chain)
2446 {
2447   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2448   gimple *stmt = stmts[0];
2449   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2450   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2451   tree scalar_type = TREE_TYPE (vector_type);
2452   struct loop *loop = gimple_bb (stmt)->loop_father;
2453   gcc_assert (loop);
2454
2455   switch (code)
2456     {
2457     case WIDEN_SUM_EXPR:
2458     case DOT_PROD_EXPR:
2459     case SAD_EXPR:
2460     case PLUS_EXPR:
2461     case MINUS_EXPR:
2462     case BIT_IOR_EXPR:
2463     case BIT_XOR_EXPR:
2464       return build_zero_cst (scalar_type);
2465
2466     case MULT_EXPR:
2467       return build_one_cst (scalar_type);
2468
2469     case BIT_AND_EXPR:
2470       return build_all_ones_cst (scalar_type);
2471
2472     case MAX_EXPR:
2473     case MIN_EXPR:
2474       /* For MIN/MAX the initial values are neutral.  A reduction chain
2475          has only a single initial value, so that value is neutral for
2476          all statements.  */
2477       if (reduc_chain)
2478         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2479       return NULL_TREE;
2480
2481     default:
2482       return NULL_TREE;
2483     }
2484 }
2485
2486 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2487    STMT is printed with a message MSG. */
2488
2489 static void
2490 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2491 {
2492   dump_printf_loc (msg_type, vect_location, "%s", msg);
2493   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2494 }
2495
2496 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2497    operation.  Return true if the results of DEF_STMT_INFO are something
2498    that can be accumulated by such a reduction.  */
2499
2500 static bool
2501 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2502 {
2503   return (is_gimple_assign (def_stmt_info->stmt)
2504           || is_gimple_call (def_stmt_info->stmt)
2505           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2506           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2507               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2508               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2509 }
2510
2511 /* Detect SLP reduction of the form:
2512
2513    #a1 = phi <a5, a0>
2514    a2 = operation (a1)
2515    a3 = operation (a2)
2516    a4 = operation (a3)
2517    a5 = operation (a4)
2518
2519    #a = phi <a5>
2520
2521    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2522    FIRST_STMT is the first reduction stmt in the chain
2523    (a2 = operation (a1)).
2524
2525    Return TRUE if a reduction chain was detected.  */
2526
2527 static bool
2528 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2529                        gimple *first_stmt)
2530 {
2531   struct loop *loop = (gimple_bb (phi))->loop_father;
2532   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2533   enum tree_code code;
2534   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2535   stmt_vec_info use_stmt_info, current_stmt_info;
2536   tree lhs;
2537   imm_use_iterator imm_iter;
2538   use_operand_p use_p;
2539   int nloop_uses, size = 0, n_out_of_loop_uses;
2540   bool found = false;
2541
2542   if (loop != vect_loop)
2543     return false;
2544
2545   lhs = PHI_RESULT (phi);
2546   code = gimple_assign_rhs_code (first_stmt);
2547   while (1)
2548     {
2549       nloop_uses = 0;
2550       n_out_of_loop_uses = 0;
2551       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2552         {
2553           gimple *use_stmt = USE_STMT (use_p);
2554           if (is_gimple_debug (use_stmt))
2555             continue;
2556
2557           /* Check if we got back to the reduction phi.  */
2558           if (use_stmt == phi)
2559             {
2560               loop_use_stmt = use_stmt;
2561               found = true;
2562               break;
2563             }
2564
2565           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2566             {
2567               loop_use_stmt = use_stmt;
2568               nloop_uses++;
2569             }
2570            else
2571              n_out_of_loop_uses++;
2572
2573            /* There are can be either a single use in the loop or two uses in
2574               phi nodes.  */
2575            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2576              return false;
2577         }
2578
2579       if (found)
2580         break;
2581
2582       /* We reached a statement with no loop uses.  */
2583       if (nloop_uses == 0)
2584         return false;
2585
2586       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2587       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2588         return false;
2589
2590       if (!is_gimple_assign (loop_use_stmt)
2591           || code != gimple_assign_rhs_code (loop_use_stmt)
2592           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2593         return false;
2594
2595       /* Insert USE_STMT into reduction chain.  */
2596       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2597       if (current_stmt)
2598         {
2599           current_stmt_info = vinfo_for_stmt (current_stmt);
2600           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2601           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2602             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2603         }
2604       else
2605         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2606
2607       lhs = gimple_assign_lhs (loop_use_stmt);
2608       current_stmt = loop_use_stmt;
2609       size++;
2610    }
2611
2612   if (!found || loop_use_stmt != phi || size < 2)
2613     return false;
2614
2615   /* Swap the operands, if needed, to make the reduction operand be the second
2616      operand.  */
2617   lhs = PHI_RESULT (phi);
2618   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2619   while (next_stmt)
2620     {
2621       if (gimple_assign_rhs2 (next_stmt) == lhs)
2622         {
2623           tree op = gimple_assign_rhs1 (next_stmt);
2624           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2625
2626           /* Check that the other def is either defined in the loop
2627              ("vect_internal_def"), or it's an induction (defined by a
2628              loop-header phi-node).  */
2629           if (def_stmt_info
2630               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2631               && vect_valid_reduction_input_p (def_stmt_info))
2632             {
2633               lhs = gimple_assign_lhs (next_stmt);
2634               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2635               continue;
2636             }
2637
2638           return false;
2639         }
2640       else
2641         {
2642           tree op = gimple_assign_rhs2 (next_stmt);
2643           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2644
2645           /* Check that the other def is either defined in the loop
2646             ("vect_internal_def"), or it's an induction (defined by a
2647             loop-header phi-node).  */
2648           if (def_stmt_info
2649               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2650               && vect_valid_reduction_input_p (def_stmt_info))
2651             {
2652               if (dump_enabled_p ())
2653                 {
2654                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2655                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2656                 }
2657
2658               swap_ssa_operands (next_stmt,
2659                                  gimple_assign_rhs1_ptr (next_stmt),
2660                                  gimple_assign_rhs2_ptr (next_stmt));
2661               update_stmt (next_stmt);
2662
2663               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2664                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2665             }
2666           else
2667             return false;
2668         }
2669
2670       lhs = gimple_assign_lhs (next_stmt);
2671       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2672     }
2673
2674   /* Save the chain for further analysis in SLP detection.  */
2675   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2676   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2677   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2678
2679   return true;
2680 }
2681
2682 /* Return true if we need an in-order reduction for operation CODE
2683    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2684    overflow must wrap.  */
2685
2686 static bool
2687 needs_fold_left_reduction_p (tree type, tree_code code,
2688                              bool need_wrapping_integral_overflow)
2689 {
2690   /* CHECKME: check for !flag_finite_math_only too?  */
2691   if (SCALAR_FLOAT_TYPE_P (type))
2692     switch (code)
2693       {
2694       case MIN_EXPR:
2695       case MAX_EXPR:
2696         return false;
2697
2698       default:
2699         return !flag_associative_math;
2700       }
2701
2702   if (INTEGRAL_TYPE_P (type))
2703     {
2704       if (!operation_no_trapping_overflow (type, code))
2705         return true;
2706       if (need_wrapping_integral_overflow
2707           && !TYPE_OVERFLOW_WRAPS (type)
2708           && operation_can_overflow (code))
2709         return true;
2710       return false;
2711     }
2712
2713   if (SAT_FIXED_POINT_TYPE_P (type))
2714     return true;
2715
2716   return false;
2717 }
2718
2719 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2720    reduction operation CODE has a handled computation expression.  */
2721
2722 bool
2723 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2724                       tree loop_arg, enum tree_code code)
2725 {
2726   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2727   auto_bitmap visited;
2728   tree lookfor = PHI_RESULT (phi);
2729   ssa_op_iter curri;
2730   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2731   while (USE_FROM_PTR (curr) != loop_arg)
2732     curr = op_iter_next_use (&curri);
2733   curri.i = curri.numops;
2734   do
2735     {
2736       path.safe_push (std::make_pair (curri, curr));
2737       tree use = USE_FROM_PTR (curr);
2738       if (use == lookfor)
2739         break;
2740       gimple *def = SSA_NAME_DEF_STMT (use);
2741       if (gimple_nop_p (def)
2742           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2743         {
2744 pop:
2745           do
2746             {
2747               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2748               curri = x.first;
2749               curr = x.second;
2750               do
2751                 curr = op_iter_next_use (&curri);
2752               /* Skip already visited or non-SSA operands (from iterating
2753                  over PHI args).  */
2754               while (curr != NULL_USE_OPERAND_P
2755                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2756                          || ! bitmap_set_bit (visited,
2757                                               SSA_NAME_VERSION
2758                                                 (USE_FROM_PTR (curr)))));
2759             }
2760           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2761           if (curr == NULL_USE_OPERAND_P)
2762             break;
2763         }
2764       else
2765         {
2766           if (gimple_code (def) == GIMPLE_PHI)
2767             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2768           else
2769             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2770           while (curr != NULL_USE_OPERAND_P
2771                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2772                      || ! bitmap_set_bit (visited,
2773                                           SSA_NAME_VERSION
2774                                             (USE_FROM_PTR (curr)))))
2775             curr = op_iter_next_use (&curri);
2776           if (curr == NULL_USE_OPERAND_P)
2777             goto pop;
2778         }
2779     }
2780   while (1);
2781   if (dump_file && (dump_flags & TDF_DETAILS))
2782     {
2783       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2784       unsigned i;
2785       std::pair<ssa_op_iter, use_operand_p> *x;
2786       FOR_EACH_VEC_ELT (path, i, x)
2787         {
2788           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2789           dump_printf (MSG_NOTE, " ");
2790         }
2791       dump_printf (MSG_NOTE, "\n");
2792     }
2793
2794   /* Check whether the reduction path detected is valid.  */
2795   bool fail = path.length () == 0;
2796   bool neg = false;
2797   for (unsigned i = 1; i < path.length (); ++i)
2798     {
2799       gimple *use_stmt = USE_STMT (path[i].second);
2800       tree op = USE_FROM_PTR (path[i].second);
2801       if (! has_single_use (op)
2802           || ! is_gimple_assign (use_stmt))
2803         {
2804           fail = true;
2805           break;
2806         }
2807       if (gimple_assign_rhs_code (use_stmt) != code)
2808         {
2809           if (code == PLUS_EXPR
2810               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2811             {
2812               /* Track whether we negate the reduction value each iteration.  */
2813               if (gimple_assign_rhs2 (use_stmt) == op)
2814                 neg = ! neg;
2815             }
2816           else
2817             {
2818               fail = true;
2819               break;
2820             }
2821         }
2822     }
2823   return ! fail && ! neg;
2824 }
2825
2826
2827 /* Function vect_is_simple_reduction
2828
2829    (1) Detect a cross-iteration def-use cycle that represents a simple
2830    reduction computation.  We look for the following pattern:
2831
2832    loop_header:
2833      a1 = phi < a0, a2 >
2834      a3 = ...
2835      a2 = operation (a3, a1)
2836
2837    or
2838
2839    a3 = ...
2840    loop_header:
2841      a1 = phi < a0, a2 >
2842      a2 = operation (a3, a1)
2843
2844    such that:
2845    1. operation is commutative and associative and it is safe to
2846       change the order of the computation
2847    2. no uses for a2 in the loop (a2 is used out of the loop)
2848    3. no uses of a1 in the loop besides the reduction operation
2849    4. no uses of a1 outside the loop.
2850
2851    Conditions 1,4 are tested here.
2852    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2853
2854    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2855    nested cycles.
2856
2857    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2858    reductions:
2859
2860      a1 = phi < a0, a2 >
2861      inner loop (def of a3)
2862      a2 = phi < a3 >
2863
2864    (4) Detect condition expressions, ie:
2865      for (int i = 0; i < N; i++)
2866        if (a[i] < val)
2867         ret_val = a[i];
2868
2869 */
2870
2871 static gimple *
2872 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2873                           bool *double_reduc,
2874                           bool need_wrapping_integral_overflow,
2875                           enum vect_reduction_type *v_reduc_type)
2876 {
2877   struct loop *loop = (gimple_bb (phi))->loop_father;
2878   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2879   gimple *def_stmt, *phi_use_stmt = NULL;
2880   enum tree_code orig_code, code;
2881   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2882   tree type;
2883   int nloop_uses;
2884   tree name;
2885   imm_use_iterator imm_iter;
2886   use_operand_p use_p;
2887   bool phi_def;
2888
2889   *double_reduc = false;
2890   *v_reduc_type = TREE_CODE_REDUCTION;
2891
2892   tree phi_name = PHI_RESULT (phi);
2893   /* ???  If there are no uses of the PHI result the inner loop reduction
2894      won't be detected as possibly double-reduction by vectorizable_reduction
2895      because that tries to walk the PHI arg from the preheader edge which
2896      can be constant.  See PR60382.  */
2897   if (has_zero_uses (phi_name))
2898     return NULL;
2899   nloop_uses = 0;
2900   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2901     {
2902       gimple *use_stmt = USE_STMT (use_p);
2903       if (is_gimple_debug (use_stmt))
2904         continue;
2905
2906       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2907         {
2908           if (dump_enabled_p ())
2909             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910                              "intermediate value used outside loop.\n");
2911
2912           return NULL;
2913         }
2914
2915       nloop_uses++;
2916       if (nloop_uses > 1)
2917         {
2918           if (dump_enabled_p ())
2919             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2920                              "reduction value used in loop.\n");
2921           return NULL;
2922         }
2923
2924       phi_use_stmt = use_stmt;
2925     }
2926
2927   edge latch_e = loop_latch_edge (loop);
2928   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2929   if (TREE_CODE (loop_arg) != SSA_NAME)
2930     {
2931       if (dump_enabled_p ())
2932         {
2933           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2934                            "reduction: not ssa_name: ");
2935           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2936           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2937         }
2938       return NULL;
2939     }
2940
2941   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2942   if (is_gimple_assign (def_stmt))
2943     {
2944       name = gimple_assign_lhs (def_stmt);
2945       phi_def = false;
2946     }
2947   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2948     {
2949       name = PHI_RESULT (def_stmt);
2950       phi_def = true;
2951     }
2952   else
2953     {
2954       if (dump_enabled_p ())
2955         {
2956           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2957                            "reduction: unhandled reduction operation: ");
2958           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2959         }
2960       return NULL;
2961     }
2962
2963   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2964     return NULL;
2965
2966   nloop_uses = 0;
2967   auto_vec<gphi *, 3> lcphis;
2968   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2969     {
2970       gimple *use_stmt = USE_STMT (use_p);
2971       if (is_gimple_debug (use_stmt))
2972         continue;
2973       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2974         nloop_uses++;
2975       else
2976         /* We can have more than one loop-closed PHI.  */
2977         lcphis.safe_push (as_a <gphi *> (use_stmt));
2978       if (nloop_uses > 1)
2979         {
2980           if (dump_enabled_p ())
2981             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2982                              "reduction used in loop.\n");
2983           return NULL;
2984         }
2985     }
2986
2987   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2988      defined in the inner loop.  */
2989   if (phi_def)
2990     {
2991       op1 = PHI_ARG_DEF (def_stmt, 0);
2992
2993       if (gimple_phi_num_args (def_stmt) != 1
2994           || TREE_CODE (op1) != SSA_NAME)
2995         {
2996           if (dump_enabled_p ())
2997             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998                              "unsupported phi node definition.\n");
2999
3000           return NULL;
3001         }
3002
3003       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3004       if (gimple_bb (def1)
3005           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3006           && loop->inner
3007           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3008           && is_gimple_assign (def1)
3009           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3010         {
3011           if (dump_enabled_p ())
3012             report_vect_op (MSG_NOTE, def_stmt,
3013                             "detected double reduction: ");
3014
3015           *double_reduc = true;
3016           return def_stmt;
3017         }
3018
3019       return NULL;
3020     }
3021
3022   /* If we are vectorizing an inner reduction we are executing that
3023      in the original order only in case we are not dealing with a
3024      double reduction.  */
3025   bool check_reduction = true;
3026   if (flow_loop_nested_p (vect_loop, loop))
3027     {
3028       gphi *lcphi;
3029       unsigned i;
3030       check_reduction = false;
3031       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3032         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3033           {
3034             gimple *use_stmt = USE_STMT (use_p);
3035             if (is_gimple_debug (use_stmt))
3036               continue;
3037             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3038               check_reduction = true;
3039           }
3040     }
3041
3042   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3043   code = orig_code = gimple_assign_rhs_code (def_stmt);
3044
3045   /* We can handle "res -= x[i]", which is non-associative by
3046      simply rewriting this into "res += -x[i]".  Avoid changing
3047      gimple instruction for the first simple tests and only do this
3048      if we're allowed to change code at all.  */
3049   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3050     code = PLUS_EXPR;
3051
3052   if (code == COND_EXPR)
3053     {
3054       if (! nested_in_vect_loop)
3055         *v_reduc_type = COND_REDUCTION;
3056
3057       op3 = gimple_assign_rhs1 (def_stmt);
3058       if (COMPARISON_CLASS_P (op3))
3059         {
3060           op4 = TREE_OPERAND (op3, 1);
3061           op3 = TREE_OPERAND (op3, 0);
3062         }
3063       if (op3 == phi_name || op4 == phi_name)
3064         {
3065           if (dump_enabled_p ())
3066             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3067                             "reduction: condition depends on previous"
3068                             " iteration: ");
3069           return NULL;
3070         }
3071
3072       op1 = gimple_assign_rhs2 (def_stmt);
3073       op2 = gimple_assign_rhs3 (def_stmt);
3074     }
3075   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3076     {
3077       if (dump_enabled_p ())
3078         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3079                         "reduction: not commutative/associative: ");
3080       return NULL;
3081     }
3082   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3083     {
3084       op1 = gimple_assign_rhs1 (def_stmt);
3085       op2 = gimple_assign_rhs2 (def_stmt);
3086     }
3087   else
3088     {
3089       if (dump_enabled_p ())
3090         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3091                         "reduction: not handled operation: ");
3092       return NULL;
3093     }
3094
3095   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3096     {
3097       if (dump_enabled_p ())
3098         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3099                         "reduction: both uses not ssa_names: ");
3100
3101       return NULL;
3102     }
3103
3104   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3105   if ((TREE_CODE (op1) == SSA_NAME
3106        && !types_compatible_p (type,TREE_TYPE (op1)))
3107       || (TREE_CODE (op2) == SSA_NAME
3108           && !types_compatible_p (type, TREE_TYPE (op2)))
3109       || (op3 && TREE_CODE (op3) == SSA_NAME
3110           && !types_compatible_p (type, TREE_TYPE (op3)))
3111       || (op4 && TREE_CODE (op4) == SSA_NAME
3112           && !types_compatible_p (type, TREE_TYPE (op4))))
3113     {
3114       if (dump_enabled_p ())
3115         {
3116           dump_printf_loc (MSG_NOTE, vect_location,
3117                            "reduction: multiple types: operation type: ");
3118           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3119           dump_printf (MSG_NOTE, ", operands types: ");
3120           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3121                              TREE_TYPE (op1));
3122           dump_printf (MSG_NOTE, ",");
3123           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3124                              TREE_TYPE (op2));
3125           if (op3)
3126             {
3127               dump_printf (MSG_NOTE, ",");
3128               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3129                                  TREE_TYPE (op3));
3130             }
3131
3132           if (op4)
3133             {
3134               dump_printf (MSG_NOTE, ",");
3135               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3136                                  TREE_TYPE (op4));
3137             }
3138           dump_printf (MSG_NOTE, "\n");
3139         }
3140
3141       return NULL;
3142     }
3143
3144   /* Check whether it's ok to change the order of the computation.
3145      Generally, when vectorizing a reduction we change the order of the
3146      computation.  This may change the behavior of the program in some
3147      cases, so we need to check that this is ok.  One exception is when
3148      vectorizing an outer-loop: the inner-loop is executed sequentially,
3149      and therefore vectorizing reductions in the inner-loop during
3150      outer-loop vectorization is safe.  */
3151   if (check_reduction
3152       && *v_reduc_type == TREE_CODE_REDUCTION
3153       && needs_fold_left_reduction_p (type, code,
3154                                       need_wrapping_integral_overflow))
3155     *v_reduc_type = FOLD_LEFT_REDUCTION;
3156
3157   /* Reduction is safe. We're dealing with one of the following:
3158      1) integer arithmetic and no trapv
3159      2) floating point arithmetic, and special flags permit this optimization
3160      3) nested cycle (i.e., outer loop vectorization).  */
3161   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3162   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3163   if (code != COND_EXPR && !def1_info && !def2_info)
3164     {
3165       if (dump_enabled_p ())
3166         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3167       return NULL;
3168     }
3169
3170   /* Check that one def is the reduction def, defined by PHI,
3171      the other def is either defined in the loop ("vect_internal_def"),
3172      or it's an induction (defined by a loop-header phi-node).  */
3173
3174   if (def2_info
3175       && def2_info->stmt == phi
3176       && (code == COND_EXPR
3177           || !def1_info
3178           || vect_valid_reduction_input_p (def1_info)))
3179     {
3180       if (dump_enabled_p ())
3181         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3182       return def_stmt;
3183     }
3184
3185   if (def1_info
3186       && def1_info->stmt == phi
3187       && (code == COND_EXPR
3188           || !def2_info
3189           || vect_valid_reduction_input_p (def2_info)))
3190     {
3191       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3192         {
3193           /* Check if we can swap operands (just for simplicity - so that
3194              the rest of the code can assume that the reduction variable
3195              is always the last (second) argument).  */
3196           if (code == COND_EXPR)
3197             {
3198               /* Swap cond_expr by inverting the condition.  */
3199               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3200               enum tree_code invert_code = ERROR_MARK;
3201               enum tree_code cond_code = TREE_CODE (cond_expr);
3202
3203               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3204                 {
3205                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3206                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3207                 }
3208               if (invert_code != ERROR_MARK)
3209                 {
3210                   TREE_SET_CODE (cond_expr, invert_code);
3211                   swap_ssa_operands (def_stmt,
3212                                      gimple_assign_rhs2_ptr (def_stmt),
3213                                      gimple_assign_rhs3_ptr (def_stmt));
3214                 }
3215               else
3216                 {
3217                   if (dump_enabled_p ())
3218                     report_vect_op (MSG_NOTE, def_stmt,
3219                                     "detected reduction: cannot swap operands "
3220                                     "for cond_expr");
3221                   return NULL;
3222                 }
3223             }
3224           else
3225             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3226                                gimple_assign_rhs2_ptr (def_stmt));
3227
3228           if (dump_enabled_p ())
3229             report_vect_op (MSG_NOTE, def_stmt,
3230                             "detected reduction: need to swap operands: ");
3231
3232           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3233             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3234         }
3235       else
3236         {
3237           if (dump_enabled_p ())
3238             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3239         }
3240
3241       return def_stmt;
3242     }
3243
3244   /* Try to find SLP reduction chain.  */
3245   if (! nested_in_vect_loop
3246       && code != COND_EXPR
3247       && orig_code != MINUS_EXPR
3248       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3249     {
3250       if (dump_enabled_p ())
3251         report_vect_op (MSG_NOTE, def_stmt,
3252                         "reduction: detected reduction chain: ");
3253
3254       return def_stmt;
3255     }
3256
3257   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3258   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3259   while (first)
3260     {
3261       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3262       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3263       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3264       first = next;
3265     }
3266
3267   /* Look for the expression computing loop_arg from loop PHI result.  */
3268   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3269                             code))
3270     return def_stmt;
3271
3272   if (dump_enabled_p ())
3273     {
3274       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3275                       "reduction: unknown pattern: ");
3276     }
3277
3278   return NULL;
3279 }
3280
3281 /* Wrapper around vect_is_simple_reduction, which will modify code
3282    in-place if it enables detection of more reductions.  Arguments
3283    as there.  */
3284
3285 gimple *
3286 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3287                              bool *double_reduc,
3288                              bool need_wrapping_integral_overflow)
3289 {
3290   enum vect_reduction_type v_reduc_type;
3291   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3292                                           need_wrapping_integral_overflow,
3293                                           &v_reduc_type);
3294   if (def)
3295     {
3296       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3297       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3298       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3299       reduc_def_info = vinfo_for_stmt (def);
3300       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3301       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3302     }
3303   return def;
3304 }
3305
3306 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3307 int
3308 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3309                              int *peel_iters_epilogue,
3310                              stmt_vector_for_cost *scalar_cost_vec,
3311                              stmt_vector_for_cost *prologue_cost_vec,
3312                              stmt_vector_for_cost *epilogue_cost_vec)
3313 {
3314   int retval = 0;
3315   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3316
3317   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3318     {
3319       *peel_iters_epilogue = assumed_vf / 2;
3320       if (dump_enabled_p ())
3321         dump_printf_loc (MSG_NOTE, vect_location,
3322                          "cost model: epilogue peel iters set to vf/2 "
3323                          "because loop iterations are unknown .\n");
3324
3325       /* If peeled iterations are known but number of scalar loop
3326          iterations are unknown, count a taken branch per peeled loop.  */
3327       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3328                                  NULL, 0, vect_prologue);
3329       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3330                                  NULL, 0, vect_epilogue);
3331     }
3332   else
3333     {
3334       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3335       peel_iters_prologue = niters < peel_iters_prologue ?
3336                             niters : peel_iters_prologue;
3337       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3338       /* If we need to peel for gaps, but no peeling is required, we have to
3339          peel VF iterations.  */
3340       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3341         *peel_iters_epilogue = assumed_vf;
3342     }
3343
3344   stmt_info_for_cost *si;
3345   int j;
3346   if (peel_iters_prologue)
3347     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3348         {
3349           stmt_vec_info stmt_info
3350             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3351           retval += record_stmt_cost (prologue_cost_vec,
3352                                       si->count * peel_iters_prologue,
3353                                       si->kind, stmt_info, si->misalign,
3354                                       vect_prologue);
3355         }
3356   if (*peel_iters_epilogue)
3357     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3358         {
3359           stmt_vec_info stmt_info
3360             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3361           retval += record_stmt_cost (epilogue_cost_vec,
3362                                       si->count * *peel_iters_epilogue,
3363                                       si->kind, stmt_info, si->misalign,
3364                                       vect_epilogue);
3365         }
3366
3367   return retval;
3368 }
3369
3370 /* Function vect_estimate_min_profitable_iters
3371
3372    Return the number of iterations required for the vector version of the
3373    loop to be profitable relative to the cost of the scalar version of the
3374    loop.
3375
3376    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3377    of iterations for vectorization.  -1 value means loop vectorization
3378    is not profitable.  This returned value may be used for dynamic
3379    profitability check.
3380
3381    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3382    for static check against estimated number of iterations.  */
3383
3384 static void
3385 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3386                                     int *ret_min_profitable_niters,
3387                                     int *ret_min_profitable_estimate)
3388 {
3389   int min_profitable_iters;
3390   int min_profitable_estimate;
3391   int peel_iters_prologue;
3392   int peel_iters_epilogue;
3393   unsigned vec_inside_cost = 0;
3394   int vec_outside_cost = 0;
3395   unsigned vec_prologue_cost = 0;
3396   unsigned vec_epilogue_cost = 0;
3397   int scalar_single_iter_cost = 0;
3398   int scalar_outside_cost = 0;
3399   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3400   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3401   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3402
3403   /* Cost model disabled.  */
3404   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3405     {
3406       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3407       *ret_min_profitable_niters = 0;
3408       *ret_min_profitable_estimate = 0;
3409       return;
3410     }
3411
3412   /* Requires loop versioning tests to handle misalignment.  */
3413   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3414     {
3415       /*  FIXME: Make cost depend on complexity of individual check.  */
3416       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3417       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3418                             vect_prologue);
3419       dump_printf (MSG_NOTE,
3420                    "cost model: Adding cost of checks for loop "
3421                    "versioning to treat misalignment.\n");
3422     }
3423
3424   /* Requires loop versioning with alias checks.  */
3425   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3426     {
3427       /*  FIXME: Make cost depend on complexity of individual check.  */
3428       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3429       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3430                             vect_prologue);
3431       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3432       if (len)
3433         /* Count LEN - 1 ANDs and LEN comparisons.  */
3434         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3435                               NULL, 0, vect_prologue);
3436       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3437       if (len)
3438         {
3439           /* Count LEN - 1 ANDs and LEN comparisons.  */
3440           unsigned int nstmts = len * 2 - 1;
3441           /* +1 for each bias that needs adding.  */
3442           for (unsigned int i = 0; i < len; ++i)
3443             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3444               nstmts += 1;
3445           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3446                                 NULL, 0, vect_prologue);
3447         }
3448       dump_printf (MSG_NOTE,
3449                    "cost model: Adding cost of checks for loop "
3450                    "versioning aliasing.\n");
3451     }
3452
3453   /* Requires loop versioning with niter checks.  */
3454   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3455     {
3456       /*  FIXME: Make cost depend on complexity of individual check.  */
3457       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3458                             vect_prologue);
3459       dump_printf (MSG_NOTE,
3460                    "cost model: Adding cost of checks for loop "
3461                    "versioning niters.\n");
3462     }
3463
3464   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3465     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3466                           vect_prologue);
3467
3468   /* Count statements in scalar loop.  Using this as scalar cost for a single
3469      iteration for now.
3470
3471      TODO: Add outer loop support.
3472
3473      TODO: Consider assigning different costs to different scalar
3474      statements.  */
3475
3476   scalar_single_iter_cost
3477     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3478
3479   /* Add additional cost for the peeled instructions in prologue and epilogue
3480      loop.  (For fully-masked loops there will be no peeling.)
3481
3482      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3483      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3484
3485      TODO: Build an expression that represents peel_iters for prologue and
3486      epilogue to be used in a run-time test.  */
3487
3488   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3489     {
3490       peel_iters_prologue = 0;
3491       peel_iters_epilogue = 0;
3492
3493       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3494         {
3495           /* We need to peel exactly one iteration.  */
3496           peel_iters_epilogue += 1;
3497           stmt_info_for_cost *si;
3498           int j;
3499           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3500                             j, si)
3501             {
3502               struct _stmt_vec_info *stmt_info
3503                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3504               (void) add_stmt_cost (target_cost_data, si->count,
3505                                     si->kind, stmt_info, si->misalign,
3506                                     vect_epilogue);
3507             }
3508         }
3509     }
3510   else if (npeel < 0)
3511     {
3512       peel_iters_prologue = assumed_vf / 2;
3513       dump_printf (MSG_NOTE, "cost model: "
3514                    "prologue peel iters set to vf/2.\n");
3515
3516       /* If peeling for alignment is unknown, loop bound of main loop becomes
3517          unknown.  */
3518       peel_iters_epilogue = assumed_vf / 2;
3519       dump_printf (MSG_NOTE, "cost model: "
3520                    "epilogue peel iters set to vf/2 because "
3521                    "peeling for alignment is unknown.\n");
3522
3523       /* If peeled iterations are unknown, count a taken branch and a not taken
3524          branch per peeled loop. Even if scalar loop iterations are known,
3525          vector iterations are not known since peeled prologue iterations are
3526          not known. Hence guards remain the same.  */
3527       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3528                             NULL, 0, vect_prologue);
3529       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3530                             NULL, 0, vect_prologue);
3531       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3532                             NULL, 0, vect_epilogue);
3533       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3534                             NULL, 0, vect_epilogue);
3535       stmt_info_for_cost *si;
3536       int j;
3537       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3538         {
3539           struct _stmt_vec_info *stmt_info
3540             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3541           (void) add_stmt_cost (target_cost_data,
3542                                 si->count * peel_iters_prologue,
3543                                 si->kind, stmt_info, si->misalign,
3544                                 vect_prologue);
3545           (void) add_stmt_cost (target_cost_data,
3546                                 si->count * peel_iters_epilogue,
3547                                 si->kind, stmt_info, si->misalign,
3548                                 vect_epilogue);
3549         }
3550     }
3551   else
3552     {
3553       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3554       stmt_info_for_cost *si;
3555       int j;
3556       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3557
3558       prologue_cost_vec.create (2);
3559       epilogue_cost_vec.create (2);
3560       peel_iters_prologue = npeel;
3561
3562       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3563                                           &peel_iters_epilogue,
3564                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3565                                             (loop_vinfo),
3566                                           &prologue_cost_vec,
3567                                           &epilogue_cost_vec);
3568
3569       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3570         {
3571           struct _stmt_vec_info *stmt_info
3572             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3573           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3574                                 si->misalign, vect_prologue);
3575         }
3576
3577       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3578         {
3579           struct _stmt_vec_info *stmt_info
3580             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL_STMT_VEC_INFO;
3581           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3582                                 si->misalign, vect_epilogue);
3583         }
3584
3585       prologue_cost_vec.release ();
3586       epilogue_cost_vec.release ();
3587     }
3588
3589   /* FORNOW: The scalar outside cost is incremented in one of the
3590      following ways:
3591
3592      1. The vectorizer checks for alignment and aliasing and generates
3593      a condition that allows dynamic vectorization.  A cost model
3594      check is ANDED with the versioning condition.  Hence scalar code
3595      path now has the added cost of the versioning check.
3596
3597        if (cost > th & versioning_check)
3598          jmp to vector code
3599
3600      Hence run-time scalar is incremented by not-taken branch cost.
3601
3602      2. The vectorizer then checks if a prologue is required.  If the
3603      cost model check was not done before during versioning, it has to
3604      be done before the prologue check.
3605
3606        if (cost <= th)
3607          prologue = scalar_iters
3608        if (prologue == 0)
3609          jmp to vector code
3610        else
3611          execute prologue
3612        if (prologue == num_iters)
3613          go to exit
3614
3615      Hence the run-time scalar cost is incremented by a taken branch,
3616      plus a not-taken branch, plus a taken branch cost.
3617
3618      3. The vectorizer then checks if an epilogue is required.  If the
3619      cost model check was not done before during prologue check, it
3620      has to be done with the epilogue check.
3621
3622        if (prologue == 0)
3623          jmp to vector code
3624        else
3625          execute prologue
3626        if (prologue == num_iters)
3627          go to exit
3628        vector code:
3629          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3630            jmp to epilogue
3631
3632      Hence the run-time scalar cost should be incremented by 2 taken
3633      branches.
3634
3635      TODO: The back end may reorder the BBS's differently and reverse
3636      conditions/branch directions.  Change the estimates below to
3637      something more reasonable.  */
3638
3639   /* If the number of iterations is known and we do not do versioning, we can
3640      decide whether to vectorize at compile time.  Hence the scalar version
3641      do not carry cost model guard costs.  */
3642   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3643       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3644     {
3645       /* Cost model check occurs at versioning.  */
3646       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3647         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3648       else
3649         {
3650           /* Cost model check occurs at prologue generation.  */
3651           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3652             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3653               + vect_get_stmt_cost (cond_branch_not_taken);
3654           /* Cost model check occurs at epilogue generation.  */
3655           else
3656             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3657         }
3658     }
3659
3660   /* Complete the target-specific cost calculations.  */
3661   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3662                &vec_inside_cost, &vec_epilogue_cost);
3663
3664   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3665
3666   if (dump_enabled_p ())
3667     {
3668       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3669       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3670                    vec_inside_cost);
3671       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3672                    vec_prologue_cost);
3673       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3674                    vec_epilogue_cost);
3675       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3676                    scalar_single_iter_cost);
3677       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3678                    scalar_outside_cost);
3679       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3680                    vec_outside_cost);
3681       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3682                    peel_iters_prologue);
3683       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3684                    peel_iters_epilogue);
3685     }
3686
3687   /* Calculate number of iterations required to make the vector version
3688      profitable, relative to the loop bodies only.  The following condition
3689      must hold true:
3690      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3691      where
3692      SIC = scalar iteration cost, VIC = vector iteration cost,
3693      VOC = vector outside cost, VF = vectorization factor,
3694      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3695      SOC = scalar outside cost for run time cost model check.  */
3696
3697   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3698     {
3699       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3700                               * assumed_vf
3701                               - vec_inside_cost * peel_iters_prologue
3702                               - vec_inside_cost * peel_iters_epilogue);
3703       if (min_profitable_iters <= 0)
3704         min_profitable_iters = 0;
3705       else
3706         {
3707           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3708                                    - vec_inside_cost);
3709
3710           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3711               <= (((int) vec_inside_cost * min_profitable_iters)
3712                   + (((int) vec_outside_cost - scalar_outside_cost)
3713                      * assumed_vf)))
3714             min_profitable_iters++;
3715         }
3716     }
3717   /* vector version will never be profitable.  */
3718   else
3719     {
3720       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3721         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3722                     "vectorization did not happen for a simd loop");
3723
3724       if (dump_enabled_p ())
3725         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3726                          "cost model: the vector iteration cost = %d "
3727                          "divided by the scalar iteration cost = %d "
3728                          "is greater or equal to the vectorization factor = %d"
3729                          ".\n",
3730                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3731       *ret_min_profitable_niters = -1;
3732       *ret_min_profitable_estimate = -1;
3733       return;
3734     }
3735
3736   dump_printf (MSG_NOTE,
3737                "  Calculated minimum iters for profitability: %d\n",
3738                min_profitable_iters);
3739
3740   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3741       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3742     /* We want the vectorized loop to execute at least once.  */
3743     min_profitable_iters = assumed_vf + peel_iters_prologue;
3744
3745   if (dump_enabled_p ())
3746     dump_printf_loc (MSG_NOTE, vect_location,
3747                      "  Runtime profitability threshold = %d\n",
3748                      min_profitable_iters);
3749
3750   *ret_min_profitable_niters = min_profitable_iters;
3751
3752   /* Calculate number of iterations required to make the vector version
3753      profitable, relative to the loop bodies only.
3754
3755      Non-vectorized variant is SIC * niters and it must win over vector
3756      variant on the expected loop trip count.  The following condition must hold true:
3757      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3758
3759   if (vec_outside_cost <= 0)
3760     min_profitable_estimate = 0;
3761   else
3762     {
3763       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3764                                  * assumed_vf
3765                                  - vec_inside_cost * peel_iters_prologue
3766                                  - vec_inside_cost * peel_iters_epilogue)
3767                                  / ((scalar_single_iter_cost * assumed_vf)
3768                                    - vec_inside_cost);
3769     }
3770   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3771   if (dump_enabled_p ())
3772     dump_printf_loc (MSG_NOTE, vect_location,
3773                      "  Static estimate profitability threshold = %d\n",
3774                      min_profitable_estimate);
3775
3776   *ret_min_profitable_estimate = min_profitable_estimate;
3777 }
3778
3779 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3780    vector elements (not bits) for a vector with NELT elements.  */
3781 static void
3782 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3783                               vec_perm_builder *sel)
3784 {
3785   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3786      by vec_perm_indices.  */
3787   sel->new_vector (nelt, 1, 3);
3788   for (unsigned int i = 0; i < 3; i++)
3789     sel->quick_push (i + offset);
3790 }
3791
3792 /* Checks whether the target supports whole-vector shifts for vectors of mode
3793    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3794    it supports vec_perm_const with masks for all necessary shift amounts.  */
3795 static bool
3796 have_whole_vector_shift (machine_mode mode)
3797 {
3798   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3799     return true;
3800
3801   /* Variable-length vectors should be handled via the optab.  */
3802   unsigned int nelt;
3803   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3804     return false;
3805
3806   vec_perm_builder sel;
3807   vec_perm_indices indices;
3808   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3809     {
3810       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3811       indices.new_vector (sel, 2, nelt);
3812       if (!can_vec_perm_const_p (mode, indices, false))
3813         return false;
3814     }
3815   return true;
3816 }
3817
3818 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3819    functions. Design better to avoid maintenance issues.  */
3820
3821 /* Function vect_model_reduction_cost.
3822
3823    Models cost for a reduction operation, including the vector ops
3824    generated within the strip-mine loop, the initial definition before
3825    the loop, and the epilogue code that must be generated.  */
3826
3827 static void
3828 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3829                            int ncopies, stmt_vector_for_cost *cost_vec)
3830 {
3831   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3832   enum tree_code code;
3833   optab optab;
3834   tree vectype;
3835   machine_mode mode;
3836   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3837   struct loop *loop = NULL;
3838
3839   if (loop_vinfo)
3840     loop = LOOP_VINFO_LOOP (loop_vinfo);
3841
3842   /* Condition reductions generate two reductions in the loop.  */
3843   vect_reduction_type reduction_type
3844     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3845   if (reduction_type == COND_REDUCTION)
3846     ncopies *= 2;
3847
3848   vectype = STMT_VINFO_VECTYPE (stmt_info);
3849   mode = TYPE_MODE (vectype);
3850   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3851
3852   if (!orig_stmt_info)
3853     orig_stmt_info = stmt_info;
3854
3855   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3856
3857   if (reduction_type == EXTRACT_LAST_REDUCTION
3858       || reduction_type == FOLD_LEFT_REDUCTION)
3859     {
3860       /* No extra instructions needed in the prologue.  */
3861       prologue_cost = 0;
3862
3863       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3864         /* Count one reduction-like operation per vector.  */
3865         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3866                                         stmt_info, 0, vect_body);
3867       else
3868         {
3869           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3870           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3871           inside_cost = record_stmt_cost (cost_vec, nelements,
3872                                           vec_to_scalar, stmt_info, 0,
3873                                           vect_body);
3874           inside_cost += record_stmt_cost (cost_vec, nelements,
3875                                            scalar_stmt, stmt_info, 0,
3876                                            vect_body);
3877         }
3878     }
3879   else
3880     {
3881       /* Add in cost for initial definition.
3882          For cond reduction we have four vectors: initial index, step,
3883          initial result of the data reduction, initial value of the index
3884          reduction.  */
3885       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3886       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3887                                          scalar_to_vec, stmt_info, 0,
3888                                          vect_prologue);
3889
3890       /* Cost of reduction op inside loop.  */
3891       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3892                                       stmt_info, 0, vect_body);
3893     }
3894
3895   /* Determine cost of epilogue code.
3896
3897      We have a reduction operator that will reduce the vector in one statement.
3898      Also requires scalar extract.  */
3899
3900   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3901     {
3902       if (reduc_fn != IFN_LAST)
3903         {
3904           if (reduction_type == COND_REDUCTION)
3905             {
3906               /* An EQ stmt and an COND_EXPR stmt.  */
3907               epilogue_cost += record_stmt_cost (cost_vec, 2,
3908                                                  vector_stmt, stmt_info, 0,
3909                                                  vect_epilogue);
3910               /* Reduction of the max index and a reduction of the found
3911                  values.  */
3912               epilogue_cost += record_stmt_cost (cost_vec, 2,
3913                                                  vec_to_scalar, stmt_info, 0,
3914                                                  vect_epilogue);
3915               /* A broadcast of the max value.  */
3916               epilogue_cost += record_stmt_cost (cost_vec, 1,
3917                                                  scalar_to_vec, stmt_info, 0,
3918                                                  vect_epilogue);
3919             }
3920           else
3921             {
3922               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3923                                                  stmt_info, 0, vect_epilogue);
3924               epilogue_cost += record_stmt_cost (cost_vec, 1,
3925                                                  vec_to_scalar, stmt_info, 0,
3926                                                  vect_epilogue);
3927             }
3928         }
3929       else if (reduction_type == COND_REDUCTION)
3930         {
3931           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3932           /* Extraction of scalar elements.  */
3933           epilogue_cost += record_stmt_cost (cost_vec,
3934                                              2 * estimated_nunits,
3935                                              vec_to_scalar, stmt_info, 0,
3936                                              vect_epilogue);
3937           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3938           epilogue_cost += record_stmt_cost (cost_vec,
3939                                              2 * estimated_nunits - 3,
3940                                              scalar_stmt, stmt_info, 0,
3941                                              vect_epilogue);
3942         }
3943       else if (reduction_type == EXTRACT_LAST_REDUCTION
3944                || reduction_type == FOLD_LEFT_REDUCTION)
3945         /* No extra instructions need in the epilogue.  */
3946         ;
3947       else
3948         {
3949           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3950           tree bitsize =
3951             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3952           int element_bitsize = tree_to_uhwi (bitsize);
3953           int nelements = vec_size_in_bits / element_bitsize;
3954
3955           if (code == COND_EXPR)
3956             code = MAX_EXPR;
3957
3958           optab = optab_for_tree_code (code, vectype, optab_default);
3959
3960           /* We have a whole vector shift available.  */
3961           if (optab != unknown_optab
3962               && VECTOR_MODE_P (mode)
3963               && optab_handler (optab, mode) != CODE_FOR_nothing
3964               && have_whole_vector_shift (mode))
3965             {
3966               /* Final reduction via vector shifts and the reduction operator.
3967                  Also requires scalar extract.  */
3968               epilogue_cost += record_stmt_cost (cost_vec,
3969                                                  exact_log2 (nelements) * 2,
3970                                                  vector_stmt, stmt_info, 0,
3971                                                  vect_epilogue);
3972               epilogue_cost += record_stmt_cost (cost_vec, 1,
3973                                                  vec_to_scalar, stmt_info, 0,
3974                                                  vect_epilogue);
3975             }
3976           else
3977             /* Use extracts and reduction op for final reduction.  For N
3978                elements, we have N extracts and N-1 reduction ops.  */
3979             epilogue_cost += record_stmt_cost (cost_vec,
3980                                                nelements + nelements - 1,
3981                                                vector_stmt, stmt_info, 0,
3982                                                vect_epilogue);
3983         }
3984     }
3985
3986   if (dump_enabled_p ())
3987     dump_printf (MSG_NOTE,
3988                  "vect_model_reduction_cost: inside_cost = %d, "
3989                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3990                  prologue_cost, epilogue_cost);
3991 }
3992
3993
3994 /* Function vect_model_induction_cost.
3995
3996    Models cost for induction operations.  */
3997
3998 static void
3999 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4000                            stmt_vector_for_cost *cost_vec)
4001 {
4002   unsigned inside_cost, prologue_cost;
4003
4004   if (PURE_SLP_STMT (stmt_info))
4005     return;
4006
4007   /* loop cost for vec_loop.  */
4008   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4009                                   stmt_info, 0, vect_body);
4010
4011   /* prologue cost for vec_init and vec_step.  */
4012   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4013                                     stmt_info, 0, vect_prologue);
4014
4015   if (dump_enabled_p ())
4016     dump_printf_loc (MSG_NOTE, vect_location,
4017                      "vect_model_induction_cost: inside_cost = %d, "
4018                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4019 }
4020
4021
4022
4023 /* Function get_initial_def_for_reduction
4024
4025    Input:
4026    STMT - a stmt that performs a reduction operation in the loop.
4027    INIT_VAL - the initial value of the reduction variable
4028
4029    Output:
4030    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4031         of the reduction (used for adjusting the epilog - see below).
4032    Return a vector variable, initialized according to the operation that STMT
4033         performs. This vector will be used as the initial value of the
4034         vector of partial results.
4035
4036    Option1 (adjust in epilog): Initialize the vector as follows:
4037      add/bit or/xor:    [0,0,...,0,0]
4038      mult/bit and:      [1,1,...,1,1]
4039      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4040    and when necessary (e.g. add/mult case) let the caller know
4041    that it needs to adjust the result by init_val.
4042
4043    Option2: Initialize the vector as follows:
4044      add/bit or/xor:    [init_val,0,0,...,0]
4045      mult/bit and:      [init_val,1,1,...,1]
4046      min/max/cond_expr: [init_val,init_val,...,init_val]
4047    and no adjustments are needed.
4048
4049    For example, for the following code:
4050
4051    s = init_val;
4052    for (i=0;i<n;i++)
4053      s = s + a[i];
4054
4055    STMT is 's = s + a[i]', and the reduction variable is 's'.
4056    For a vector of 4 units, we want to return either [0,0,0,init_val],
4057    or [0,0,0,0] and let the caller know that it needs to adjust
4058    the result at the end by 'init_val'.
4059
4060    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4061    initialization vector is simpler (same element in all entries), if
4062    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4063
4064    A cost model should help decide between these two schemes.  */
4065
4066 tree
4067 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4068                                tree *adjustment_def)
4069 {
4070   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4071   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4072   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4073   tree scalar_type = TREE_TYPE (init_val);
4074   tree vectype = get_vectype_for_scalar_type (scalar_type);
4075   enum tree_code code = gimple_assign_rhs_code (stmt);
4076   tree def_for_init;
4077   tree init_def;
4078   REAL_VALUE_TYPE real_init_val = dconst0;
4079   int int_init_val = 0;
4080   gimple_seq stmts = NULL;
4081
4082   gcc_assert (vectype);
4083
4084   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4085               || SCALAR_FLOAT_TYPE_P (scalar_type));
4086
4087   gcc_assert (nested_in_vect_loop_p (loop, stmt)
4088               || loop == (gimple_bb (stmt))->loop_father);
4089
4090   vect_reduction_type reduction_type
4091     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4092
4093   switch (code)
4094     {
4095     case WIDEN_SUM_EXPR:
4096     case DOT_PROD_EXPR:
4097     case SAD_EXPR:
4098     case PLUS_EXPR:
4099     case MINUS_EXPR:
4100     case BIT_IOR_EXPR:
4101     case BIT_XOR_EXPR:
4102     case MULT_EXPR:
4103     case BIT_AND_EXPR:
4104       {
4105         /* ADJUSTMENT_DEF is NULL when called from
4106            vect_create_epilog_for_reduction to vectorize double reduction.  */
4107         if (adjustment_def)
4108           *adjustment_def = init_val;
4109
4110         if (code == MULT_EXPR)
4111           {
4112             real_init_val = dconst1;
4113             int_init_val = 1;
4114           }
4115
4116         if (code == BIT_AND_EXPR)
4117           int_init_val = -1;
4118
4119         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4120           def_for_init = build_real (scalar_type, real_init_val);
4121         else
4122           def_for_init = build_int_cst (scalar_type, int_init_val);
4123
4124         if (adjustment_def)
4125           /* Option1: the first element is '0' or '1' as well.  */
4126           init_def = gimple_build_vector_from_val (&stmts, vectype,
4127                                                    def_for_init);
4128         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4129           {
4130             /* Option2 (variable length): the first element is INIT_VAL.  */
4131             init_def = gimple_build_vector_from_val (&stmts, vectype,
4132                                                      def_for_init);
4133             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4134                                      vectype, init_def, init_val);
4135           }
4136         else
4137           {
4138             /* Option2: the first element is INIT_VAL.  */
4139             tree_vector_builder elts (vectype, 1, 2);
4140             elts.quick_push (init_val);
4141             elts.quick_push (def_for_init);
4142             init_def = gimple_build_vector (&stmts, &elts);
4143           }
4144       }
4145       break;
4146
4147     case MIN_EXPR:
4148     case MAX_EXPR:
4149     case COND_EXPR:
4150       {
4151         if (adjustment_def)
4152           {
4153             *adjustment_def = NULL_TREE;
4154             if (reduction_type != COND_REDUCTION
4155                 && reduction_type != EXTRACT_LAST_REDUCTION)
4156               {
4157                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4158                 break;
4159               }
4160           }
4161         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4162         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4163       }
4164       break;
4165
4166     default:
4167       gcc_unreachable ();
4168     }
4169
4170   if (stmts)
4171     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4172   return init_def;
4173 }
4174
4175 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4176    NUMBER_OF_VECTORS is the number of vector defs to create.
4177    If NEUTRAL_OP is nonnull, introducing extra elements of that
4178    value will not change the result.  */
4179
4180 static void
4181 get_initial_defs_for_reduction (slp_tree slp_node,
4182                                 vec<tree> *vec_oprnds,
4183                                 unsigned int number_of_vectors,
4184                                 bool reduc_chain, tree neutral_op)
4185 {
4186   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4187   gimple *stmt = stmts[0];
4188   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4189   unsigned HOST_WIDE_INT nunits;
4190   unsigned j, number_of_places_left_in_vector;
4191   tree vector_type;
4192   tree vop;
4193   int group_size = stmts.length ();
4194   unsigned int vec_num, i;
4195   unsigned number_of_copies = 1;
4196   vec<tree> voprnds;
4197   voprnds.create (number_of_vectors);
4198   struct loop *loop;
4199   auto_vec<tree, 16> permute_results;
4200
4201   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4202
4203   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4204
4205   loop = (gimple_bb (stmt))->loop_father;
4206   gcc_assert (loop);
4207   edge pe = loop_preheader_edge (loop);
4208
4209   gcc_assert (!reduc_chain || neutral_op);
4210
4211   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4212      created vectors. It is greater than 1 if unrolling is performed.
4213
4214      For example, we have two scalar operands, s1 and s2 (e.g., group of
4215      strided accesses of size two), while NUNITS is four (i.e., four scalars
4216      of this type can be packed in a vector).  The output vector will contain
4217      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4218      will be 2).
4219
4220      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4221      vectors containing the operands.
4222
4223      For example, NUNITS is four as before, and the group size is 8
4224      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4225      {s5, s6, s7, s8}.  */
4226
4227   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4228     nunits = group_size;
4229
4230   number_of_copies = nunits * number_of_vectors / group_size;
4231
4232   number_of_places_left_in_vector = nunits;
4233   bool constant_p = true;
4234   tree_vector_builder elts (vector_type, nunits, 1);
4235   elts.quick_grow (nunits);
4236   for (j = 0; j < number_of_copies; j++)
4237     {
4238       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4239         {
4240           tree op;
4241           /* Get the def before the loop.  In reduction chain we have only
4242              one initial value.  */
4243           if ((j != (number_of_copies - 1)
4244                || (reduc_chain && i != 0))
4245               && neutral_op)
4246             op = neutral_op;
4247           else
4248             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4249
4250           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4251           number_of_places_left_in_vector--;
4252           elts[number_of_places_left_in_vector] = op;
4253           if (!CONSTANT_CLASS_P (op))
4254             constant_p = false;
4255
4256           if (number_of_places_left_in_vector == 0)
4257             {
4258               gimple_seq ctor_seq = NULL;
4259               tree init;
4260               if (constant_p && !neutral_op
4261                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4262                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4263                 /* Build the vector directly from ELTS.  */
4264                 init = gimple_build_vector (&ctor_seq, &elts);
4265               else if (neutral_op)
4266                 {
4267                   /* Build a vector of the neutral value and shift the
4268                      other elements into place.  */
4269                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4270                                                        neutral_op);
4271                   int k = nunits;
4272                   while (k > 0 && elts[k - 1] == neutral_op)
4273                     k -= 1;
4274                   while (k > 0)
4275                     {
4276                       k -= 1;
4277                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4278                                            vector_type, init, elts[k]);
4279                     }
4280                 }
4281               else
4282                 {
4283                   /* First time round, duplicate ELTS to fill the
4284                      required number of vectors, then cherry pick the
4285                      appropriate result for each iteration.  */
4286                   if (vec_oprnds->is_empty ())
4287                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4288                                               number_of_vectors,
4289                                               permute_results);
4290                   init = permute_results[number_of_vectors - j - 1];
4291                 }
4292               if (ctor_seq != NULL)
4293                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4294               voprnds.quick_push (init);
4295
4296               number_of_places_left_in_vector = nunits;
4297               elts.new_vector (vector_type, nunits, 1);
4298               elts.quick_grow (nunits);
4299               constant_p = true;
4300             }
4301         }
4302     }
4303
4304   /* Since the vectors are created in the reverse order, we should invert
4305      them.  */
4306   vec_num = voprnds.length ();
4307   for (j = vec_num; j != 0; j--)
4308     {
4309       vop = voprnds[j - 1];
4310       vec_oprnds->quick_push (vop);
4311     }
4312
4313   voprnds.release ();
4314
4315   /* In case that VF is greater than the unrolling factor needed for the SLP
4316      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4317      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4318      to replicate the vectors.  */
4319   tree neutral_vec = NULL;
4320   while (number_of_vectors > vec_oprnds->length ())
4321     {
4322       if (neutral_op)
4323         {
4324           if (!neutral_vec)
4325             {
4326               gimple_seq ctor_seq = NULL;
4327               neutral_vec = gimple_build_vector_from_val
4328                 (&ctor_seq, vector_type, neutral_op);
4329               if (ctor_seq != NULL)
4330                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4331             }
4332           vec_oprnds->quick_push (neutral_vec);
4333         }
4334       else
4335         {
4336           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4337             vec_oprnds->quick_push (vop);
4338         }
4339     }
4340 }
4341
4342
4343 /* Function vect_create_epilog_for_reduction
4344
4345    Create code at the loop-epilog to finalize the result of a reduction
4346    computation.
4347
4348    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4349      reduction statements.
4350    STMT is the scalar reduction stmt that is being vectorized.
4351    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4352      number of elements that we can fit in a vectype (nunits).  In this case
4353      we have to generate more than one vector stmt - i.e - we need to "unroll"
4354      the vector stmt by a factor VF/nunits.  For more details see documentation
4355      in vectorizable_operation.
4356    REDUC_FN is the internal function for the epilog reduction.
4357    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4358      computation.
4359    REDUC_INDEX is the index of the operand in the right hand side of the
4360      statement that is defined by REDUCTION_PHI.
4361    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4362    SLP_NODE is an SLP node containing a group of reduction statements. The
4363      first one in this group is STMT.
4364    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4365      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4366      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4367      any value of the IV in the loop.
4368    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4369    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4370      null if this is not an SLP reduction
4371
4372    This function:
4373    1. Creates the reduction def-use cycles: sets the arguments for
4374       REDUCTION_PHIS:
4375       The loop-entry argument is the vectorized initial-value of the reduction.
4376       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4377       sums.
4378    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4379       by calling the function specified by REDUC_FN if available, or by
4380       other means (whole-vector shifts or a scalar loop).
4381       The function also creates a new phi node at the loop exit to preserve
4382       loop-closed form, as illustrated below.
4383
4384      The flow at the entry to this function:
4385
4386         loop:
4387           vec_def = phi <null, null>            # REDUCTION_PHI
4388           VECT_DEF = vector_stmt                # vectorized form of STMT
4389           s_loop = scalar_stmt                  # (scalar) STMT
4390         loop_exit:
4391           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4392           use <s_out0>
4393           use <s_out0>
4394
4395      The above is transformed by this function into:
4396
4397         loop:
4398           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4399           VECT_DEF = vector_stmt                # vectorized form of STMT
4400           s_loop = scalar_stmt                  # (scalar) STMT
4401         loop_exit:
4402           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4403           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4404           v_out2 = reduce <v_out1>
4405           s_out3 = extract_field <v_out2, 0>
4406           s_out4 = adjust_result <s_out3>
4407           use <s_out4>
4408           use <s_out4>
4409 */
4410
4411 static void
4412 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4413                                   gimple *reduc_def_stmt,
4414                                   int ncopies, internal_fn reduc_fn,
4415                                   vec<stmt_vec_info> reduction_phis,
4416                                   bool double_reduc,
4417                                   slp_tree slp_node,
4418                                   slp_instance slp_node_instance,
4419                                   tree induc_val, enum tree_code induc_code,
4420                                   tree neutral_op)
4421 {
4422   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4423   stmt_vec_info prev_phi_info;
4424   tree vectype;
4425   machine_mode mode;
4426   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4427   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4428   basic_block exit_bb;
4429   tree scalar_dest;
4430   tree scalar_type;
4431   gimple *new_phi = NULL, *phi;
4432   stmt_vec_info phi_info;
4433   gimple_stmt_iterator exit_gsi;
4434   tree vec_dest;
4435   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4436   gimple *epilog_stmt = NULL;
4437   enum tree_code code = gimple_assign_rhs_code (stmt);
4438   gimple *exit_phi;
4439   tree bitsize;
4440   tree adjustment_def = NULL;
4441   tree vec_initial_def = NULL;
4442   tree expr, def, initial_def = NULL;
4443   tree orig_name, scalar_result;
4444   imm_use_iterator imm_iter, phi_imm_iter;
4445   use_operand_p use_p, phi_use_p;
4446   gimple *use_stmt;
4447   stmt_vec_info reduction_phi_info = NULL;
4448   bool nested_in_vect_loop = false;
4449   auto_vec<gimple *> new_phis;
4450   auto_vec<stmt_vec_info> inner_phis;
4451   enum vect_def_type dt = vect_unknown_def_type;
4452   int j, i;
4453   auto_vec<tree> scalar_results;
4454   unsigned int group_size = 1, k, ratio;
4455   auto_vec<tree> vec_initial_defs;
4456   auto_vec<gimple *> phis;
4457   bool slp_reduc = false;
4458   bool direct_slp_reduc;
4459   tree new_phi_result;
4460   stmt_vec_info inner_phi = NULL;
4461   tree induction_index = NULL_TREE;
4462
4463   if (slp_node)
4464     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4465
4466   if (nested_in_vect_loop_p (loop, stmt))
4467     {
4468       outer_loop = loop;
4469       loop = loop->inner;
4470       nested_in_vect_loop = true;
4471       gcc_assert (!slp_node);
4472     }
4473
4474   vectype = STMT_VINFO_VECTYPE (stmt_info);
4475   gcc_assert (vectype);
4476   mode = TYPE_MODE (vectype);
4477
4478   /* 1. Create the reduction def-use cycle:
4479      Set the arguments of REDUCTION_PHIS, i.e., transform
4480
4481         loop:
4482           vec_def = phi <null, null>            # REDUCTION_PHI
4483           VECT_DEF = vector_stmt                # vectorized form of STMT
4484           ...
4485
4486      into:
4487
4488         loop:
4489           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4490           VECT_DEF = vector_stmt                # vectorized form of STMT
4491           ...
4492
4493      (in case of SLP, do it for all the phis). */
4494
4495   /* Get the loop-entry arguments.  */
4496   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4497   if (slp_node)
4498     {
4499       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4500       vec_initial_defs.reserve (vec_num);
4501       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4502                                       &vec_initial_defs, vec_num,
4503                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4504                                       neutral_op);
4505     }
4506   else
4507     {
4508       /* Get at the scalar def before the loop, that defines the initial value
4509          of the reduction variable.  */
4510       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4511                                            loop_preheader_edge (loop));
4512       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4513          and we can't use zero for induc_val, use initial_def.  Similarly
4514          for REDUC_MIN and initial_def larger than the base.  */
4515       if (TREE_CODE (initial_def) == INTEGER_CST
4516           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4517               == INTEGER_INDUC_COND_REDUCTION)
4518           && !integer_zerop (induc_val)
4519           && ((induc_code == MAX_EXPR
4520                && tree_int_cst_lt (initial_def, induc_val))
4521               || (induc_code == MIN_EXPR
4522                   && tree_int_cst_lt (induc_val, initial_def))))
4523         induc_val = initial_def;
4524
4525       if (double_reduc)
4526         /* In case of double reduction we only create a vector variable
4527            to be put in the reduction phi node.  The actual statement
4528            creation is done later in this function.  */
4529         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4530       else if (nested_in_vect_loop)
4531         {
4532           /* Do not use an adjustment def as that case is not supported
4533              correctly if ncopies is not one.  */
4534           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4535           vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4536         }
4537       else
4538         vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4539                                                          &adjustment_def);
4540       vec_initial_defs.create (1);
4541       vec_initial_defs.quick_push (vec_initial_def);
4542     }
4543
4544   /* Set phi nodes arguments.  */
4545   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4546     {
4547       tree vec_init_def = vec_initial_defs[i];
4548       tree def = vect_defs[i];
4549       for (j = 0; j < ncopies; j++)
4550         {
4551           if (j != 0)
4552             {
4553               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4554               if (nested_in_vect_loop)
4555                 vec_init_def
4556                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4557                                                     vec_init_def);
4558             }
4559
4560           /* Set the loop-entry arg of the reduction-phi.  */
4561
4562           gphi *phi = as_a <gphi *> (phi_info->stmt);
4563           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4564               == INTEGER_INDUC_COND_REDUCTION)
4565             {
4566               /* Initialise the reduction phi to zero.  This prevents initial
4567                  values of non-zero interferring with the reduction op.  */
4568               gcc_assert (ncopies == 1);
4569               gcc_assert (i == 0);
4570
4571               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4572               tree induc_val_vec
4573                 = build_vector_from_val (vec_init_def_type, induc_val);
4574
4575               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4576                            UNKNOWN_LOCATION);
4577             }
4578           else
4579             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4580                          UNKNOWN_LOCATION);
4581
4582           /* Set the loop-latch arg for the reduction-phi.  */
4583           if (j > 0)
4584             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4585
4586           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4587
4588           if (dump_enabled_p ())
4589             {
4590               dump_printf_loc (MSG_NOTE, vect_location,
4591                                "transform reduction: created def-use cycle: ");
4592               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4593               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4594             }
4595         }
4596     }
4597
4598   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4599      which is updated with the current index of the loop for every match of
4600      the original loop's cond_expr (VEC_STMT).  This results in a vector
4601      containing the last time the condition passed for that vector lane.
4602      The first match will be a 1 to allow 0 to be used for non-matching
4603      indexes.  If there are no matches at all then the vector will be all
4604      zeroes.  */
4605   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4606     {
4607       tree indx_before_incr, indx_after_incr;
4608       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4609
4610       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4611       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4612
4613       int scalar_precision
4614         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4615       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4616       tree cr_index_vector_type = build_vector_type
4617         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4618
4619       /* First we create a simple vector induction variable which starts
4620          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4621          vector size (STEP).  */
4622
4623       /* Create a {1,2,3,...} vector.  */
4624       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4625
4626       /* Create a vector of the step value.  */
4627       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4628       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4629
4630       /* Create an induction variable.  */
4631       gimple_stmt_iterator incr_gsi;
4632       bool insert_after;
4633       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4634       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4635                  insert_after, &indx_before_incr, &indx_after_incr);
4636
4637       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4638          filled with zeros (VEC_ZERO).  */
4639
4640       /* Create a vector of 0s.  */
4641       tree zero = build_zero_cst (cr_index_scalar_type);
4642       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4643
4644       /* Create a vector phi node.  */
4645       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4646       new_phi = create_phi_node (new_phi_tree, loop->header);
4647       loop_vinfo->add_stmt (new_phi);
4648       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4649                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4650
4651       /* Now take the condition from the loops original cond_expr
4652          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4653          every match uses values from the induction variable
4654          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4655          (NEW_PHI_TREE).
4656          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4657          the new cond_expr (INDEX_COND_EXPR).  */
4658
4659       /* Duplicate the condition from vec_stmt.  */
4660       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4661
4662       /* Create a conditional, where the condition is taken from vec_stmt
4663          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4664          else is the phi (NEW_PHI_TREE).  */
4665       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4666                                      ccompare, indx_before_incr,
4667                                      new_phi_tree);
4668       induction_index = make_ssa_name (cr_index_vector_type);
4669       gimple *index_condition = gimple_build_assign (induction_index,
4670                                                      index_cond_expr);
4671       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4672       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4673       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4674
4675       /* Update the phi with the vec cond.  */
4676       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4677                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4678     }
4679
4680   /* 2. Create epilog code.
4681         The reduction epilog code operates across the elements of the vector
4682         of partial results computed by the vectorized loop.
4683         The reduction epilog code consists of:
4684
4685         step 1: compute the scalar result in a vector (v_out2)
4686         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4687         step 3: adjust the scalar result (s_out3) if needed.
4688
4689         Step 1 can be accomplished using one the following three schemes:
4690           (scheme 1) using reduc_fn, if available.
4691           (scheme 2) using whole-vector shifts, if available.
4692           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4693                      combined.
4694
4695           The overall epilog code looks like this:
4696
4697           s_out0 = phi <s_loop>         # original EXIT_PHI
4698           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4699           v_out2 = reduce <v_out1>              # step 1
4700           s_out3 = extract_field <v_out2, 0>    # step 2
4701           s_out4 = adjust_result <s_out3>       # step 3
4702
4703           (step 3 is optional, and steps 1 and 2 may be combined).
4704           Lastly, the uses of s_out0 are replaced by s_out4.  */
4705
4706
4707   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4708          v_out1 = phi <VECT_DEF>
4709          Store them in NEW_PHIS.  */
4710
4711   exit_bb = single_exit (loop)->dest;
4712   prev_phi_info = NULL;
4713   new_phis.create (vect_defs.length ());
4714   FOR_EACH_VEC_ELT (vect_defs, i, def)
4715     {
4716       for (j = 0; j < ncopies; j++)
4717         {
4718           tree new_def = copy_ssa_name (def);
4719           phi = create_phi_node (new_def, exit_bb);
4720           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4721           if (j == 0)
4722             new_phis.quick_push (phi);
4723           else
4724             {
4725               def = vect_get_vec_def_for_stmt_copy (dt, def);
4726               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4727             }
4728
4729           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4730           prev_phi_info = phi_info;
4731         }
4732     }
4733
4734   /* The epilogue is created for the outer-loop, i.e., for the loop being
4735      vectorized.  Create exit phis for the outer loop.  */
4736   if (double_reduc)
4737     {
4738       loop = outer_loop;
4739       exit_bb = single_exit (loop)->dest;
4740       inner_phis.create (vect_defs.length ());
4741       FOR_EACH_VEC_ELT (new_phis, i, phi)
4742         {
4743           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4744           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4745           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4746           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4747                            PHI_RESULT (phi));
4748           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4749           inner_phis.quick_push (phi_info);
4750           new_phis[i] = outer_phi;
4751           while (STMT_VINFO_RELATED_STMT (phi_info))
4752             {
4753               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4754               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4755               outer_phi = create_phi_node (new_result, exit_bb);
4756               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4757                                PHI_RESULT (phi_info->stmt));
4758               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4759               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4760               prev_phi_info = outer_phi_info;
4761             }
4762         }
4763     }
4764
4765   exit_gsi = gsi_after_labels (exit_bb);
4766
4767   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4768          (i.e. when reduc_fn is not available) and in the final adjustment
4769          code (if needed).  Also get the original scalar reduction variable as
4770          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4771          represents a reduction pattern), the tree-code and scalar-def are
4772          taken from the original stmt that the pattern-stmt (STMT) replaces.
4773          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4774          are taken from STMT.  */
4775
4776   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4777   if (!orig_stmt_info)
4778     {
4779       /* Regular reduction  */
4780       orig_stmt_info = stmt_info;
4781     }
4782   else
4783     {
4784       /* Reduction pattern  */
4785       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4786       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4787     }
4788
4789   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4790   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4791      partial results are added and not subtracted.  */
4792   if (code == MINUS_EXPR)
4793     code = PLUS_EXPR;
4794
4795   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4796   scalar_type = TREE_TYPE (scalar_dest);
4797   scalar_results.create (group_size);
4798   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4799   bitsize = TYPE_SIZE (scalar_type);
4800
4801   /* In case this is a reduction in an inner-loop while vectorizing an outer
4802      loop - we don't need to extract a single scalar result at the end of the
4803      inner-loop (unless it is double reduction, i.e., the use of reduction is
4804      outside the outer-loop).  The final vector of partial results will be used
4805      in the vectorized outer-loop, or reduced to a scalar result at the end of
4806      the outer-loop.  */
4807   if (nested_in_vect_loop && !double_reduc)
4808     goto vect_finalize_reduction;
4809
4810   /* SLP reduction without reduction chain, e.g.,
4811      # a1 = phi <a2, a0>
4812      # b1 = phi <b2, b0>
4813      a2 = operation (a1)
4814      b2 = operation (b1)  */
4815   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4816
4817   /* True if we should implement SLP_REDUC using native reduction operations
4818      instead of scalar operations.  */
4819   direct_slp_reduc = (reduc_fn != IFN_LAST
4820                       && slp_reduc
4821                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4822
4823   /* In case of reduction chain, e.g.,
4824      # a1 = phi <a3, a0>
4825      a2 = operation (a1)
4826      a3 = operation (a2),
4827
4828      we may end up with more than one vector result.  Here we reduce them to
4829      one vector.  */
4830   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4831     {
4832       tree first_vect = PHI_RESULT (new_phis[0]);
4833       gassign *new_vec_stmt = NULL;
4834       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4835       for (k = 1; k < new_phis.length (); k++)
4836         {
4837           gimple *next_phi = new_phis[k];
4838           tree second_vect = PHI_RESULT (next_phi);
4839           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4840           new_vec_stmt = gimple_build_assign (tem, code,
4841                                               first_vect, second_vect);
4842           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4843           first_vect = tem;
4844         }
4845
4846       new_phi_result = first_vect;
4847       if (new_vec_stmt)
4848         {
4849           new_phis.truncate (0);
4850           new_phis.safe_push (new_vec_stmt);
4851         }
4852     }
4853   /* Likewise if we couldn't use a single defuse cycle.  */
4854   else if (ncopies > 1)
4855     {
4856       gcc_assert (new_phis.length () == 1);
4857       tree first_vect = PHI_RESULT (new_phis[0]);
4858       gassign *new_vec_stmt = NULL;
4859       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4860       gimple *next_phi = new_phis[0];
4861       for (int k = 1; k < ncopies; ++k)
4862         {
4863           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4864           tree second_vect = PHI_RESULT (next_phi);
4865           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4866           new_vec_stmt = gimple_build_assign (tem, code,
4867                                               first_vect, second_vect);
4868           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4869           first_vect = tem;
4870         }
4871       new_phi_result = first_vect;
4872       new_phis.truncate (0);
4873       new_phis.safe_push (new_vec_stmt);
4874     }
4875   else
4876     new_phi_result = PHI_RESULT (new_phis[0]);
4877
4878   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4879       && reduc_fn != IFN_LAST)
4880     {
4881       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4882          various data values where the condition matched and another vector
4883          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4884          need to extract the last matching index (which will be the index with
4885          highest value) and use this to index into the data vector.
4886          For the case where there were no matches, the data vector will contain
4887          all default values and the index vector will be all zeros.  */
4888
4889       /* Get various versions of the type of the vector of indexes.  */
4890       tree index_vec_type = TREE_TYPE (induction_index);
4891       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4892       tree index_scalar_type = TREE_TYPE (index_vec_type);
4893       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4894         (index_vec_type);
4895
4896       /* Get an unsigned integer version of the type of the data vector.  */
4897       int scalar_precision
4898         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4899       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4900       tree vectype_unsigned = build_vector_type
4901         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4902
4903       /* First we need to create a vector (ZERO_VEC) of zeros and another
4904          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4905          can create using a MAX reduction and then expanding.
4906          In the case where the loop never made any matches, the max index will
4907          be zero.  */
4908
4909       /* Vector of {0, 0, 0,...}.  */
4910       tree zero_vec = make_ssa_name (vectype);
4911       tree zero_vec_rhs = build_zero_cst (vectype);
4912       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4913       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4914
4915       /* Find maximum value from the vector of found indexes.  */
4916       tree max_index = make_ssa_name (index_scalar_type);
4917       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4918                                                           1, induction_index);
4919       gimple_call_set_lhs (max_index_stmt, max_index);
4920       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4921
4922       /* Vector of {max_index, max_index, max_index,...}.  */
4923       tree max_index_vec = make_ssa_name (index_vec_type);
4924       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4925                                                       max_index);
4926       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4927                                                         max_index_vec_rhs);
4928       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4929
4930       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4931          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4932          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4933          otherwise.  Only one value should match, resulting in a vector
4934          (VEC_COND) with one data value and the rest zeros.
4935          In the case where the loop never made any matches, every index will
4936          match, resulting in a vector with all data values (which will all be
4937          the default value).  */
4938
4939       /* Compare the max index vector to the vector of found indexes to find
4940          the position of the max value.  */
4941       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4942       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4943                                                       induction_index,
4944                                                       max_index_vec);
4945       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4946
4947       /* Use the compare to choose either values from the data vector or
4948          zero.  */
4949       tree vec_cond = make_ssa_name (vectype);
4950       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4951                                                    vec_compare, new_phi_result,
4952                                                    zero_vec);
4953       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4954
4955       /* Finally we need to extract the data value from the vector (VEC_COND)
4956          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4957          reduction, but because this doesn't exist, we can use a MAX reduction
4958          instead.  The data value might be signed or a float so we need to cast
4959          it first.
4960          In the case where the loop never made any matches, the data values are
4961          all identical, and so will reduce down correctly.  */
4962
4963       /* Make the matched data values unsigned.  */
4964       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4965       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4966                                        vec_cond);
4967       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4968                                                         VIEW_CONVERT_EXPR,
4969                                                         vec_cond_cast_rhs);
4970       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4971
4972       /* Reduce down to a scalar value.  */
4973       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4974       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4975                                                            1, vec_cond_cast);
4976       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4977       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4978
4979       /* Convert the reduced value back to the result type and set as the
4980          result.  */
4981       gimple_seq stmts = NULL;
4982       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4983                                data_reduc);
4984       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4985       scalar_results.safe_push (new_temp);
4986     }
4987   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4988            && reduc_fn == IFN_LAST)
4989     {
4990       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4991          idx = 0;
4992          idx_val = induction_index[0];
4993          val = data_reduc[0];
4994          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4995            if (induction_index[i] > idx_val)
4996              val = data_reduc[i], idx_val = induction_index[i];
4997          return val;  */
4998
4999       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5000       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5001       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5002       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5003       /* Enforced by vectorizable_reduction, which ensures we have target
5004          support before allowing a conditional reduction on variable-length
5005          vectors.  */
5006       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5007       tree idx_val = NULL_TREE, val = NULL_TREE;
5008       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5009         {
5010           tree old_idx_val = idx_val;
5011           tree old_val = val;
5012           idx_val = make_ssa_name (idx_eltype);
5013           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5014                                              build3 (BIT_FIELD_REF, idx_eltype,
5015                                                      induction_index,
5016                                                      bitsize_int (el_size),
5017                                                      bitsize_int (off)));
5018           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5019           val = make_ssa_name (data_eltype);
5020           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5021                                              build3 (BIT_FIELD_REF,
5022                                                      data_eltype,
5023                                                      new_phi_result,
5024                                                      bitsize_int (el_size),
5025                                                      bitsize_int (off)));
5026           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5027           if (off != 0)
5028             {
5029               tree new_idx_val = idx_val;
5030               tree new_val = val;
5031               if (off != v_size - el_size)
5032                 {
5033                   new_idx_val = make_ssa_name (idx_eltype);
5034                   epilog_stmt = gimple_build_assign (new_idx_val,
5035                                                      MAX_EXPR, idx_val,
5036                                                      old_idx_val);
5037                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5038                 }
5039               new_val = make_ssa_name (data_eltype);
5040               epilog_stmt = gimple_build_assign (new_val,
5041                                                  COND_EXPR,
5042                                                  build2 (GT_EXPR,
5043                                                          boolean_type_node,
5044                                                          idx_val,
5045                                                          old_idx_val),
5046                                                  val, old_val);
5047               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5048               idx_val = new_idx_val;
5049               val = new_val;
5050             }
5051         }
5052       /* Convert the reduced value back to the result type and set as the
5053          result.  */
5054       gimple_seq stmts = NULL;
5055       val = gimple_convert (&stmts, scalar_type, val);
5056       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5057       scalar_results.safe_push (val);
5058     }
5059
5060   /* 2.3 Create the reduction code, using one of the three schemes described
5061          above. In SLP we simply need to extract all the elements from the
5062          vector (without reducing them), so we use scalar shifts.  */
5063   else if (reduc_fn != IFN_LAST && !slp_reduc)
5064     {
5065       tree tmp;
5066       tree vec_elem_type;
5067
5068       /* Case 1:  Create:
5069          v_out2 = reduc_expr <v_out1>  */
5070
5071       if (dump_enabled_p ())
5072         dump_printf_loc (MSG_NOTE, vect_location,
5073                          "Reduce using direct vector reduction.\n");
5074
5075       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5076       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5077         {
5078           tree tmp_dest
5079             = vect_create_destination_var (scalar_dest, vec_elem_type);
5080           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5081                                                     new_phi_result);
5082           gimple_set_lhs (epilog_stmt, tmp_dest);
5083           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5084           gimple_set_lhs (epilog_stmt, new_temp);
5085           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5086
5087           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5088                                              new_temp);
5089         }
5090       else
5091         {
5092           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5093                                                     new_phi_result);
5094           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5095         }
5096
5097       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5098       gimple_set_lhs (epilog_stmt, new_temp);
5099       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5100
5101       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5102            == INTEGER_INDUC_COND_REDUCTION)
5103           && !operand_equal_p (initial_def, induc_val, 0))
5104         {
5105           /* Earlier we set the initial value to be a vector if induc_val
5106              values.  Check the result and if it is induc_val then replace
5107              with the original initial value, unless induc_val is
5108              the same as initial_def already.  */
5109           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5110                                   induc_val);
5111
5112           tmp = make_ssa_name (new_scalar_dest);
5113           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5114                                              initial_def, new_temp);
5115           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5116           new_temp = tmp;
5117         }
5118
5119       scalar_results.safe_push (new_temp);
5120     }
5121   else if (direct_slp_reduc)
5122     {
5123       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5124          with the elements for other SLP statements replaced with the
5125          neutral value.  We can then do a normal reduction on each vector.  */
5126
5127       /* Enforced by vectorizable_reduction.  */
5128       gcc_assert (new_phis.length () == 1);
5129       gcc_assert (pow2p_hwi (group_size));
5130
5131       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5132       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5133       gimple_seq seq = NULL;
5134
5135       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5136          and the same element size as VECTYPE.  */
5137       tree index = build_index_vector (vectype, 0, 1);
5138       tree index_type = TREE_TYPE (index);
5139       tree index_elt_type = TREE_TYPE (index_type);
5140       tree mask_type = build_same_sized_truth_vector_type (index_type);
5141
5142       /* Create a vector that, for each element, identifies which of
5143          the REDUC_GROUP_SIZE results should use it.  */
5144       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5145       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5146                             build_vector_from_val (index_type, index_mask));
5147
5148       /* Get a neutral vector value.  This is simply a splat of the neutral
5149          scalar value if we have one, otherwise the initial scalar value
5150          is itself a neutral value.  */
5151       tree vector_identity = NULL_TREE;
5152       if (neutral_op)
5153         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5154                                                         neutral_op);
5155       for (unsigned int i = 0; i < group_size; ++i)
5156         {
5157           /* If there's no univeral neutral value, we can use the
5158              initial scalar value from the original PHI.  This is used
5159              for MIN and MAX reduction, for example.  */
5160           if (!neutral_op)
5161             {
5162               tree scalar_value
5163                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5164                                          loop_preheader_edge (loop));
5165               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5166                                                               scalar_value);
5167             }
5168
5169           /* Calculate the equivalent of:
5170
5171              sel[j] = (index[j] == i);
5172
5173              which selects the elements of NEW_PHI_RESULT that should
5174              be included in the result.  */
5175           tree compare_val = build_int_cst (index_elt_type, i);
5176           compare_val = build_vector_from_val (index_type, compare_val);
5177           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5178                                    index, compare_val);
5179
5180           /* Calculate the equivalent of:
5181
5182              vec = seq ? new_phi_result : vector_identity;
5183
5184              VEC is now suitable for a full vector reduction.  */
5185           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5186                                    sel, new_phi_result, vector_identity);
5187
5188           /* Do the reduction and convert it to the appropriate type.  */
5189           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5190                                       TREE_TYPE (vectype), vec);
5191           scalar = gimple_convert (&seq, scalar_type, scalar);
5192           scalar_results.safe_push (scalar);
5193         }
5194       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5195     }
5196   else
5197     {
5198       bool reduce_with_shift;
5199       tree vec_temp;
5200
5201       /* COND reductions all do the final reduction with MAX_EXPR
5202          or MIN_EXPR.  */
5203       if (code == COND_EXPR)
5204         {
5205           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5206               == INTEGER_INDUC_COND_REDUCTION)
5207             code = induc_code;
5208           else
5209             code = MAX_EXPR;
5210         }
5211
5212       /* See if the target wants to do the final (shift) reduction
5213          in a vector mode of smaller size and first reduce upper/lower
5214          halves against each other.  */
5215       enum machine_mode mode1 = mode;
5216       tree vectype1 = vectype;
5217       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5218       unsigned sz1 = sz;
5219       if (!slp_reduc
5220           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5221         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5222
5223       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5224       reduce_with_shift = have_whole_vector_shift (mode1);
5225       if (!VECTOR_MODE_P (mode1))
5226         reduce_with_shift = false;
5227       else
5228         {
5229           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5230           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5231             reduce_with_shift = false;
5232         }
5233
5234       /* First reduce the vector to the desired vector size we should
5235          do shift reduction on by combining upper and lower halves.  */
5236       new_temp = new_phi_result;
5237       while (sz > sz1)
5238         {
5239           gcc_assert (!slp_reduc);
5240           sz /= 2;
5241           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5242
5243           /* The target has to make sure we support lowpart/highpart
5244              extraction, either via direct vector extract or through
5245              an integer mode punning.  */
5246           tree dst1, dst2;
5247           if (convert_optab_handler (vec_extract_optab,
5248                                      TYPE_MODE (TREE_TYPE (new_temp)),
5249                                      TYPE_MODE (vectype1))
5250               != CODE_FOR_nothing)
5251             {
5252               /* Extract sub-vectors directly once vec_extract becomes
5253                  a conversion optab.  */
5254               dst1 = make_ssa_name (vectype1);
5255               epilog_stmt
5256                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5257                                          build3 (BIT_FIELD_REF, vectype1,
5258                                                  new_temp, TYPE_SIZE (vectype1),
5259                                                  bitsize_int (0)));
5260               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5261               dst2 =  make_ssa_name (vectype1);
5262               epilog_stmt
5263                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5264                                          build3 (BIT_FIELD_REF, vectype1,
5265                                                  new_temp, TYPE_SIZE (vectype1),
5266                                                  bitsize_int (sz * BITS_PER_UNIT)));
5267               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5268             }
5269           else
5270             {
5271               /* Extract via punning to appropriately sized integer mode
5272                  vector.  */
5273               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5274                                                             1);
5275               tree etype = build_vector_type (eltype, 2);
5276               gcc_assert (convert_optab_handler (vec_extract_optab,
5277                                                  TYPE_MODE (etype),
5278                                                  TYPE_MODE (eltype))
5279                           != CODE_FOR_nothing);
5280               tree tem = make_ssa_name (etype);
5281               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5282                                                  build1 (VIEW_CONVERT_EXPR,
5283                                                          etype, new_temp));
5284               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5285               new_temp = tem;
5286               tem = make_ssa_name (eltype);
5287               epilog_stmt
5288                   = gimple_build_assign (tem, BIT_FIELD_REF,
5289                                          build3 (BIT_FIELD_REF, eltype,
5290                                                  new_temp, TYPE_SIZE (eltype),
5291                                                  bitsize_int (0)));
5292               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5293               dst1 = make_ssa_name (vectype1);
5294               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5295                                                  build1 (VIEW_CONVERT_EXPR,
5296                                                          vectype1, tem));
5297               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5298               tem = make_ssa_name (eltype);
5299               epilog_stmt
5300                   = gimple_build_assign (tem, BIT_FIELD_REF,
5301                                          build3 (BIT_FIELD_REF, eltype,
5302                                                  new_temp, TYPE_SIZE (eltype),
5303                                                  bitsize_int (sz * BITS_PER_UNIT)));
5304               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305               dst2 =  make_ssa_name (vectype1);
5306               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5307                                                  build1 (VIEW_CONVERT_EXPR,
5308                                                          vectype1, tem));
5309               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5310             }
5311
5312           new_temp = make_ssa_name (vectype1);
5313           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5314           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5315         }
5316
5317       if (reduce_with_shift && !slp_reduc)
5318         {
5319           int element_bitsize = tree_to_uhwi (bitsize);
5320           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5321              for variable-length vectors and also requires direct target support
5322              for loop reductions.  */
5323           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5324           int nelements = vec_size_in_bits / element_bitsize;
5325           vec_perm_builder sel;
5326           vec_perm_indices indices;
5327
5328           int elt_offset;
5329
5330           tree zero_vec = build_zero_cst (vectype1);
5331           /* Case 2: Create:
5332              for (offset = nelements/2; offset >= 1; offset/=2)
5333                 {
5334                   Create:  va' = vec_shift <va, offset>
5335                   Create:  va = vop <va, va'>
5336                 }  */
5337
5338           tree rhs;
5339
5340           if (dump_enabled_p ())
5341             dump_printf_loc (MSG_NOTE, vect_location,
5342                              "Reduce using vector shifts\n");
5343
5344           mode1 = TYPE_MODE (vectype1);
5345           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5346           for (elt_offset = nelements / 2;
5347                elt_offset >= 1;
5348                elt_offset /= 2)
5349             {
5350               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5351               indices.new_vector (sel, 2, nelements);
5352               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5353               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5354                                                  new_temp, zero_vec, mask);
5355               new_name = make_ssa_name (vec_dest, epilog_stmt);
5356               gimple_assign_set_lhs (epilog_stmt, new_name);
5357               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5358
5359               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5360                                                  new_temp);
5361               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5362               gimple_assign_set_lhs (epilog_stmt, new_temp);
5363               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5364             }
5365
5366           /* 2.4  Extract the final scalar result.  Create:
5367              s_out3 = extract_field <v_out2, bitpos>  */
5368
5369           if (dump_enabled_p ())
5370             dump_printf_loc (MSG_NOTE, vect_location,
5371                              "extract scalar result\n");
5372
5373           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5374                         bitsize, bitsize_zero_node);
5375           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5376           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5377           gimple_assign_set_lhs (epilog_stmt, new_temp);
5378           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5379           scalar_results.safe_push (new_temp);
5380         }
5381       else
5382         {
5383           /* Case 3: Create:
5384              s = extract_field <v_out2, 0>
5385              for (offset = element_size;
5386                   offset < vector_size;
5387                   offset += element_size;)
5388                {
5389                  Create:  s' = extract_field <v_out2, offset>
5390                  Create:  s = op <s, s'>  // For non SLP cases
5391                }  */
5392
5393           if (dump_enabled_p ())
5394             dump_printf_loc (MSG_NOTE, vect_location,
5395                              "Reduce using scalar code.\n");
5396
5397           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5398           int element_bitsize = tree_to_uhwi (bitsize);
5399           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5400             {
5401               int bit_offset;
5402               if (gimple_code (new_phi) == GIMPLE_PHI)
5403                 vec_temp = PHI_RESULT (new_phi);
5404               else
5405                 vec_temp = gimple_assign_lhs (new_phi);
5406               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5407                                  bitsize_zero_node);
5408               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5409               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5410               gimple_assign_set_lhs (epilog_stmt, new_temp);
5411               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5412
5413               /* In SLP we don't need to apply reduction operation, so we just
5414                  collect s' values in SCALAR_RESULTS.  */
5415               if (slp_reduc)
5416                 scalar_results.safe_push (new_temp);
5417
5418               for (bit_offset = element_bitsize;
5419                    bit_offset < vec_size_in_bits;
5420                    bit_offset += element_bitsize)
5421                 {
5422                   tree bitpos = bitsize_int (bit_offset);
5423                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5424                                      bitsize, bitpos);
5425
5426                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5427                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5428                   gimple_assign_set_lhs (epilog_stmt, new_name);
5429                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5430
5431                   if (slp_reduc)
5432                     {
5433                       /* In SLP we don't need to apply reduction operation, so
5434                          we just collect s' values in SCALAR_RESULTS.  */
5435                       new_temp = new_name;
5436                       scalar_results.safe_push (new_name);
5437                     }
5438                   else
5439                     {
5440                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5441                                                          new_name, new_temp);
5442                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5443                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5444                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5445                     }
5446                 }
5447             }
5448
5449           /* The only case where we need to reduce scalar results in SLP, is
5450              unrolling.  If the size of SCALAR_RESULTS is greater than
5451              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5452              REDUC_GROUP_SIZE.  */
5453           if (slp_reduc)
5454             {
5455               tree res, first_res, new_res;
5456               gimple *new_stmt;
5457
5458               /* Reduce multiple scalar results in case of SLP unrolling.  */
5459               for (j = group_size; scalar_results.iterate (j, &res);
5460                    j++)
5461                 {
5462                   first_res = scalar_results[j % group_size];
5463                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5464                                                   first_res, res);
5465                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5466                   gimple_assign_set_lhs (new_stmt, new_res);
5467                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5468                   scalar_results[j % group_size] = new_res;
5469                 }
5470             }
5471           else
5472             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5473             scalar_results.safe_push (new_temp);
5474         }
5475
5476       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5477            == INTEGER_INDUC_COND_REDUCTION)
5478           && !operand_equal_p (initial_def, induc_val, 0))
5479         {
5480           /* Earlier we set the initial value to be a vector if induc_val
5481              values.  Check the result and if it is induc_val then replace
5482              with the original initial value, unless induc_val is
5483              the same as initial_def already.  */
5484           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5485                                   induc_val);
5486
5487           tree tmp = make_ssa_name (new_scalar_dest);
5488           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5489                                              initial_def, new_temp);
5490           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5491           scalar_results[0] = tmp;
5492         }
5493     }
5494
5495 vect_finalize_reduction:
5496
5497   if (double_reduc)
5498     loop = loop->inner;
5499
5500   /* 2.5 Adjust the final result by the initial value of the reduction
5501          variable. (When such adjustment is not needed, then
5502          'adjustment_def' is zero).  For example, if code is PLUS we create:
5503          new_temp = loop_exit_def + adjustment_def  */
5504
5505   if (adjustment_def)
5506     {
5507       gcc_assert (!slp_reduc);
5508       if (nested_in_vect_loop)
5509         {
5510           new_phi = new_phis[0];
5511           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5512           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5513           new_dest = vect_create_destination_var (scalar_dest, vectype);
5514         }
5515       else
5516         {
5517           new_temp = scalar_results[0];
5518           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5519           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5520           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5521         }
5522
5523       epilog_stmt = gimple_build_assign (new_dest, expr);
5524       new_temp = make_ssa_name (new_dest, epilog_stmt);
5525       gimple_assign_set_lhs (epilog_stmt, new_temp);
5526       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5527       if (nested_in_vect_loop)
5528         {
5529           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5530           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5531             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5532
5533           if (!double_reduc)
5534             scalar_results.quick_push (new_temp);
5535           else
5536             scalar_results[0] = new_temp;
5537         }
5538       else
5539         scalar_results[0] = new_temp;
5540
5541       new_phis[0] = epilog_stmt;
5542     }
5543
5544   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5545           phis with new adjusted scalar results, i.e., replace use <s_out0>
5546           with use <s_out4>.
5547
5548      Transform:
5549         loop_exit:
5550           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5551           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5552           v_out2 = reduce <v_out1>
5553           s_out3 = extract_field <v_out2, 0>
5554           s_out4 = adjust_result <s_out3>
5555           use <s_out0>
5556           use <s_out0>
5557
5558      into:
5559
5560         loop_exit:
5561           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5562           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5563           v_out2 = reduce <v_out1>
5564           s_out3 = extract_field <v_out2, 0>
5565           s_out4 = adjust_result <s_out3>
5566           use <s_out4>
5567           use <s_out4> */
5568
5569
5570   /* In SLP reduction chain we reduce vector results into one vector if
5571      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5572      LHS of the last stmt in the reduction chain, since we are looking for
5573      the loop exit phi node.  */
5574   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5575     {
5576       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5577       /* Handle reduction patterns.  */
5578       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5579         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5580
5581       scalar_dest = gimple_assign_lhs (dest_stmt);
5582       group_size = 1;
5583     }
5584
5585   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5586      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5587      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5588      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5589      correspond to the first vector stmt, etc.
5590      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5591   if (group_size > new_phis.length ())
5592     {
5593       ratio = group_size / new_phis.length ();
5594       gcc_assert (!(group_size % new_phis.length ()));
5595     }
5596   else
5597     ratio = 1;
5598
5599   for (k = 0; k < group_size; k++)
5600     {
5601       if (k % ratio == 0)
5602         {
5603           epilog_stmt = new_phis[k / ratio];
5604           reduction_phi_info = reduction_phis[k / ratio];
5605           if (double_reduc)
5606             inner_phi = inner_phis[k / ratio];
5607         }
5608
5609       if (slp_reduc)
5610         {
5611           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5612
5613           orig_stmt_info
5614             = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5615           /* SLP statements can't participate in patterns.  */
5616           gcc_assert (!orig_stmt_info);
5617           scalar_dest = gimple_assign_lhs (current_stmt);
5618         }
5619
5620       phis.create (3);
5621       /* Find the loop-closed-use at the loop exit of the original scalar
5622          result.  (The reduction result is expected to have two immediate uses -
5623          one at the latch block, and one at the loop exit).  */
5624       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5625         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5626             && !is_gimple_debug (USE_STMT (use_p)))
5627           phis.safe_push (USE_STMT (use_p));
5628
5629       /* While we expect to have found an exit_phi because of loop-closed-ssa
5630          form we can end up without one if the scalar cycle is dead.  */
5631
5632       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5633         {
5634           if (outer_loop)
5635             {
5636               stmt_vec_info exit_phi_vinfo
5637                 = loop_vinfo->lookup_stmt (exit_phi);
5638               gphi *vect_phi;
5639
5640               /* FORNOW. Currently not supporting the case that an inner-loop
5641                  reduction is not used in the outer-loop (but only outside the
5642                  outer-loop), unless it is double reduction.  */
5643               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5644                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5645                           || double_reduc);
5646
5647               if (double_reduc)
5648                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5649               else
5650                 STMT_VINFO_VEC_STMT (exit_phi_vinfo)
5651                   = vinfo_for_stmt (epilog_stmt);
5652               if (!double_reduc
5653                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5654                       != vect_double_reduction_def)
5655                 continue;
5656
5657               /* Handle double reduction:
5658
5659                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5660                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5661                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5662                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5663
5664                  At that point the regular reduction (stmt2 and stmt3) is
5665                  already vectorized, as well as the exit phi node, stmt4.
5666                  Here we vectorize the phi node of double reduction, stmt1, and
5667                  update all relevant statements.  */
5668
5669               /* Go through all the uses of s2 to find double reduction phi
5670                  node, i.e., stmt1 above.  */
5671               orig_name = PHI_RESULT (exit_phi);
5672               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5673                 {
5674                   stmt_vec_info use_stmt_vinfo;
5675                   tree vect_phi_init, preheader_arg, vect_phi_res;
5676                   basic_block bb = gimple_bb (use_stmt);
5677
5678                   /* Check that USE_STMT is really double reduction phi
5679                      node.  */
5680                   if (gimple_code (use_stmt) != GIMPLE_PHI
5681                       || gimple_phi_num_args (use_stmt) != 2
5682                       || bb->loop_father != outer_loop)
5683                     continue;
5684                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5685                   if (!use_stmt_vinfo
5686                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5687                           != vect_double_reduction_def)
5688                     continue;
5689
5690                   /* Create vector phi node for double reduction:
5691                      vs1 = phi <vs0, vs2>
5692                      vs1 was created previously in this function by a call to
5693                        vect_get_vec_def_for_operand and is stored in
5694                        vec_initial_def;
5695                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5696                      vs0 is created here.  */
5697
5698                   /* Create vector phi node.  */
5699                   vect_phi = create_phi_node (vec_initial_def, bb);
5700                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5701
5702                   /* Create vs0 - initial def of the double reduction phi.  */
5703                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5704                                              loop_preheader_edge (outer_loop));
5705                   vect_phi_init = get_initial_def_for_reduction
5706                     (stmt, preheader_arg, NULL);
5707
5708                   /* Update phi node arguments with vs0 and vs2.  */
5709                   add_phi_arg (vect_phi, vect_phi_init,
5710                                loop_preheader_edge (outer_loop),
5711                                UNKNOWN_LOCATION);
5712                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5713                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5714                   if (dump_enabled_p ())
5715                     {
5716                       dump_printf_loc (MSG_NOTE, vect_location,
5717                                        "created double reduction phi node: ");
5718                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5719                     }
5720
5721                   vect_phi_res = PHI_RESULT (vect_phi);
5722
5723                   /* Replace the use, i.e., set the correct vs1 in the regular
5724                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5725                      loop is redundant.  */
5726                   stmt_vec_info use_info = reduction_phi_info;
5727                   for (j = 0; j < ncopies; j++)
5728                     {
5729                       edge pr_edge = loop_preheader_edge (loop);
5730                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5731                                        pr_edge->dest_idx, vect_phi_res);
5732                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5733                     }
5734                 }
5735             }
5736         }
5737
5738       phis.release ();
5739       if (nested_in_vect_loop)
5740         {
5741           if (double_reduc)
5742             loop = outer_loop;
5743           else
5744             continue;
5745         }
5746
5747       phis.create (3);
5748       /* Find the loop-closed-use at the loop exit of the original scalar
5749          result.  (The reduction result is expected to have two immediate uses,
5750          one at the latch block, and one at the loop exit).  For double
5751          reductions we are looking for exit phis of the outer loop.  */
5752       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5753         {
5754           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5755             {
5756               if (!is_gimple_debug (USE_STMT (use_p)))
5757                 phis.safe_push (USE_STMT (use_p));
5758             }
5759           else
5760             {
5761               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5762                 {
5763                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5764
5765                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5766                     {
5767                       if (!flow_bb_inside_loop_p (loop,
5768                                              gimple_bb (USE_STMT (phi_use_p)))
5769                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5770                         phis.safe_push (USE_STMT (phi_use_p));
5771                     }
5772                 }
5773             }
5774         }
5775
5776       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5777         {
5778           /* Replace the uses:  */
5779           orig_name = PHI_RESULT (exit_phi);
5780           scalar_result = scalar_results[k];
5781           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5782             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5783               SET_USE (use_p, scalar_result);
5784         }
5785
5786       phis.release ();
5787     }
5788 }
5789
5790 /* Return a vector of type VECTYPE that is equal to the vector select
5791    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5792    before GSI.  */
5793
5794 static tree
5795 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5796                      tree vec, tree identity)
5797 {
5798   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5799   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5800                                           mask, vec, identity);
5801   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5802   return cond;
5803 }
5804
5805 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5806    order, starting with LHS.  Insert the extraction statements before GSI and
5807    associate the new scalar SSA names with variable SCALAR_DEST.
5808    Return the SSA name for the result.  */
5809
5810 static tree
5811 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5812                        tree_code code, tree lhs, tree vector_rhs)
5813 {
5814   tree vectype = TREE_TYPE (vector_rhs);
5815   tree scalar_type = TREE_TYPE (vectype);
5816   tree bitsize = TYPE_SIZE (scalar_type);
5817   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5818   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5819
5820   for (unsigned HOST_WIDE_INT bit_offset = 0;
5821        bit_offset < vec_size_in_bits;
5822        bit_offset += element_bitsize)
5823     {
5824       tree bitpos = bitsize_int (bit_offset);
5825       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5826                          bitsize, bitpos);
5827
5828       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5829       rhs = make_ssa_name (scalar_dest, stmt);
5830       gimple_assign_set_lhs (stmt, rhs);
5831       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5832
5833       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5834       tree new_name = make_ssa_name (scalar_dest, stmt);
5835       gimple_assign_set_lhs (stmt, new_name);
5836       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5837       lhs = new_name;
5838     }
5839   return lhs;
5840 }
5841
5842 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5843    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5844    statement.  CODE is the operation performed by STMT and OPS are
5845    its scalar operands.  REDUC_INDEX is the index of the operand in
5846    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5847    implements in-order reduction, or IFN_LAST if we should open-code it.
5848    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5849    that should be used to control the operation in a fully-masked loop.  */
5850
5851 static bool
5852 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5853                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5854                                gimple *reduc_def_stmt,
5855                                tree_code code, internal_fn reduc_fn,
5856                                tree ops[3], tree vectype_in,
5857                                int reduc_index, vec_loop_masks *masks)
5858 {
5859   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5860   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5861   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5862   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5863   stmt_vec_info new_stmt_info = NULL;
5864
5865   int ncopies;
5866   if (slp_node)
5867     ncopies = 1;
5868   else
5869     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5870
5871   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5872   gcc_assert (ncopies == 1);
5873   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5874   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5875   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5876               == FOLD_LEFT_REDUCTION);
5877
5878   if (slp_node)
5879     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5880                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5881
5882   tree op0 = ops[1 - reduc_index];
5883
5884   int group_size = 1;
5885   gimple *scalar_dest_def;
5886   auto_vec<tree> vec_oprnds0;
5887   if (slp_node)
5888     {
5889       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5890       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5891       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5892     }
5893   else
5894     {
5895       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5896       vec_oprnds0.create (1);
5897       vec_oprnds0.quick_push (loop_vec_def0);
5898       scalar_dest_def = stmt;
5899     }
5900
5901   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5902   tree scalar_type = TREE_TYPE (scalar_dest);
5903   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5904
5905   int vec_num = vec_oprnds0.length ();
5906   gcc_assert (vec_num == 1 || slp_node);
5907   tree vec_elem_type = TREE_TYPE (vectype_out);
5908   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5909
5910   tree vector_identity = NULL_TREE;
5911   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5912     vector_identity = build_zero_cst (vectype_out);
5913
5914   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5915   int i;
5916   tree def0;
5917   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5918     {
5919       gimple *new_stmt;
5920       tree mask = NULL_TREE;
5921       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5922         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5923
5924       /* Handle MINUS by adding the negative.  */
5925       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5926         {
5927           tree negated = make_ssa_name (vectype_out);
5928           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5929           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5930           def0 = negated;
5931         }
5932
5933       if (mask)
5934         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5935                                     vector_identity);
5936
5937       /* On the first iteration the input is simply the scalar phi
5938          result, and for subsequent iterations it is the output of
5939          the preceding operation.  */
5940       if (reduc_fn != IFN_LAST)
5941         {
5942           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5943           /* For chained SLP reductions the output of the previous reduction
5944              operation serves as the input of the next. For the final statement
5945              the output cannot be a temporary - we reuse the original
5946              scalar destination of the last statement.  */
5947           if (i != vec_num - 1)
5948             {
5949               gimple_set_lhs (new_stmt, scalar_dest_var);
5950               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5951               gimple_set_lhs (new_stmt, reduc_var);
5952             }
5953         }
5954       else
5955         {
5956           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5957                                              reduc_var, def0);
5958           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5959           /* Remove the statement, so that we can use the same code paths
5960              as for statements that we've just created.  */
5961           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5962           gsi_remove (&tmp_gsi, false);
5963         }
5964
5965       if (i == vec_num - 1)
5966         {
5967           gimple_set_lhs (new_stmt, scalar_dest);
5968           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def, new_stmt);
5969         }
5970       else
5971         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def,
5972                                                      new_stmt, gsi);
5973
5974       if (slp_node)
5975         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5976     }
5977
5978   if (!slp_node)
5979     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5980
5981   return true;
5982 }
5983
5984 /* Function is_nonwrapping_integer_induction.
5985
5986    Check if STMT (which is part of loop LOOP) both increments and
5987    does not cause overflow.  */
5988
5989 static bool
5990 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5991 {
5992   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5993   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5994   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5995   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5996   widest_int ni, max_loop_value, lhs_max;
5997   wi::overflow_type overflow = wi::OVF_NONE;
5998
5999   /* Make sure the loop is integer based.  */
6000   if (TREE_CODE (base) != INTEGER_CST
6001       || TREE_CODE (step) != INTEGER_CST)
6002     return false;
6003
6004   /* Check that the max size of the loop will not wrap.  */
6005
6006   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6007     return true;
6008
6009   if (! max_stmt_executions (loop, &ni))
6010     return false;
6011
6012   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6013                             &overflow);
6014   if (overflow)
6015     return false;
6016
6017   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6018                             TYPE_SIGN (lhs_type), &overflow);
6019   if (overflow)
6020     return false;
6021
6022   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6023           <= TYPE_PRECISION (lhs_type));
6024 }
6025
6026 /* Function vectorizable_reduction.
6027
6028    Check if STMT performs a reduction operation that can be vectorized.
6029    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6030    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6031    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6032
6033    This function also handles reduction idioms (patterns) that have been
6034    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6035    of this form:
6036      X = pattern_expr (arg0, arg1, ..., X)
6037    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6038    sequence that had been detected and replaced by the pattern-stmt (STMT).
6039
6040    This function also handles reduction of condition expressions, for example:
6041      for (int i = 0; i < N; i++)
6042        if (a[i] < value)
6043          last = a[i];
6044    This is handled by vectorising the loop and creating an additional vector
6045    containing the loop indexes for which "a[i] < value" was true.  In the
6046    function epilogue this is reduced to a single max value and then used to
6047    index into the vector of results.
6048
6049    In some cases of reduction patterns, the type of the reduction variable X is
6050    different than the type of the other arguments of STMT.
6051    In such cases, the vectype that is used when transforming STMT into a vector
6052    stmt is different than the vectype that is used to determine the
6053    vectorization factor, because it consists of a different number of elements
6054    than the actual number of elements that are being operated upon in parallel.
6055
6056    For example, consider an accumulation of shorts into an int accumulator.
6057    On some targets it's possible to vectorize this pattern operating on 8
6058    shorts at a time (hence, the vectype for purposes of determining the
6059    vectorization factor should be V8HI); on the other hand, the vectype that
6060    is used to create the vector form is actually V4SI (the type of the result).
6061
6062    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6063    indicates what is the actual level of parallelism (V8HI in the example), so
6064    that the right vectorization factor would be derived.  This vectype
6065    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6066    be used to create the vectorized stmt.  The right vectype for the vectorized
6067    stmt is obtained from the type of the result X:
6068         get_vectype_for_scalar_type (TREE_TYPE (X))
6069
6070    This means that, contrary to "regular" reductions (or "regular" stmts in
6071    general), the following equation:
6072       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6073    does *NOT* necessarily hold for reduction patterns.  */
6074
6075 bool
6076 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6077                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6078                         slp_instance slp_node_instance,
6079                         stmt_vector_for_cost *cost_vec)
6080 {
6081   tree vec_dest;
6082   tree scalar_dest;
6083   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6084   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6085   tree vectype_in = NULL_TREE;
6086   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6087   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6088   enum tree_code code, orig_code;
6089   internal_fn reduc_fn;
6090   machine_mode vec_mode;
6091   int op_type;
6092   optab optab;
6093   tree new_temp = NULL_TREE;
6094   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6095   gimple *cond_reduc_def_stmt = NULL;
6096   enum tree_code cond_reduc_op_code = ERROR_MARK;
6097   tree scalar_type;
6098   bool is_simple_use;
6099   int i;
6100   int ncopies;
6101   int epilog_copies;
6102   stmt_vec_info prev_stmt_info, prev_phi_info;
6103   bool single_defuse_cycle = false;
6104   stmt_vec_info new_stmt_info = NULL;
6105   int j;
6106   tree ops[3];
6107   enum vect_def_type dts[3];
6108   bool nested_cycle = false, found_nested_cycle_def = false;
6109   bool double_reduc = false;
6110   basic_block def_bb;
6111   struct loop * def_stmt_loop;
6112   tree def_arg;
6113   auto_vec<tree> vec_oprnds0;
6114   auto_vec<tree> vec_oprnds1;
6115   auto_vec<tree> vec_oprnds2;
6116   auto_vec<tree> vect_defs;
6117   auto_vec<stmt_vec_info> phis;
6118   int vec_num;
6119   tree def0, tem;
6120   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6121   tree cond_reduc_val = NULL_TREE;
6122
6123   /* Make sure it was already recognized as a reduction computation.  */
6124   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6125       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6126     return false;
6127
6128   if (nested_in_vect_loop_p (loop, stmt))
6129     {
6130       loop = loop->inner;
6131       nested_cycle = true;
6132     }
6133
6134   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6135     gcc_assert (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt);
6136
6137   if (gimple_code (stmt) == GIMPLE_PHI)
6138     {
6139       tree phi_result = gimple_phi_result (stmt);
6140       /* Analysis is fully done on the reduction stmt invocation.  */
6141       if (! vec_stmt)
6142         {
6143           if (slp_node)
6144             slp_node_instance->reduc_phis = slp_node;
6145
6146           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6147           return true;
6148         }
6149
6150       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6151         /* Leave the scalar phi in place.  Note that checking
6152            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6153            for reductions involving a single statement.  */
6154         return true;
6155
6156       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6157       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6158         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6159
6160       stmt_vec_info reduc_stmt_info = vinfo_for_stmt (reduc_stmt);
6161       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6162           == EXTRACT_LAST_REDUCTION)
6163         /* Leave the scalar phi in place.  */
6164         return true;
6165
6166       gcc_assert (is_gimple_assign (reduc_stmt));
6167       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6168         {
6169           tree op = gimple_op (reduc_stmt, k);
6170           if (op == gimple_phi_result (stmt))
6171             continue;
6172           if (k == 1
6173               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6174             continue;
6175           if (!vectype_in
6176               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6177                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6178             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6179           break;
6180         }
6181       gcc_assert (vectype_in);
6182
6183       if (slp_node)
6184         ncopies = 1;
6185       else
6186         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6187
6188       stmt_vec_info use_stmt_info;
6189       if (ncopies > 1
6190           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6191           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6192           && (use_stmt_info == reduc_stmt_info
6193               || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt))
6194         single_defuse_cycle = true;
6195
6196       /* Create the destination vector  */
6197       scalar_dest = gimple_assign_lhs (reduc_stmt);
6198       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6199
6200       if (slp_node)
6201         /* The size vect_schedule_slp_instance computes is off for us.  */
6202         vec_num = vect_get_num_vectors
6203           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6204            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6205            vectype_in);
6206       else
6207         vec_num = 1;
6208
6209       /* Generate the reduction PHIs upfront.  */
6210       prev_phi_info = NULL;
6211       for (j = 0; j < ncopies; j++)
6212         {
6213           if (j == 0 || !single_defuse_cycle)
6214             {
6215               for (i = 0; i < vec_num; i++)
6216                 {
6217                   /* Create the reduction-phi that defines the reduction
6218                      operand.  */
6219                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6220                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6221
6222                   if (slp_node)
6223                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6224                   else
6225                     {
6226                       if (j == 0)
6227                         STMT_VINFO_VEC_STMT (stmt_info)
6228                           = *vec_stmt = new_phi_info;
6229                       else
6230                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6231                       prev_phi_info = new_phi_info;
6232                     }
6233                 }
6234             }
6235         }
6236
6237       return true;
6238     }
6239
6240   /* 1. Is vectorizable reduction?  */
6241   /* Not supportable if the reduction variable is used in the loop, unless
6242      it's a reduction chain.  */
6243   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6244       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6245     return false;
6246
6247   /* Reductions that are not used even in an enclosing outer-loop,
6248      are expected to be "live" (used out of the loop).  */
6249   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6250       && !STMT_VINFO_LIVE_P (stmt_info))
6251     return false;
6252
6253   /* 2. Has this been recognized as a reduction pattern?
6254
6255      Check if STMT represents a pattern that has been recognized
6256      in earlier analysis stages.  For stmts that represent a pattern,
6257      the STMT_VINFO_RELATED_STMT field records the last stmt in
6258      the original sequence that constitutes the pattern.  */
6259
6260   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6261   if (orig_stmt_info)
6262     {
6263       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6264       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6265     }
6266
6267   /* 3. Check the operands of the operation.  The first operands are defined
6268         inside the loop body. The last operand is the reduction variable,
6269         which is defined by the loop-header-phi.  */
6270
6271   gcc_assert (is_gimple_assign (stmt));
6272
6273   /* Flatten RHS.  */
6274   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6275     {
6276     case GIMPLE_BINARY_RHS:
6277       code = gimple_assign_rhs_code (stmt);
6278       op_type = TREE_CODE_LENGTH (code);
6279       gcc_assert (op_type == binary_op);
6280       ops[0] = gimple_assign_rhs1 (stmt);
6281       ops[1] = gimple_assign_rhs2 (stmt);
6282       break;
6283
6284     case GIMPLE_TERNARY_RHS:
6285       code = gimple_assign_rhs_code (stmt);
6286       op_type = TREE_CODE_LENGTH (code);
6287       gcc_assert (op_type == ternary_op);
6288       ops[0] = gimple_assign_rhs1 (stmt);
6289       ops[1] = gimple_assign_rhs2 (stmt);
6290       ops[2] = gimple_assign_rhs3 (stmt);
6291       break;
6292
6293     case GIMPLE_UNARY_RHS:
6294       return false;
6295
6296     default:
6297       gcc_unreachable ();
6298     }
6299
6300   if (code == COND_EXPR && slp_node)
6301     return false;
6302
6303   scalar_dest = gimple_assign_lhs (stmt);
6304   scalar_type = TREE_TYPE (scalar_dest);
6305   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6306       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6307     return false;
6308
6309   /* Do not try to vectorize bit-precision reductions.  */
6310   if (!type_has_mode_precision_p (scalar_type))
6311     return false;
6312
6313   /* All uses but the last are expected to be defined in the loop.
6314      The last use is the reduction variable.  In case of nested cycle this
6315      assumption is not true: we use reduc_index to record the index of the
6316      reduction variable.  */
6317   gimple *reduc_def_stmt = NULL;
6318   int reduc_index = -1;
6319   for (i = 0; i < op_type; i++)
6320     {
6321       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6322       if (i == 0 && code == COND_EXPR)
6323         continue;
6324
6325       stmt_vec_info def_stmt_info;
6326       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6327                                           &def_stmt_info);
6328       dt = dts[i];
6329       gcc_assert (is_simple_use);
6330       if (dt == vect_reduction_def)
6331         {
6332           reduc_def_stmt = def_stmt_info;
6333           reduc_index = i;
6334           continue;
6335         }
6336       else if (tem)
6337         {
6338           /* To properly compute ncopies we are interested in the widest
6339              input type in case we're looking at a widening accumulation.  */
6340           if (!vectype_in
6341               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6342                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6343             vectype_in = tem;
6344         }
6345
6346       if (dt != vect_internal_def
6347           && dt != vect_external_def
6348           && dt != vect_constant_def
6349           && dt != vect_induction_def
6350           && !(dt == vect_nested_cycle && nested_cycle))
6351         return false;
6352
6353       if (dt == vect_nested_cycle)
6354         {
6355           found_nested_cycle_def = true;
6356           reduc_def_stmt = def_stmt_info;
6357           reduc_index = i;
6358         }
6359
6360       if (i == 1 && code == COND_EXPR)
6361         {
6362           /* Record how value of COND_EXPR is defined.  */
6363           if (dt == vect_constant_def)
6364             {
6365               cond_reduc_dt = dt;
6366               cond_reduc_val = ops[i];
6367             }
6368           if (dt == vect_induction_def
6369               && def_stmt_info
6370               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6371             {
6372               cond_reduc_dt = dt;
6373               cond_reduc_def_stmt = def_stmt_info;
6374             }
6375         }
6376     }
6377
6378   if (!vectype_in)
6379     vectype_in = vectype_out;
6380
6381   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6382      directy used in stmt.  */
6383   if (reduc_index == -1)
6384     {
6385       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6386         {
6387           if (dump_enabled_p ())
6388             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6389                              "in-order reduction chain without SLP.\n");
6390           return false;
6391         }
6392
6393       if (orig_stmt_info)
6394         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6395       else
6396         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6397     }
6398
6399   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6400     return false;
6401
6402   if (!(reduc_index == -1
6403         || dts[reduc_index] == vect_reduction_def
6404         || dts[reduc_index] == vect_nested_cycle
6405         || ((dts[reduc_index] == vect_internal_def
6406              || dts[reduc_index] == vect_external_def
6407              || dts[reduc_index] == vect_constant_def
6408              || dts[reduc_index] == vect_induction_def)
6409             && nested_cycle && found_nested_cycle_def)))
6410     {
6411       /* For pattern recognized stmts, orig_stmt might be a reduction,
6412          but some helper statements for the pattern might not, or
6413          might be COND_EXPRs with reduction uses in the condition.  */
6414       gcc_assert (orig_stmt_info);
6415       return false;
6416     }
6417
6418   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6419   /* PHIs should not participate in patterns.  */
6420   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6421   enum vect_reduction_type v_reduc_type
6422     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6423   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6424
6425   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6426   /* If we have a condition reduction, see if we can simplify it further.  */
6427   if (v_reduc_type == COND_REDUCTION)
6428     {
6429       /* TODO: We can't yet handle reduction chains, since we need to treat
6430          each COND_EXPR in the chain specially, not just the last one.
6431          E.g. for:
6432
6433             x_1 = PHI <x_3, ...>
6434             x_2 = a_2 ? ... : x_1;
6435             x_3 = a_3 ? ... : x_2;
6436
6437          we're interested in the last element in x_3 for which a_2 || a_3
6438          is true, whereas the current reduction chain handling would
6439          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6440          as a reduction operation.  */
6441       if (reduc_index == -1)
6442         {
6443           if (dump_enabled_p ())
6444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6445                              "conditional reduction chains not supported\n");
6446           return false;
6447         }
6448
6449       /* vect_is_simple_reduction ensured that operand 2 is the
6450          loop-carried operand.  */
6451       gcc_assert (reduc_index == 2);
6452
6453       /* Loop peeling modifies initial value of reduction PHI, which
6454          makes the reduction stmt to be transformed different to the
6455          original stmt analyzed.  We need to record reduction code for
6456          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6457          it can be used directly at transform stage.  */
6458       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6459           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6460         {
6461           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6462           gcc_assert (cond_reduc_dt == vect_constant_def);
6463           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6464         }
6465       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6466                                                vectype_in, OPTIMIZE_FOR_SPEED))
6467         {
6468           if (dump_enabled_p ())
6469             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470                              "optimizing condition reduction with"
6471                              " FOLD_EXTRACT_LAST.\n");
6472           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6473         }
6474       else if (cond_reduc_dt == vect_induction_def)
6475         {
6476           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6477           tree base
6478             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6479           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6480
6481           gcc_assert (TREE_CODE (base) == INTEGER_CST
6482                       && TREE_CODE (step) == INTEGER_CST);
6483           cond_reduc_val = NULL_TREE;
6484           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6485              above base; punt if base is the minimum value of the type for
6486              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6487           if (tree_int_cst_sgn (step) == -1)
6488             {
6489               cond_reduc_op_code = MIN_EXPR;
6490               if (tree_int_cst_sgn (base) == -1)
6491                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6492               else if (tree_int_cst_lt (base,
6493                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6494                 cond_reduc_val
6495                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6496             }
6497           else
6498             {
6499               cond_reduc_op_code = MAX_EXPR;
6500               if (tree_int_cst_sgn (base) == 1)
6501                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6502               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6503                                         base))
6504                 cond_reduc_val
6505                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6506             }
6507           if (cond_reduc_val)
6508             {
6509               if (dump_enabled_p ())
6510                 dump_printf_loc (MSG_NOTE, vect_location,
6511                                  "condition expression based on "
6512                                  "integer induction.\n");
6513               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6514                 = INTEGER_INDUC_COND_REDUCTION;
6515             }
6516         }
6517       else if (cond_reduc_dt == vect_constant_def)
6518         {
6519           enum vect_def_type cond_initial_dt;
6520           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6521           tree cond_initial_val
6522             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6523
6524           gcc_assert (cond_reduc_val != NULL_TREE);
6525           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6526           if (cond_initial_dt == vect_constant_def
6527               && types_compatible_p (TREE_TYPE (cond_initial_val),
6528                                      TREE_TYPE (cond_reduc_val)))
6529             {
6530               tree e = fold_binary (LE_EXPR, boolean_type_node,
6531                                     cond_initial_val, cond_reduc_val);
6532               if (e && (integer_onep (e) || integer_zerop (e)))
6533                 {
6534                   if (dump_enabled_p ())
6535                     dump_printf_loc (MSG_NOTE, vect_location,
6536                                      "condition expression based on "
6537                                      "compile time constant.\n");
6538                   /* Record reduction code at analysis stage.  */
6539                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6540                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6541                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6542                     = CONST_COND_REDUCTION;
6543                 }
6544             }
6545         }
6546     }
6547
6548   if (orig_stmt_info)
6549     gcc_assert (tmp == orig_stmt_info
6550                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6551                     == orig_stmt_info));
6552   else
6553     /* We changed STMT to be the first stmt in reduction chain, hence we
6554        check that in this case the first element in the chain is STMT.  */
6555     gcc_assert (stmt == tmp
6556                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6557
6558   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6559     return false;
6560
6561   if (slp_node)
6562     ncopies = 1;
6563   else
6564     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6565
6566   gcc_assert (ncopies >= 1);
6567
6568   vec_mode = TYPE_MODE (vectype_in);
6569   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6570
6571   if (code == COND_EXPR)
6572     {
6573       /* Only call during the analysis stage, otherwise we'll lose
6574          STMT_VINFO_TYPE.  */
6575       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6576                                                 ops[reduc_index], 0, NULL,
6577                                                 cost_vec))
6578         {
6579           if (dump_enabled_p ())
6580             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581                              "unsupported condition in reduction\n");
6582           return false;
6583         }
6584     }
6585   else
6586     {
6587       /* 4. Supportable by target?  */
6588
6589       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6590           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6591         {
6592           /* Shifts and rotates are only supported by vectorizable_shifts,
6593              not vectorizable_reduction.  */
6594           if (dump_enabled_p ())
6595             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6596                              "unsupported shift or rotation.\n");
6597           return false;
6598         }
6599
6600       /* 4.1. check support for the operation in the loop  */
6601       optab = optab_for_tree_code (code, vectype_in, optab_default);
6602       if (!optab)
6603         {
6604           if (dump_enabled_p ())
6605             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6606                              "no optab.\n");
6607
6608           return false;
6609         }
6610
6611       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6612         {
6613           if (dump_enabled_p ())
6614             dump_printf (MSG_NOTE, "op not supported by target.\n");
6615
6616           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6617               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6618             return false;
6619
6620           if (dump_enabled_p ())
6621             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6622         }
6623
6624       /* Worthwhile without SIMD support?  */
6625       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6626           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6627         {
6628           if (dump_enabled_p ())
6629             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6630                              "not worthwhile without SIMD support.\n");
6631
6632           return false;
6633         }
6634     }
6635
6636   /* 4.2. Check support for the epilog operation.
6637
6638           If STMT represents a reduction pattern, then the type of the
6639           reduction variable may be different than the type of the rest
6640           of the arguments.  For example, consider the case of accumulation
6641           of shorts into an int accumulator; The original code:
6642                         S1: int_a = (int) short_a;
6643           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6644
6645           was replaced with:
6646                         STMT: int_acc = widen_sum <short_a, int_acc>
6647
6648           This means that:
6649           1. The tree-code that is used to create the vector operation in the
6650              epilog code (that reduces the partial results) is not the
6651              tree-code of STMT, but is rather the tree-code of the original
6652              stmt from the pattern that STMT is replacing.  I.e, in the example
6653              above we want to use 'widen_sum' in the loop, but 'plus' in the
6654              epilog.
6655           2. The type (mode) we use to check available target support
6656              for the vector operation to be created in the *epilog*, is
6657              determined by the type of the reduction variable (in the example
6658              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6659              However the type (mode) we use to check available target support
6660              for the vector operation to be created *inside the loop*, is
6661              determined by the type of the other arguments to STMT (in the
6662              example we'd check this: optab_handler (widen_sum_optab,
6663              vect_short_mode)).
6664
6665           This is contrary to "regular" reductions, in which the types of all
6666           the arguments are the same as the type of the reduction variable.
6667           For "regular" reductions we can therefore use the same vector type
6668           (and also the same tree-code) when generating the epilog code and
6669           when generating the code inside the loop.  */
6670
6671   vect_reduction_type reduction_type
6672     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6673   if (orig_stmt_info
6674       && (reduction_type == TREE_CODE_REDUCTION
6675           || reduction_type == FOLD_LEFT_REDUCTION))
6676     {
6677       /* This is a reduction pattern: get the vectype from the type of the
6678          reduction variable, and get the tree-code from orig_stmt.  */
6679       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6680       gcc_assert (vectype_out);
6681       vec_mode = TYPE_MODE (vectype_out);
6682     }
6683   else
6684     {
6685       /* Regular reduction: use the same vectype and tree-code as used for
6686          the vector code inside the loop can be used for the epilog code. */
6687       orig_code = code;
6688
6689       if (code == MINUS_EXPR)
6690         orig_code = PLUS_EXPR;
6691
6692       /* For simple condition reductions, replace with the actual expression
6693          we want to base our reduction around.  */
6694       if (reduction_type == CONST_COND_REDUCTION)
6695         {
6696           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6697           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6698         }
6699       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6700         orig_code = cond_reduc_op_code;
6701     }
6702
6703   if (nested_cycle)
6704     {
6705       def_bb = gimple_bb (reduc_def_stmt);
6706       def_stmt_loop = def_bb->loop_father;
6707       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6708                                        loop_preheader_edge (def_stmt_loop));
6709       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6710       if (def_arg_stmt_info
6711           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6712               == vect_double_reduction_def))
6713         double_reduc = true;
6714     }
6715
6716   reduc_fn = IFN_LAST;
6717
6718   if (reduction_type == TREE_CODE_REDUCTION
6719       || reduction_type == FOLD_LEFT_REDUCTION
6720       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6721       || reduction_type == CONST_COND_REDUCTION)
6722     {
6723       if (reduction_type == FOLD_LEFT_REDUCTION
6724           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6725           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6726         {
6727           if (reduc_fn != IFN_LAST
6728               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6729                                                   OPTIMIZE_FOR_SPEED))
6730             {
6731               if (dump_enabled_p ())
6732                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6733                                  "reduc op not supported by target.\n");
6734
6735               reduc_fn = IFN_LAST;
6736             }
6737         }
6738       else
6739         {
6740           if (!nested_cycle || double_reduc)
6741             {
6742               if (dump_enabled_p ())
6743                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744                                  "no reduc code for scalar code.\n");
6745
6746               return false;
6747             }
6748         }
6749     }
6750   else if (reduction_type == COND_REDUCTION)
6751     {
6752       int scalar_precision
6753         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6754       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6755       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6756                                                 nunits_out);
6757
6758       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6759                                           OPTIMIZE_FOR_SPEED))
6760         reduc_fn = IFN_REDUC_MAX;
6761     }
6762
6763   if (reduction_type != EXTRACT_LAST_REDUCTION
6764       && reduc_fn == IFN_LAST
6765       && !nunits_out.is_constant ())
6766     {
6767       if (dump_enabled_p ())
6768         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6769                          "missing target support for reduction on"
6770                          " variable-length vectors.\n");
6771       return false;
6772     }
6773
6774   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6775       && ncopies > 1)
6776     {
6777       if (dump_enabled_p ())
6778         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6779                          "multiple types in double reduction or condition "
6780                          "reduction.\n");
6781       return false;
6782     }
6783
6784   /* For SLP reductions, see if there is a neutral value we can use.  */
6785   tree neutral_op = NULL_TREE;
6786   if (slp_node)
6787     neutral_op = neutral_op_for_slp_reduction
6788                    (slp_node_instance->reduc_phis, code,
6789                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6790
6791   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6792     {
6793       /* We can't support in-order reductions of code such as this:
6794
6795            for (int i = 0; i < n1; ++i)
6796              for (int j = 0; j < n2; ++j)
6797                l += a[j];
6798
6799          since GCC effectively transforms the loop when vectorizing:
6800
6801            for (int i = 0; i < n1 / VF; ++i)
6802              for (int j = 0; j < n2; ++j)
6803                for (int k = 0; k < VF; ++k)
6804                  l += a[j];
6805
6806          which is a reassociation of the original operation.  */
6807       if (dump_enabled_p ())
6808         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6809                          "in-order double reduction not supported.\n");
6810
6811       return false;
6812     }
6813
6814   if (reduction_type == FOLD_LEFT_REDUCTION
6815       && slp_node
6816       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6817     {
6818       /* We cannot use in-order reductions in this case because there is
6819          an implicit reassociation of the operations involved.  */
6820       if (dump_enabled_p ())
6821         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6822                          "in-order unchained SLP reductions not supported.\n");
6823       return false;
6824     }
6825
6826   /* For double reductions, and for SLP reductions with a neutral value,
6827      we construct a variable-length initial vector by loading a vector
6828      full of the neutral value and then shift-and-inserting the start
6829      values into the low-numbered elements.  */
6830   if ((double_reduc || neutral_op)
6831       && !nunits_out.is_constant ()
6832       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6833                                           vectype_out, OPTIMIZE_FOR_SPEED))
6834     {
6835       if (dump_enabled_p ())
6836         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837                          "reduction on variable-length vectors requires"
6838                          " target support for a vector-shift-and-insert"
6839                          " operation.\n");
6840       return false;
6841     }
6842
6843   /* Check extra constraints for variable-length unchained SLP reductions.  */
6844   if (STMT_SLP_TYPE (stmt_info)
6845       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6846       && !nunits_out.is_constant ())
6847     {
6848       /* We checked above that we could build the initial vector when
6849          there's a neutral element value.  Check here for the case in
6850          which each SLP statement has its own initial value and in which
6851          that value needs to be repeated for every instance of the
6852          statement within the initial vector.  */
6853       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6854       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6855       if (!neutral_op
6856           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6857         {
6858           if (dump_enabled_p ())
6859             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6860                              "unsupported form of SLP reduction for"
6861                              " variable-length vectors: cannot build"
6862                              " initial vector.\n");
6863           return false;
6864         }
6865       /* The epilogue code relies on the number of elements being a multiple
6866          of the group size.  The duplicate-and-interleave approach to setting
6867          up the the initial vector does too.  */
6868       if (!multiple_p (nunits_out, group_size))
6869         {
6870           if (dump_enabled_p ())
6871             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6872                              "unsupported form of SLP reduction for"
6873                              " variable-length vectors: the vector size"
6874                              " is not a multiple of the number of results.\n");
6875           return false;
6876         }
6877     }
6878
6879   /* In case of widenning multiplication by a constant, we update the type
6880      of the constant to be the type of the other operand.  We check that the
6881      constant fits the type in the pattern recognition pass.  */
6882   if (code == DOT_PROD_EXPR
6883       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6884     {
6885       if (TREE_CODE (ops[0]) == INTEGER_CST)
6886         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6887       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6888         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6889       else
6890         {
6891           if (dump_enabled_p ())
6892             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6893                              "invalid types in dot-prod\n");
6894
6895           return false;
6896         }
6897     }
6898
6899   if (reduction_type == COND_REDUCTION)
6900     {
6901       widest_int ni;
6902
6903       if (! max_loop_iterations (loop, &ni))
6904         {
6905           if (dump_enabled_p ())
6906             dump_printf_loc (MSG_NOTE, vect_location,
6907                              "loop count not known, cannot create cond "
6908                              "reduction.\n");
6909           return false;
6910         }
6911       /* Convert backedges to iterations.  */
6912       ni += 1;
6913
6914       /* The additional index will be the same type as the condition.  Check
6915          that the loop can fit into this less one (because we'll use up the
6916          zero slot for when there are no matches).  */
6917       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6918       if (wi::geu_p (ni, wi::to_widest (max_index)))
6919         {
6920           if (dump_enabled_p ())
6921             dump_printf_loc (MSG_NOTE, vect_location,
6922                              "loop size is greater than data size.\n");
6923           return false;
6924         }
6925     }
6926
6927   /* In case the vectorization factor (VF) is bigger than the number
6928      of elements that we can fit in a vectype (nunits), we have to generate
6929      more than one vector stmt - i.e - we need to "unroll" the
6930      vector stmt by a factor VF/nunits.  For more details see documentation
6931      in vectorizable_operation.  */
6932
6933   /* If the reduction is used in an outer loop we need to generate
6934      VF intermediate results, like so (e.g. for ncopies=2):
6935         r0 = phi (init, r0)
6936         r1 = phi (init, r1)
6937         r0 = x0 + r0;
6938         r1 = x1 + r1;
6939     (i.e. we generate VF results in 2 registers).
6940     In this case we have a separate def-use cycle for each copy, and therefore
6941     for each copy we get the vector def for the reduction variable from the
6942     respective phi node created for this copy.
6943
6944     Otherwise (the reduction is unused in the loop nest), we can combine
6945     together intermediate results, like so (e.g. for ncopies=2):
6946         r = phi (init, r)
6947         r = x0 + r;
6948         r = x1 + r;
6949    (i.e. we generate VF/2 results in a single register).
6950    In this case for each copy we get the vector def for the reduction variable
6951    from the vectorized reduction operation generated in the previous iteration.
6952
6953    This only works when we see both the reduction PHI and its only consumer
6954    in vectorizable_reduction and there are no intermediate stmts
6955    participating.  */
6956   stmt_vec_info use_stmt_info;
6957   tree reduc_phi_result = gimple_phi_result (reduc_def_stmt);
6958   if (ncopies > 1
6959       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6960       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6961       && (use_stmt_info == stmt_info
6962           || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt))
6963     {
6964       single_defuse_cycle = true;
6965       epilog_copies = 1;
6966     }
6967   else
6968     epilog_copies = ncopies;
6969
6970   /* If the reduction stmt is one of the patterns that have lane
6971      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6972   if ((ncopies > 1
6973        && ! single_defuse_cycle)
6974       && (code == DOT_PROD_EXPR
6975           || code == WIDEN_SUM_EXPR
6976           || code == SAD_EXPR))
6977     {
6978       if (dump_enabled_p ())
6979         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6980                          "multi def-use cycle not possible for lane-reducing "
6981                          "reduction operation\n");
6982       return false;
6983     }
6984
6985   if (slp_node)
6986     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6987   else
6988     vec_num = 1;
6989
6990   internal_fn cond_fn = get_conditional_internal_fn (code);
6991   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6992
6993   if (!vec_stmt) /* transformation not required.  */
6994     {
6995       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6996       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6997         {
6998           if (reduction_type != FOLD_LEFT_REDUCTION
6999               && (cond_fn == IFN_LAST
7000                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7001                                                       OPTIMIZE_FOR_SPEED)))
7002             {
7003               if (dump_enabled_p ())
7004                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7005                                  "can't use a fully-masked loop because no"
7006                                  " conditional operation is available.\n");
7007               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7008             }
7009           else if (reduc_index == -1)
7010             {
7011               if (dump_enabled_p ())
7012                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7013                                  "can't use a fully-masked loop for chained"
7014                                  " reductions.\n");
7015               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7016             }
7017           else
7018             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7019                                    vectype_in);
7020         }
7021       if (dump_enabled_p ()
7022           && reduction_type == FOLD_LEFT_REDUCTION)
7023         dump_printf_loc (MSG_NOTE, vect_location,
7024                          "using an in-order (fold-left) reduction.\n");
7025       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7026       return true;
7027     }
7028
7029   /* Transform.  */
7030
7031   if (dump_enabled_p ())
7032     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7033
7034   /* FORNOW: Multiple types are not supported for condition.  */
7035   if (code == COND_EXPR)
7036     gcc_assert (ncopies == 1);
7037
7038   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7039
7040   if (reduction_type == FOLD_LEFT_REDUCTION)
7041     return vectorize_fold_left_reduction
7042       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7043        reduc_fn, ops, vectype_in, reduc_index, masks);
7044
7045   if (reduction_type == EXTRACT_LAST_REDUCTION)
7046     {
7047       gcc_assert (!slp_node);
7048       return vectorizable_condition (stmt, gsi, vec_stmt,
7049                                      NULL, reduc_index, NULL, NULL);
7050     }
7051
7052   /* Create the destination vector  */
7053   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7054
7055   prev_stmt_info = NULL;
7056   prev_phi_info = NULL;
7057   if (!slp_node)
7058     {
7059       vec_oprnds0.create (1);
7060       vec_oprnds1.create (1);
7061       if (op_type == ternary_op)
7062         vec_oprnds2.create (1);
7063     }
7064
7065   phis.create (vec_num);
7066   vect_defs.create (vec_num);
7067   if (!slp_node)
7068     vect_defs.quick_push (NULL_TREE);
7069
7070   if (slp_node)
7071     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7072   else
7073     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7074
7075   for (j = 0; j < ncopies; j++)
7076     {
7077       if (code == COND_EXPR)
7078         {
7079           gcc_assert (!slp_node);
7080           vectorizable_condition (stmt, gsi, vec_stmt,
7081                                   PHI_RESULT (phis[0]->stmt),
7082                                   reduc_index, NULL, NULL);
7083           /* Multiple types are not supported for condition.  */
7084           break;
7085         }
7086
7087       /* Handle uses.  */
7088       if (j == 0)
7089         {
7090           if (slp_node)
7091             {
7092               /* Get vec defs for all the operands except the reduction index,
7093                  ensuring the ordering of the ops in the vector is kept.  */
7094               auto_vec<tree, 3> slp_ops;
7095               auto_vec<vec<tree>, 3> vec_defs;
7096
7097               slp_ops.quick_push (ops[0]);
7098               slp_ops.quick_push (ops[1]);
7099               if (op_type == ternary_op)
7100                 slp_ops.quick_push (ops[2]);
7101
7102               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7103
7104               vec_oprnds0.safe_splice (vec_defs[0]);
7105               vec_defs[0].release ();
7106               vec_oprnds1.safe_splice (vec_defs[1]);
7107               vec_defs[1].release ();
7108               if (op_type == ternary_op)
7109                 {
7110                   vec_oprnds2.safe_splice (vec_defs[2]);
7111                   vec_defs[2].release ();
7112                 }
7113             }
7114           else
7115             {
7116               vec_oprnds0.quick_push
7117                 (vect_get_vec_def_for_operand (ops[0], stmt));
7118               vec_oprnds1.quick_push
7119                 (vect_get_vec_def_for_operand (ops[1], stmt));
7120               if (op_type == ternary_op)
7121                 vec_oprnds2.quick_push
7122                   (vect_get_vec_def_for_operand (ops[2], stmt));
7123             }
7124         }
7125       else
7126         {
7127           if (!slp_node)
7128             {
7129               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7130
7131               if (single_defuse_cycle && reduc_index == 0)
7132                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7133               else
7134                 vec_oprnds0[0]
7135                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7136               if (single_defuse_cycle && reduc_index == 1)
7137                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7138               else
7139                 vec_oprnds1[0]
7140                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7141               if (op_type == ternary_op)
7142                 {
7143                   if (single_defuse_cycle && reduc_index == 2)
7144                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7145                   else
7146                     vec_oprnds2[0]
7147                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7148                 }
7149             }
7150         }
7151
7152       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7153         {
7154           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7155           if (masked_loop_p)
7156             {
7157               /* Make sure that the reduction accumulator is vop[0].  */
7158               if (reduc_index == 1)
7159                 {
7160                   gcc_assert (commutative_tree_code (code));
7161                   std::swap (vop[0], vop[1]);
7162                 }
7163               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7164                                               vectype_in, i * ncopies + j);
7165               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7166                                                         vop[0], vop[1],
7167                                                         vop[0]);
7168               new_temp = make_ssa_name (vec_dest, call);
7169               gimple_call_set_lhs (call, new_temp);
7170               gimple_call_set_nothrow (call, true);
7171               new_stmt_info = vect_finish_stmt_generation (stmt, call, gsi);
7172             }
7173           else
7174             {
7175               if (op_type == ternary_op)
7176                 vop[2] = vec_oprnds2[i];
7177
7178               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7179                                                        vop[0], vop[1], vop[2]);
7180               new_temp = make_ssa_name (vec_dest, new_stmt);
7181               gimple_assign_set_lhs (new_stmt, new_temp);
7182               new_stmt_info
7183                 = vect_finish_stmt_generation (stmt, new_stmt, gsi);
7184             }
7185
7186           if (slp_node)
7187             {
7188               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7189               vect_defs.quick_push (new_temp);
7190             }
7191           else
7192             vect_defs[0] = new_temp;
7193         }
7194
7195       if (slp_node)
7196         continue;
7197
7198       if (j == 0)
7199         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7200       else
7201         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7202
7203       prev_stmt_info = new_stmt_info;
7204     }
7205
7206   /* Finalize the reduction-phi (set its arguments) and create the
7207      epilog reduction code.  */
7208   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7209     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7210
7211   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7212                                     epilog_copies, reduc_fn, phis,
7213                                     double_reduc, slp_node, slp_node_instance,
7214                                     cond_reduc_val, cond_reduc_op_code,
7215                                     neutral_op);
7216
7217   return true;
7218 }
7219
7220 /* Function vect_min_worthwhile_factor.
7221
7222    For a loop where we could vectorize the operation indicated by CODE,
7223    return the minimum vectorization factor that makes it worthwhile
7224    to use generic vectors.  */
7225 static unsigned int
7226 vect_min_worthwhile_factor (enum tree_code code)
7227 {
7228   switch (code)
7229     {
7230     case PLUS_EXPR:
7231     case MINUS_EXPR:
7232     case NEGATE_EXPR:
7233       return 4;
7234
7235     case BIT_AND_EXPR:
7236     case BIT_IOR_EXPR:
7237     case BIT_XOR_EXPR:
7238     case BIT_NOT_EXPR:
7239       return 2;
7240
7241     default:
7242       return INT_MAX;
7243     }
7244 }
7245
7246 /* Return true if VINFO indicates we are doing loop vectorization and if
7247    it is worth decomposing CODE operations into scalar operations for
7248    that loop's vectorization factor.  */
7249
7250 bool
7251 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7252 {
7253   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7254   unsigned HOST_WIDE_INT value;
7255   return (loop_vinfo
7256           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7257           && value >= vect_min_worthwhile_factor (code));
7258 }
7259
7260 /* Function vectorizable_induction
7261
7262    Check if PHI performs an induction computation that can be vectorized.
7263    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7264    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7265    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7266
7267 bool
7268 vectorizable_induction (gimple *phi,
7269                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7270                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7271                         stmt_vector_for_cost *cost_vec)
7272 {
7273   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7274   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7275   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7276   unsigned ncopies;
7277   bool nested_in_vect_loop = false;
7278   struct loop *iv_loop;
7279   tree vec_def;
7280   edge pe = loop_preheader_edge (loop);
7281   basic_block new_bb;
7282   tree new_vec, vec_init, vec_step, t;
7283   tree new_name;
7284   gimple *new_stmt;
7285   gphi *induction_phi;
7286   tree induc_def, vec_dest;
7287   tree init_expr, step_expr;
7288   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7289   unsigned i;
7290   tree expr;
7291   gimple_seq stmts;
7292   imm_use_iterator imm_iter;
7293   use_operand_p use_p;
7294   gimple *exit_phi;
7295   edge latch_e;
7296   tree loop_arg;
7297   gimple_stmt_iterator si;
7298   basic_block bb = gimple_bb (phi);
7299
7300   if (gimple_code (phi) != GIMPLE_PHI)
7301     return false;
7302
7303   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7304     return false;
7305
7306   /* Make sure it was recognized as induction computation.  */
7307   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7308     return false;
7309
7310   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7311   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7312
7313   if (slp_node)
7314     ncopies = 1;
7315   else
7316     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7317   gcc_assert (ncopies >= 1);
7318
7319   /* FORNOW. These restrictions should be relaxed.  */
7320   if (nested_in_vect_loop_p (loop, phi))
7321     {
7322       imm_use_iterator imm_iter;
7323       use_operand_p use_p;
7324       gimple *exit_phi;
7325       edge latch_e;
7326       tree loop_arg;
7327
7328       if (ncopies > 1)
7329         {
7330           if (dump_enabled_p ())
7331             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7332                              "multiple types in nested loop.\n");
7333           return false;
7334         }
7335
7336       /* FORNOW: outer loop induction with SLP not supported.  */
7337       if (STMT_SLP_TYPE (stmt_info))
7338         return false;
7339
7340       exit_phi = NULL;
7341       latch_e = loop_latch_edge (loop->inner);
7342       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7343       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7344         {
7345           gimple *use_stmt = USE_STMT (use_p);
7346           if (is_gimple_debug (use_stmt))
7347             continue;
7348
7349           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7350             {
7351               exit_phi = use_stmt;
7352               break;
7353             }
7354         }
7355       if (exit_phi)
7356         {
7357           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7358           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7359                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7360             {
7361               if (dump_enabled_p ())
7362                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7363                                  "inner-loop induction only used outside "
7364                                  "of the outer vectorized loop.\n");
7365               return false;
7366             }
7367         }
7368
7369       nested_in_vect_loop = true;
7370       iv_loop = loop->inner;
7371     }
7372   else
7373     iv_loop = loop;
7374   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7375
7376   if (slp_node && !nunits.is_constant ())
7377     {
7378       /* The current SLP code creates the initial value element-by-element.  */
7379       if (dump_enabled_p ())
7380         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7381                          "SLP induction not supported for variable-length"
7382                          " vectors.\n");
7383       return false;
7384     }
7385
7386   if (!vec_stmt) /* transformation not required.  */
7387     {
7388       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7389       DUMP_VECT_SCOPE ("vectorizable_induction");
7390       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7391       return true;
7392     }
7393
7394   /* Transform.  */
7395
7396   /* Compute a vector variable, initialized with the first VF values of
7397      the induction variable.  E.g., for an iv with IV_PHI='X' and
7398      evolution S, for a vector of 4 units, we want to compute:
7399      [X, X + S, X + 2*S, X + 3*S].  */
7400
7401   if (dump_enabled_p ())
7402     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7403
7404   latch_e = loop_latch_edge (iv_loop);
7405   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7406
7407   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7408   gcc_assert (step_expr != NULL_TREE);
7409
7410   pe = loop_preheader_edge (iv_loop);
7411   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7412                                      loop_preheader_edge (iv_loop));
7413
7414   stmts = NULL;
7415   if (!nested_in_vect_loop)
7416     {
7417       /* Convert the initial value to the desired type.  */
7418       tree new_type = TREE_TYPE (vectype);
7419       init_expr = gimple_convert (&stmts, new_type, init_expr);
7420
7421       /* If we are using the loop mask to "peel" for alignment then we need
7422          to adjust the start value here.  */
7423       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7424       if (skip_niters != NULL_TREE)
7425         {
7426           if (FLOAT_TYPE_P (vectype))
7427             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7428                                         skip_niters);
7429           else
7430             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7431           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7432                                          skip_niters, step_expr);
7433           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7434                                     init_expr, skip_step);
7435         }
7436     }
7437
7438   /* Convert the step to the desired type.  */
7439   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7440
7441   if (stmts)
7442     {
7443       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7444       gcc_assert (!new_bb);
7445     }
7446
7447   /* Find the first insertion point in the BB.  */
7448   si = gsi_after_labels (bb);
7449
7450   /* For SLP induction we have to generate several IVs as for example
7451      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7452      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7453      [VF*S, VF*S, VF*S, VF*S] for all.  */
7454   if (slp_node)
7455     {
7456       /* Enforced above.  */
7457       unsigned int const_nunits = nunits.to_constant ();
7458
7459       /* Generate [VF*S, VF*S, ... ].  */
7460       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7461         {
7462           expr = build_int_cst (integer_type_node, vf);
7463           expr = fold_convert (TREE_TYPE (step_expr), expr);
7464         }
7465       else
7466         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7467       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7468                               expr, step_expr);
7469       if (! CONSTANT_CLASS_P (new_name))
7470         new_name = vect_init_vector (phi, new_name,
7471                                      TREE_TYPE (step_expr), NULL);
7472       new_vec = build_vector_from_val (vectype, new_name);
7473       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7474
7475       /* Now generate the IVs.  */
7476       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7477       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7478       unsigned elts = const_nunits * nvects;
7479       unsigned nivs = least_common_multiple (group_size,
7480                                              const_nunits) / const_nunits;
7481       gcc_assert (elts % group_size == 0);
7482       tree elt = init_expr;
7483       unsigned ivn;
7484       for (ivn = 0; ivn < nivs; ++ivn)
7485         {
7486           tree_vector_builder elts (vectype, const_nunits, 1);
7487           stmts = NULL;
7488           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7489             {
7490               if (ivn*const_nunits + eltn >= group_size
7491                   && (ivn * const_nunits + eltn) % group_size == 0)
7492                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7493                                     elt, step_expr);
7494               elts.quick_push (elt);
7495             }
7496           vec_init = gimple_build_vector (&stmts, &elts);
7497           if (stmts)
7498             {
7499               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7500               gcc_assert (!new_bb);
7501             }
7502
7503           /* Create the induction-phi that defines the induction-operand.  */
7504           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7505           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7506           stmt_vec_info induction_phi_info
7507             = loop_vinfo->add_stmt (induction_phi);
7508           induc_def = PHI_RESULT (induction_phi);
7509
7510           /* Create the iv update inside the loop  */
7511           vec_def = make_ssa_name (vec_dest);
7512           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7513           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7514           loop_vinfo->add_stmt (new_stmt);
7515
7516           /* Set the arguments of the phi node:  */
7517           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7518           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7519                        UNKNOWN_LOCATION);
7520
7521           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7522         }
7523
7524       /* Re-use IVs when we can.  */
7525       if (ivn < nvects)
7526         {
7527           unsigned vfp
7528             = least_common_multiple (group_size, const_nunits) / group_size;
7529           /* Generate [VF'*S, VF'*S, ... ].  */
7530           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7531             {
7532               expr = build_int_cst (integer_type_node, vfp);
7533               expr = fold_convert (TREE_TYPE (step_expr), expr);
7534             }
7535           else
7536             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7537           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7538                                   expr, step_expr);
7539           if (! CONSTANT_CLASS_P (new_name))
7540             new_name = vect_init_vector (phi, new_name,
7541                                          TREE_TYPE (step_expr), NULL);
7542           new_vec = build_vector_from_val (vectype, new_name);
7543           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7544           for (; ivn < nvects; ++ivn)
7545             {
7546               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7547               tree def;
7548               if (gimple_code (iv) == GIMPLE_PHI)
7549                 def = gimple_phi_result (iv);
7550               else
7551                 def = gimple_assign_lhs (iv);
7552               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7553                                               PLUS_EXPR,
7554                                               def, vec_step);
7555               if (gimple_code (iv) == GIMPLE_PHI)
7556                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7557               else
7558                 {
7559                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7560                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7561                 }
7562               SLP_TREE_VEC_STMTS (slp_node).quick_push
7563                 (loop_vinfo->add_stmt (new_stmt));
7564             }
7565         }
7566
7567       return true;
7568     }
7569
7570   /* Create the vector that holds the initial_value of the induction.  */
7571   if (nested_in_vect_loop)
7572     {
7573       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7574          been created during vectorization of previous stmts.  We obtain it
7575          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7576       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7577       /* If the initial value is not of proper type, convert it.  */
7578       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7579         {
7580           new_stmt
7581             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7582                                                           vect_simple_var,
7583                                                           "vec_iv_"),
7584                                    VIEW_CONVERT_EXPR,
7585                                    build1 (VIEW_CONVERT_EXPR, vectype,
7586                                            vec_init));
7587           vec_init = gimple_assign_lhs (new_stmt);
7588           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7589                                                  new_stmt);
7590           gcc_assert (!new_bb);
7591           loop_vinfo->add_stmt (new_stmt);
7592         }
7593     }
7594   else
7595     {
7596       /* iv_loop is the loop to be vectorized. Create:
7597          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7598       stmts = NULL;
7599       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7600
7601       unsigned HOST_WIDE_INT const_nunits;
7602       if (nunits.is_constant (&const_nunits))
7603         {
7604           tree_vector_builder elts (vectype, const_nunits, 1);
7605           elts.quick_push (new_name);
7606           for (i = 1; i < const_nunits; i++)
7607             {
7608               /* Create: new_name_i = new_name + step_expr  */
7609               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7610                                        new_name, step_expr);
7611               elts.quick_push (new_name);
7612             }
7613           /* Create a vector from [new_name_0, new_name_1, ...,
7614              new_name_nunits-1]  */
7615           vec_init = gimple_build_vector (&stmts, &elts);
7616         }
7617       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7618         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7619         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7620                                  new_name, step_expr);
7621       else
7622         {
7623           /* Build:
7624                 [base, base, base, ...]
7625                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7626           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7627           gcc_assert (flag_associative_math);
7628           tree index = build_index_vector (vectype, 0, 1);
7629           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7630                                                         new_name);
7631           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7632                                                         step_expr);
7633           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7634           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7635                                    vec_init, step_vec);
7636           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7637                                    vec_init, base_vec);
7638         }
7639
7640       if (stmts)
7641         {
7642           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7643           gcc_assert (!new_bb);
7644         }
7645     }
7646
7647
7648   /* Create the vector that holds the step of the induction.  */
7649   if (nested_in_vect_loop)
7650     /* iv_loop is nested in the loop to be vectorized. Generate:
7651        vec_step = [S, S, S, S]  */
7652     new_name = step_expr;
7653   else
7654     {
7655       /* iv_loop is the loop to be vectorized. Generate:
7656           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7657       gimple_seq seq = NULL;
7658       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7659         {
7660           expr = build_int_cst (integer_type_node, vf);
7661           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7662         }
7663       else
7664         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7665       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7666                                expr, step_expr);
7667       if (seq)
7668         {
7669           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7670           gcc_assert (!new_bb);
7671         }
7672     }
7673
7674   t = unshare_expr (new_name);
7675   gcc_assert (CONSTANT_CLASS_P (new_name)
7676               || TREE_CODE (new_name) == SSA_NAME);
7677   new_vec = build_vector_from_val (vectype, t);
7678   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7679
7680
7681   /* Create the following def-use cycle:
7682      loop prolog:
7683          vec_init = ...
7684          vec_step = ...
7685      loop:
7686          vec_iv = PHI <vec_init, vec_loop>
7687          ...
7688          STMT
7689          ...
7690          vec_loop = vec_iv + vec_step;  */
7691
7692   /* Create the induction-phi that defines the induction-operand.  */
7693   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7694   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7695   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7696   induc_def = PHI_RESULT (induction_phi);
7697
7698   /* Create the iv update inside the loop  */
7699   vec_def = make_ssa_name (vec_dest);
7700   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7701   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7702   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7703
7704   /* Set the arguments of the phi node:  */
7705   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7706   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7707                UNKNOWN_LOCATION);
7708
7709   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7710
7711   /* In case that vectorization factor (VF) is bigger than the number
7712      of elements that we can fit in a vectype (nunits), we have to generate
7713      more than one vector stmt - i.e - we need to "unroll" the
7714      vector stmt by a factor VF/nunits.  For more details see documentation
7715      in vectorizable_operation.  */
7716
7717   if (ncopies > 1)
7718     {
7719       gimple_seq seq = NULL;
7720       stmt_vec_info prev_stmt_vinfo;
7721       /* FORNOW. This restriction should be relaxed.  */
7722       gcc_assert (!nested_in_vect_loop);
7723
7724       /* Create the vector that holds the step of the induction.  */
7725       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7726         {
7727           expr = build_int_cst (integer_type_node, nunits);
7728           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7729         }
7730       else
7731         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7732       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7733                                expr, step_expr);
7734       if (seq)
7735         {
7736           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7737           gcc_assert (!new_bb);
7738         }
7739
7740       t = unshare_expr (new_name);
7741       gcc_assert (CONSTANT_CLASS_P (new_name)
7742                   || TREE_CODE (new_name) == SSA_NAME);
7743       new_vec = build_vector_from_val (vectype, t);
7744       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7745
7746       vec_def = induc_def;
7747       prev_stmt_vinfo = induction_phi_info;
7748       for (i = 1; i < ncopies; i++)
7749         {
7750           /* vec_i = vec_prev + vec_step  */
7751           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7752                                           vec_def, vec_step);
7753           vec_def = make_ssa_name (vec_dest, new_stmt);
7754           gimple_assign_set_lhs (new_stmt, vec_def);
7755
7756           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7757           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7758           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7759           prev_stmt_vinfo = new_stmt_info;
7760         }
7761     }
7762
7763   if (nested_in_vect_loop)
7764     {
7765       /* Find the loop-closed exit-phi of the induction, and record
7766          the final vector of induction results:  */
7767       exit_phi = NULL;
7768       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7769         {
7770           gimple *use_stmt = USE_STMT (use_p);
7771           if (is_gimple_debug (use_stmt))
7772             continue;
7773
7774           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7775             {
7776               exit_phi = use_stmt;
7777               break;
7778             }
7779         }
7780       if (exit_phi)
7781         {
7782           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7783           /* FORNOW. Currently not supporting the case that an inner-loop induction
7784              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7785           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7786                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7787
7788           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7789           if (dump_enabled_p ())
7790             {
7791               dump_printf_loc (MSG_NOTE, vect_location,
7792                                "vector of inductions after inner-loop:");
7793               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7794             }
7795         }
7796     }
7797
7798
7799   if (dump_enabled_p ())
7800     {
7801       dump_printf_loc (MSG_NOTE, vect_location,
7802                        "transform induction: created def-use cycle: ");
7803       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7804       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7805                         SSA_NAME_DEF_STMT (vec_def), 0);
7806     }
7807
7808   return true;
7809 }
7810
7811 /* Function vectorizable_live_operation.
7812
7813    STMT computes a value that is used outside the loop.  Check if
7814    it can be supported.  */
7815
7816 bool
7817 vectorizable_live_operation (gimple *stmt,
7818                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7819                              slp_tree slp_node, int slp_index,
7820                              stmt_vec_info *vec_stmt,
7821                              stmt_vector_for_cost *)
7822 {
7823   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7824   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7825   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7826   imm_use_iterator imm_iter;
7827   tree lhs, lhs_type, bitsize, vec_bitsize;
7828   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7829   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7830   int ncopies;
7831   gimple *use_stmt;
7832   auto_vec<tree> vec_oprnds;
7833   int vec_entry = 0;
7834   poly_uint64 vec_index = 0;
7835
7836   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7837
7838   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7839     return false;
7840
7841   /* FORNOW.  CHECKME.  */
7842   if (nested_in_vect_loop_p (loop, stmt))
7843     return false;
7844
7845   /* If STMT is not relevant and it is a simple assignment and its inputs are
7846      invariant then it can remain in place, unvectorized.  The original last
7847      scalar value that it computes will be used.  */
7848   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7849     {
7850       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7851       if (dump_enabled_p ())
7852         dump_printf_loc (MSG_NOTE, vect_location,
7853                          "statement is simple and uses invariant.  Leaving in "
7854                          "place.\n");
7855       return true;
7856     }
7857
7858   if (slp_node)
7859     ncopies = 1;
7860   else
7861     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7862
7863   if (slp_node)
7864     {
7865       gcc_assert (slp_index >= 0);
7866
7867       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7868       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7869
7870       /* Get the last occurrence of the scalar index from the concatenation of
7871          all the slp vectors. Calculate which slp vector it is and the index
7872          within.  */
7873       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7874
7875       /* Calculate which vector contains the result, and which lane of
7876          that vector we need.  */
7877       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7878         {
7879           if (dump_enabled_p ())
7880             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7881                              "Cannot determine which vector holds the"
7882                              " final result.\n");
7883           return false;
7884         }
7885     }
7886
7887   if (!vec_stmt)
7888     {
7889       /* No transformation required.  */
7890       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7891         {
7892           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7893                                                OPTIMIZE_FOR_SPEED))
7894             {
7895               if (dump_enabled_p ())
7896                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7897                                  "can't use a fully-masked loop because "
7898                                  "the target doesn't support extract last "
7899                                  "reduction.\n");
7900               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7901             }
7902           else if (slp_node)
7903             {
7904               if (dump_enabled_p ())
7905                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7906                                  "can't use a fully-masked loop because an "
7907                                  "SLP statement is live after the loop.\n");
7908               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7909             }
7910           else if (ncopies > 1)
7911             {
7912               if (dump_enabled_p ())
7913                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7914                                  "can't use a fully-masked loop because"
7915                                  " ncopies is greater than 1.\n");
7916               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7917             }
7918           else
7919             {
7920               gcc_assert (ncopies == 1 && !slp_node);
7921               vect_record_loop_mask (loop_vinfo,
7922                                      &LOOP_VINFO_MASKS (loop_vinfo),
7923                                      1, vectype);
7924             }
7925         }
7926       return true;
7927     }
7928
7929   /* If stmt has a related stmt, then use that for getting the lhs.  */
7930   if (is_pattern_stmt_p (stmt_info))
7931     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7932
7933   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7934         : gimple_get_lhs (stmt);
7935   lhs_type = TREE_TYPE (lhs);
7936
7937   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7938              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7939              : TYPE_SIZE (TREE_TYPE (vectype)));
7940   vec_bitsize = TYPE_SIZE (vectype);
7941
7942   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7943   tree vec_lhs, bitstart;
7944   if (slp_node)
7945     {
7946       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7947
7948       /* Get the correct slp vectorized stmt.  */
7949       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7950       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7951         vec_lhs = gimple_phi_result (phi);
7952       else
7953         vec_lhs = gimple_get_lhs (vec_stmt);
7954
7955       /* Get entry to use.  */
7956       bitstart = bitsize_int (vec_index);
7957       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7958     }
7959   else
7960     {
7961       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7962       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7963       gcc_checking_assert (ncopies == 1
7964                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7965
7966       /* For multiple copies, get the last copy.  */
7967       for (int i = 1; i < ncopies; ++i)
7968         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7969                                                   vec_lhs);
7970
7971       /* Get the last lane in the vector.  */
7972       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7973     }
7974
7975   gimple_seq stmts = NULL;
7976   tree new_tree;
7977   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7978     {
7979       /* Emit:
7980
7981            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7982
7983          where VEC_LHS is the vectorized live-out result and MASK is
7984          the loop mask for the final iteration.  */
7985       gcc_assert (ncopies == 1 && !slp_node);
7986       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7987       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7988                                       1, vectype, 0);
7989       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7990                                       scalar_type, mask, vec_lhs);
7991
7992       /* Convert the extracted vector element to the required scalar type.  */
7993       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7994     }
7995   else
7996     {
7997       tree bftype = TREE_TYPE (vectype);
7998       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7999         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8000       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8001       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8002                                        &stmts, true, NULL_TREE);
8003     }
8004
8005   if (stmts)
8006     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8007
8008   /* Replace use of lhs with newly computed result.  If the use stmt is a
8009      single arg PHI, just replace all uses of PHI result.  It's necessary
8010      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8011   use_operand_p use_p;
8012   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8013     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8014         && !is_gimple_debug (use_stmt))
8015     {
8016       if (gimple_code (use_stmt) == GIMPLE_PHI
8017           && gimple_phi_num_args (use_stmt) == 1)
8018         {
8019           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8020         }
8021       else
8022         {
8023           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8024             SET_USE (use_p, new_tree);
8025         }
8026       update_stmt (use_stmt);
8027     }
8028
8029   return true;
8030 }
8031
8032 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8033
8034 static void
8035 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8036 {
8037   ssa_op_iter op_iter;
8038   imm_use_iterator imm_iter;
8039   def_operand_p def_p;
8040   gimple *ustmt;
8041
8042   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8043     {
8044       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8045         {
8046           basic_block bb;
8047
8048           if (!is_gimple_debug (ustmt))
8049             continue;
8050
8051           bb = gimple_bb (ustmt);
8052
8053           if (!flow_bb_inside_loop_p (loop, bb))
8054             {
8055               if (gimple_debug_bind_p (ustmt))
8056                 {
8057                   if (dump_enabled_p ())
8058                     dump_printf_loc (MSG_NOTE, vect_location,
8059                                      "killing debug use\n");
8060
8061                   gimple_debug_bind_reset_value (ustmt);
8062                   update_stmt (ustmt);
8063                 }
8064               else
8065                 gcc_unreachable ();
8066             }
8067         }
8068     }
8069 }
8070
8071 /* Given loop represented by LOOP_VINFO, return true if computation of
8072    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8073    otherwise.  */
8074
8075 static bool
8076 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8077 {
8078   /* Constant case.  */
8079   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8080     {
8081       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8082       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8083
8084       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8085       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8086       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8087         return true;
8088     }
8089
8090   widest_int max;
8091   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8092   /* Check the upper bound of loop niters.  */
8093   if (get_max_loop_iterations (loop, &max))
8094     {
8095       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8096       signop sgn = TYPE_SIGN (type);
8097       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8098       if (max < type_max)
8099         return true;
8100     }
8101   return false;
8102 }
8103
8104 /* Return a mask type with half the number of elements as TYPE.  */
8105
8106 tree
8107 vect_halve_mask_nunits (tree type)
8108 {
8109   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8110   return build_truth_vector_type (nunits, current_vector_size);
8111 }
8112
8113 /* Return a mask type with twice as many elements as TYPE.  */
8114
8115 tree
8116 vect_double_mask_nunits (tree type)
8117 {
8118   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8119   return build_truth_vector_type (nunits, current_vector_size);
8120 }
8121
8122 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8123    contain a sequence of NVECTORS masks that each control a vector of type
8124    VECTYPE.  */
8125
8126 void
8127 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8128                        unsigned int nvectors, tree vectype)
8129 {
8130   gcc_assert (nvectors != 0);
8131   if (masks->length () < nvectors)
8132     masks->safe_grow_cleared (nvectors);
8133   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8134   /* The number of scalars per iteration and the number of vectors are
8135      both compile-time constants.  */
8136   unsigned int nscalars_per_iter
8137     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8138                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8139   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8140     {
8141       rgm->max_nscalars_per_iter = nscalars_per_iter;
8142       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8143     }
8144 }
8145
8146 /* Given a complete set of masks MASKS, extract mask number INDEX
8147    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8148    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8149
8150    See the comment above vec_loop_masks for more details about the mask
8151    arrangement.  */
8152
8153 tree
8154 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8155                     unsigned int nvectors, tree vectype, unsigned int index)
8156 {
8157   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8158   tree mask_type = rgm->mask_type;
8159
8160   /* Populate the rgroup's mask array, if this is the first time we've
8161      used it.  */
8162   if (rgm->masks.is_empty ())
8163     {
8164       rgm->masks.safe_grow_cleared (nvectors);
8165       for (unsigned int i = 0; i < nvectors; ++i)
8166         {
8167           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8168           /* Provide a dummy definition until the real one is available.  */
8169           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8170           rgm->masks[i] = mask;
8171         }
8172     }
8173
8174   tree mask = rgm->masks[index];
8175   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8176                 TYPE_VECTOR_SUBPARTS (vectype)))
8177     {
8178       /* A loop mask for data type X can be reused for data type Y
8179          if X has N times more elements than Y and if Y's elements
8180          are N times bigger than X's.  In this case each sequence
8181          of N elements in the loop mask will be all-zero or all-one.
8182          We can then view-convert the mask so that each sequence of
8183          N elements is replaced by a single element.  */
8184       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8185                               TYPE_VECTOR_SUBPARTS (vectype)));
8186       gimple_seq seq = NULL;
8187       mask_type = build_same_sized_truth_vector_type (vectype);
8188       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8189       if (seq)
8190         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8191     }
8192   return mask;
8193 }
8194
8195 /* Scale profiling counters by estimation for LOOP which is vectorized
8196    by factor VF.  */
8197
8198 static void
8199 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8200 {
8201   edge preheader = loop_preheader_edge (loop);
8202   /* Reduce loop iterations by the vectorization factor.  */
8203   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8204   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8205
8206   if (freq_h.nonzero_p ())
8207     {
8208       profile_probability p;
8209
8210       /* Avoid dropping loop body profile counter to 0 because of zero count
8211          in loop's preheader.  */
8212       if (!(freq_e == profile_count::zero ()))
8213         freq_e = freq_e.force_nonzero ();
8214       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8215       scale_loop_frequencies (loop, p);
8216     }
8217
8218   edge exit_e = single_exit (loop);
8219   exit_e->probability = profile_probability::always ()
8220                                  .apply_scale (1, new_est_niter + 1);
8221
8222   edge exit_l = single_pred_edge (loop->latch);
8223   profile_probability prob = exit_l->probability;
8224   exit_l->probability = exit_e->probability.invert ();
8225   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8226     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8227 }
8228
8229 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8230    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8231    *SLP_SCHEDULE is a running record of whether we have called
8232    vect_schedule_slp.  */
8233
8234 static void
8235 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8236                           gimple_stmt_iterator *gsi,
8237                           stmt_vec_info *seen_store, bool *slp_scheduled)
8238 {
8239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8240   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8241   stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
8242   if (!stmt_info)
8243     return;
8244
8245   if (dump_enabled_p ())
8246     {
8247       dump_printf_loc (MSG_NOTE, vect_location,
8248                        "------>vectorizing statement: ");
8249       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8250     }
8251
8252   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8253     vect_loop_kill_debug_uses (loop, stmt);
8254
8255   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8256       && !STMT_VINFO_LIVE_P (stmt_info))
8257     return;
8258
8259   if (STMT_VINFO_VECTYPE (stmt_info))
8260     {
8261       poly_uint64 nunits
8262         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8263       if (!STMT_SLP_TYPE (stmt_info)
8264           && maybe_ne (nunits, vf)
8265           && dump_enabled_p ())
8266         /* For SLP VF is set according to unrolling factor, and not
8267            to vector size, hence for SLP this print is not valid.  */
8268         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8269     }
8270
8271   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8272      reached.  */
8273   if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8274     {
8275
8276       if (!*slp_scheduled)
8277         {
8278           *slp_scheduled = true;
8279
8280           DUMP_VECT_SCOPE ("scheduling SLP instances");
8281
8282           vect_schedule_slp (loop_vinfo);
8283         }
8284
8285       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8286       if (slptype == pure_slp)
8287         return;
8288     }
8289
8290   if (dump_enabled_p ())
8291     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8292
8293   bool grouped_store = false;
8294   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8295     *seen_store = stmt_info;
8296 }
8297
8298 /* Function vect_transform_loop.
8299
8300    The analysis phase has determined that the loop is vectorizable.
8301    Vectorize the loop - created vectorized stmts to replace the scalar
8302    stmts in the loop, and update the loop exit condition.
8303    Returns scalar epilogue loop if any.  */
8304
8305 struct loop *
8306 vect_transform_loop (loop_vec_info loop_vinfo)
8307 {
8308   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8309   struct loop *epilogue = NULL;
8310   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8311   int nbbs = loop->num_nodes;
8312   int i;
8313   tree niters_vector = NULL_TREE;
8314   tree step_vector = NULL_TREE;
8315   tree niters_vector_mult_vf = NULL_TREE;
8316   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8317   unsigned int lowest_vf = constant_lower_bound (vf);
8318   bool slp_scheduled = false;
8319   gimple *stmt;
8320   bool check_profitability = false;
8321   unsigned int th;
8322
8323   DUMP_VECT_SCOPE ("vec_transform_loop");
8324
8325   loop_vinfo->shared->check_datarefs ();
8326
8327   /* Use the more conservative vectorization threshold.  If the number
8328      of iterations is constant assume the cost check has been performed
8329      by our caller.  If the threshold makes all loops profitable that
8330      run at least the (estimated) vectorization factor number of times
8331      checking is pointless, too.  */
8332   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8333   if (th >= vect_vf_for_cost (loop_vinfo)
8334       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8335     {
8336       if (dump_enabled_p ())
8337         dump_printf_loc (MSG_NOTE, vect_location,
8338                          "Profitability threshold is %d loop iterations.\n",
8339                          th);
8340       check_profitability = true;
8341     }
8342
8343   /* Make sure there exists a single-predecessor exit bb.  Do this before
8344      versioning.   */
8345   edge e = single_exit (loop);
8346   if (! single_pred_p (e->dest))
8347     {
8348       split_loop_exit_edge (e);
8349       if (dump_enabled_p ())
8350         dump_printf (MSG_NOTE, "split exit edge\n");
8351     }
8352
8353   /* Version the loop first, if required, so the profitability check
8354      comes first.  */
8355
8356   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8357     {
8358       poly_uint64 versioning_threshold
8359         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8360       if (check_profitability
8361           && ordered_p (poly_uint64 (th), versioning_threshold))
8362         {
8363           versioning_threshold = ordered_max (poly_uint64 (th),
8364                                               versioning_threshold);
8365           check_profitability = false;
8366         }
8367       vect_loop_versioning (loop_vinfo, th, check_profitability,
8368                             versioning_threshold);
8369       check_profitability = false;
8370     }
8371
8372   /* Make sure there exists a single-predecessor exit bb also on the
8373      scalar loop copy.  Do this after versioning but before peeling
8374      so CFG structure is fine for both scalar and if-converted loop
8375      to make slpeel_duplicate_current_defs_from_edges face matched
8376      loop closed PHI nodes on the exit.  */
8377   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8378     {
8379       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8380       if (! single_pred_p (e->dest))
8381         {
8382           split_loop_exit_edge (e);
8383           if (dump_enabled_p ())
8384             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8385         }
8386     }
8387
8388   tree niters = vect_build_loop_niters (loop_vinfo);
8389   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8390   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8391   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8392   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8393                               &step_vector, &niters_vector_mult_vf, th,
8394                               check_profitability, niters_no_overflow);
8395
8396   if (niters_vector == NULL_TREE)
8397     {
8398       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8399           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8400           && known_eq (lowest_vf, vf))
8401         {
8402           niters_vector
8403             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8404                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8405           step_vector = build_one_cst (TREE_TYPE (niters));
8406         }
8407       else
8408         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8409                                      &step_vector, niters_no_overflow);
8410     }
8411
8412   /* 1) Make sure the loop header has exactly two entries
8413      2) Make sure we have a preheader basic block.  */
8414
8415   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8416
8417   split_edge (loop_preheader_edge (loop));
8418
8419   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8420       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8421     /* This will deal with any possible peeling.  */
8422     vect_prepare_for_masked_peels (loop_vinfo);
8423
8424   /* FORNOW: the vectorizer supports only loops which body consist
8425      of one basic block (header + empty latch). When the vectorizer will
8426      support more involved loop forms, the order by which the BBs are
8427      traversed need to be reconsidered.  */
8428
8429   for (i = 0; i < nbbs; i++)
8430     {
8431       basic_block bb = bbs[i];
8432       stmt_vec_info stmt_info;
8433
8434       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8435            gsi_next (&si))
8436         {
8437           gphi *phi = si.phi ();
8438           if (dump_enabled_p ())
8439             {
8440               dump_printf_loc (MSG_NOTE, vect_location,
8441                                "------>vectorizing phi: ");
8442               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8443             }
8444           stmt_info = loop_vinfo->lookup_stmt (phi);
8445           if (!stmt_info)
8446             continue;
8447
8448           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8449             vect_loop_kill_debug_uses (loop, phi);
8450
8451           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8452               && !STMT_VINFO_LIVE_P (stmt_info))
8453             continue;
8454
8455           if (STMT_VINFO_VECTYPE (stmt_info)
8456               && (maybe_ne
8457                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8458               && dump_enabled_p ())
8459             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8460
8461           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8462                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8463                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8464               && ! PURE_SLP_STMT (stmt_info))
8465             {
8466               if (dump_enabled_p ())
8467                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8468               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8469             }
8470         }
8471
8472       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8473            !gsi_end_p (si);)
8474         {
8475           stmt = gsi_stmt (si);
8476           /* During vectorization remove existing clobber stmts.  */
8477           if (gimple_clobber_p (stmt))
8478             {
8479               unlink_stmt_vdef (stmt);
8480               gsi_remove (&si, true);
8481               release_defs (stmt);
8482             }
8483           else
8484             {
8485               stmt_info = loop_vinfo->lookup_stmt (stmt);
8486
8487               /* vector stmts created in the outer-loop during vectorization of
8488                  stmts in an inner-loop may not have a stmt_info, and do not
8489                  need to be vectorized.  */
8490               stmt_vec_info seen_store = NULL;
8491               if (stmt_info)
8492                 {
8493                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8494                     {
8495                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8496                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8497                            !gsi_end_p (subsi); gsi_next (&subsi))
8498                         vect_transform_loop_stmt (loop_vinfo,
8499                                                   gsi_stmt (subsi), &si,
8500                                                   &seen_store,
8501                                                   &slp_scheduled);
8502                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8503                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8504                                                 &seen_store, &slp_scheduled);
8505                     }
8506                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8507                                             &seen_store, &slp_scheduled);
8508                 }
8509               if (seen_store)
8510                 {
8511                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8512                     {
8513                       /* Interleaving.  If IS_STORE is TRUE, the
8514                          vectorization of the interleaving chain was
8515                          completed - free all the stores in the chain.  */
8516                       gsi_next (&si);
8517                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8518                     }
8519                   else
8520                     {
8521                       /* Free the attached stmt_vec_info and remove the
8522                          stmt.  */
8523                       free_stmt_vec_info (stmt);
8524                       unlink_stmt_vdef (stmt);
8525                       gsi_remove (&si, true);
8526                       release_defs (stmt);
8527                     }
8528                 }
8529               else
8530                 gsi_next (&si);
8531             }
8532         }
8533
8534       /* Stub out scalar statements that must not survive vectorization.
8535          Doing this here helps with grouped statements, or statements that
8536          are involved in patterns.  */
8537       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8538            !gsi_end_p (gsi); gsi_next (&gsi))
8539         {
8540           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8541           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8542             {
8543               tree lhs = gimple_get_lhs (call);
8544               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8545                 {
8546                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8547                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8548                   gsi_replace (&gsi, new_stmt, true);
8549                 }
8550             }
8551         }
8552     }                           /* BBs in loop */
8553
8554   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8555      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8556   if (integer_onep (step_vector))
8557     niters_no_overflow = true;
8558   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8559                            niters_vector_mult_vf, !niters_no_overflow);
8560
8561   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8562   scale_profile_for_vect_loop (loop, assumed_vf);
8563
8564   /* True if the final iteration might not handle a full vector's
8565      worth of scalar iterations.  */
8566   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8567   /* The minimum number of iterations performed by the epilogue.  This
8568      is 1 when peeling for gaps because we always need a final scalar
8569      iteration.  */
8570   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8571   /* +1 to convert latch counts to loop iteration counts,
8572      -min_epilogue_iters to remove iterations that cannot be performed
8573        by the vector code.  */
8574   int bias_for_lowest = 1 - min_epilogue_iters;
8575   int bias_for_assumed = bias_for_lowest;
8576   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8577   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8578     {
8579       /* When the amount of peeling is known at compile time, the first
8580          iteration will have exactly alignment_npeels active elements.
8581          In the worst case it will have at least one.  */
8582       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8583       bias_for_lowest += lowest_vf - min_first_active;
8584       bias_for_assumed += assumed_vf - min_first_active;
8585     }
8586   /* In these calculations the "- 1" converts loop iteration counts
8587      back to latch counts.  */
8588   if (loop->any_upper_bound)
8589     loop->nb_iterations_upper_bound
8590       = (final_iter_may_be_partial
8591          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8592                           lowest_vf) - 1
8593          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8594                            lowest_vf) - 1);
8595   if (loop->any_likely_upper_bound)
8596     loop->nb_iterations_likely_upper_bound
8597       = (final_iter_may_be_partial
8598          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8599                           + bias_for_lowest, lowest_vf) - 1
8600          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8601                            + bias_for_lowest, lowest_vf) - 1);
8602   if (loop->any_estimate)
8603     loop->nb_iterations_estimate
8604       = (final_iter_may_be_partial
8605          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8606                           assumed_vf) - 1
8607          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8608                            assumed_vf) - 1);
8609
8610   if (dump_enabled_p ())
8611     {
8612       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8613         {
8614           dump_printf_loc (MSG_NOTE, vect_location,
8615                            "LOOP VECTORIZED\n");
8616           if (loop->inner)
8617             dump_printf_loc (MSG_NOTE, vect_location,
8618                              "OUTER LOOP VECTORIZED\n");
8619           dump_printf (MSG_NOTE, "\n");
8620         }
8621       else
8622         {
8623           dump_printf_loc (MSG_NOTE, vect_location,
8624                            "LOOP EPILOGUE VECTORIZED (VS=");
8625           dump_dec (MSG_NOTE, current_vector_size);
8626           dump_printf (MSG_NOTE, ")\n");
8627         }
8628     }
8629
8630   /* Free SLP instances here because otherwise stmt reference counting
8631      won't work.  */
8632   slp_instance instance;
8633   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8634     vect_free_slp_instance (instance, true);
8635   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8636   /* Clear-up safelen field since its value is invalid after vectorization
8637      since vectorized loop can have loop-carried dependencies.  */
8638   loop->safelen = 0;
8639
8640   /* Don't vectorize epilogue for epilogue.  */
8641   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8642     epilogue = NULL;
8643
8644   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8645     epilogue = NULL;
8646
8647   if (epilogue)
8648     {
8649       auto_vector_sizes vector_sizes;
8650       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8651       unsigned int next_size = 0;
8652
8653       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8654           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8655           && known_eq (vf, lowest_vf))
8656         {
8657           unsigned int eiters
8658             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8659                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8660           eiters = eiters % lowest_vf;
8661           epilogue->nb_iterations_upper_bound = eiters - 1;
8662
8663           unsigned int ratio;
8664           while (next_size < vector_sizes.length ()
8665                  && !(constant_multiple_p (current_vector_size,
8666                                            vector_sizes[next_size], &ratio)
8667                       && eiters >= lowest_vf / ratio))
8668             next_size += 1;
8669         }
8670       else
8671         while (next_size < vector_sizes.length ()
8672                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8673           next_size += 1;
8674
8675       if (next_size == vector_sizes.length ())
8676         epilogue = NULL;
8677     }
8678
8679   if (epilogue)
8680     {
8681       epilogue->force_vectorize = loop->force_vectorize;
8682       epilogue->safelen = loop->safelen;
8683       epilogue->dont_vectorize = false;
8684
8685       /* We may need to if-convert epilogue to vectorize it.  */
8686       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8687         tree_if_conversion (epilogue);
8688     }
8689
8690   return epilogue;
8691 }
8692
8693 /* The code below is trying to perform simple optimization - revert
8694    if-conversion for masked stores, i.e. if the mask of a store is zero
8695    do not perform it and all stored value producers also if possible.
8696    For example,
8697      for (i=0; i<n; i++)
8698        if (c[i])
8699         {
8700           p1[i] += 1;
8701           p2[i] = p3[i] +2;
8702         }
8703    this transformation will produce the following semi-hammock:
8704
8705    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8706      {
8707        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8708        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8709        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8710        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8711        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8712        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8713      }
8714 */
8715
8716 void
8717 optimize_mask_stores (struct loop *loop)
8718 {
8719   basic_block *bbs = get_loop_body (loop);
8720   unsigned nbbs = loop->num_nodes;
8721   unsigned i;
8722   basic_block bb;
8723   struct loop *bb_loop;
8724   gimple_stmt_iterator gsi;
8725   gimple *stmt;
8726   auto_vec<gimple *> worklist;
8727
8728   vect_location = find_loop_location (loop);
8729   /* Pick up all masked stores in loop if any.  */
8730   for (i = 0; i < nbbs; i++)
8731     {
8732       bb = bbs[i];
8733       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8734            gsi_next (&gsi))
8735         {
8736           stmt = gsi_stmt (gsi);
8737           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8738             worklist.safe_push (stmt);
8739         }
8740     }
8741
8742   free (bbs);
8743   if (worklist.is_empty ())
8744     return;
8745
8746   /* Loop has masked stores.  */
8747   while (!worklist.is_empty ())
8748     {
8749       gimple *last, *last_store;
8750       edge e, efalse;
8751       tree mask;
8752       basic_block store_bb, join_bb;
8753       gimple_stmt_iterator gsi_to;
8754       tree vdef, new_vdef;
8755       gphi *phi;
8756       tree vectype;
8757       tree zero;
8758
8759       last = worklist.pop ();
8760       mask = gimple_call_arg (last, 2);
8761       bb = gimple_bb (last);
8762       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8763          the same loop as if_bb.  It could be different to LOOP when two
8764          level loop-nest is vectorized and mask_store belongs to the inner
8765          one.  */
8766       e = split_block (bb, last);
8767       bb_loop = bb->loop_father;
8768       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8769       join_bb = e->dest;
8770       store_bb = create_empty_bb (bb);
8771       add_bb_to_loop (store_bb, bb_loop);
8772       e->flags = EDGE_TRUE_VALUE;
8773       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8774       /* Put STORE_BB to likely part.  */
8775       efalse->probability = profile_probability::unlikely ();
8776       store_bb->count = efalse->count ();
8777       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8778       if (dom_info_available_p (CDI_DOMINATORS))
8779         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8780       if (dump_enabled_p ())
8781         dump_printf_loc (MSG_NOTE, vect_location,
8782                          "Create new block %d to sink mask stores.",
8783                          store_bb->index);
8784       /* Create vector comparison with boolean result.  */
8785       vectype = TREE_TYPE (mask);
8786       zero = build_zero_cst (vectype);
8787       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8788       gsi = gsi_last_bb (bb);
8789       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8790       /* Create new PHI node for vdef of the last masked store:
8791          .MEM_2 = VDEF <.MEM_1>
8792          will be converted to
8793          .MEM.3 = VDEF <.MEM_1>
8794          and new PHI node will be created in join bb
8795          .MEM_2 = PHI <.MEM_1, .MEM_3>
8796       */
8797       vdef = gimple_vdef (last);
8798       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8799       gimple_set_vdef (last, new_vdef);
8800       phi = create_phi_node (vdef, join_bb);
8801       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8802
8803       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8804       while (true)
8805         {
8806           gimple_stmt_iterator gsi_from;
8807           gimple *stmt1 = NULL;
8808
8809           /* Move masked store to STORE_BB.  */
8810           last_store = last;
8811           gsi = gsi_for_stmt (last);
8812           gsi_from = gsi;
8813           /* Shift GSI to the previous stmt for further traversal.  */
8814           gsi_prev (&gsi);
8815           gsi_to = gsi_start_bb (store_bb);
8816           gsi_move_before (&gsi_from, &gsi_to);
8817           /* Setup GSI_TO to the non-empty block start.  */
8818           gsi_to = gsi_start_bb (store_bb);
8819           if (dump_enabled_p ())
8820             {
8821               dump_printf_loc (MSG_NOTE, vect_location,
8822                                "Move stmt to created bb\n");
8823               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8824             }
8825           /* Move all stored value producers if possible.  */
8826           while (!gsi_end_p (gsi))
8827             {
8828               tree lhs;
8829               imm_use_iterator imm_iter;
8830               use_operand_p use_p;
8831               bool res;
8832
8833               /* Skip debug statements.  */
8834               if (is_gimple_debug (gsi_stmt (gsi)))
8835                 {
8836                   gsi_prev (&gsi);
8837                   continue;
8838                 }
8839               stmt1 = gsi_stmt (gsi);
8840               /* Do not consider statements writing to memory or having
8841                  volatile operand.  */
8842               if (gimple_vdef (stmt1)
8843                   || gimple_has_volatile_ops (stmt1))
8844                 break;
8845               gsi_from = gsi;
8846               gsi_prev (&gsi);
8847               lhs = gimple_get_lhs (stmt1);
8848               if (!lhs)
8849                 break;
8850
8851               /* LHS of vectorized stmt must be SSA_NAME.  */
8852               if (TREE_CODE (lhs) != SSA_NAME)
8853                 break;
8854
8855               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8856                 {
8857                   /* Remove dead scalar statement.  */
8858                   if (has_zero_uses (lhs))
8859                     {
8860                       gsi_remove (&gsi_from, true);
8861                       continue;
8862                     }
8863                 }
8864
8865               /* Check that LHS does not have uses outside of STORE_BB.  */
8866               res = true;
8867               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8868                 {
8869                   gimple *use_stmt;
8870                   use_stmt = USE_STMT (use_p);
8871                   if (is_gimple_debug (use_stmt))
8872                     continue;
8873                   if (gimple_bb (use_stmt) != store_bb)
8874                     {
8875                       res = false;
8876                       break;
8877                     }
8878                 }
8879               if (!res)
8880                 break;
8881
8882               if (gimple_vuse (stmt1)
8883                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8884                 break;
8885
8886               /* Can move STMT1 to STORE_BB.  */
8887               if (dump_enabled_p ())
8888                 {
8889                   dump_printf_loc (MSG_NOTE, vect_location,
8890                                    "Move stmt to created bb\n");
8891                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8892                 }
8893               gsi_move_before (&gsi_from, &gsi_to);
8894               /* Shift GSI_TO for further insertion.  */
8895               gsi_prev (&gsi_to);
8896             }
8897           /* Put other masked stores with the same mask to STORE_BB.  */
8898           if (worklist.is_empty ()
8899               || gimple_call_arg (worklist.last (), 2) != mask
8900               || worklist.last () != stmt1)
8901             break;
8902           last = worklist.pop ();
8903         }
8904       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8905     }
8906 }