gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     {
 219       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 220       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 221     }
 222   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 223     return false;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             {
 238               dump_printf_loc (MSG_NOTE, vect_location,
 239                                "==> examining pattern def stmt: ");
 240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 241                                 def_stmt_info->stmt, 0);
 242             }
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245             return false;
 246         }
 247
 248       if (dump_enabled_p ())
 249         {
 250           dump_printf_loc (MSG_NOTE, vect_location,
 251                            "==> examining pattern statement: ");
 252           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 253         }
 254       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 255         return false;
 256     }
 257
 258   return true;
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static bool
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 314               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 315             }
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 {
 327                   dump_printf_loc (MSG_NOTE, vect_location,
 328                                    "get vectype for scalar type:  ");
 329                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 330                   dump_printf (MSG_NOTE, "\n");
 331                 }
 332
 333               vectype = get_vectype_for_scalar_type (scalar_type);
 334               if (!vectype)
 335                 {
 336                   if (dump_enabled_p ())
 337                     {
 338                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                        "not vectorized: unsupported "
 340                                        "data-type ");
 341                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                          scalar_type);
 343                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 344                     }
 345                   return false;
 346                 }
 347               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 348
 349               if (dump_enabled_p ())
 350                 {
 351                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 352                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 353                   dump_printf (MSG_NOTE, "\n");
 354                 }
 355
 356               if (dump_enabled_p ())
 357                 {
 358                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 359                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 360                   dump_printf (MSG_NOTE, "\n");
 361                 }
 362
 363               vect_update_max_nunits (&vectorization_factor, vectype);
 364             }
 365         }
 366
 367       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 368            gsi_next (&si))
 369         {
 370           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 371           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 372                                            &mask_producers))
 373             return false;
 374         }
 375     }
 376
 377   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 378   if (dump_enabled_p ())
 379     {
 380       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 381       dump_dec (MSG_NOTE, vectorization_factor);
 382       dump_printf (MSG_NOTE, "\n");
 383     }
 384
 385   if (known_le (vectorization_factor, 1U))
 386     {
 387       if (dump_enabled_p ())
 388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                          "not vectorized: unsupported data-type\n");
 390       return false;
 391     }
 392   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 393
 394   for (i = 0; i < mask_producers.length (); i++)
 395     {
 396       stmt_info = mask_producers[i];
 397       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 398       if (!mask_type)
 399         return false;
 400       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 401     }
 402
 403   return true;
 404 }
 405
 406
 407 /* Function vect_is_simple_iv_evolution.
 408
 409    FORNOW: A simple evolution of an induction variables in the loop is
 410    considered a polynomial evolution.  */
 411
 412 static bool
 413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 414                              tree * step)
 415 {
 416   tree init_expr;
 417   tree step_expr;
 418   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 419   basic_block bb;
 420
 421   /* When there is no evolution in this loop, the evolution function
 422      is not "simple".  */
 423   if (evolution_part == NULL_TREE)
 424     return false;
 425
 426   /* When the evolution is a polynomial of degree >= 2
 427      the evolution function is not "simple".  */
 428   if (tree_is_chrec (evolution_part))
 429     return false;
 430
 431   step_expr = evolution_part;
 432   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 433
 434   if (dump_enabled_p ())
 435     {
 436       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 437       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 438       dump_printf (MSG_NOTE, ",  init: ");
 439       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 440       dump_printf (MSG_NOTE, "\n");
 441     }
 442
 443   *init = init_expr;
 444   *step = step_expr;
 445
 446   if (TREE_CODE (step_expr) != INTEGER_CST
 447       && (TREE_CODE (step_expr) != SSA_NAME
 448           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 449               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 450           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 451               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 452                   || !flag_associative_math)))
 453       && (TREE_CODE (step_expr) != REAL_CST
 454           || !flag_associative_math))
 455     {
 456       if (dump_enabled_p ())
 457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 458                          "step unknown.\n");
 459       return false;
 460     }
 461
 462   return true;
 463 }
 464
 465 /* Function vect_analyze_scalar_cycles_1.
 466
 467    Examine the cross iteration def-use cycles of scalar variables
 468    in LOOP.  LOOP_VINFO represents the loop that is now being
 469    considered for vectorization (can be LOOP, or an outer-loop
 470    enclosing LOOP).  */
 471
 472 static void
 473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 474 {
 475   basic_block bb = loop->header;
 476   tree init, step;
 477   auto_vec<stmt_vec_info, 64> worklist;
 478   gphi_iterator gsi;
 479   bool double_reduc;
 480
 481   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 482
 483   /* First - identify all inductions.  Reduction detection assumes that all the
 484      inductions have been identified, therefore, this order must not be
 485      changed.  */
 486   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 487     {
 488       gphi *phi = gsi.phi ();
 489       tree access_fn = NULL;
 490       tree def = PHI_RESULT (phi);
 491       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 492
 493       if (dump_enabled_p ())
 494         {
 495           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 496           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 497         }
 498
 499       /* Skip virtual phi's.  The data dependences that are associated with
 500          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 501       if (virtual_operand_p (def))
 502         continue;
 503
 504       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 505
 506       /* Analyze the evolution function.  */
 507       access_fn = analyze_scalar_evolution (loop, def);
 508       if (access_fn)
 509         {
 510           STRIP_NOPS (access_fn);
 511           if (dump_enabled_p ())
 512             {
 513               dump_printf_loc (MSG_NOTE, vect_location,
 514                                "Access function of PHI: ");
 515               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 516               dump_printf (MSG_NOTE, "\n");
 517             }
 518           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519             = initial_condition_in_loop_num (access_fn, loop->num);
 520           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 521             = evolution_part_in_loop_num (access_fn, loop->num);
 522         }
 523
 524       if (!access_fn
 525           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 526           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 527               && TREE_CODE (step) != INTEGER_CST))
 528         {
 529           worklist.safe_push (stmt_vinfo);
 530           continue;
 531         }
 532
 533       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 534                   != NULL_TREE);
 535       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 539       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 540     }
 541
 542
 543   /* Second - identify all reductions and nested cycles.  */
 544   while (worklist.length () > 0)
 545     {
 546       stmt_vec_info stmt_vinfo = worklist.pop ();
 547       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 548       tree def = PHI_RESULT (phi);
 549
 550       if (dump_enabled_p ())
 551         {
 552           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 553           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 554         }
 555
 556       gcc_assert (!virtual_operand_p (def)
 557                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 558
 559       stmt_vec_info reduc_stmt_info
 560         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 561                                        &double_reduc, false);
 562       if (reduc_stmt_info)
 563         {
 564           if (double_reduc)
 565             {
 566               if (dump_enabled_p ())
 567                 dump_printf_loc (MSG_NOTE, vect_location,
 568                                  "Detected double reduction.\n");
 569
 570               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 571               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 572                 = vect_double_reduction_def;
 573             }
 574           else
 575             {
 576               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 577                 {
 578                   if (dump_enabled_p ())
 579                     dump_printf_loc (MSG_NOTE, vect_location,
 580                                      "Detected vectorizable nested cycle.\n");
 581
 582                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 583                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 584                 }
 585               else
 586                 {
 587                   if (dump_enabled_p ())
 588                     dump_printf_loc (MSG_NOTE, vect_location,
 589                                      "Detected reduction.\n");
 590
 591                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 592                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 593                   /* Store the reduction cycles for possible vectorization in
 594                      loop-aware SLP if it was not detected as reduction
 595                      chain.  */
 596                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 597                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 598                       (reduc_stmt_info);
 599                 }
 600             }
 601         }
 602       else
 603         if (dump_enabled_p ())
 604           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 605                            "Unknown def-use cycle pattern.\n");
 606     }
 607 }
 608
 609
 610 /* Function vect_analyze_scalar_cycles.
 611
 612    Examine the cross iteration def-use cycles of scalar variables, by
 613    analyzing the loop-header PHIs of scalar variables.  Classify each
 614    cycle as one of the following: invariant, induction, reduction, unknown.
 615    We do that for the loop represented by LOOP_VINFO, and also to its
 616    inner-loop, if exists.
 617    Examples for scalar cycles:
 618
 619    Example1: reduction:
 620
 621               loop1:
 622               for (i=0; i<N; i++)
 623                  sum += a[i];
 624
 625    Example2: induction:
 626
 627               loop2:
 628               for (i=0; i<N; i++)
 629                  a[i] = i;  */
 630
 631 static void
 632 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 633 {
 634   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 635
 636   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 637
 638   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 639      Reductions in such inner-loop therefore have different properties than
 640      the reductions in the nest that gets vectorized:
 641      1. When vectorized, they are executed in the same order as in the original
 642         scalar loop, so we can't change the order of computation when
 643         vectorizing them.
 644      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 645         current checks are too strict.  */
 646
 647   if (loop->inner)
 648     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 649 }
 650
 651 /* Transfer group and reduction information from STMT_INFO to its
 652    pattern stmt.  */
 653
 654 static void
 655 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 656 {
 657   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 658   stmt_vec_info stmtp;
 659   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 660               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 661   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 662   do
 663     {
 664       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 665       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 666       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 667       if (stmt_info)
 668         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 669           = STMT_VINFO_RELATED_STMT (stmt_info);
 670     }
 671   while (stmt_info);
 672   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 673 }
 674
 675 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 676
 677 static void
 678 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 679 {
 680   stmt_vec_info first;
 681   unsigned i;
 682
 683   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 684     if (STMT_VINFO_IN_PATTERN_P (first))
 685       {
 686         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 687         while (next)
 688           {
 689             if (! STMT_VINFO_IN_PATTERN_P (next))
 690               break;
 691             next = REDUC_GROUP_NEXT_ELEMENT (next);
 692           }
 693         /* If not all stmt in the chain are patterns try to handle
 694            the chain without patterns.  */
 695         if (! next)
 696           {
 697             vect_fixup_reduc_chain (first);
 698             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 699               = STMT_VINFO_RELATED_STMT (first);
 700           }
 701       }
 702 }
 703
 704 /* Function vect_get_loop_niters.
 705
 706    Determine how many iterations the loop is executed and place it
 707    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 708    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 709    niter information holds in ASSUMPTIONS.
 710
 711    Return the loop exit condition.  */
 712
 713
 714 static gcond *
 715 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 716                       tree *number_of_iterations, tree *number_of_iterationsm1)
 717 {
 718   edge exit = single_exit (loop);
 719   struct tree_niter_desc niter_desc;
 720   tree niter_assumptions, niter, may_be_zero;
 721   gcond *cond = get_loop_exit_condition (loop);
 722
 723   *assumptions = boolean_true_node;
 724   *number_of_iterationsm1 = chrec_dont_know;
 725   *number_of_iterations = chrec_dont_know;
 726   DUMP_VECT_SCOPE ("get_loop_niters");
 727
 728   if (!exit)
 729     return cond;
 730
 731   niter = chrec_dont_know;
 732   may_be_zero = NULL_TREE;
 733   niter_assumptions = boolean_true_node;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 807   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     unaligned_dr (NULL),
 821     peeling_for_alignment (0),
 822     ptr_mask (0),
 823     ivexpr_map (NULL),
 824     slp_unrolling_factor (1),
 825     single_scalar_iteration_cost (0),
 826     vectorizable (false),
 827     can_fully_mask_p (true),
 828     fully_masked_p (false),
 829     peeling_for_gaps (false),
 830     peeling_for_niter (false),
 831     operands_swapped (false),
 832     no_data_dependencies (false),
 833     has_mask_store (false),
 834     scalar_loop (NULL),
 835     orig_loop_info (NULL)
 836 {
 837   /* Create/Update stmt_info for all stmts in the loop.  */
 838   basic_block *body = get_loop_body (loop);
 839   for (unsigned int i = 0; i < loop->num_nodes; i++)
 840     {
 841       basic_block bb = body[i];
 842       gimple_stmt_iterator si;
 843
 844       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 845         {
 846           gimple *phi = gsi_stmt (si);
 847           gimple_set_uid (phi, 0);
 848           add_stmt (phi);
 849         }
 850
 851       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 852         {
 853           gimple *stmt = gsi_stmt (si);
 854           gimple_set_uid (stmt, 0);
 855           add_stmt (stmt);
 856         }
 857     }
 858   free (body);
 859
 860   /* CHECKME: We want to visit all BBs before their successors (except for
 861      latch blocks, for which this assertion wouldn't hold).  In the simple
 862      case of the loop forms we allow, a dfs order of the BBs would the same
 863      as reversed postorder traversal, so we are safe.  */
 864
 865   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 866                                           bbs, loop->num_nodes, loop);
 867   gcc_assert (nbbs == loop->num_nodes);
 868 }
 869
 870 /* Free all levels of MASKS.  */
 871
 872 void
 873 release_vec_loop_masks (vec_loop_masks *masks)
 874 {
 875   rgroup_masks *rgm;
 876   unsigned int i;
 877   FOR_EACH_VEC_ELT (*masks, i, rgm)
 878     rgm->masks.release ();
 879   masks->release ();
 880 }
 881
 882 /* Free all memory used by the _loop_vec_info, as well as all the
 883    stmt_vec_info structs of all the stmts in the loop.  */
 884
 885 _loop_vec_info::~_loop_vec_info ()
 886 {
 887   int nbbs;
 888   gimple_stmt_iterator si;
 889   int j;
 890
 891   nbbs = loop->num_nodes;
 892   for (j = 0; j < nbbs; j++)
 893     {
 894       basic_block bb = bbs[j];
 895       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 896         {
 897           gimple *stmt = gsi_stmt (si);
 898
 899           /* We may have broken canonical form by moving a constant
 900              into RHS1 of a commutative op.  Fix such occurrences.  */
 901           if (operands_swapped && is_gimple_assign (stmt))
 902             {
 903               enum tree_code code = gimple_assign_rhs_code (stmt);
 904
 905               if ((code == PLUS_EXPR
 906                    || code == POINTER_PLUS_EXPR
 907                    || code == MULT_EXPR)
 908                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 909                 swap_ssa_operands (stmt,
 910                                    gimple_assign_rhs1_ptr (stmt),
 911                                    gimple_assign_rhs2_ptr (stmt));
 912               else if (code == COND_EXPR
 913                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 914                 {
 915                   tree cond_expr = gimple_assign_rhs1 (stmt);
 916                   enum tree_code cond_code = TREE_CODE (cond_expr);
 917
 918                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 919                     {
 920                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 921                                                                   0));
 922                       cond_code = invert_tree_comparison (cond_code,
 923                                                           honor_nans);
 924                       if (cond_code != ERROR_MARK)
 925                         {
 926                           TREE_SET_CODE (cond_expr, cond_code);
 927                           swap_ssa_operands (stmt,
 928                                              gimple_assign_rhs2_ptr (stmt),
 929                                              gimple_assign_rhs3_ptr (stmt));
 930                         }
 931                     }
 932                 }
 933             }
 934           gsi_next (&si);
 935         }
 936     }
 937
 938   free (bbs);
 939
 940   release_vec_loop_masks (&masks);
 941   delete ivexpr_map;
 942
 943   loop->aux = NULL;
 944 }
 945
 946 /* Return an invariant or register for EXPR and emit necessary
 947    computations in the LOOP_VINFO loop preheader.  */
 948
 949 tree
 950 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 951 {
 952   if (is_gimple_reg (expr)
 953       || is_gimple_min_invariant (expr))
 954     return expr;
 955
 956   if (! loop_vinfo->ivexpr_map)
 957     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 958   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 959   if (! cached)
 960     {
 961       gimple_seq stmts = NULL;
 962       cached = force_gimple_operand (unshare_expr (expr),
 963                                      &stmts, true, NULL_TREE);
 964       if (stmts)
 965         {
 966           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 967           gsi_insert_seq_on_edge_immediate (e, stmts);
 968         }
 969     }
 970   return cached;
 971 }
 972
 973 /* Return true if we can use CMP_TYPE as the comparison type to produce
 974    all masks required to mask LOOP_VINFO.  */
 975
 976 static bool
 977 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 978 {
 979   rgroup_masks *rgm;
 980   unsigned int i;
 981   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 982     if (rgm->mask_type != NULL_TREE
 983         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 984                                             cmp_type, rgm->mask_type,
 985                                             OPTIMIZE_FOR_SPEED))
 986       return false;
 987   return true;
 988 }
 989
 990 /* Calculate the maximum number of scalars per iteration for every
 991    rgroup in LOOP_VINFO.  */
 992
 993 static unsigned int
 994 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 995 {
 996   unsigned int res = 1;
 997   unsigned int i;
 998   rgroup_masks *rgm;
 999   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1000     res = MAX (res, rgm->max_nscalars_per_iter);
1001   return res;
1002 }
1003
1004 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1005    whether we can actually generate the masks required.  Return true if so,
1006    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1007
1008 static bool
1009 vect_verify_full_masking (loop_vec_info loop_vinfo)
1010 {
1011   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1012   unsigned int min_ni_width;
1013
1014   /* Use a normal loop if there are no statements that need masking.
1015      This only happens in rare degenerate cases: it means that the loop
1016      has no loads, no stores, and no live-out values.  */
1017   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1018     return false;
1019
1020   /* Get the maximum number of iterations that is representable
1021      in the counter type.  */
1022   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1023   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1024
1025   /* Get a more refined estimate for the number of iterations.  */
1026   widest_int max_back_edges;
1027   if (max_loop_iterations (loop, &max_back_edges))
1028     max_ni = wi::smin (max_ni, max_back_edges + 1);
1029
1030   /* Account for rgroup masks, in which each bit is replicated N times.  */
1031   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1032
1033   /* Work out how many bits we need to represent the limit.  */
1034   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1035
1036   /* Find a scalar mode for which WHILE_ULT is supported.  */
1037   opt_scalar_int_mode cmp_mode_iter;
1038   tree cmp_type = NULL_TREE;
1039   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1040     {
1041       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1042       if (cmp_bits >= min_ni_width
1043           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1044         {
1045           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1046           if (this_type
1047               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1048             {
1049               /* Although we could stop as soon as we find a valid mode,
1050                  it's often better to continue until we hit Pmode, since the
1051                  operands to the WHILE are more likely to be reusable in
1052                  address calculations.  */
1053               cmp_type = this_type;
1054               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1055                 break;
1056             }
1057         }
1058     }
1059
1060   if (!cmp_type)
1061     return false;
1062
1063   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1064   return true;
1065 }
1066
1067 /* Calculate the cost of one scalar iteration of the loop.  */
1068 static void
1069 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1070 {
1071   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1072   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1073   int nbbs = loop->num_nodes, factor;
1074   int innerloop_iters, i;
1075
1076   /* Gather costs for statements in the scalar loop.  */
1077
1078   /* FORNOW.  */
1079   innerloop_iters = 1;
1080   if (loop->inner)
1081     innerloop_iters = 50; /* FIXME */
1082
1083   for (i = 0; i < nbbs; i++)
1084     {
1085       gimple_stmt_iterator si;
1086       basic_block bb = bbs[i];
1087
1088       if (bb->loop_father == loop->inner)
1089         factor = innerloop_iters;
1090       else
1091         factor = 1;
1092
1093       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1094         {
1095           gimple *stmt = gsi_stmt (si);
1096           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1097
1098           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1099             continue;
1100
1101           /* Skip stmts that are not vectorized inside the loop.  */
1102           if (stmt_info
1103               && !STMT_VINFO_RELEVANT_P (stmt_info)
1104               && (!STMT_VINFO_LIVE_P (stmt_info)
1105                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1106               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1107             continue;
1108
1109           vect_cost_for_stmt kind;
1110           if (STMT_VINFO_DATA_REF (stmt_info))
1111             {
1112               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1113                kind = scalar_load;
1114              else
1115                kind = scalar_store;
1116             }
1117           else
1118             kind = scalar_stmt;
1119
1120           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1121                             factor, kind, stmt_info, 0, vect_prologue);
1122         }
1123     }
1124
1125   /* Now accumulate cost.  */
1126   void *target_cost_data = init_cost (loop);
1127   stmt_info_for_cost *si;
1128   int j;
1129   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1130                     j, si)
1131     (void) add_stmt_cost (target_cost_data, si->count,
1132                           si->kind, si->stmt_info, si->misalign,
1133                           vect_body);
1134   unsigned dummy, body_cost = 0;
1135   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1136   destroy_cost_data (target_cost_data);
1137   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1138 }
1139
1140
1141 /* Function vect_analyze_loop_form_1.
1142
1143    Verify that certain CFG restrictions hold, including:
1144    - the loop has a pre-header
1145    - the loop has a single entry and exit
1146    - the loop exit condition is simple enough
1147    - the number of iterations can be analyzed, i.e, a countable loop.  The
1148      niter could be analyzed under some assumptions.  */
1149
1150 bool
1151 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1152                           tree *assumptions, tree *number_of_iterationsm1,
1153                           tree *number_of_iterations, gcond **inner_loop_cond)
1154 {
1155   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1156
1157   /* Different restrictions apply when we are considering an inner-most loop,
1158      vs. an outer (nested) loop.
1159      (FORNOW. May want to relax some of these restrictions in the future).  */
1160
1161   if (!loop->inner)
1162     {
1163       /* Inner-most loop.  We currently require that the number of BBs is
1164          exactly 2 (the header and latch).  Vectorizable inner-most loops
1165          look like this:
1166
1167                         (pre-header)
1168                            |
1169                           header <--------+
1170                            | |            |
1171                            | +--> latch --+
1172                            |
1173                         (exit-bb)  */
1174
1175       if (loop->num_nodes != 2)
1176         {
1177           if (dump_enabled_p ())
1178             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1179                              "not vectorized: control flow in loop.\n");
1180           return false;
1181         }
1182
1183       if (empty_block_p (loop->header))
1184         {
1185           if (dump_enabled_p ())
1186             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1187                              "not vectorized: empty loop.\n");
1188           return false;
1189         }
1190     }
1191   else
1192     {
1193       struct loop *innerloop = loop->inner;
1194       edge entryedge;
1195
1196       /* Nested loop. We currently require that the loop is doubly-nested,
1197          contains a single inner loop, and the number of BBs is exactly 5.
1198          Vectorizable outer-loops look like this:
1199
1200                         (pre-header)
1201                            |
1202                           header <---+
1203                            |         |
1204                           inner-loop |
1205                            |         |
1206                           tail ------+
1207                            |
1208                         (exit-bb)
1209
1210          The inner-loop has the properties expected of inner-most loops
1211          as described above.  */
1212
1213       if ((loop->inner)->inner || (loop->inner)->next)
1214         {
1215           if (dump_enabled_p ())
1216             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1217                              "not vectorized: multiple nested loops.\n");
1218           return false;
1219         }
1220
1221       if (loop->num_nodes != 5)
1222         {
1223           if (dump_enabled_p ())
1224             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1225                              "not vectorized: control flow in loop.\n");
1226           return false;
1227         }
1228
1229       entryedge = loop_preheader_edge (innerloop);
1230       if (entryedge->src != loop->header
1231           || !single_exit (innerloop)
1232           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1233         {
1234           if (dump_enabled_p ())
1235             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236                              "not vectorized: unsupported outerloop form.\n");
1237           return false;
1238         }
1239
1240       /* Analyze the inner-loop.  */
1241       tree inner_niterm1, inner_niter, inner_assumptions;
1242       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1243                                       &inner_assumptions, &inner_niterm1,
1244                                       &inner_niter, NULL)
1245           /* Don't support analyzing niter under assumptions for inner
1246              loop.  */
1247           || !integer_onep (inner_assumptions))
1248         {
1249           if (dump_enabled_p ())
1250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251                              "not vectorized: Bad inner loop.\n");
1252           return false;
1253         }
1254
1255       if (!expr_invariant_in_loop_p (loop, inner_niter))
1256         {
1257           if (dump_enabled_p ())
1258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1259                              "not vectorized: inner-loop count not"
1260                              " invariant.\n");
1261           return false;
1262         }
1263
1264       if (dump_enabled_p ())
1265         dump_printf_loc (MSG_NOTE, vect_location,
1266                          "Considering outer-loop vectorization.\n");
1267     }
1268
1269   if (!single_exit (loop)
1270       || EDGE_COUNT (loop->header->preds) != 2)
1271     {
1272       if (dump_enabled_p ())
1273         {
1274           if (!single_exit (loop))
1275             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1276                              "not vectorized: multiple exits.\n");
1277           else if (EDGE_COUNT (loop->header->preds) != 2)
1278             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1279                              "not vectorized: too many incoming edges.\n");
1280         }
1281       return false;
1282     }
1283
1284   /* We assume that the loop exit condition is at the end of the loop. i.e,
1285      that the loop is represented as a do-while (with a proper if-guard
1286      before the loop if needed), where the loop header contains all the
1287      executable statements, and the latch is empty.  */
1288   if (!empty_block_p (loop->latch)
1289       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1290     {
1291       if (dump_enabled_p ())
1292         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293                          "not vectorized: latch block not empty.\n");
1294       return false;
1295     }
1296
1297   /* Make sure the exit is not abnormal.  */
1298   edge e = single_exit (loop);
1299   if (e->flags & EDGE_ABNORMAL)
1300     {
1301       if (dump_enabled_p ())
1302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303                          "not vectorized: abnormal loop exit edge.\n");
1304       return false;
1305     }
1306
1307   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1308                                      number_of_iterationsm1);
1309   if (!*loop_cond)
1310     {
1311       if (dump_enabled_p ())
1312         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1313                          "not vectorized: complicated exit condition.\n");
1314       return false;
1315     }
1316
1317   if (integer_zerop (*assumptions)
1318       || !*number_of_iterations
1319       || chrec_contains_undetermined (*number_of_iterations))
1320     {
1321       if (dump_enabled_p ())
1322         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1323                          "not vectorized: number of iterations cannot be "
1324                          "computed.\n");
1325       return false;
1326     }
1327
1328   if (integer_zerop (*number_of_iterations))
1329     {
1330       if (dump_enabled_p ())
1331         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1332                          "not vectorized: number of iterations = 0.\n");
1333       return false;
1334     }
1335
1336   return true;
1337 }
1338
1339 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1340
1341 loop_vec_info
1342 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1343 {
1344   tree assumptions, number_of_iterations, number_of_iterationsm1;
1345   gcond *loop_cond, *inner_loop_cond = NULL;
1346
1347   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1348                                   &assumptions, &number_of_iterationsm1,
1349                                   &number_of_iterations, &inner_loop_cond))
1350     return NULL;
1351
1352   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1353   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1354   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1355   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1356   if (!integer_onep (assumptions))
1357     {
1358       /* We consider to vectorize this loop by versioning it under
1359          some assumptions.  In order to do this, we need to clear
1360          existing information computed by scev and niter analyzer.  */
1361       scev_reset_htab ();
1362       free_numbers_of_iterations_estimates (loop);
1363       /* Also set flag for this loop so that following scev and niter
1364          analysis are done under the assumptions.  */
1365       loop_constraint_set (loop, LOOP_C_FINITE);
1366       /* Also record the assumptions for versioning.  */
1367       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1368     }
1369
1370   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1371     {
1372       if (dump_enabled_p ())
1373         {
1374           dump_printf_loc (MSG_NOTE, vect_location,
1375                            "Symbolic number of iterations is ");
1376           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1377           dump_printf (MSG_NOTE, "\n");
1378         }
1379     }
1380
1381   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1382   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1383   if (inner_loop_cond)
1384     {
1385       stmt_vec_info inner_loop_cond_info
1386         = loop_vinfo->lookup_stmt (inner_loop_cond);
1387       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1388     }
1389
1390   gcc_assert (!loop->aux);
1391   loop->aux = loop_vinfo;
1392   return loop_vinfo;
1393 }
1394
1395
1396
1397 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1398    statements update the vectorization factor.  */
1399
1400 static void
1401 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1402 {
1403   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1404   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1405   int nbbs = loop->num_nodes;
1406   poly_uint64 vectorization_factor;
1407   int i;
1408
1409   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1410
1411   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1412   gcc_assert (known_ne (vectorization_factor, 0U));
1413
1414   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1415      vectorization factor of the loop is the unrolling factor required by
1416      the SLP instances.  If that unrolling factor is 1, we say, that we
1417      perform pure SLP on loop - cross iteration parallelism is not
1418      exploited.  */
1419   bool only_slp_in_loop = true;
1420   for (i = 0; i < nbbs; i++)
1421     {
1422       basic_block bb = bbs[i];
1423       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1424            gsi_next (&si))
1425         {
1426           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1427           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1428               && STMT_VINFO_RELATED_STMT (stmt_info))
1429             stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
1430           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1431                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1432               && !PURE_SLP_STMT (stmt_info))
1433             /* STMT needs both SLP and loop-based vectorization.  */
1434             only_slp_in_loop = false;
1435         }
1436     }
1437
1438   if (only_slp_in_loop)
1439     {
1440       dump_printf_loc (MSG_NOTE, vect_location,
1441                        "Loop contains only SLP stmts\n");
1442       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1443     }
1444   else
1445     {
1446       dump_printf_loc (MSG_NOTE, vect_location,
1447                        "Loop contains SLP and non-SLP stmts\n");
1448       /* Both the vectorization factor and unroll factor have the form
1449          current_vector_size * X for some rational X, so they must have
1450          a common multiple.  */
1451       vectorization_factor
1452         = force_common_multiple (vectorization_factor,
1453                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1454     }
1455
1456   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1457   if (dump_enabled_p ())
1458     {
1459       dump_printf_loc (MSG_NOTE, vect_location,
1460                        "Updating vectorization factor to ");
1461       dump_dec (MSG_NOTE, vectorization_factor);
1462       dump_printf (MSG_NOTE, ".\n");
1463     }
1464 }
1465
1466 /* Return true if STMT_INFO describes a double reduction phi and if
1467    the other phi in the reduction is also relevant for vectorization.
1468    This rejects cases such as:
1469
1470       outer1:
1471         x_1 = PHI <x_3(outer2), ...>;
1472         ...
1473
1474       inner:
1475         x_2 = ...;
1476         ...
1477
1478       outer2:
1479         x_3 = PHI <x_2(inner)>;
1480
1481    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1482
1483 static bool
1484 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1485 {
1486   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1487     return false;
1488
1489   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1490 }
1491
1492 /* Function vect_analyze_loop_operations.
1493
1494    Scan the loop stmts and make sure they are all vectorizable.  */
1495
1496 static bool
1497 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1498 {
1499   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1500   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1501   int nbbs = loop->num_nodes;
1502   int i;
1503   stmt_vec_info stmt_info;
1504   bool need_to_vectorize = false;
1505   bool ok;
1506
1507   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1508
1509   stmt_vector_for_cost cost_vec;
1510   cost_vec.create (2);
1511
1512   for (i = 0; i < nbbs; i++)
1513     {
1514       basic_block bb = bbs[i];
1515
1516       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1517            gsi_next (&si))
1518         {
1519           gphi *phi = si.phi ();
1520           ok = true;
1521
1522           stmt_info = loop_vinfo->lookup_stmt (phi);
1523           if (dump_enabled_p ())
1524             {
1525               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1526               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1527             }
1528           if (virtual_operand_p (gimple_phi_result (phi)))
1529             continue;
1530
1531           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1532              (i.e., a phi in the tail of the outer-loop).  */
1533           if (! is_loop_header_bb_p (bb))
1534             {
1535               /* FORNOW: we currently don't support the case that these phis
1536                  are not used in the outerloop (unless it is double reduction,
1537                  i.e., this phi is vect_reduction_def), cause this case
1538                  requires to actually do something here.  */
1539               if (STMT_VINFO_LIVE_P (stmt_info)
1540                   && !vect_active_double_reduction_p (stmt_info))
1541                 {
1542                   if (dump_enabled_p ())
1543                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1544                                      "Unsupported loop-closed phi in "
1545                                      "outer-loop.\n");
1546                   return false;
1547                 }
1548
1549               /* If PHI is used in the outer loop, we check that its operand
1550                  is defined in the inner loop.  */
1551               if (STMT_VINFO_RELEVANT_P (stmt_info))
1552                 {
1553                   tree phi_op;
1554
1555                   if (gimple_phi_num_args (phi) != 1)
1556                     return false;
1557
1558                   phi_op = PHI_ARG_DEF (phi, 0);
1559                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1560                   if (!op_def_info)
1561                     return false;
1562
1563                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1564                       && (STMT_VINFO_RELEVANT (op_def_info)
1565                           != vect_used_in_outer_by_reduction))
1566                     return false;
1567                 }
1568
1569               continue;
1570             }
1571
1572           gcc_assert (stmt_info);
1573
1574           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1575                || STMT_VINFO_LIVE_P (stmt_info))
1576               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1577             {
1578               /* A scalar-dependence cycle that we don't support.  */
1579               if (dump_enabled_p ())
1580                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1581                                  "not vectorized: scalar dependence cycle.\n");
1582               return false;
1583             }
1584
1585           if (STMT_VINFO_RELEVANT_P (stmt_info))
1586             {
1587               need_to_vectorize = true;
1588               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1589                   && ! PURE_SLP_STMT (stmt_info))
1590                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1591                                              &cost_vec);
1592               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1593                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1594                        && ! PURE_SLP_STMT (stmt_info))
1595                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1596                                              &cost_vec);
1597             }
1598
1599           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1600           if (ok
1601               && STMT_VINFO_LIVE_P (stmt_info)
1602               && !PURE_SLP_STMT (stmt_info))
1603             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1604                                               &cost_vec);
1605
1606           if (!ok)
1607             {
1608               if (dump_enabled_p ())
1609                 {
1610                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1611                                    "not vectorized: relevant phi not "
1612                                    "supported: ");
1613                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1614                 }
1615               return false;
1616             }
1617         }
1618
1619       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1620            gsi_next (&si))
1621         {
1622           gimple *stmt = gsi_stmt (si);
1623           if (!gimple_clobber_p (stmt)
1624               && !vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1625                                      &need_to_vectorize,
1626                                      NULL, NULL, &cost_vec))
1627             return false;
1628         }
1629     } /* bbs */
1630
1631   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1632   cost_vec.release ();
1633
1634   /* All operations in the loop are either irrelevant (deal with loop
1635      control, or dead), or only used outside the loop and can be moved
1636      out of the loop (e.g. invariants, inductions).  The loop can be
1637      optimized away by scalar optimizations.  We're better off not
1638      touching this loop.  */
1639   if (!need_to_vectorize)
1640     {
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_NOTE, vect_location,
1643                          "All the computation can be taken out of the loop.\n");
1644       if (dump_enabled_p ())
1645         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646                          "not vectorized: redundant loop. no profit to "
1647                          "vectorize.\n");
1648       return false;
1649     }
1650
1651   return true;
1652 }
1653
1654 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1655    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1656    definitely no, or -1 if it's worth retrying.  */
1657
1658 static int
1659 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1660 {
1661   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1662   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1663
1664   /* Only fully-masked loops can have iteration counts less than the
1665      vectorization factor.  */
1666   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1667     {
1668       HOST_WIDE_INT max_niter;
1669
1670       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1671         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1672       else
1673         max_niter = max_stmt_executions_int (loop);
1674
1675       if (max_niter != -1
1676           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1677         {
1678           if (dump_enabled_p ())
1679             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680                              "not vectorized: iteration count smaller than "
1681                              "vectorization factor.\n");
1682           return 0;
1683         }
1684     }
1685
1686   int min_profitable_iters, min_profitable_estimate;
1687   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1688                                       &min_profitable_estimate);
1689
1690   if (min_profitable_iters < 0)
1691     {
1692       if (dump_enabled_p ())
1693         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1694                          "not vectorized: vectorization not profitable.\n");
1695       if (dump_enabled_p ())
1696         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1697                          "not vectorized: vector version will never be "
1698                          "profitable.\n");
1699       return -1;
1700     }
1701
1702   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1703                                * assumed_vf);
1704
1705   /* Use the cost model only if it is more conservative than user specified
1706      threshold.  */
1707   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1708                                     min_profitable_iters);
1709
1710   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1711
1712   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1713       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1714     {
1715       if (dump_enabled_p ())
1716         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717                          "not vectorized: vectorization not profitable.\n");
1718       if (dump_enabled_p ())
1719         dump_printf_loc (MSG_NOTE, vect_location,
1720                          "not vectorized: iteration count smaller than user "
1721                          "specified loop bound parameter or minimum profitable "
1722                          "iterations (whichever is more conservative).\n");
1723       return 0;
1724     }
1725
1726   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1727   if (estimated_niter == -1)
1728     estimated_niter = likely_max_stmt_executions_int (loop);
1729   if (estimated_niter != -1
1730       && ((unsigned HOST_WIDE_INT) estimated_niter
1731           < MAX (th, (unsigned) min_profitable_estimate)))
1732     {
1733       if (dump_enabled_p ())
1734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735                          "not vectorized: estimated iteration count too "
1736                          "small.\n");
1737       if (dump_enabled_p ())
1738         dump_printf_loc (MSG_NOTE, vect_location,
1739                          "not vectorized: estimated iteration count smaller "
1740                          "than specified loop bound parameter or minimum "
1741                          "profitable iterations (whichever is more "
1742                          "conservative).\n");
1743       return -1;
1744     }
1745
1746   return 1;
1747 }
1748
1749 static bool
1750 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1751                            vec<data_reference_p> *datarefs,
1752                            unsigned int *n_stmts)
1753 {
1754   *n_stmts = 0;
1755   for (unsigned i = 0; i < loop->num_nodes; i++)
1756     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1757          !gsi_end_p (gsi); gsi_next (&gsi))
1758       {
1759         gimple *stmt = gsi_stmt (gsi);
1760         if (is_gimple_debug (stmt))
1761           continue;
1762         ++(*n_stmts);
1763         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1764           {
1765             if (is_gimple_call (stmt) && loop->safelen)
1766               {
1767                 tree fndecl = gimple_call_fndecl (stmt), op;
1768                 if (fndecl != NULL_TREE)
1769                   {
1770                     cgraph_node *node = cgraph_node::get (fndecl);
1771                     if (node != NULL && node->simd_clones != NULL)
1772                       {
1773                         unsigned int j, n = gimple_call_num_args (stmt);
1774                         for (j = 0; j < n; j++)
1775                           {
1776                             op = gimple_call_arg (stmt, j);
1777                             if (DECL_P (op)
1778                                 || (REFERENCE_CLASS_P (op)
1779                                     && get_base_address (op)))
1780                               break;
1781                           }
1782                         op = gimple_call_lhs (stmt);
1783                         /* Ignore #pragma omp declare simd functions
1784                            if they don't have data references in the
1785                            call stmt itself.  */
1786                         if (j == n
1787                             && !(op
1788                                  && (DECL_P (op)
1789                                      || (REFERENCE_CLASS_P (op)
1790                                          && get_base_address (op)))))
1791                           continue;
1792                       }
1793                   }
1794               }
1795             return false;
1796           }
1797         /* If dependence analysis will give up due to the limit on the
1798            number of datarefs stop here and fail fatally.  */
1799         if (datarefs->length ()
1800             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1801           return false;
1802       }
1803   return true;
1804 }
1805
1806 /* Function vect_analyze_loop_2.
1807
1808    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1809    for it.  The different analyses will record information in the
1810    loop_vec_info struct.  */
1811 static bool
1812 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1813 {
1814   bool ok;
1815   int res;
1816   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1817   poly_uint64 min_vf = 2;
1818
1819   /* The first group of checks is independent of the vector size.  */
1820   fatal = true;
1821
1822   /* Find all data references in the loop (which correspond to vdefs/vuses)
1823      and analyze their evolution in the loop.  */
1824
1825   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1826
1827   /* Gather the data references and count stmts in the loop.  */
1828   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1829     {
1830       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1831                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1832                                       n_stmts))
1833         {
1834           if (dump_enabled_p ())
1835             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1836                              "not vectorized: loop contains function "
1837                              "calls or data references that cannot "
1838                              "be analyzed\n");
1839           return false;
1840         }
1841       loop_vinfo->shared->save_datarefs ();
1842     }
1843   else
1844     loop_vinfo->shared->check_datarefs ();
1845
1846   /* Analyze the data references and also adjust the minimal
1847      vectorization factor according to the loads and stores.  */
1848
1849   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1850   if (!ok)
1851     {
1852       if (dump_enabled_p ())
1853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                          "bad data references.\n");
1855       return false;
1856     }
1857
1858   /* Classify all cross-iteration scalar data-flow cycles.
1859      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1860   vect_analyze_scalar_cycles (loop_vinfo);
1861
1862   vect_pattern_recog (loop_vinfo);
1863
1864   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1865
1866   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1867      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1868
1869   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1870   if (!ok)
1871     {
1872       if (dump_enabled_p ())
1873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874                          "bad data access.\n");
1875       return false;
1876     }
1877
1878   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1879
1880   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1881   if (!ok)
1882     {
1883       if (dump_enabled_p ())
1884         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1885                          "unexpected pattern.\n");
1886       return false;
1887     }
1888
1889   /* While the rest of the analysis below depends on it in some way.  */
1890   fatal = false;
1891
1892   /* Analyze data dependences between the data-refs in the loop
1893      and adjust the maximum vectorization factor according to
1894      the dependences.
1895      FORNOW: fail at the first data dependence that we encounter.  */
1896
1897   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1898   if (!ok
1899       || (max_vf != MAX_VECTORIZATION_FACTOR
1900           && maybe_lt (max_vf, min_vf)))
1901     {
1902       if (dump_enabled_p ())
1903             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904                              "bad data dependence.\n");
1905       return false;
1906     }
1907   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1908
1909   ok = vect_determine_vectorization_factor (loop_vinfo);
1910   if (!ok)
1911     {
1912       if (dump_enabled_p ())
1913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914                          "can't determine vectorization factor.\n");
1915       return false;
1916     }
1917   if (max_vf != MAX_VECTORIZATION_FACTOR
1918       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1919     {
1920       if (dump_enabled_p ())
1921         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922                          "bad data dependence.\n");
1923       return false;
1924     }
1925
1926   /* Compute the scalar iteration cost.  */
1927   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1928
1929   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1930   unsigned th;
1931
1932   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1933   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1934   if (!ok)
1935     return false;
1936
1937   /* If there are any SLP instances mark them as pure_slp.  */
1938   bool slp = vect_make_slp_decision (loop_vinfo);
1939   if (slp)
1940     {
1941       /* Find stmts that need to be both vectorized and SLPed.  */
1942       vect_detect_hybrid_slp (loop_vinfo);
1943
1944       /* Update the vectorization factor based on the SLP decision.  */
1945       vect_update_vf_for_slp (loop_vinfo);
1946     }
1947
1948   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1949
1950   /* We don't expect to have to roll back to anything other than an empty
1951      set of rgroups.  */
1952   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1953
1954   /* This is the point where we can re-start analysis with SLP forced off.  */
1955 start_over:
1956
1957   /* Now the vectorization factor is final.  */
1958   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959   gcc_assert (known_ne (vectorization_factor, 0U));
1960
1961   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1962     {
1963       dump_printf_loc (MSG_NOTE, vect_location,
1964                        "vectorization_factor = ");
1965       dump_dec (MSG_NOTE, vectorization_factor);
1966       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1967                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1968     }
1969
1970   HOST_WIDE_INT max_niter
1971     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1972
1973   /* Analyze the alignment of the data-refs in the loop.
1974      Fail if a data reference is found that cannot be vectorized.  */
1975
1976   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1977   if (!ok)
1978     {
1979       if (dump_enabled_p ())
1980         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1981                          "bad data alignment.\n");
1982       return false;
1983     }
1984
1985   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1986      It is important to call pruning after vect_analyze_data_ref_accesses,
1987      since we use grouping information gathered by interleaving analysis.  */
1988   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1989   if (!ok)
1990     return false;
1991
1992   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
1993      vectorization.  */
1994   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1995     {
1996     /* This pass will decide on using loop versioning and/or loop peeling in
1997        order to enhance the alignment of data references in the loop.  */
1998     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1999     if (!ok)
2000       {
2001         if (dump_enabled_p ())
2002           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003                            "bad data alignment.\n");
2004         return false;
2005       }
2006     }
2007
2008   if (slp)
2009     {
2010       /* Analyze operations in the SLP instances.  Note this may
2011          remove unsupported SLP instances which makes the above
2012          SLP kind detection invalid.  */
2013       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2014       vect_slp_analyze_operations (loop_vinfo);
2015       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2016         goto again;
2017     }
2018
2019   /* Scan all the remaining operations in the loop that are not subject
2020      to SLP and make sure they are vectorizable.  */
2021   ok = vect_analyze_loop_operations (loop_vinfo);
2022   if (!ok)
2023     {
2024       if (dump_enabled_p ())
2025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026                          "bad operation or unsupported loop bound.\n");
2027       return false;
2028     }
2029
2030   /* Decide whether to use a fully-masked loop for this vectorization
2031      factor.  */
2032   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2033     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2034        && vect_verify_full_masking (loop_vinfo));
2035   if (dump_enabled_p ())
2036     {
2037       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2038         dump_printf_loc (MSG_NOTE, vect_location,
2039                          "using a fully-masked loop.\n");
2040       else
2041         dump_printf_loc (MSG_NOTE, vect_location,
2042                          "not using a fully-masked loop.\n");
2043     }
2044
2045   /* If epilog loop is required because of data accesses with gaps,
2046      one additional iteration needs to be peeled.  Check if there is
2047      enough iterations for vectorization.  */
2048   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2049       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2050       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2051     {
2052       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2053       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2054
2055       if (known_lt (wi::to_widest (scalar_niters), vf))
2056         {
2057           if (dump_enabled_p ())
2058             dump_printf_loc (MSG_NOTE, vect_location,
2059                              "loop has no enough iterations to support"
2060                              " peeling for gaps.\n");
2061           return false;
2062         }
2063     }
2064
2065   /* Check the costings of the loop make vectorizing worthwhile.  */
2066   res = vect_analyze_loop_costing (loop_vinfo);
2067   if (res < 0)
2068     goto again;
2069   if (!res)
2070     {
2071       if (dump_enabled_p ())
2072         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2073                          "Loop costings not worthwhile.\n");
2074       return false;
2075     }
2076
2077   /* Decide whether we need to create an epilogue loop to handle
2078      remaining scalar iterations.  */
2079   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2080
2081   unsigned HOST_WIDE_INT const_vf;
2082   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2083     /* The main loop handles all iterations.  */
2084     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2085   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2086            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2087     {
2088       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2089                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2090                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2091         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2092     }
2093   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2094            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2095            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2096                 < (unsigned) exact_log2 (const_vf))
2097                /* In case of versioning, check if the maximum number of
2098                   iterations is greater than th.  If they are identical,
2099                   the epilogue is unnecessary.  */
2100                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2101                    || ((unsigned HOST_WIDE_INT) max_niter
2102                        > (th / const_vf) * const_vf))))
2103     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2104
2105   /* If an epilogue loop is required make sure we can create one.  */
2106   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2107       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2108     {
2109       if (dump_enabled_p ())
2110         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2111       if (!vect_can_advance_ivs_p (loop_vinfo)
2112           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2113                                            single_exit (LOOP_VINFO_LOOP
2114                                                          (loop_vinfo))))
2115         {
2116           if (dump_enabled_p ())
2117             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118                              "not vectorized: can't create required "
2119                              "epilog loop\n");
2120           goto again;
2121         }
2122     }
2123
2124   /* During peeling, we need to check if number of loop iterations is
2125      enough for both peeled prolog loop and vector loop.  This check
2126      can be merged along with threshold check of loop versioning, so
2127      increase threshold for this case if necessary.  */
2128   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2129     {
2130       poly_uint64 niters_th = 0;
2131
2132       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2133         {
2134           /* Niters for peeled prolog loop.  */
2135           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2136             {
2137               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2138               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2139               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2140             }
2141           else
2142             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2143         }
2144
2145       /* Niters for at least one iteration of vectorized loop.  */
2146       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2147         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2148       /* One additional iteration because of peeling for gap.  */
2149       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2150         niters_th += 1;
2151       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2152     }
2153
2154   gcc_assert (known_eq (vectorization_factor,
2155                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2156
2157   /* Ok to vectorize!  */
2158   return true;
2159
2160 again:
2161   /* Try again with SLP forced off but if we didn't do any SLP there is
2162      no point in re-trying.  */
2163   if (!slp)
2164     return false;
2165
2166   /* If there are reduction chains re-trying will fail anyway.  */
2167   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2168     return false;
2169
2170   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2171      via interleaving or lane instructions.  */
2172   slp_instance instance;
2173   slp_tree node;
2174   unsigned i, j;
2175   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2176     {
2177       stmt_vec_info vinfo;
2178       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2179       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2180         continue;
2181       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2182       unsigned int size = DR_GROUP_SIZE (vinfo);
2183       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2184       if (! vect_store_lanes_supported (vectype, size, false)
2185          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2186          && ! vect_grouped_store_supported (vectype, size))
2187        return false;
2188       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2189         {
2190           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2191           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2192           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2193           size = DR_GROUP_SIZE (vinfo);
2194           vectype = STMT_VINFO_VECTYPE (vinfo);
2195           if (! vect_load_lanes_supported (vectype, size, false)
2196               && ! vect_grouped_load_supported (vectype, single_element_p,
2197                                                 size))
2198             return false;
2199         }
2200     }
2201
2202   if (dump_enabled_p ())
2203     dump_printf_loc (MSG_NOTE, vect_location,
2204                      "re-trying with SLP disabled\n");
2205
2206   /* Roll back state appropriately.  No SLP this time.  */
2207   slp = false;
2208   /* Restore vectorization factor as it were without SLP.  */
2209   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2210   /* Free the SLP instances.  */
2211   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2212     vect_free_slp_instance (instance, false);
2213   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2214   /* Reset SLP type to loop_vect on all stmts.  */
2215   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2216     {
2217       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2218       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2219            !gsi_end_p (si); gsi_next (&si))
2220         {
2221           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2222           STMT_SLP_TYPE (stmt_info) = loop_vect;
2223         }
2224       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2225            !gsi_end_p (si); gsi_next (&si))
2226         {
2227           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2228           STMT_SLP_TYPE (stmt_info) = loop_vect;
2229           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2230             {
2231               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2232               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2233               STMT_SLP_TYPE (stmt_info) = loop_vect;
2234               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2235                    !gsi_end_p (pi); gsi_next (&pi))
2236                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2237                   = loop_vect;
2238             }
2239         }
2240     }
2241   /* Free optimized alias test DDRS.  */
2242   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2243   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2244   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2245   /* Reset target cost data.  */
2246   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2247   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2248     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2249   /* Reset accumulated rgroup information.  */
2250   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2251   /* Reset assorted flags.  */
2252   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2253   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2254   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2255   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2256   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2257
2258   goto start_over;
2259 }
2260
2261 /* Function vect_analyze_loop.
2262
2263    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2264    for it.  The different analyses will record information in the
2265    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2266    be vectorized.  */
2267 loop_vec_info
2268 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2269                    vec_info_shared *shared)
2270 {
2271   loop_vec_info loop_vinfo;
2272   auto_vector_sizes vector_sizes;
2273
2274   /* Autodetect first vector size we try.  */
2275   current_vector_size = 0;
2276   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2277   unsigned int next_size = 0;
2278
2279   DUMP_VECT_SCOPE ("analyze_loop_nest");
2280
2281   if (loop_outer (loop)
2282       && loop_vec_info_for_loop (loop_outer (loop))
2283       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2284     {
2285       if (dump_enabled_p ())
2286         dump_printf_loc (MSG_NOTE, vect_location,
2287                          "outer-loop already vectorized.\n");
2288       return NULL;
2289     }
2290
2291   if (!find_loop_nest (loop, &shared->loop_nest))
2292     {
2293       if (dump_enabled_p ())
2294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2295                          "not vectorized: loop nest containing two "
2296                          "or more consecutive inner loops cannot be "
2297                          "vectorized\n");
2298       return NULL;
2299     }
2300
2301   unsigned n_stmts = 0;
2302   poly_uint64 autodetected_vector_size = 0;
2303   while (1)
2304     {
2305       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2306       loop_vinfo = vect_analyze_loop_form (loop, shared);
2307       if (!loop_vinfo)
2308         {
2309           if (dump_enabled_p ())
2310             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311                              "bad loop form.\n");
2312           return NULL;
2313         }
2314
2315       bool fatal = false;
2316
2317       if (orig_loop_vinfo)
2318         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2319
2320       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2321         {
2322           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2323
2324           return loop_vinfo;
2325         }
2326
2327       delete loop_vinfo;
2328
2329       if (next_size == 0)
2330         autodetected_vector_size = current_vector_size;
2331
2332       if (next_size < vector_sizes.length ()
2333           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2334         next_size += 1;
2335
2336       if (fatal
2337           || next_size == vector_sizes.length ()
2338           || known_eq (current_vector_size, 0U))
2339         return NULL;
2340
2341       /* Try the next biggest vector size.  */
2342       current_vector_size = vector_sizes[next_size++];
2343       if (dump_enabled_p ())
2344         {
2345           dump_printf_loc (MSG_NOTE, vect_location,
2346                            "***** Re-trying analysis with "
2347                            "vector size ");
2348           dump_dec (MSG_NOTE, current_vector_size);
2349           dump_printf (MSG_NOTE, "\n");
2350         }
2351     }
2352 }
2353
2354 /* Return true if there is an in-order reduction function for CODE, storing
2355    it in *REDUC_FN if so.  */
2356
2357 static bool
2358 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2359 {
2360   switch (code)
2361     {
2362     case PLUS_EXPR:
2363       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2364       return true;
2365
2366     default:
2367       return false;
2368     }
2369 }
2370
2371 /* Function reduction_fn_for_scalar_code
2372
2373    Input:
2374    CODE - tree_code of a reduction operations.
2375
2376    Output:
2377    REDUC_FN - the corresponding internal function to be used to reduce the
2378       vector of partial results into a single scalar result, or IFN_LAST
2379       if the operation is a supported reduction operation, but does not have
2380       such an internal function.
2381
2382    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2383
2384 static bool
2385 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2386 {
2387   switch (code)
2388     {
2389       case MAX_EXPR:
2390         *reduc_fn = IFN_REDUC_MAX;
2391         return true;
2392
2393       case MIN_EXPR:
2394         *reduc_fn = IFN_REDUC_MIN;
2395         return true;
2396
2397       case PLUS_EXPR:
2398         *reduc_fn = IFN_REDUC_PLUS;
2399         return true;
2400
2401       case BIT_AND_EXPR:
2402         *reduc_fn = IFN_REDUC_AND;
2403         return true;
2404
2405       case BIT_IOR_EXPR:
2406         *reduc_fn = IFN_REDUC_IOR;
2407         return true;
2408
2409       case BIT_XOR_EXPR:
2410         *reduc_fn = IFN_REDUC_XOR;
2411         return true;
2412
2413       case MULT_EXPR:
2414       case MINUS_EXPR:
2415         *reduc_fn = IFN_LAST;
2416         return true;
2417
2418       default:
2419        return false;
2420     }
2421 }
2422
2423 /* If there is a neutral value X such that SLP reduction NODE would not
2424    be affected by the introduction of additional X elements, return that X,
2425    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2426    is true if the SLP statements perform a single reduction, false if each
2427    statement performs an independent reduction.  */
2428
2429 static tree
2430 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2431                               bool reduc_chain)
2432 {
2433   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2434   stmt_vec_info stmt_vinfo = stmts[0];
2435   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2436   tree scalar_type = TREE_TYPE (vector_type);
2437   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2438   gcc_assert (loop);
2439
2440   switch (code)
2441     {
2442     case WIDEN_SUM_EXPR:
2443     case DOT_PROD_EXPR:
2444     case SAD_EXPR:
2445     case PLUS_EXPR:
2446     case MINUS_EXPR:
2447     case BIT_IOR_EXPR:
2448     case BIT_XOR_EXPR:
2449       return build_zero_cst (scalar_type);
2450
2451     case MULT_EXPR:
2452       return build_one_cst (scalar_type);
2453
2454     case BIT_AND_EXPR:
2455       return build_all_ones_cst (scalar_type);
2456
2457     case MAX_EXPR:
2458     case MIN_EXPR:
2459       /* For MIN/MAX the initial values are neutral.  A reduction chain
2460          has only a single initial value, so that value is neutral for
2461          all statements.  */
2462       if (reduc_chain)
2463         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2464                                       loop_preheader_edge (loop));
2465       return NULL_TREE;
2466
2467     default:
2468       return NULL_TREE;
2469     }
2470 }
2471
2472 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2473    STMT is printed with a message MSG. */
2474
2475 static void
2476 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2477 {
2478   dump_printf_loc (msg_type, vect_location, "%s", msg);
2479   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2480 }
2481
2482 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2483    operation.  Return true if the results of DEF_STMT_INFO are something
2484    that can be accumulated by such a reduction.  */
2485
2486 static bool
2487 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2488 {
2489   return (is_gimple_assign (def_stmt_info->stmt)
2490           || is_gimple_call (def_stmt_info->stmt)
2491           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2492           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2493               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2494               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2495 }
2496
2497 /* Detect SLP reduction of the form:
2498
2499    #a1 = phi <a5, a0>
2500    a2 = operation (a1)
2501    a3 = operation (a2)
2502    a4 = operation (a3)
2503    a5 = operation (a4)
2504
2505    #a = phi <a5>
2506
2507    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2508    FIRST_STMT is the first reduction stmt in the chain
2509    (a2 = operation (a1)).
2510
2511    Return TRUE if a reduction chain was detected.  */
2512
2513 static bool
2514 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2515                        gimple *first_stmt)
2516 {
2517   struct loop *loop = (gimple_bb (phi))->loop_father;
2518   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2519   enum tree_code code;
2520   gimple *loop_use_stmt = NULL;
2521   stmt_vec_info use_stmt_info, current_stmt_info = NULL;
2522   tree lhs;
2523   imm_use_iterator imm_iter;
2524   use_operand_p use_p;
2525   int nloop_uses, size = 0, n_out_of_loop_uses;
2526   bool found = false;
2527
2528   if (loop != vect_loop)
2529     return false;
2530
2531   lhs = PHI_RESULT (phi);
2532   code = gimple_assign_rhs_code (first_stmt);
2533   while (1)
2534     {
2535       nloop_uses = 0;
2536       n_out_of_loop_uses = 0;
2537       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2538         {
2539           gimple *use_stmt = USE_STMT (use_p);
2540           if (is_gimple_debug (use_stmt))
2541             continue;
2542
2543           /* Check if we got back to the reduction phi.  */
2544           if (use_stmt == phi)
2545             {
2546               loop_use_stmt = use_stmt;
2547               found = true;
2548               break;
2549             }
2550
2551           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2552             {
2553               loop_use_stmt = use_stmt;
2554               nloop_uses++;
2555             }
2556            else
2557              n_out_of_loop_uses++;
2558
2559            /* There are can be either a single use in the loop or two uses in
2560               phi nodes.  */
2561            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2562              return false;
2563         }
2564
2565       if (found)
2566         break;
2567
2568       /* We reached a statement with no loop uses.  */
2569       if (nloop_uses == 0)
2570         return false;
2571
2572       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2573       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2574         return false;
2575
2576       if (!is_gimple_assign (loop_use_stmt)
2577           || code != gimple_assign_rhs_code (loop_use_stmt)
2578           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2579         return false;
2580
2581       /* Insert USE_STMT into reduction chain.  */
2582       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2583       if (current_stmt_info)
2584         {
2585           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = use_stmt_info;
2586           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2587             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2588         }
2589       else
2590         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = use_stmt_info;
2591
2592       lhs = gimple_assign_lhs (loop_use_stmt);
2593       current_stmt_info = use_stmt_info;
2594       size++;
2595    }
2596
2597   if (!found || loop_use_stmt != phi || size < 2)
2598     return false;
2599
2600   /* Swap the operands, if needed, to make the reduction operand be the second
2601      operand.  */
2602   lhs = PHI_RESULT (phi);
2603   stmt_vec_info next_stmt_info = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2604   while (next_stmt_info)
2605     {
2606       gassign *next_stmt = as_a <gassign *> (next_stmt_info->stmt);
2607       if (gimple_assign_rhs2 (next_stmt) == lhs)
2608         {
2609           tree op = gimple_assign_rhs1 (next_stmt);
2610           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2611
2612           /* Check that the other def is either defined in the loop
2613              ("vect_internal_def"), or it's an induction (defined by a
2614              loop-header phi-node).  */
2615           if (def_stmt_info
2616               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2617               && vect_valid_reduction_input_p (def_stmt_info))
2618             {
2619               lhs = gimple_assign_lhs (next_stmt);
2620               next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2621               continue;
2622             }
2623
2624           return false;
2625         }
2626       else
2627         {
2628           tree op = gimple_assign_rhs2 (next_stmt);
2629           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2630
2631           /* Check that the other def is either defined in the loop
2632             ("vect_internal_def"), or it's an induction (defined by a
2633             loop-header phi-node).  */
2634           if (def_stmt_info
2635               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2636               && vect_valid_reduction_input_p (def_stmt_info))
2637             {
2638               if (dump_enabled_p ())
2639                 {
2640                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2641                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2642                 }
2643
2644               swap_ssa_operands (next_stmt,
2645                                  gimple_assign_rhs1_ptr (next_stmt),
2646                                  gimple_assign_rhs2_ptr (next_stmt));
2647               update_stmt (next_stmt);
2648
2649               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2650                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2651             }
2652           else
2653             return false;
2654         }
2655
2656       lhs = gimple_assign_lhs (next_stmt);
2657       next_stmt_info = REDUC_GROUP_NEXT_ELEMENT (next_stmt_info);
2658     }
2659
2660   /* Save the chain for further analysis in SLP detection.  */
2661   stmt_vec_info first_stmt_info
2662     = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2663   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first_stmt_info);
2664   REDUC_GROUP_SIZE (first_stmt_info) = size;
2665
2666   return true;
2667 }
2668
2669 /* Return true if we need an in-order reduction for operation CODE
2670    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2671    overflow must wrap.  */
2672
2673 static bool
2674 needs_fold_left_reduction_p (tree type, tree_code code,
2675                              bool need_wrapping_integral_overflow)
2676 {
2677   /* CHECKME: check for !flag_finite_math_only too?  */
2678   if (SCALAR_FLOAT_TYPE_P (type))
2679     switch (code)
2680       {
2681       case MIN_EXPR:
2682       case MAX_EXPR:
2683         return false;
2684
2685       default:
2686         return !flag_associative_math;
2687       }
2688
2689   if (INTEGRAL_TYPE_P (type))
2690     {
2691       if (!operation_no_trapping_overflow (type, code))
2692         return true;
2693       if (need_wrapping_integral_overflow
2694           && !TYPE_OVERFLOW_WRAPS (type)
2695           && operation_can_overflow (code))
2696         return true;
2697       return false;
2698     }
2699
2700   if (SAT_FIXED_POINT_TYPE_P (type))
2701     return true;
2702
2703   return false;
2704 }
2705
2706 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2707    reduction operation CODE has a handled computation expression.  */
2708
2709 bool
2710 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2711                       tree loop_arg, enum tree_code code)
2712 {
2713   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2714   auto_bitmap visited;
2715   tree lookfor = PHI_RESULT (phi);
2716   ssa_op_iter curri;
2717   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2718   while (USE_FROM_PTR (curr) != loop_arg)
2719     curr = op_iter_next_use (&curri);
2720   curri.i = curri.numops;
2721   do
2722     {
2723       path.safe_push (std::make_pair (curri, curr));
2724       tree use = USE_FROM_PTR (curr);
2725       if (use == lookfor)
2726         break;
2727       gimple *def = SSA_NAME_DEF_STMT (use);
2728       if (gimple_nop_p (def)
2729           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2730         {
2731 pop:
2732           do
2733             {
2734               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2735               curri = x.first;
2736               curr = x.second;
2737               do
2738                 curr = op_iter_next_use (&curri);
2739               /* Skip already visited or non-SSA operands (from iterating
2740                  over PHI args).  */
2741               while (curr != NULL_USE_OPERAND_P
2742                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2743                          || ! bitmap_set_bit (visited,
2744                                               SSA_NAME_VERSION
2745                                                 (USE_FROM_PTR (curr)))));
2746             }
2747           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2748           if (curr == NULL_USE_OPERAND_P)
2749             break;
2750         }
2751       else
2752         {
2753           if (gimple_code (def) == GIMPLE_PHI)
2754             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2755           else
2756             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2757           while (curr != NULL_USE_OPERAND_P
2758                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2759                      || ! bitmap_set_bit (visited,
2760                                           SSA_NAME_VERSION
2761                                             (USE_FROM_PTR (curr)))))
2762             curr = op_iter_next_use (&curri);
2763           if (curr == NULL_USE_OPERAND_P)
2764             goto pop;
2765         }
2766     }
2767   while (1);
2768   if (dump_file && (dump_flags & TDF_DETAILS))
2769     {
2770       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2771       unsigned i;
2772       std::pair<ssa_op_iter, use_operand_p> *x;
2773       FOR_EACH_VEC_ELT (path, i, x)
2774         {
2775           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2776           dump_printf (MSG_NOTE, " ");
2777         }
2778       dump_printf (MSG_NOTE, "\n");
2779     }
2780
2781   /* Check whether the reduction path detected is valid.  */
2782   bool fail = path.length () == 0;
2783   bool neg = false;
2784   for (unsigned i = 1; i < path.length (); ++i)
2785     {
2786       gimple *use_stmt = USE_STMT (path[i].second);
2787       tree op = USE_FROM_PTR (path[i].second);
2788       if (! has_single_use (op)
2789           || ! is_gimple_assign (use_stmt))
2790         {
2791           fail = true;
2792           break;
2793         }
2794       if (gimple_assign_rhs_code (use_stmt) != code)
2795         {
2796           if (code == PLUS_EXPR
2797               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2798             {
2799               /* Track whether we negate the reduction value each iteration.  */
2800               if (gimple_assign_rhs2 (use_stmt) == op)
2801                 neg = ! neg;
2802             }
2803           else
2804             {
2805               fail = true;
2806               break;
2807             }
2808         }
2809     }
2810   return ! fail && ! neg;
2811 }
2812
2813
2814 /* Function vect_is_simple_reduction
2815
2816    (1) Detect a cross-iteration def-use cycle that represents a simple
2817    reduction computation.  We look for the following pattern:
2818
2819    loop_header:
2820      a1 = phi < a0, a2 >
2821      a3 = ...
2822      a2 = operation (a3, a1)
2823
2824    or
2825
2826    a3 = ...
2827    loop_header:
2828      a1 = phi < a0, a2 >
2829      a2 = operation (a3, a1)
2830
2831    such that:
2832    1. operation is commutative and associative and it is safe to
2833       change the order of the computation
2834    2. no uses for a2 in the loop (a2 is used out of the loop)
2835    3. no uses of a1 in the loop besides the reduction operation
2836    4. no uses of a1 outside the loop.
2837
2838    Conditions 1,4 are tested here.
2839    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2840
2841    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2842    nested cycles.
2843
2844    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2845    reductions:
2846
2847      a1 = phi < a0, a2 >
2848      inner loop (def of a3)
2849      a2 = phi < a3 >
2850
2851    (4) Detect condition expressions, ie:
2852      for (int i = 0; i < N; i++)
2853        if (a[i] < val)
2854         ret_val = a[i];
2855
2856 */
2857
2858 static stmt_vec_info
2859 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2860                           bool *double_reduc,
2861                           bool need_wrapping_integral_overflow,
2862                           enum vect_reduction_type *v_reduc_type)
2863 {
2864   gphi *phi = as_a <gphi *> (phi_info->stmt);
2865   struct loop *loop = (gimple_bb (phi))->loop_father;
2866   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2867   gimple *phi_use_stmt = NULL;
2868   enum tree_code orig_code, code;
2869   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2870   tree type;
2871   int nloop_uses;
2872   tree name;
2873   imm_use_iterator imm_iter;
2874   use_operand_p use_p;
2875   bool phi_def;
2876
2877   *double_reduc = false;
2878   *v_reduc_type = TREE_CODE_REDUCTION;
2879
2880   tree phi_name = PHI_RESULT (phi);
2881   /* ???  If there are no uses of the PHI result the inner loop reduction
2882      won't be detected as possibly double-reduction by vectorizable_reduction
2883      because that tries to walk the PHI arg from the preheader edge which
2884      can be constant.  See PR60382.  */
2885   if (has_zero_uses (phi_name))
2886     return NULL;
2887   nloop_uses = 0;
2888   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2889     {
2890       gimple *use_stmt = USE_STMT (use_p);
2891       if (is_gimple_debug (use_stmt))
2892         continue;
2893
2894       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2895         {
2896           if (dump_enabled_p ())
2897             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2898                              "intermediate value used outside loop.\n");
2899
2900           return NULL;
2901         }
2902
2903       nloop_uses++;
2904       if (nloop_uses > 1)
2905         {
2906           if (dump_enabled_p ())
2907             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908                              "reduction value used in loop.\n");
2909           return NULL;
2910         }
2911
2912       phi_use_stmt = use_stmt;
2913     }
2914
2915   edge latch_e = loop_latch_edge (loop);
2916   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2917   if (TREE_CODE (loop_arg) != SSA_NAME)
2918     {
2919       if (dump_enabled_p ())
2920         {
2921           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2922                            "reduction: not ssa_name: ");
2923           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2924           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2925         }
2926       return NULL;
2927     }
2928
2929   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2930   if (!def_stmt_info)
2931     return NULL;
2932
2933   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2934     {
2935       name = gimple_assign_lhs (def_stmt);
2936       phi_def = false;
2937     }
2938   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2939     {
2940       name = PHI_RESULT (def_stmt);
2941       phi_def = true;
2942     }
2943   else
2944     {
2945       if (dump_enabled_p ())
2946         {
2947           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2948                            "reduction: unhandled reduction operation: ");
2949           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2950                             def_stmt_info->stmt, 0);
2951         }
2952       return NULL;
2953     }
2954
2955   nloop_uses = 0;
2956   auto_vec<gphi *, 3> lcphis;
2957   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2958     {
2959       gimple *use_stmt = USE_STMT (use_p);
2960       if (is_gimple_debug (use_stmt))
2961         continue;
2962       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2963         nloop_uses++;
2964       else
2965         /* We can have more than one loop-closed PHI.  */
2966         lcphis.safe_push (as_a <gphi *> (use_stmt));
2967       if (nloop_uses > 1)
2968         {
2969           if (dump_enabled_p ())
2970             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2971                              "reduction used in loop.\n");
2972           return NULL;
2973         }
2974     }
2975
2976   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2977      defined in the inner loop.  */
2978   if (phi_def)
2979     {
2980       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2981       op1 = PHI_ARG_DEF (def_stmt, 0);
2982
2983       if (gimple_phi_num_args (def_stmt) != 1
2984           || TREE_CODE (op1) != SSA_NAME)
2985         {
2986           if (dump_enabled_p ())
2987             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2988                              "unsupported phi node definition.\n");
2989
2990           return NULL;
2991         }
2992
2993       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2994       if (gimple_bb (def1)
2995           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2996           && loop->inner
2997           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2998           && is_gimple_assign (def1)
2999           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3000         {
3001           if (dump_enabled_p ())
3002             report_vect_op (MSG_NOTE, def_stmt,
3003                             "detected double reduction: ");
3004
3005           *double_reduc = true;
3006           return def_stmt_info;
3007         }
3008
3009       return NULL;
3010     }
3011
3012   /* If we are vectorizing an inner reduction we are executing that
3013      in the original order only in case we are not dealing with a
3014      double reduction.  */
3015   bool check_reduction = true;
3016   if (flow_loop_nested_p (vect_loop, loop))
3017     {
3018       gphi *lcphi;
3019       unsigned i;
3020       check_reduction = false;
3021       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3022         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3023           {
3024             gimple *use_stmt = USE_STMT (use_p);
3025             if (is_gimple_debug (use_stmt))
3026               continue;
3027             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3028               check_reduction = true;
3029           }
3030     }
3031
3032   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3033   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3034   code = orig_code = gimple_assign_rhs_code (def_stmt);
3035
3036   /* We can handle "res -= x[i]", which is non-associative by
3037      simply rewriting this into "res += -x[i]".  Avoid changing
3038      gimple instruction for the first simple tests and only do this
3039      if we're allowed to change code at all.  */
3040   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3041     code = PLUS_EXPR;
3042
3043   if (code == COND_EXPR)
3044     {
3045       if (! nested_in_vect_loop)
3046         *v_reduc_type = COND_REDUCTION;
3047
3048       op3 = gimple_assign_rhs1 (def_stmt);
3049       if (COMPARISON_CLASS_P (op3))
3050         {
3051           op4 = TREE_OPERAND (op3, 1);
3052           op3 = TREE_OPERAND (op3, 0);
3053         }
3054       if (op3 == phi_name || op4 == phi_name)
3055         {
3056           if (dump_enabled_p ())
3057             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3058                             "reduction: condition depends on previous"
3059                             " iteration: ");
3060           return NULL;
3061         }
3062
3063       op1 = gimple_assign_rhs2 (def_stmt);
3064       op2 = gimple_assign_rhs3 (def_stmt);
3065     }
3066   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3067     {
3068       if (dump_enabled_p ())
3069         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3070                         "reduction: not commutative/associative: ");
3071       return NULL;
3072     }
3073   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3074     {
3075       op1 = gimple_assign_rhs1 (def_stmt);
3076       op2 = gimple_assign_rhs2 (def_stmt);
3077     }
3078   else
3079     {
3080       if (dump_enabled_p ())
3081         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3082                         "reduction: not handled operation: ");
3083       return NULL;
3084     }
3085
3086   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3087     {
3088       if (dump_enabled_p ())
3089         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3090                         "reduction: both uses not ssa_names: ");
3091
3092       return NULL;
3093     }
3094
3095   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3096   if ((TREE_CODE (op1) == SSA_NAME
3097        && !types_compatible_p (type,TREE_TYPE (op1)))
3098       || (TREE_CODE (op2) == SSA_NAME
3099           && !types_compatible_p (type, TREE_TYPE (op2)))
3100       || (op3 && TREE_CODE (op3) == SSA_NAME
3101           && !types_compatible_p (type, TREE_TYPE (op3)))
3102       || (op4 && TREE_CODE (op4) == SSA_NAME
3103           && !types_compatible_p (type, TREE_TYPE (op4))))
3104     {
3105       if (dump_enabled_p ())
3106         {
3107           dump_printf_loc (MSG_NOTE, vect_location,
3108                            "reduction: multiple types: operation type: ");
3109           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3110           dump_printf (MSG_NOTE, ", operands types: ");
3111           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3112                              TREE_TYPE (op1));
3113           dump_printf (MSG_NOTE, ",");
3114           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3115                              TREE_TYPE (op2));
3116           if (op3)
3117             {
3118               dump_printf (MSG_NOTE, ",");
3119               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3120                                  TREE_TYPE (op3));
3121             }
3122
3123           if (op4)
3124             {
3125               dump_printf (MSG_NOTE, ",");
3126               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3127                                  TREE_TYPE (op4));
3128             }
3129           dump_printf (MSG_NOTE, "\n");
3130         }
3131
3132       return NULL;
3133     }
3134
3135   /* Check whether it's ok to change the order of the computation.
3136      Generally, when vectorizing a reduction we change the order of the
3137      computation.  This may change the behavior of the program in some
3138      cases, so we need to check that this is ok.  One exception is when
3139      vectorizing an outer-loop: the inner-loop is executed sequentially,
3140      and therefore vectorizing reductions in the inner-loop during
3141      outer-loop vectorization is safe.  */
3142   if (check_reduction
3143       && *v_reduc_type == TREE_CODE_REDUCTION
3144       && needs_fold_left_reduction_p (type, code,
3145                                       need_wrapping_integral_overflow))
3146     *v_reduc_type = FOLD_LEFT_REDUCTION;
3147
3148   /* Reduction is safe. We're dealing with one of the following:
3149      1) integer arithmetic and no trapv
3150      2) floating point arithmetic, and special flags permit this optimization
3151      3) nested cycle (i.e., outer loop vectorization).  */
3152   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3153   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3154   if (code != COND_EXPR && !def1_info && !def2_info)
3155     {
3156       if (dump_enabled_p ())
3157         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3158       return NULL;
3159     }
3160
3161   /* Check that one def is the reduction def, defined by PHI,
3162      the other def is either defined in the loop ("vect_internal_def"),
3163      or it's an induction (defined by a loop-header phi-node).  */
3164
3165   if (def2_info
3166       && def2_info->stmt == phi
3167       && (code == COND_EXPR
3168           || !def1_info
3169           || vect_valid_reduction_input_p (def1_info)))
3170     {
3171       if (dump_enabled_p ())
3172         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3173       return def_stmt_info;
3174     }
3175
3176   if (def1_info
3177       && def1_info->stmt == phi
3178       && (code == COND_EXPR
3179           || !def2_info
3180           || vect_valid_reduction_input_p (def2_info)))
3181     {
3182       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3183         {
3184           /* Check if we can swap operands (just for simplicity - so that
3185              the rest of the code can assume that the reduction variable
3186              is always the last (second) argument).  */
3187           if (code == COND_EXPR)
3188             {
3189               /* Swap cond_expr by inverting the condition.  */
3190               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3191               enum tree_code invert_code = ERROR_MARK;
3192               enum tree_code cond_code = TREE_CODE (cond_expr);
3193
3194               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3195                 {
3196                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3197                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3198                 }
3199               if (invert_code != ERROR_MARK)
3200                 {
3201                   TREE_SET_CODE (cond_expr, invert_code);
3202                   swap_ssa_operands (def_stmt,
3203                                      gimple_assign_rhs2_ptr (def_stmt),
3204                                      gimple_assign_rhs3_ptr (def_stmt));
3205                 }
3206               else
3207                 {
3208                   if (dump_enabled_p ())
3209                     report_vect_op (MSG_NOTE, def_stmt,
3210                                     "detected reduction: cannot swap operands "
3211                                     "for cond_expr");
3212                   return NULL;
3213                 }
3214             }
3215           else
3216             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3217                                gimple_assign_rhs2_ptr (def_stmt));
3218
3219           if (dump_enabled_p ())
3220             report_vect_op (MSG_NOTE, def_stmt,
3221                             "detected reduction: need to swap operands: ");
3222
3223           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3224             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3225         }
3226       else
3227         {
3228           if (dump_enabled_p ())
3229             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3230         }
3231
3232       return def_stmt_info;
3233     }
3234
3235   /* Try to find SLP reduction chain.  */
3236   if (! nested_in_vect_loop
3237       && code != COND_EXPR
3238       && orig_code != MINUS_EXPR
3239       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3240     {
3241       if (dump_enabled_p ())
3242         report_vect_op (MSG_NOTE, def_stmt,
3243                         "reduction: detected reduction chain: ");
3244
3245       return def_stmt_info;
3246     }
3247
3248   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3249   stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (def_stmt_info);
3250   while (first)
3251     {
3252       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
3253       REDUC_GROUP_FIRST_ELEMENT (first) = NULL;
3254       REDUC_GROUP_NEXT_ELEMENT (first) = NULL;
3255       first = next;
3256     }
3257
3258   /* Look for the expression computing loop_arg from loop PHI result.  */
3259   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3260     return def_stmt_info;
3261
3262   if (dump_enabled_p ())
3263     {
3264       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3265                       "reduction: unknown pattern: ");
3266     }
3267
3268   return NULL;
3269 }
3270
3271 /* Wrapper around vect_is_simple_reduction, which will modify code
3272    in-place if it enables detection of more reductions.  Arguments
3273    as there.  */
3274
3275 stmt_vec_info
3276 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3277                              bool *double_reduc,
3278                              bool need_wrapping_integral_overflow)
3279 {
3280   enum vect_reduction_type v_reduc_type;
3281   stmt_vec_info def_info
3282     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3283                                 need_wrapping_integral_overflow,
3284                                 &v_reduc_type);
3285   if (def_info)
3286     {
3287       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3288       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3289       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3290       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3291     }
3292   return def_info;
3293 }
3294
3295 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3296 int
3297 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3298                              int *peel_iters_epilogue,
3299                              stmt_vector_for_cost *scalar_cost_vec,
3300                              stmt_vector_for_cost *prologue_cost_vec,
3301                              stmt_vector_for_cost *epilogue_cost_vec)
3302 {
3303   int retval = 0;
3304   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3305
3306   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3307     {
3308       *peel_iters_epilogue = assumed_vf / 2;
3309       if (dump_enabled_p ())
3310         dump_printf_loc (MSG_NOTE, vect_location,
3311                          "cost model: epilogue peel iters set to vf/2 "
3312                          "because loop iterations are unknown .\n");
3313
3314       /* If peeled iterations are known but number of scalar loop
3315          iterations are unknown, count a taken branch per peeled loop.  */
3316       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3317                                  NULL, 0, vect_prologue);
3318       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3319                                  NULL, 0, vect_epilogue);
3320     }
3321   else
3322     {
3323       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3324       peel_iters_prologue = niters < peel_iters_prologue ?
3325                             niters : peel_iters_prologue;
3326       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3327       /* If we need to peel for gaps, but no peeling is required, we have to
3328          peel VF iterations.  */
3329       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3330         *peel_iters_epilogue = assumed_vf;
3331     }
3332
3333   stmt_info_for_cost *si;
3334   int j;
3335   if (peel_iters_prologue)
3336     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3337       retval += record_stmt_cost (prologue_cost_vec,
3338                                   si->count * peel_iters_prologue,
3339                                   si->kind, si->stmt_info, si->misalign,
3340                                   vect_prologue);
3341   if (*peel_iters_epilogue)
3342     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3343       retval += record_stmt_cost (epilogue_cost_vec,
3344                                   si->count * *peel_iters_epilogue,
3345                                   si->kind, si->stmt_info, si->misalign,
3346                                   vect_epilogue);
3347
3348   return retval;
3349 }
3350
3351 /* Function vect_estimate_min_profitable_iters
3352
3353    Return the number of iterations required for the vector version of the
3354    loop to be profitable relative to the cost of the scalar version of the
3355    loop.
3356
3357    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3358    of iterations for vectorization.  -1 value means loop vectorization
3359    is not profitable.  This returned value may be used for dynamic
3360    profitability check.
3361
3362    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3363    for static check against estimated number of iterations.  */
3364
3365 static void
3366 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3367                                     int *ret_min_profitable_niters,
3368                                     int *ret_min_profitable_estimate)
3369 {
3370   int min_profitable_iters;
3371   int min_profitable_estimate;
3372   int peel_iters_prologue;
3373   int peel_iters_epilogue;
3374   unsigned vec_inside_cost = 0;
3375   int vec_outside_cost = 0;
3376   unsigned vec_prologue_cost = 0;
3377   unsigned vec_epilogue_cost = 0;
3378   int scalar_single_iter_cost = 0;
3379   int scalar_outside_cost = 0;
3380   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3381   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3382   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3383
3384   /* Cost model disabled.  */
3385   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3386     {
3387       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3388       *ret_min_profitable_niters = 0;
3389       *ret_min_profitable_estimate = 0;
3390       return;
3391     }
3392
3393   /* Requires loop versioning tests to handle misalignment.  */
3394   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3395     {
3396       /*  FIXME: Make cost depend on complexity of individual check.  */
3397       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3398       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3399                             vect_prologue);
3400       dump_printf (MSG_NOTE,
3401                    "cost model: Adding cost of checks for loop "
3402                    "versioning to treat misalignment.\n");
3403     }
3404
3405   /* Requires loop versioning with alias checks.  */
3406   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3407     {
3408       /*  FIXME: Make cost depend on complexity of individual check.  */
3409       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3410       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3411                             vect_prologue);
3412       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3413       if (len)
3414         /* Count LEN - 1 ANDs and LEN comparisons.  */
3415         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3416                               NULL, 0, vect_prologue);
3417       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3418       if (len)
3419         {
3420           /* Count LEN - 1 ANDs and LEN comparisons.  */
3421           unsigned int nstmts = len * 2 - 1;
3422           /* +1 for each bias that needs adding.  */
3423           for (unsigned int i = 0; i < len; ++i)
3424             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3425               nstmts += 1;
3426           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3427                                 NULL, 0, vect_prologue);
3428         }
3429       dump_printf (MSG_NOTE,
3430                    "cost model: Adding cost of checks for loop "
3431                    "versioning aliasing.\n");
3432     }
3433
3434   /* Requires loop versioning with niter checks.  */
3435   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3436     {
3437       /*  FIXME: Make cost depend on complexity of individual check.  */
3438       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3439                             vect_prologue);
3440       dump_printf (MSG_NOTE,
3441                    "cost model: Adding cost of checks for loop "
3442                    "versioning niters.\n");
3443     }
3444
3445   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3446     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3447                           vect_prologue);
3448
3449   /* Count statements in scalar loop.  Using this as scalar cost for a single
3450      iteration for now.
3451
3452      TODO: Add outer loop support.
3453
3454      TODO: Consider assigning different costs to different scalar
3455      statements.  */
3456
3457   scalar_single_iter_cost
3458     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3459
3460   /* Add additional cost for the peeled instructions in prologue and epilogue
3461      loop.  (For fully-masked loops there will be no peeling.)
3462
3463      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3464      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3465
3466      TODO: Build an expression that represents peel_iters for prologue and
3467      epilogue to be used in a run-time test.  */
3468
3469   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3470     {
3471       peel_iters_prologue = 0;
3472       peel_iters_epilogue = 0;
3473
3474       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3475         {
3476           /* We need to peel exactly one iteration.  */
3477           peel_iters_epilogue += 1;
3478           stmt_info_for_cost *si;
3479           int j;
3480           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3481                             j, si)
3482             (void) add_stmt_cost (target_cost_data, si->count,
3483                                   si->kind, si->stmt_info, si->misalign,
3484                                   vect_epilogue);
3485         }
3486     }
3487   else if (npeel < 0)
3488     {
3489       peel_iters_prologue = assumed_vf / 2;
3490       dump_printf (MSG_NOTE, "cost model: "
3491                    "prologue peel iters set to vf/2.\n");
3492
3493       /* If peeling for alignment is unknown, loop bound of main loop becomes
3494          unknown.  */
3495       peel_iters_epilogue = assumed_vf / 2;
3496       dump_printf (MSG_NOTE, "cost model: "
3497                    "epilogue peel iters set to vf/2 because "
3498                    "peeling for alignment is unknown.\n");
3499
3500       /* If peeled iterations are unknown, count a taken branch and a not taken
3501          branch per peeled loop. Even if scalar loop iterations are known,
3502          vector iterations are not known since peeled prologue iterations are
3503          not known. Hence guards remain the same.  */
3504       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3505                             NULL, 0, vect_prologue);
3506       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3507                             NULL, 0, vect_prologue);
3508       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3509                             NULL, 0, vect_epilogue);
3510       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3511                             NULL, 0, vect_epilogue);
3512       stmt_info_for_cost *si;
3513       int j;
3514       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3515         {
3516           (void) add_stmt_cost (target_cost_data,
3517                                 si->count * peel_iters_prologue,
3518                                 si->kind, si->stmt_info, si->misalign,
3519                                 vect_prologue);
3520           (void) add_stmt_cost (target_cost_data,
3521                                 si->count * peel_iters_epilogue,
3522                                 si->kind, si->stmt_info, si->misalign,
3523                                 vect_epilogue);
3524         }
3525     }
3526   else
3527     {
3528       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3529       stmt_info_for_cost *si;
3530       int j;
3531       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3532
3533       prologue_cost_vec.create (2);
3534       epilogue_cost_vec.create (2);
3535       peel_iters_prologue = npeel;
3536
3537       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3538                                           &peel_iters_epilogue,
3539                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3540                                             (loop_vinfo),
3541                                           &prologue_cost_vec,
3542                                           &epilogue_cost_vec);
3543
3544       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3545         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3546                               si->misalign, vect_prologue);
3547
3548       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3549         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3550                               si->misalign, vect_epilogue);
3551
3552       prologue_cost_vec.release ();
3553       epilogue_cost_vec.release ();
3554     }
3555
3556   /* FORNOW: The scalar outside cost is incremented in one of the
3557      following ways:
3558
3559      1. The vectorizer checks for alignment and aliasing and generates
3560      a condition that allows dynamic vectorization.  A cost model
3561      check is ANDED with the versioning condition.  Hence scalar code
3562      path now has the added cost of the versioning check.
3563
3564        if (cost > th & versioning_check)
3565          jmp to vector code
3566
3567      Hence run-time scalar is incremented by not-taken branch cost.
3568
3569      2. The vectorizer then checks if a prologue is required.  If the
3570      cost model check was not done before during versioning, it has to
3571      be done before the prologue check.
3572
3573        if (cost <= th)
3574          prologue = scalar_iters
3575        if (prologue == 0)
3576          jmp to vector code
3577        else
3578          execute prologue
3579        if (prologue == num_iters)
3580          go to exit
3581
3582      Hence the run-time scalar cost is incremented by a taken branch,
3583      plus a not-taken branch, plus a taken branch cost.
3584
3585      3. The vectorizer then checks if an epilogue is required.  If the
3586      cost model check was not done before during prologue check, it
3587      has to be done with the epilogue check.
3588
3589        if (prologue == 0)
3590          jmp to vector code
3591        else
3592          execute prologue
3593        if (prologue == num_iters)
3594          go to exit
3595        vector code:
3596          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3597            jmp to epilogue
3598
3599      Hence the run-time scalar cost should be incremented by 2 taken
3600      branches.
3601
3602      TODO: The back end may reorder the BBS's differently and reverse
3603      conditions/branch directions.  Change the estimates below to
3604      something more reasonable.  */
3605
3606   /* If the number of iterations is known and we do not do versioning, we can
3607      decide whether to vectorize at compile time.  Hence the scalar version
3608      do not carry cost model guard costs.  */
3609   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3610       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3611     {
3612       /* Cost model check occurs at versioning.  */
3613       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3614         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3615       else
3616         {
3617           /* Cost model check occurs at prologue generation.  */
3618           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3619             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3620               + vect_get_stmt_cost (cond_branch_not_taken);
3621           /* Cost model check occurs at epilogue generation.  */
3622           else
3623             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3624         }
3625     }
3626
3627   /* Complete the target-specific cost calculations.  */
3628   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3629                &vec_inside_cost, &vec_epilogue_cost);
3630
3631   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3632
3633   if (dump_enabled_p ())
3634     {
3635       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3636       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3637                    vec_inside_cost);
3638       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3639                    vec_prologue_cost);
3640       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3641                    vec_epilogue_cost);
3642       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3643                    scalar_single_iter_cost);
3644       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3645                    scalar_outside_cost);
3646       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3647                    vec_outside_cost);
3648       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3649                    peel_iters_prologue);
3650       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3651                    peel_iters_epilogue);
3652     }
3653
3654   /* Calculate number of iterations required to make the vector version
3655      profitable, relative to the loop bodies only.  The following condition
3656      must hold true:
3657      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3658      where
3659      SIC = scalar iteration cost, VIC = vector iteration cost,
3660      VOC = vector outside cost, VF = vectorization factor,
3661      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3662      SOC = scalar outside cost for run time cost model check.  */
3663
3664   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3665     {
3666       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3667                               * assumed_vf
3668                               - vec_inside_cost * peel_iters_prologue
3669                               - vec_inside_cost * peel_iters_epilogue);
3670       if (min_profitable_iters <= 0)
3671         min_profitable_iters = 0;
3672       else
3673         {
3674           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3675                                    - vec_inside_cost);
3676
3677           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3678               <= (((int) vec_inside_cost * min_profitable_iters)
3679                   + (((int) vec_outside_cost - scalar_outside_cost)
3680                      * assumed_vf)))
3681             min_profitable_iters++;
3682         }
3683     }
3684   /* vector version will never be profitable.  */
3685   else
3686     {
3687       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3688         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3689                     "vectorization did not happen for a simd loop");
3690
3691       if (dump_enabled_p ())
3692         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3693                          "cost model: the vector iteration cost = %d "
3694                          "divided by the scalar iteration cost = %d "
3695                          "is greater or equal to the vectorization factor = %d"
3696                          ".\n",
3697                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3698       *ret_min_profitable_niters = -1;
3699       *ret_min_profitable_estimate = -1;
3700       return;
3701     }
3702
3703   dump_printf (MSG_NOTE,
3704                "  Calculated minimum iters for profitability: %d\n",
3705                min_profitable_iters);
3706
3707   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3708       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3709     /* We want the vectorized loop to execute at least once.  */
3710     min_profitable_iters = assumed_vf + peel_iters_prologue;
3711
3712   if (dump_enabled_p ())
3713     dump_printf_loc (MSG_NOTE, vect_location,
3714                      "  Runtime profitability threshold = %d\n",
3715                      min_profitable_iters);
3716
3717   *ret_min_profitable_niters = min_profitable_iters;
3718
3719   /* Calculate number of iterations required to make the vector version
3720      profitable, relative to the loop bodies only.
3721
3722      Non-vectorized variant is SIC * niters and it must win over vector
3723      variant on the expected loop trip count.  The following condition must hold true:
3724      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3725
3726   if (vec_outside_cost <= 0)
3727     min_profitable_estimate = 0;
3728   else
3729     {
3730       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3731                                  * assumed_vf
3732                                  - vec_inside_cost * peel_iters_prologue
3733                                  - vec_inside_cost * peel_iters_epilogue)
3734                                  / ((scalar_single_iter_cost * assumed_vf)
3735                                    - vec_inside_cost);
3736     }
3737   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3738   if (dump_enabled_p ())
3739     dump_printf_loc (MSG_NOTE, vect_location,
3740                      "  Static estimate profitability threshold = %d\n",
3741                      min_profitable_estimate);
3742
3743   *ret_min_profitable_estimate = min_profitable_estimate;
3744 }
3745
3746 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3747    vector elements (not bits) for a vector with NELT elements.  */
3748 static void
3749 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3750                               vec_perm_builder *sel)
3751 {
3752   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3753      by vec_perm_indices.  */
3754   sel->new_vector (nelt, 1, 3);
3755   for (unsigned int i = 0; i < 3; i++)
3756     sel->quick_push (i + offset);
3757 }
3758
3759 /* Checks whether the target supports whole-vector shifts for vectors of mode
3760    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3761    it supports vec_perm_const with masks for all necessary shift amounts.  */
3762 static bool
3763 have_whole_vector_shift (machine_mode mode)
3764 {
3765   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3766     return true;
3767
3768   /* Variable-length vectors should be handled via the optab.  */
3769   unsigned int nelt;
3770   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3771     return false;
3772
3773   vec_perm_builder sel;
3774   vec_perm_indices indices;
3775   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3776     {
3777       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3778       indices.new_vector (sel, 2, nelt);
3779       if (!can_vec_perm_const_p (mode, indices, false))
3780         return false;
3781     }
3782   return true;
3783 }
3784
3785 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3786    functions. Design better to avoid maintenance issues.  */
3787
3788 /* Function vect_model_reduction_cost.
3789
3790    Models cost for a reduction operation, including the vector ops
3791    generated within the strip-mine loop, the initial definition before
3792    the loop, and the epilogue code that must be generated.  */
3793
3794 static void
3795 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3796                            int ncopies, stmt_vector_for_cost *cost_vec)
3797 {
3798   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3799   enum tree_code code;
3800   optab optab;
3801   tree vectype;
3802   machine_mode mode;
3803   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3804   struct loop *loop = NULL;
3805
3806   if (loop_vinfo)
3807     loop = LOOP_VINFO_LOOP (loop_vinfo);
3808
3809   /* Condition reductions generate two reductions in the loop.  */
3810   vect_reduction_type reduction_type
3811     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3812   if (reduction_type == COND_REDUCTION)
3813     ncopies *= 2;
3814
3815   vectype = STMT_VINFO_VECTYPE (stmt_info);
3816   mode = TYPE_MODE (vectype);
3817   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
3818
3819   if (!orig_stmt_info)
3820     orig_stmt_info = stmt_info;
3821
3822   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3823
3824   if (reduction_type == EXTRACT_LAST_REDUCTION
3825       || reduction_type == FOLD_LEFT_REDUCTION)
3826     {
3827       /* No extra instructions needed in the prologue.  */
3828       prologue_cost = 0;
3829
3830       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3831         /* Count one reduction-like operation per vector.  */
3832         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3833                                         stmt_info, 0, vect_body);
3834       else
3835         {
3836           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3837           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3838           inside_cost = record_stmt_cost (cost_vec, nelements,
3839                                           vec_to_scalar, stmt_info, 0,
3840                                           vect_body);
3841           inside_cost += record_stmt_cost (cost_vec, nelements,
3842                                            scalar_stmt, stmt_info, 0,
3843                                            vect_body);
3844         }
3845     }
3846   else
3847     {
3848       /* Add in cost for initial definition.
3849          For cond reduction we have four vectors: initial index, step,
3850          initial result of the data reduction, initial value of the index
3851          reduction.  */
3852       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3853       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3854                                          scalar_to_vec, stmt_info, 0,
3855                                          vect_prologue);
3856
3857       /* Cost of reduction op inside loop.  */
3858       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3859                                       stmt_info, 0, vect_body);
3860     }
3861
3862   /* Determine cost of epilogue code.
3863
3864      We have a reduction operator that will reduce the vector in one statement.
3865      Also requires scalar extract.  */
3866
3867   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3868     {
3869       if (reduc_fn != IFN_LAST)
3870         {
3871           if (reduction_type == COND_REDUCTION)
3872             {
3873               /* An EQ stmt and an COND_EXPR stmt.  */
3874               epilogue_cost += record_stmt_cost (cost_vec, 2,
3875                                                  vector_stmt, stmt_info, 0,
3876                                                  vect_epilogue);
3877               /* Reduction of the max index and a reduction of the found
3878                  values.  */
3879               epilogue_cost += record_stmt_cost (cost_vec, 2,
3880                                                  vec_to_scalar, stmt_info, 0,
3881                                                  vect_epilogue);
3882               /* A broadcast of the max value.  */
3883               epilogue_cost += record_stmt_cost (cost_vec, 1,
3884                                                  scalar_to_vec, stmt_info, 0,
3885                                                  vect_epilogue);
3886             }
3887           else
3888             {
3889               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3890                                                  stmt_info, 0, vect_epilogue);
3891               epilogue_cost += record_stmt_cost (cost_vec, 1,
3892                                                  vec_to_scalar, stmt_info, 0,
3893                                                  vect_epilogue);
3894             }
3895         }
3896       else if (reduction_type == COND_REDUCTION)
3897         {
3898           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3899           /* Extraction of scalar elements.  */
3900           epilogue_cost += record_stmt_cost (cost_vec,
3901                                              2 * estimated_nunits,
3902                                              vec_to_scalar, stmt_info, 0,
3903                                              vect_epilogue);
3904           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3905           epilogue_cost += record_stmt_cost (cost_vec,
3906                                              2 * estimated_nunits - 3,
3907                                              scalar_stmt, stmt_info, 0,
3908                                              vect_epilogue);
3909         }
3910       else if (reduction_type == EXTRACT_LAST_REDUCTION
3911                || reduction_type == FOLD_LEFT_REDUCTION)
3912         /* No extra instructions need in the epilogue.  */
3913         ;
3914       else
3915         {
3916           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3917           tree bitsize =
3918             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3919           int element_bitsize = tree_to_uhwi (bitsize);
3920           int nelements = vec_size_in_bits / element_bitsize;
3921
3922           if (code == COND_EXPR)
3923             code = MAX_EXPR;
3924
3925           optab = optab_for_tree_code (code, vectype, optab_default);
3926
3927           /* We have a whole vector shift available.  */
3928           if (optab != unknown_optab
3929               && VECTOR_MODE_P (mode)
3930               && optab_handler (optab, mode) != CODE_FOR_nothing
3931               && have_whole_vector_shift (mode))
3932             {
3933               /* Final reduction via vector shifts and the reduction operator.
3934                  Also requires scalar extract.  */
3935               epilogue_cost += record_stmt_cost (cost_vec,
3936                                                  exact_log2 (nelements) * 2,
3937                                                  vector_stmt, stmt_info, 0,
3938                                                  vect_epilogue);
3939               epilogue_cost += record_stmt_cost (cost_vec, 1,
3940                                                  vec_to_scalar, stmt_info, 0,
3941                                                  vect_epilogue);
3942             }
3943           else
3944             /* Use extracts and reduction op for final reduction.  For N
3945                elements, we have N extracts and N-1 reduction ops.  */
3946             epilogue_cost += record_stmt_cost (cost_vec,
3947                                                nelements + nelements - 1,
3948                                                vector_stmt, stmt_info, 0,
3949                                                vect_epilogue);
3950         }
3951     }
3952
3953   if (dump_enabled_p ())
3954     dump_printf (MSG_NOTE,
3955                  "vect_model_reduction_cost: inside_cost = %d, "
3956                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3957                  prologue_cost, epilogue_cost);
3958 }
3959
3960
3961 /* Function vect_model_induction_cost.
3962
3963    Models cost for induction operations.  */
3964
3965 static void
3966 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3967                            stmt_vector_for_cost *cost_vec)
3968 {
3969   unsigned inside_cost, prologue_cost;
3970
3971   if (PURE_SLP_STMT (stmt_info))
3972     return;
3973
3974   /* loop cost for vec_loop.  */
3975   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3976                                   stmt_info, 0, vect_body);
3977
3978   /* prologue cost for vec_init and vec_step.  */
3979   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3980                                     stmt_info, 0, vect_prologue);
3981
3982   if (dump_enabled_p ())
3983     dump_printf_loc (MSG_NOTE, vect_location,
3984                      "vect_model_induction_cost: inside_cost = %d, "
3985                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3986 }
3987
3988
3989
3990 /* Function get_initial_def_for_reduction
3991
3992    Input:
3993    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3994    INIT_VAL - the initial value of the reduction variable
3995
3996    Output:
3997    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3998         of the reduction (used for adjusting the epilog - see below).
3999    Return a vector variable, initialized according to the operation that
4000         STMT_VINFO performs. This vector will be used as the initial value
4001         of the vector of partial results.
4002
4003    Option1 (adjust in epilog): Initialize the vector as follows:
4004      add/bit or/xor:    [0,0,...,0,0]
4005      mult/bit and:      [1,1,...,1,1]
4006      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4007    and when necessary (e.g. add/mult case) let the caller know
4008    that it needs to adjust the result by init_val.
4009
4010    Option2: Initialize the vector as follows:
4011      add/bit or/xor:    [init_val,0,0,...,0]
4012      mult/bit and:      [init_val,1,1,...,1]
4013      min/max/cond_expr: [init_val,init_val,...,init_val]
4014    and no adjustments are needed.
4015
4016    For example, for the following code:
4017
4018    s = init_val;
4019    for (i=0;i<n;i++)
4020      s = s + a[i];
4021
4022    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4023    For a vector of 4 units, we want to return either [0,0,0,init_val],
4024    or [0,0,0,0] and let the caller know that it needs to adjust
4025    the result at the end by 'init_val'.
4026
4027    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4028    initialization vector is simpler (same element in all entries), if
4029    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4030
4031    A cost model should help decide between these two schemes.  */
4032
4033 tree
4034 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4035                                tree *adjustment_def)
4036 {
4037   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4038   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4039   tree scalar_type = TREE_TYPE (init_val);
4040   tree vectype = get_vectype_for_scalar_type (scalar_type);
4041   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4042   tree def_for_init;
4043   tree init_def;
4044   REAL_VALUE_TYPE real_init_val = dconst0;
4045   int int_init_val = 0;
4046   gimple_seq stmts = NULL;
4047
4048   gcc_assert (vectype);
4049
4050   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4051               || SCALAR_FLOAT_TYPE_P (scalar_type));
4052
4053   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4054               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4055
4056   vect_reduction_type reduction_type
4057     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4058
4059   switch (code)
4060     {
4061     case WIDEN_SUM_EXPR:
4062     case DOT_PROD_EXPR:
4063     case SAD_EXPR:
4064     case PLUS_EXPR:
4065     case MINUS_EXPR:
4066     case BIT_IOR_EXPR:
4067     case BIT_XOR_EXPR:
4068     case MULT_EXPR:
4069     case BIT_AND_EXPR:
4070       {
4071         /* ADJUSTMENT_DEF is NULL when called from
4072            vect_create_epilog_for_reduction to vectorize double reduction.  */
4073         if (adjustment_def)
4074           *adjustment_def = init_val;
4075
4076         if (code == MULT_EXPR)
4077           {
4078             real_init_val = dconst1;
4079             int_init_val = 1;
4080           }
4081
4082         if (code == BIT_AND_EXPR)
4083           int_init_val = -1;
4084
4085         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4086           def_for_init = build_real (scalar_type, real_init_val);
4087         else
4088           def_for_init = build_int_cst (scalar_type, int_init_val);
4089
4090         if (adjustment_def)
4091           /* Option1: the first element is '0' or '1' as well.  */
4092           init_def = gimple_build_vector_from_val (&stmts, vectype,
4093                                                    def_for_init);
4094         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4095           {
4096             /* Option2 (variable length): the first element is INIT_VAL.  */
4097             init_def = gimple_build_vector_from_val (&stmts, vectype,
4098                                                      def_for_init);
4099             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4100                                      vectype, init_def, init_val);
4101           }
4102         else
4103           {
4104             /* Option2: the first element is INIT_VAL.  */
4105             tree_vector_builder elts (vectype, 1, 2);
4106             elts.quick_push (init_val);
4107             elts.quick_push (def_for_init);
4108             init_def = gimple_build_vector (&stmts, &elts);
4109           }
4110       }
4111       break;
4112
4113     case MIN_EXPR:
4114     case MAX_EXPR:
4115     case COND_EXPR:
4116       {
4117         if (adjustment_def)
4118           {
4119             *adjustment_def = NULL_TREE;
4120             if (reduction_type != COND_REDUCTION
4121                 && reduction_type != EXTRACT_LAST_REDUCTION)
4122               {
4123                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4124                 break;
4125               }
4126           }
4127         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4128         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4129       }
4130       break;
4131
4132     default:
4133       gcc_unreachable ();
4134     }
4135
4136   if (stmts)
4137     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4138   return init_def;
4139 }
4140
4141 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4142    NUMBER_OF_VECTORS is the number of vector defs to create.
4143    If NEUTRAL_OP is nonnull, introducing extra elements of that
4144    value will not change the result.  */
4145
4146 static void
4147 get_initial_defs_for_reduction (slp_tree slp_node,
4148                                 vec<tree> *vec_oprnds,
4149                                 unsigned int number_of_vectors,
4150                                 bool reduc_chain, tree neutral_op)
4151 {
4152   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4153   stmt_vec_info stmt_vinfo = stmts[0];
4154   unsigned HOST_WIDE_INT nunits;
4155   unsigned j, number_of_places_left_in_vector;
4156   tree vector_type;
4157   tree vop;
4158   int group_size = stmts.length ();
4159   unsigned int vec_num, i;
4160   unsigned number_of_copies = 1;
4161   vec<tree> voprnds;
4162   voprnds.create (number_of_vectors);
4163   struct loop *loop;
4164   auto_vec<tree, 16> permute_results;
4165
4166   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4167
4168   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4169
4170   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4171   gcc_assert (loop);
4172   edge pe = loop_preheader_edge (loop);
4173
4174   gcc_assert (!reduc_chain || neutral_op);
4175
4176   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4177      created vectors. It is greater than 1 if unrolling is performed.
4178
4179      For example, we have two scalar operands, s1 and s2 (e.g., group of
4180      strided accesses of size two), while NUNITS is four (i.e., four scalars
4181      of this type can be packed in a vector).  The output vector will contain
4182      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4183      will be 2).
4184
4185      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4186      vectors containing the operands.
4187
4188      For example, NUNITS is four as before, and the group size is 8
4189      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4190      {s5, s6, s7, s8}.  */
4191
4192   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4193     nunits = group_size;
4194
4195   number_of_copies = nunits * number_of_vectors / group_size;
4196
4197   number_of_places_left_in_vector = nunits;
4198   bool constant_p = true;
4199   tree_vector_builder elts (vector_type, nunits, 1);
4200   elts.quick_grow (nunits);
4201   for (j = 0; j < number_of_copies; j++)
4202     {
4203       for (i = group_size - 1; stmts.iterate (i, &stmt_vinfo); i--)
4204         {
4205           tree op;
4206           /* Get the def before the loop.  In reduction chain we have only
4207              one initial value.  */
4208           if ((j != (number_of_copies - 1)
4209                || (reduc_chain && i != 0))
4210               && neutral_op)
4211             op = neutral_op;
4212           else
4213             op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4214
4215           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4216           number_of_places_left_in_vector--;
4217           elts[number_of_places_left_in_vector] = op;
4218           if (!CONSTANT_CLASS_P (op))
4219             constant_p = false;
4220
4221           if (number_of_places_left_in_vector == 0)
4222             {
4223               gimple_seq ctor_seq = NULL;
4224               tree init;
4225               if (constant_p && !neutral_op
4226                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4227                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4228                 /* Build the vector directly from ELTS.  */
4229                 init = gimple_build_vector (&ctor_seq, &elts);
4230               else if (neutral_op)
4231                 {
4232                   /* Build a vector of the neutral value and shift the
4233                      other elements into place.  */
4234                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4235                                                        neutral_op);
4236                   int k = nunits;
4237                   while (k > 0 && elts[k - 1] == neutral_op)
4238                     k -= 1;
4239                   while (k > 0)
4240                     {
4241                       k -= 1;
4242                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4243                                            vector_type, init, elts[k]);
4244                     }
4245                 }
4246               else
4247                 {
4248                   /* First time round, duplicate ELTS to fill the
4249                      required number of vectors, then cherry pick the
4250                      appropriate result for each iteration.  */
4251                   if (vec_oprnds->is_empty ())
4252                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4253                                               number_of_vectors,
4254                                               permute_results);
4255                   init = permute_results[number_of_vectors - j - 1];
4256                 }
4257               if (ctor_seq != NULL)
4258                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4259               voprnds.quick_push (init);
4260
4261               number_of_places_left_in_vector = nunits;
4262               elts.new_vector (vector_type, nunits, 1);
4263               elts.quick_grow (nunits);
4264               constant_p = true;
4265             }
4266         }
4267     }
4268
4269   /* Since the vectors are created in the reverse order, we should invert
4270      them.  */
4271   vec_num = voprnds.length ();
4272   for (j = vec_num; j != 0; j--)
4273     {
4274       vop = voprnds[j - 1];
4275       vec_oprnds->quick_push (vop);
4276     }
4277
4278   voprnds.release ();
4279
4280   /* In case that VF is greater than the unrolling factor needed for the SLP
4281      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4282      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4283      to replicate the vectors.  */
4284   tree neutral_vec = NULL;
4285   while (number_of_vectors > vec_oprnds->length ())
4286     {
4287       if (neutral_op)
4288         {
4289           if (!neutral_vec)
4290             {
4291               gimple_seq ctor_seq = NULL;
4292               neutral_vec = gimple_build_vector_from_val
4293                 (&ctor_seq, vector_type, neutral_op);
4294               if (ctor_seq != NULL)
4295                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4296             }
4297           vec_oprnds->quick_push (neutral_vec);
4298         }
4299       else
4300         {
4301           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4302             vec_oprnds->quick_push (vop);
4303         }
4304     }
4305 }
4306
4307
4308 /* Function vect_create_epilog_for_reduction
4309
4310    Create code at the loop-epilog to finalize the result of a reduction
4311    computation.
4312
4313    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4314      reduction statements.
4315    STMT_INFO is the scalar reduction stmt that is being vectorized.
4316    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4317      number of elements that we can fit in a vectype (nunits).  In this case
4318      we have to generate more than one vector stmt - i.e - we need to "unroll"
4319      the vector stmt by a factor VF/nunits.  For more details see documentation
4320      in vectorizable_operation.
4321    REDUC_FN is the internal function for the epilog reduction.
4322    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4323      computation.
4324    REDUC_INDEX is the index of the operand in the right hand side of the
4325      statement that is defined by REDUCTION_PHI.
4326    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4327    SLP_NODE is an SLP node containing a group of reduction statements. The
4328      first one in this group is STMT_INFO.
4329    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4330      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4331      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4332      any value of the IV in the loop.
4333    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4334    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4335      null if this is not an SLP reduction
4336
4337    This function:
4338    1. Creates the reduction def-use cycles: sets the arguments for
4339       REDUCTION_PHIS:
4340       The loop-entry argument is the vectorized initial-value of the reduction.
4341       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4342       sums.
4343    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4344       by calling the function specified by REDUC_FN if available, or by
4345       other means (whole-vector shifts or a scalar loop).
4346       The function also creates a new phi node at the loop exit to preserve
4347       loop-closed form, as illustrated below.
4348
4349      The flow at the entry to this function:
4350
4351         loop:
4352           vec_def = phi <null, null>            # REDUCTION_PHI
4353           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4354           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4355         loop_exit:
4356           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4357           use <s_out0>
4358           use <s_out0>
4359
4360      The above is transformed by this function into:
4361
4362         loop:
4363           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4364           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4365           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4366         loop_exit:
4367           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4368           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4369           v_out2 = reduce <v_out1>
4370           s_out3 = extract_field <v_out2, 0>
4371           s_out4 = adjust_result <s_out3>
4372           use <s_out4>
4373           use <s_out4>
4374 */
4375
4376 static void
4377 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4378                                   stmt_vec_info stmt_info,
4379                                   gimple *reduc_def_stmt,
4380                                   int ncopies, internal_fn reduc_fn,
4381                                   vec<stmt_vec_info> reduction_phis,
4382                                   bool double_reduc,
4383                                   slp_tree slp_node,
4384                                   slp_instance slp_node_instance,
4385                                   tree induc_val, enum tree_code induc_code,
4386                                   tree neutral_op)
4387 {
4388   stmt_vec_info prev_phi_info;
4389   tree vectype;
4390   machine_mode mode;
4391   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4392   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4393   basic_block exit_bb;
4394   tree scalar_dest;
4395   tree scalar_type;
4396   gimple *new_phi = NULL, *phi;
4397   stmt_vec_info phi_info;
4398   gimple_stmt_iterator exit_gsi;
4399   tree vec_dest;
4400   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4401   gimple *epilog_stmt = NULL;
4402   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4403   gimple *exit_phi;
4404   tree bitsize;
4405   tree adjustment_def = NULL;
4406   tree vec_initial_def = NULL;
4407   tree expr, def, initial_def = NULL;
4408   tree orig_name, scalar_result;
4409   imm_use_iterator imm_iter, phi_imm_iter;
4410   use_operand_p use_p, phi_use_p;
4411   gimple *use_stmt;
4412   stmt_vec_info reduction_phi_info = NULL;
4413   bool nested_in_vect_loop = false;
4414   auto_vec<gimple *> new_phis;
4415   auto_vec<stmt_vec_info> inner_phis;
4416   int j, i;
4417   auto_vec<tree> scalar_results;
4418   unsigned int group_size = 1, k, ratio;
4419   auto_vec<tree> vec_initial_defs;
4420   auto_vec<gimple *> phis;
4421   bool slp_reduc = false;
4422   bool direct_slp_reduc;
4423   tree new_phi_result;
4424   stmt_vec_info inner_phi = NULL;
4425   tree induction_index = NULL_TREE;
4426
4427   if (slp_node)
4428     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4429
4430   if (nested_in_vect_loop_p (loop, stmt_info))
4431     {
4432       outer_loop = loop;
4433       loop = loop->inner;
4434       nested_in_vect_loop = true;
4435       gcc_assert (!slp_node);
4436     }
4437
4438   vectype = STMT_VINFO_VECTYPE (stmt_info);
4439   gcc_assert (vectype);
4440   mode = TYPE_MODE (vectype);
4441
4442   /* 1. Create the reduction def-use cycle:
4443      Set the arguments of REDUCTION_PHIS, i.e., transform
4444
4445         loop:
4446           vec_def = phi <null, null>            # REDUCTION_PHI
4447           VECT_DEF = vector_stmt                # vectorized form of STMT
4448           ...
4449
4450      into:
4451
4452         loop:
4453           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4454           VECT_DEF = vector_stmt                # vectorized form of STMT
4455           ...
4456
4457      (in case of SLP, do it for all the phis). */
4458
4459   /* Get the loop-entry arguments.  */
4460   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4461   if (slp_node)
4462     {
4463       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4464       vec_initial_defs.reserve (vec_num);
4465       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4466                                       &vec_initial_defs, vec_num,
4467                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4468                                       neutral_op);
4469     }
4470   else
4471     {
4472       /* Get at the scalar def before the loop, that defines the initial value
4473          of the reduction variable.  */
4474       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4475                                            loop_preheader_edge (loop));
4476       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4477          and we can't use zero for induc_val, use initial_def.  Similarly
4478          for REDUC_MIN and initial_def larger than the base.  */
4479       if (TREE_CODE (initial_def) == INTEGER_CST
4480           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4481               == INTEGER_INDUC_COND_REDUCTION)
4482           && !integer_zerop (induc_val)
4483           && ((induc_code == MAX_EXPR
4484                && tree_int_cst_lt (initial_def, induc_val))
4485               || (induc_code == MIN_EXPR
4486                   && tree_int_cst_lt (induc_val, initial_def))))
4487         induc_val = initial_def;
4488
4489       if (double_reduc)
4490         /* In case of double reduction we only create a vector variable
4491            to be put in the reduction phi node.  The actual statement
4492            creation is done later in this function.  */
4493         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4494       else if (nested_in_vect_loop)
4495         {
4496           /* Do not use an adjustment def as that case is not supported
4497              correctly if ncopies is not one.  */
4498           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4499           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4500                                                           stmt_info);
4501         }
4502       else
4503         vec_initial_def
4504           = get_initial_def_for_reduction (stmt_info, initial_def,
4505                                            &adjustment_def);
4506       vec_initial_defs.create (1);
4507       vec_initial_defs.quick_push (vec_initial_def);
4508     }
4509
4510   /* Set phi nodes arguments.  */
4511   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4512     {
4513       tree vec_init_def = vec_initial_defs[i];
4514       tree def = vect_defs[i];
4515       for (j = 0; j < ncopies; j++)
4516         {
4517           if (j != 0)
4518             {
4519               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4520               if (nested_in_vect_loop)
4521                 vec_init_def
4522                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4523             }
4524
4525           /* Set the loop-entry arg of the reduction-phi.  */
4526
4527           gphi *phi = as_a <gphi *> (phi_info->stmt);
4528           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4529               == INTEGER_INDUC_COND_REDUCTION)
4530             {
4531               /* Initialise the reduction phi to zero.  This prevents initial
4532                  values of non-zero interferring with the reduction op.  */
4533               gcc_assert (ncopies == 1);
4534               gcc_assert (i == 0);
4535
4536               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4537               tree induc_val_vec
4538                 = build_vector_from_val (vec_init_def_type, induc_val);
4539
4540               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4541                            UNKNOWN_LOCATION);
4542             }
4543           else
4544             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4545                          UNKNOWN_LOCATION);
4546
4547           /* Set the loop-latch arg for the reduction-phi.  */
4548           if (j > 0)
4549             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4550
4551           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4552
4553           if (dump_enabled_p ())
4554             {
4555               dump_printf_loc (MSG_NOTE, vect_location,
4556                                "transform reduction: created def-use cycle: ");
4557               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4558               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4559             }
4560         }
4561     }
4562
4563   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4564      which is updated with the current index of the loop for every match of
4565      the original loop's cond_expr (VEC_STMT).  This results in a vector
4566      containing the last time the condition passed for that vector lane.
4567      The first match will be a 1 to allow 0 to be used for non-matching
4568      indexes.  If there are no matches at all then the vector will be all
4569      zeroes.  */
4570   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4571     {
4572       tree indx_before_incr, indx_after_incr;
4573       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4574
4575       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4576       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4577
4578       int scalar_precision
4579         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4580       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4581       tree cr_index_vector_type = build_vector_type
4582         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4583
4584       /* First we create a simple vector induction variable which starts
4585          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4586          vector size (STEP).  */
4587
4588       /* Create a {1,2,3,...} vector.  */
4589       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4590
4591       /* Create a vector of the step value.  */
4592       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4593       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4594
4595       /* Create an induction variable.  */
4596       gimple_stmt_iterator incr_gsi;
4597       bool insert_after;
4598       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4599       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4600                  insert_after, &indx_before_incr, &indx_after_incr);
4601
4602       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4603          filled with zeros (VEC_ZERO).  */
4604
4605       /* Create a vector of 0s.  */
4606       tree zero = build_zero_cst (cr_index_scalar_type);
4607       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4608
4609       /* Create a vector phi node.  */
4610       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4611       new_phi = create_phi_node (new_phi_tree, loop->header);
4612       loop_vinfo->add_stmt (new_phi);
4613       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4614                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4615
4616       /* Now take the condition from the loops original cond_expr
4617          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4618          every match uses values from the induction variable
4619          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4620          (NEW_PHI_TREE).
4621          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4622          the new cond_expr (INDEX_COND_EXPR).  */
4623
4624       /* Duplicate the condition from vec_stmt.  */
4625       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4626
4627       /* Create a conditional, where the condition is taken from vec_stmt
4628          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4629          else is the phi (NEW_PHI_TREE).  */
4630       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4631                                      ccompare, indx_before_incr,
4632                                      new_phi_tree);
4633       induction_index = make_ssa_name (cr_index_vector_type);
4634       gimple *index_condition = gimple_build_assign (induction_index,
4635                                                      index_cond_expr);
4636       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4637       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4638       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4639
4640       /* Update the phi with the vec cond.  */
4641       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4642                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4643     }
4644
4645   /* 2. Create epilog code.
4646         The reduction epilog code operates across the elements of the vector
4647         of partial results computed by the vectorized loop.
4648         The reduction epilog code consists of:
4649
4650         step 1: compute the scalar result in a vector (v_out2)
4651         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4652         step 3: adjust the scalar result (s_out3) if needed.
4653
4654         Step 1 can be accomplished using one the following three schemes:
4655           (scheme 1) using reduc_fn, if available.
4656           (scheme 2) using whole-vector shifts, if available.
4657           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4658                      combined.
4659
4660           The overall epilog code looks like this:
4661
4662           s_out0 = phi <s_loop>         # original EXIT_PHI
4663           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4664           v_out2 = reduce <v_out1>              # step 1
4665           s_out3 = extract_field <v_out2, 0>    # step 2
4666           s_out4 = adjust_result <s_out3>       # step 3
4667
4668           (step 3 is optional, and steps 1 and 2 may be combined).
4669           Lastly, the uses of s_out0 are replaced by s_out4.  */
4670
4671
4672   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4673          v_out1 = phi <VECT_DEF>
4674          Store them in NEW_PHIS.  */
4675
4676   exit_bb = single_exit (loop)->dest;
4677   prev_phi_info = NULL;
4678   new_phis.create (vect_defs.length ());
4679   FOR_EACH_VEC_ELT (vect_defs, i, def)
4680     {
4681       for (j = 0; j < ncopies; j++)
4682         {
4683           tree new_def = copy_ssa_name (def);
4684           phi = create_phi_node (new_def, exit_bb);
4685           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4686           if (j == 0)
4687             new_phis.quick_push (phi);
4688           else
4689             {
4690               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4691               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4692             }
4693
4694           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4695           prev_phi_info = phi_info;
4696         }
4697     }
4698
4699   /* The epilogue is created for the outer-loop, i.e., for the loop being
4700      vectorized.  Create exit phis for the outer loop.  */
4701   if (double_reduc)
4702     {
4703       loop = outer_loop;
4704       exit_bb = single_exit (loop)->dest;
4705       inner_phis.create (vect_defs.length ());
4706       FOR_EACH_VEC_ELT (new_phis, i, phi)
4707         {
4708           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4709           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4710           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4711           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4712                            PHI_RESULT (phi));
4713           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4714           inner_phis.quick_push (phi_info);
4715           new_phis[i] = outer_phi;
4716           while (STMT_VINFO_RELATED_STMT (phi_info))
4717             {
4718               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4719               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4720               outer_phi = create_phi_node (new_result, exit_bb);
4721               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4722                                PHI_RESULT (phi_info->stmt));
4723               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4724               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4725               prev_phi_info = outer_phi_info;
4726             }
4727         }
4728     }
4729
4730   exit_gsi = gsi_after_labels (exit_bb);
4731
4732   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4733          (i.e. when reduc_fn is not available) and in the final adjustment
4734          code (if needed).  Also get the original scalar reduction variable as
4735          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4736          represents a reduction pattern), the tree-code and scalar-def are
4737          taken from the original stmt that the pattern-stmt (STMT) replaces.
4738          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4739          are taken from STMT.  */
4740
4741   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
4742   if (!orig_stmt_info)
4743     {
4744       /* Regular reduction  */
4745       orig_stmt_info = stmt_info;
4746     }
4747   else
4748     {
4749       /* Reduction pattern  */
4750       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4751       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4752     }
4753
4754   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4755   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4756      partial results are added and not subtracted.  */
4757   if (code == MINUS_EXPR)
4758     code = PLUS_EXPR;
4759
4760   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4761   scalar_type = TREE_TYPE (scalar_dest);
4762   scalar_results.create (group_size);
4763   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4764   bitsize = TYPE_SIZE (scalar_type);
4765
4766   /* In case this is a reduction in an inner-loop while vectorizing an outer
4767      loop - we don't need to extract a single scalar result at the end of the
4768      inner-loop (unless it is double reduction, i.e., the use of reduction is
4769      outside the outer-loop).  The final vector of partial results will be used
4770      in the vectorized outer-loop, or reduced to a scalar result at the end of
4771      the outer-loop.  */
4772   if (nested_in_vect_loop && !double_reduc)
4773     goto vect_finalize_reduction;
4774
4775   /* SLP reduction without reduction chain, e.g.,
4776      # a1 = phi <a2, a0>
4777      # b1 = phi <b2, b0>
4778      a2 = operation (a1)
4779      b2 = operation (b1)  */
4780   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4781
4782   /* True if we should implement SLP_REDUC using native reduction operations
4783      instead of scalar operations.  */
4784   direct_slp_reduc = (reduc_fn != IFN_LAST
4785                       && slp_reduc
4786                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4787
4788   /* In case of reduction chain, e.g.,
4789      # a1 = phi <a3, a0>
4790      a2 = operation (a1)
4791      a3 = operation (a2),
4792
4793      we may end up with more than one vector result.  Here we reduce them to
4794      one vector.  */
4795   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4796     {
4797       tree first_vect = PHI_RESULT (new_phis[0]);
4798       gassign *new_vec_stmt = NULL;
4799       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4800       for (k = 1; k < new_phis.length (); k++)
4801         {
4802           gimple *next_phi = new_phis[k];
4803           tree second_vect = PHI_RESULT (next_phi);
4804           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4805           new_vec_stmt = gimple_build_assign (tem, code,
4806                                               first_vect, second_vect);
4807           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4808           first_vect = tem;
4809         }
4810
4811       new_phi_result = first_vect;
4812       if (new_vec_stmt)
4813         {
4814           new_phis.truncate (0);
4815           new_phis.safe_push (new_vec_stmt);
4816         }
4817     }
4818   /* Likewise if we couldn't use a single defuse cycle.  */
4819   else if (ncopies > 1)
4820     {
4821       gcc_assert (new_phis.length () == 1);
4822       tree first_vect = PHI_RESULT (new_phis[0]);
4823       gassign *new_vec_stmt = NULL;
4824       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4825       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4826       for (int k = 1; k < ncopies; ++k)
4827         {
4828           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4829           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4830           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4831           new_vec_stmt = gimple_build_assign (tem, code,
4832                                               first_vect, second_vect);
4833           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4834           first_vect = tem;
4835         }
4836       new_phi_result = first_vect;
4837       new_phis.truncate (0);
4838       new_phis.safe_push (new_vec_stmt);
4839     }
4840   else
4841     new_phi_result = PHI_RESULT (new_phis[0]);
4842
4843   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4844       && reduc_fn != IFN_LAST)
4845     {
4846       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4847          various data values where the condition matched and another vector
4848          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4849          need to extract the last matching index (which will be the index with
4850          highest value) and use this to index into the data vector.
4851          For the case where there were no matches, the data vector will contain
4852          all default values and the index vector will be all zeros.  */
4853
4854       /* Get various versions of the type of the vector of indexes.  */
4855       tree index_vec_type = TREE_TYPE (induction_index);
4856       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4857       tree index_scalar_type = TREE_TYPE (index_vec_type);
4858       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4859         (index_vec_type);
4860
4861       /* Get an unsigned integer version of the type of the data vector.  */
4862       int scalar_precision
4863         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4864       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4865       tree vectype_unsigned = build_vector_type
4866         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4867
4868       /* First we need to create a vector (ZERO_VEC) of zeros and another
4869          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4870          can create using a MAX reduction and then expanding.
4871          In the case where the loop never made any matches, the max index will
4872          be zero.  */
4873
4874       /* Vector of {0, 0, 0,...}.  */
4875       tree zero_vec = make_ssa_name (vectype);
4876       tree zero_vec_rhs = build_zero_cst (vectype);
4877       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4878       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4879
4880       /* Find maximum value from the vector of found indexes.  */
4881       tree max_index = make_ssa_name (index_scalar_type);
4882       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4883                                                           1, induction_index);
4884       gimple_call_set_lhs (max_index_stmt, max_index);
4885       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4886
4887       /* Vector of {max_index, max_index, max_index,...}.  */
4888       tree max_index_vec = make_ssa_name (index_vec_type);
4889       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4890                                                       max_index);
4891       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4892                                                         max_index_vec_rhs);
4893       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4894
4895       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4896          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4897          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4898          otherwise.  Only one value should match, resulting in a vector
4899          (VEC_COND) with one data value and the rest zeros.
4900          In the case where the loop never made any matches, every index will
4901          match, resulting in a vector with all data values (which will all be
4902          the default value).  */
4903
4904       /* Compare the max index vector to the vector of found indexes to find
4905          the position of the max value.  */
4906       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4907       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4908                                                       induction_index,
4909                                                       max_index_vec);
4910       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4911
4912       /* Use the compare to choose either values from the data vector or
4913          zero.  */
4914       tree vec_cond = make_ssa_name (vectype);
4915       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4916                                                    vec_compare, new_phi_result,
4917                                                    zero_vec);
4918       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4919
4920       /* Finally we need to extract the data value from the vector (VEC_COND)
4921          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4922          reduction, but because this doesn't exist, we can use a MAX reduction
4923          instead.  The data value might be signed or a float so we need to cast
4924          it first.
4925          In the case where the loop never made any matches, the data values are
4926          all identical, and so will reduce down correctly.  */
4927
4928       /* Make the matched data values unsigned.  */
4929       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4930       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4931                                        vec_cond);
4932       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4933                                                         VIEW_CONVERT_EXPR,
4934                                                         vec_cond_cast_rhs);
4935       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4936
4937       /* Reduce down to a scalar value.  */
4938       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4939       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4940                                                            1, vec_cond_cast);
4941       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4942       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4943
4944       /* Convert the reduced value back to the result type and set as the
4945          result.  */
4946       gimple_seq stmts = NULL;
4947       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4948                                data_reduc);
4949       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4950       scalar_results.safe_push (new_temp);
4951     }
4952   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4953            && reduc_fn == IFN_LAST)
4954     {
4955       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4956          idx = 0;
4957          idx_val = induction_index[0];
4958          val = data_reduc[0];
4959          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4960            if (induction_index[i] > idx_val)
4961              val = data_reduc[i], idx_val = induction_index[i];
4962          return val;  */
4963
4964       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4965       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4966       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4967       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4968       /* Enforced by vectorizable_reduction, which ensures we have target
4969          support before allowing a conditional reduction on variable-length
4970          vectors.  */
4971       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4972       tree idx_val = NULL_TREE, val = NULL_TREE;
4973       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4974         {
4975           tree old_idx_val = idx_val;
4976           tree old_val = val;
4977           idx_val = make_ssa_name (idx_eltype);
4978           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4979                                              build3 (BIT_FIELD_REF, idx_eltype,
4980                                                      induction_index,
4981                                                      bitsize_int (el_size),
4982                                                      bitsize_int (off)));
4983           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4984           val = make_ssa_name (data_eltype);
4985           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4986                                              build3 (BIT_FIELD_REF,
4987                                                      data_eltype,
4988                                                      new_phi_result,
4989                                                      bitsize_int (el_size),
4990                                                      bitsize_int (off)));
4991           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4992           if (off != 0)
4993             {
4994               tree new_idx_val = idx_val;
4995               tree new_val = val;
4996               if (off != v_size - el_size)
4997                 {
4998                   new_idx_val = make_ssa_name (idx_eltype);
4999                   epilog_stmt = gimple_build_assign (new_idx_val,
5000                                                      MAX_EXPR, idx_val,
5001                                                      old_idx_val);
5002                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5003                 }
5004               new_val = make_ssa_name (data_eltype);
5005               epilog_stmt = gimple_build_assign (new_val,
5006                                                  COND_EXPR,
5007                                                  build2 (GT_EXPR,
5008                                                          boolean_type_node,
5009                                                          idx_val,
5010                                                          old_idx_val),
5011                                                  val, old_val);
5012               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5013               idx_val = new_idx_val;
5014               val = new_val;
5015             }
5016         }
5017       /* Convert the reduced value back to the result type and set as the
5018          result.  */
5019       gimple_seq stmts = NULL;
5020       val = gimple_convert (&stmts, scalar_type, val);
5021       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5022       scalar_results.safe_push (val);
5023     }
5024
5025   /* 2.3 Create the reduction code, using one of the three schemes described
5026          above. In SLP we simply need to extract all the elements from the
5027          vector (without reducing them), so we use scalar shifts.  */
5028   else if (reduc_fn != IFN_LAST && !slp_reduc)
5029     {
5030       tree tmp;
5031       tree vec_elem_type;
5032
5033       /* Case 1:  Create:
5034          v_out2 = reduc_expr <v_out1>  */
5035
5036       if (dump_enabled_p ())
5037         dump_printf_loc (MSG_NOTE, vect_location,
5038                          "Reduce using direct vector reduction.\n");
5039
5040       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5041       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5042         {
5043           tree tmp_dest
5044             = vect_create_destination_var (scalar_dest, vec_elem_type);
5045           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5046                                                     new_phi_result);
5047           gimple_set_lhs (epilog_stmt, tmp_dest);
5048           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5049           gimple_set_lhs (epilog_stmt, new_temp);
5050           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5051
5052           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5053                                              new_temp);
5054         }
5055       else
5056         {
5057           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5058                                                     new_phi_result);
5059           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5060         }
5061
5062       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5063       gimple_set_lhs (epilog_stmt, new_temp);
5064       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5065
5066       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5067            == INTEGER_INDUC_COND_REDUCTION)
5068           && !operand_equal_p (initial_def, induc_val, 0))
5069         {
5070           /* Earlier we set the initial value to be a vector if induc_val
5071              values.  Check the result and if it is induc_val then replace
5072              with the original initial value, unless induc_val is
5073              the same as initial_def already.  */
5074           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5075                                   induc_val);
5076
5077           tmp = make_ssa_name (new_scalar_dest);
5078           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5079                                              initial_def, new_temp);
5080           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5081           new_temp = tmp;
5082         }
5083
5084       scalar_results.safe_push (new_temp);
5085     }
5086   else if (direct_slp_reduc)
5087     {
5088       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5089          with the elements for other SLP statements replaced with the
5090          neutral value.  We can then do a normal reduction on each vector.  */
5091
5092       /* Enforced by vectorizable_reduction.  */
5093       gcc_assert (new_phis.length () == 1);
5094       gcc_assert (pow2p_hwi (group_size));
5095
5096       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5097       vec<stmt_vec_info> orig_phis
5098         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5099       gimple_seq seq = NULL;
5100
5101       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5102          and the same element size as VECTYPE.  */
5103       tree index = build_index_vector (vectype, 0, 1);
5104       tree index_type = TREE_TYPE (index);
5105       tree index_elt_type = TREE_TYPE (index_type);
5106       tree mask_type = build_same_sized_truth_vector_type (index_type);
5107
5108       /* Create a vector that, for each element, identifies which of
5109          the REDUC_GROUP_SIZE results should use it.  */
5110       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5111       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5112                             build_vector_from_val (index_type, index_mask));
5113
5114       /* Get a neutral vector value.  This is simply a splat of the neutral
5115          scalar value if we have one, otherwise the initial scalar value
5116          is itself a neutral value.  */
5117       tree vector_identity = NULL_TREE;
5118       if (neutral_op)
5119         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5120                                                         neutral_op);
5121       for (unsigned int i = 0; i < group_size; ++i)
5122         {
5123           /* If there's no univeral neutral value, we can use the
5124              initial scalar value from the original PHI.  This is used
5125              for MIN and MAX reduction, for example.  */
5126           if (!neutral_op)
5127             {
5128               tree scalar_value
5129                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5130                                          loop_preheader_edge (loop));
5131               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5132                                                               scalar_value);
5133             }
5134
5135           /* Calculate the equivalent of:
5136
5137              sel[j] = (index[j] == i);
5138
5139              which selects the elements of NEW_PHI_RESULT that should
5140              be included in the result.  */
5141           tree compare_val = build_int_cst (index_elt_type, i);
5142           compare_val = build_vector_from_val (index_type, compare_val);
5143           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5144                                    index, compare_val);
5145
5146           /* Calculate the equivalent of:
5147
5148              vec = seq ? new_phi_result : vector_identity;
5149
5150              VEC is now suitable for a full vector reduction.  */
5151           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5152                                    sel, new_phi_result, vector_identity);
5153
5154           /* Do the reduction and convert it to the appropriate type.  */
5155           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5156                                       TREE_TYPE (vectype), vec);
5157           scalar = gimple_convert (&seq, scalar_type, scalar);
5158           scalar_results.safe_push (scalar);
5159         }
5160       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5161     }
5162   else
5163     {
5164       bool reduce_with_shift;
5165       tree vec_temp;
5166
5167       /* COND reductions all do the final reduction with MAX_EXPR
5168          or MIN_EXPR.  */
5169       if (code == COND_EXPR)
5170         {
5171           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5172               == INTEGER_INDUC_COND_REDUCTION)
5173             code = induc_code;
5174           else
5175             code = MAX_EXPR;
5176         }
5177
5178       /* See if the target wants to do the final (shift) reduction
5179          in a vector mode of smaller size and first reduce upper/lower
5180          halves against each other.  */
5181       enum machine_mode mode1 = mode;
5182       tree vectype1 = vectype;
5183       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5184       unsigned sz1 = sz;
5185       if (!slp_reduc
5186           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5187         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5188
5189       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5190       reduce_with_shift = have_whole_vector_shift (mode1);
5191       if (!VECTOR_MODE_P (mode1))
5192         reduce_with_shift = false;
5193       else
5194         {
5195           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5196           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5197             reduce_with_shift = false;
5198         }
5199
5200       /* First reduce the vector to the desired vector size we should
5201          do shift reduction on by combining upper and lower halves.  */
5202       new_temp = new_phi_result;
5203       while (sz > sz1)
5204         {
5205           gcc_assert (!slp_reduc);
5206           sz /= 2;
5207           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5208
5209           /* The target has to make sure we support lowpart/highpart
5210              extraction, either via direct vector extract or through
5211              an integer mode punning.  */
5212           tree dst1, dst2;
5213           if (convert_optab_handler (vec_extract_optab,
5214                                      TYPE_MODE (TREE_TYPE (new_temp)),
5215                                      TYPE_MODE (vectype1))
5216               != CODE_FOR_nothing)
5217             {
5218               /* Extract sub-vectors directly once vec_extract becomes
5219                  a conversion optab.  */
5220               dst1 = make_ssa_name (vectype1);
5221               epilog_stmt
5222                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5223                                          build3 (BIT_FIELD_REF, vectype1,
5224                                                  new_temp, TYPE_SIZE (vectype1),
5225                                                  bitsize_int (0)));
5226               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5227               dst2 =  make_ssa_name (vectype1);
5228               epilog_stmt
5229                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5230                                          build3 (BIT_FIELD_REF, vectype1,
5231                                                  new_temp, TYPE_SIZE (vectype1),
5232                                                  bitsize_int (sz * BITS_PER_UNIT)));
5233               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234             }
5235           else
5236             {
5237               /* Extract via punning to appropriately sized integer mode
5238                  vector.  */
5239               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5240                                                             1);
5241               tree etype = build_vector_type (eltype, 2);
5242               gcc_assert (convert_optab_handler (vec_extract_optab,
5243                                                  TYPE_MODE (etype),
5244                                                  TYPE_MODE (eltype))
5245                           != CODE_FOR_nothing);
5246               tree tem = make_ssa_name (etype);
5247               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5248                                                  build1 (VIEW_CONVERT_EXPR,
5249                                                          etype, new_temp));
5250               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5251               new_temp = tem;
5252               tem = make_ssa_name (eltype);
5253               epilog_stmt
5254                   = gimple_build_assign (tem, BIT_FIELD_REF,
5255                                          build3 (BIT_FIELD_REF, eltype,
5256                                                  new_temp, TYPE_SIZE (eltype),
5257                                                  bitsize_int (0)));
5258               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5259               dst1 = make_ssa_name (vectype1);
5260               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5261                                                  build1 (VIEW_CONVERT_EXPR,
5262                                                          vectype1, tem));
5263               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5264               tem = make_ssa_name (eltype);
5265               epilog_stmt
5266                   = gimple_build_assign (tem, BIT_FIELD_REF,
5267                                          build3 (BIT_FIELD_REF, eltype,
5268                                                  new_temp, TYPE_SIZE (eltype),
5269                                                  bitsize_int (sz * BITS_PER_UNIT)));
5270               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271               dst2 =  make_ssa_name (vectype1);
5272               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5273                                                  build1 (VIEW_CONVERT_EXPR,
5274                                                          vectype1, tem));
5275               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5276             }
5277
5278           new_temp = make_ssa_name (vectype1);
5279           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5280           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5281         }
5282
5283       if (reduce_with_shift && !slp_reduc)
5284         {
5285           int element_bitsize = tree_to_uhwi (bitsize);
5286           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5287              for variable-length vectors and also requires direct target support
5288              for loop reductions.  */
5289           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5290           int nelements = vec_size_in_bits / element_bitsize;
5291           vec_perm_builder sel;
5292           vec_perm_indices indices;
5293
5294           int elt_offset;
5295
5296           tree zero_vec = build_zero_cst (vectype1);
5297           /* Case 2: Create:
5298              for (offset = nelements/2; offset >= 1; offset/=2)
5299                 {
5300                   Create:  va' = vec_shift <va, offset>
5301                   Create:  va = vop <va, va'>
5302                 }  */
5303
5304           tree rhs;
5305
5306           if (dump_enabled_p ())
5307             dump_printf_loc (MSG_NOTE, vect_location,
5308                              "Reduce using vector shifts\n");
5309
5310           mode1 = TYPE_MODE (vectype1);
5311           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5312           for (elt_offset = nelements / 2;
5313                elt_offset >= 1;
5314                elt_offset /= 2)
5315             {
5316               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5317               indices.new_vector (sel, 2, nelements);
5318               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5319               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5320                                                  new_temp, zero_vec, mask);
5321               new_name = make_ssa_name (vec_dest, epilog_stmt);
5322               gimple_assign_set_lhs (epilog_stmt, new_name);
5323               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5324
5325               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5326                                                  new_temp);
5327               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5328               gimple_assign_set_lhs (epilog_stmt, new_temp);
5329               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5330             }
5331
5332           /* 2.4  Extract the final scalar result.  Create:
5333              s_out3 = extract_field <v_out2, bitpos>  */
5334
5335           if (dump_enabled_p ())
5336             dump_printf_loc (MSG_NOTE, vect_location,
5337                              "extract scalar result\n");
5338
5339           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5340                         bitsize, bitsize_zero_node);
5341           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5342           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5343           gimple_assign_set_lhs (epilog_stmt, new_temp);
5344           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5345           scalar_results.safe_push (new_temp);
5346         }
5347       else
5348         {
5349           /* Case 3: Create:
5350              s = extract_field <v_out2, 0>
5351              for (offset = element_size;
5352                   offset < vector_size;
5353                   offset += element_size;)
5354                {
5355                  Create:  s' = extract_field <v_out2, offset>
5356                  Create:  s = op <s, s'>  // For non SLP cases
5357                }  */
5358
5359           if (dump_enabled_p ())
5360             dump_printf_loc (MSG_NOTE, vect_location,
5361                              "Reduce using scalar code.\n");
5362
5363           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5364           int element_bitsize = tree_to_uhwi (bitsize);
5365           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5366             {
5367               int bit_offset;
5368               if (gimple_code (new_phi) == GIMPLE_PHI)
5369                 vec_temp = PHI_RESULT (new_phi);
5370               else
5371                 vec_temp = gimple_assign_lhs (new_phi);
5372               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5373                                  bitsize_zero_node);
5374               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5375               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5376               gimple_assign_set_lhs (epilog_stmt, new_temp);
5377               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5378
5379               /* In SLP we don't need to apply reduction operation, so we just
5380                  collect s' values in SCALAR_RESULTS.  */
5381               if (slp_reduc)
5382                 scalar_results.safe_push (new_temp);
5383
5384               for (bit_offset = element_bitsize;
5385                    bit_offset < vec_size_in_bits;
5386                    bit_offset += element_bitsize)
5387                 {
5388                   tree bitpos = bitsize_int (bit_offset);
5389                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5390                                      bitsize, bitpos);
5391
5392                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5393                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5394                   gimple_assign_set_lhs (epilog_stmt, new_name);
5395                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5396
5397                   if (slp_reduc)
5398                     {
5399                       /* In SLP we don't need to apply reduction operation, so
5400                          we just collect s' values in SCALAR_RESULTS.  */
5401                       new_temp = new_name;
5402                       scalar_results.safe_push (new_name);
5403                     }
5404                   else
5405                     {
5406                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5407                                                          new_name, new_temp);
5408                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5409                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5410                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5411                     }
5412                 }
5413             }
5414
5415           /* The only case where we need to reduce scalar results in SLP, is
5416              unrolling.  If the size of SCALAR_RESULTS is greater than
5417              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5418              REDUC_GROUP_SIZE.  */
5419           if (slp_reduc)
5420             {
5421               tree res, first_res, new_res;
5422               gimple *new_stmt;
5423
5424               /* Reduce multiple scalar results in case of SLP unrolling.  */
5425               for (j = group_size; scalar_results.iterate (j, &res);
5426                    j++)
5427                 {
5428                   first_res = scalar_results[j % group_size];
5429                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5430                                                   first_res, res);
5431                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5432                   gimple_assign_set_lhs (new_stmt, new_res);
5433                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5434                   scalar_results[j % group_size] = new_res;
5435                 }
5436             }
5437           else
5438             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5439             scalar_results.safe_push (new_temp);
5440         }
5441
5442       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5443            == INTEGER_INDUC_COND_REDUCTION)
5444           && !operand_equal_p (initial_def, induc_val, 0))
5445         {
5446           /* Earlier we set the initial value to be a vector if induc_val
5447              values.  Check the result and if it is induc_val then replace
5448              with the original initial value, unless induc_val is
5449              the same as initial_def already.  */
5450           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5451                                   induc_val);
5452
5453           tree tmp = make_ssa_name (new_scalar_dest);
5454           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5455                                              initial_def, new_temp);
5456           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5457           scalar_results[0] = tmp;
5458         }
5459     }
5460
5461 vect_finalize_reduction:
5462
5463   if (double_reduc)
5464     loop = loop->inner;
5465
5466   /* 2.5 Adjust the final result by the initial value of the reduction
5467          variable. (When such adjustment is not needed, then
5468          'adjustment_def' is zero).  For example, if code is PLUS we create:
5469          new_temp = loop_exit_def + adjustment_def  */
5470
5471   if (adjustment_def)
5472     {
5473       gcc_assert (!slp_reduc);
5474       if (nested_in_vect_loop)
5475         {
5476           new_phi = new_phis[0];
5477           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5478           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5479           new_dest = vect_create_destination_var (scalar_dest, vectype);
5480         }
5481       else
5482         {
5483           new_temp = scalar_results[0];
5484           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5485           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5486           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5487         }
5488
5489       epilog_stmt = gimple_build_assign (new_dest, expr);
5490       new_temp = make_ssa_name (new_dest, epilog_stmt);
5491       gimple_assign_set_lhs (epilog_stmt, new_temp);
5492       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5493       if (nested_in_vect_loop)
5494         {
5495           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5496           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5497             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5498
5499           if (!double_reduc)
5500             scalar_results.quick_push (new_temp);
5501           else
5502             scalar_results[0] = new_temp;
5503         }
5504       else
5505         scalar_results[0] = new_temp;
5506
5507       new_phis[0] = epilog_stmt;
5508     }
5509
5510   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5511           phis with new adjusted scalar results, i.e., replace use <s_out0>
5512           with use <s_out4>.
5513
5514      Transform:
5515         loop_exit:
5516           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5517           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5518           v_out2 = reduce <v_out1>
5519           s_out3 = extract_field <v_out2, 0>
5520           s_out4 = adjust_result <s_out3>
5521           use <s_out0>
5522           use <s_out0>
5523
5524      into:
5525
5526         loop_exit:
5527           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5528           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5529           v_out2 = reduce <v_out1>
5530           s_out3 = extract_field <v_out2, 0>
5531           s_out4 = adjust_result <s_out3>
5532           use <s_out4>
5533           use <s_out4> */
5534
5535
5536   /* In SLP reduction chain we reduce vector results into one vector if
5537      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5538      LHS of the last stmt in the reduction chain, since we are looking for
5539      the loop exit phi node.  */
5540   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5541     {
5542       stmt_vec_info dest_stmt_info
5543         = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5544       /* Handle reduction patterns.  */
5545       if (STMT_VINFO_RELATED_STMT (dest_stmt_info))
5546         dest_stmt_info = STMT_VINFO_RELATED_STMT (dest_stmt_info);
5547
5548       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5549       group_size = 1;
5550     }
5551
5552   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5553      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5554      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5555      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5556      correspond to the first vector stmt, etc.
5557      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5558   if (group_size > new_phis.length ())
5559     {
5560       ratio = group_size / new_phis.length ();
5561       gcc_assert (!(group_size % new_phis.length ()));
5562     }
5563   else
5564     ratio = 1;
5565
5566   stmt_vec_info epilog_stmt_info = NULL;
5567   for (k = 0; k < group_size; k++)
5568     {
5569       if (k % ratio == 0)
5570         {
5571           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5572           reduction_phi_info = reduction_phis[k / ratio];
5573           if (double_reduc)
5574             inner_phi = inner_phis[k / ratio];
5575         }
5576
5577       if (slp_reduc)
5578         {
5579           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5580
5581           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5582           /* SLP statements can't participate in patterns.  */
5583           gcc_assert (!orig_stmt_info);
5584           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5585         }
5586
5587       phis.create (3);
5588       /* Find the loop-closed-use at the loop exit of the original scalar
5589          result.  (The reduction result is expected to have two immediate uses -
5590          one at the latch block, and one at the loop exit).  */
5591       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5592         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5593             && !is_gimple_debug (USE_STMT (use_p)))
5594           phis.safe_push (USE_STMT (use_p));
5595
5596       /* While we expect to have found an exit_phi because of loop-closed-ssa
5597          form we can end up without one if the scalar cycle is dead.  */
5598
5599       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5600         {
5601           if (outer_loop)
5602             {
5603               stmt_vec_info exit_phi_vinfo
5604                 = loop_vinfo->lookup_stmt (exit_phi);
5605               gphi *vect_phi;
5606
5607               /* FORNOW. Currently not supporting the case that an inner-loop
5608                  reduction is not used in the outer-loop (but only outside the
5609                  outer-loop), unless it is double reduction.  */
5610               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5611                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5612                           || double_reduc);
5613
5614               if (double_reduc)
5615                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5616               else
5617                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5618               if (!double_reduc
5619                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5620                       != vect_double_reduction_def)
5621                 continue;
5622
5623               /* Handle double reduction:
5624
5625                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5626                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5627                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5628                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5629
5630                  At that point the regular reduction (stmt2 and stmt3) is
5631                  already vectorized, as well as the exit phi node, stmt4.
5632                  Here we vectorize the phi node of double reduction, stmt1, and
5633                  update all relevant statements.  */
5634
5635               /* Go through all the uses of s2 to find double reduction phi
5636                  node, i.e., stmt1 above.  */
5637               orig_name = PHI_RESULT (exit_phi);
5638               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5639                 {
5640                   stmt_vec_info use_stmt_vinfo;
5641                   tree vect_phi_init, preheader_arg, vect_phi_res;
5642                   basic_block bb = gimple_bb (use_stmt);
5643
5644                   /* Check that USE_STMT is really double reduction phi
5645                      node.  */
5646                   if (gimple_code (use_stmt) != GIMPLE_PHI
5647                       || gimple_phi_num_args (use_stmt) != 2
5648                       || bb->loop_father != outer_loop)
5649                     continue;
5650                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5651                   if (!use_stmt_vinfo
5652                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5653                           != vect_double_reduction_def)
5654                     continue;
5655
5656                   /* Create vector phi node for double reduction:
5657                      vs1 = phi <vs0, vs2>
5658                      vs1 was created previously in this function by a call to
5659                        vect_get_vec_def_for_operand and is stored in
5660                        vec_initial_def;
5661                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5662                      vs0 is created here.  */
5663
5664                   /* Create vector phi node.  */
5665                   vect_phi = create_phi_node (vec_initial_def, bb);
5666                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5667
5668                   /* Create vs0 - initial def of the double reduction phi.  */
5669                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5670                                              loop_preheader_edge (outer_loop));
5671                   vect_phi_init = get_initial_def_for_reduction
5672                     (stmt_info, preheader_arg, NULL);
5673
5674                   /* Update phi node arguments with vs0 and vs2.  */
5675                   add_phi_arg (vect_phi, vect_phi_init,
5676                                loop_preheader_edge (outer_loop),
5677                                UNKNOWN_LOCATION);
5678                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5679                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5680                   if (dump_enabled_p ())
5681                     {
5682                       dump_printf_loc (MSG_NOTE, vect_location,
5683                                        "created double reduction phi node: ");
5684                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5685                     }
5686
5687                   vect_phi_res = PHI_RESULT (vect_phi);
5688
5689                   /* Replace the use, i.e., set the correct vs1 in the regular
5690                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5691                      loop is redundant.  */
5692                   stmt_vec_info use_info = reduction_phi_info;
5693                   for (j = 0; j < ncopies; j++)
5694                     {
5695                       edge pr_edge = loop_preheader_edge (loop);
5696                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5697                                        pr_edge->dest_idx, vect_phi_res);
5698                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5699                     }
5700                 }
5701             }
5702         }
5703
5704       phis.release ();
5705       if (nested_in_vect_loop)
5706         {
5707           if (double_reduc)
5708             loop = outer_loop;
5709           else
5710             continue;
5711         }
5712
5713       phis.create (3);
5714       /* Find the loop-closed-use at the loop exit of the original scalar
5715          result.  (The reduction result is expected to have two immediate uses,
5716          one at the latch block, and one at the loop exit).  For double
5717          reductions we are looking for exit phis of the outer loop.  */
5718       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5719         {
5720           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5721             {
5722               if (!is_gimple_debug (USE_STMT (use_p)))
5723                 phis.safe_push (USE_STMT (use_p));
5724             }
5725           else
5726             {
5727               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5728                 {
5729                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5730
5731                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5732                     {
5733                       if (!flow_bb_inside_loop_p (loop,
5734                                              gimple_bb (USE_STMT (phi_use_p)))
5735                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5736                         phis.safe_push (USE_STMT (phi_use_p));
5737                     }
5738                 }
5739             }
5740         }
5741
5742       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5743         {
5744           /* Replace the uses:  */
5745           orig_name = PHI_RESULT (exit_phi);
5746           scalar_result = scalar_results[k];
5747           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5748             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5749               SET_USE (use_p, scalar_result);
5750         }
5751
5752       phis.release ();
5753     }
5754 }
5755
5756 /* Return a vector of type VECTYPE that is equal to the vector select
5757    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5758    before GSI.  */
5759
5760 static tree
5761 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5762                      tree vec, tree identity)
5763 {
5764   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5765   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5766                                           mask, vec, identity);
5767   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5768   return cond;
5769 }
5770
5771 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5772    order, starting with LHS.  Insert the extraction statements before GSI and
5773    associate the new scalar SSA names with variable SCALAR_DEST.
5774    Return the SSA name for the result.  */
5775
5776 static tree
5777 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5778                        tree_code code, tree lhs, tree vector_rhs)
5779 {
5780   tree vectype = TREE_TYPE (vector_rhs);
5781   tree scalar_type = TREE_TYPE (vectype);
5782   tree bitsize = TYPE_SIZE (scalar_type);
5783   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5784   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5785
5786   for (unsigned HOST_WIDE_INT bit_offset = 0;
5787        bit_offset < vec_size_in_bits;
5788        bit_offset += element_bitsize)
5789     {
5790       tree bitpos = bitsize_int (bit_offset);
5791       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5792                          bitsize, bitpos);
5793
5794       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5795       rhs = make_ssa_name (scalar_dest, stmt);
5796       gimple_assign_set_lhs (stmt, rhs);
5797       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5798
5799       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5800       tree new_name = make_ssa_name (scalar_dest, stmt);
5801       gimple_assign_set_lhs (stmt, new_name);
5802       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5803       lhs = new_name;
5804     }
5805   return lhs;
5806 }
5807
5808 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5809    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5810    statement.  CODE is the operation performed by STMT_INFO and OPS are
5811    its scalar operands.  REDUC_INDEX is the index of the operand in
5812    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5813    implements in-order reduction, or IFN_LAST if we should open-code it.
5814    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5815    that should be used to control the operation in a fully-masked loop.  */
5816
5817 static bool
5818 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5819                                gimple_stmt_iterator *gsi,
5820                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5821                                gimple *reduc_def_stmt,
5822                                tree_code code, internal_fn reduc_fn,
5823                                tree ops[3], tree vectype_in,
5824                                int reduc_index, vec_loop_masks *masks)
5825 {
5826   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5827   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5828   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5829   stmt_vec_info new_stmt_info = NULL;
5830
5831   int ncopies;
5832   if (slp_node)
5833     ncopies = 1;
5834   else
5835     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5836
5837   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5838   gcc_assert (ncopies == 1);
5839   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5840   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5841   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5842               == FOLD_LEFT_REDUCTION);
5843
5844   if (slp_node)
5845     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5846                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5847
5848   tree op0 = ops[1 - reduc_index];
5849
5850   int group_size = 1;
5851   stmt_vec_info scalar_dest_def_info;
5852   auto_vec<tree> vec_oprnds0;
5853   if (slp_node)
5854     {
5855       vect_get_vec_defs (op0, NULL_TREE, stmt_info, &vec_oprnds0, NULL,
5856                          slp_node);
5857       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5858       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5859     }
5860   else
5861     {
5862       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5863       vec_oprnds0.create (1);
5864       vec_oprnds0.quick_push (loop_vec_def0);
5865       scalar_dest_def_info = stmt_info;
5866     }
5867
5868   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5869   tree scalar_type = TREE_TYPE (scalar_dest);
5870   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5871
5872   int vec_num = vec_oprnds0.length ();
5873   gcc_assert (vec_num == 1 || slp_node);
5874   tree vec_elem_type = TREE_TYPE (vectype_out);
5875   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5876
5877   tree vector_identity = NULL_TREE;
5878   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5879     vector_identity = build_zero_cst (vectype_out);
5880
5881   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5882   int i;
5883   tree def0;
5884   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5885     {
5886       gimple *new_stmt;
5887       tree mask = NULL_TREE;
5888       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5889         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5890
5891       /* Handle MINUS by adding the negative.  */
5892       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5893         {
5894           tree negated = make_ssa_name (vectype_out);
5895           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5896           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5897           def0 = negated;
5898         }
5899
5900       if (mask)
5901         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5902                                     vector_identity);
5903
5904       /* On the first iteration the input is simply the scalar phi
5905          result, and for subsequent iterations it is the output of
5906          the preceding operation.  */
5907       if (reduc_fn != IFN_LAST)
5908         {
5909           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5910           /* For chained SLP reductions the output of the previous reduction
5911              operation serves as the input of the next. For the final statement
5912              the output cannot be a temporary - we reuse the original
5913              scalar destination of the last statement.  */
5914           if (i != vec_num - 1)
5915             {
5916               gimple_set_lhs (new_stmt, scalar_dest_var);
5917               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5918               gimple_set_lhs (new_stmt, reduc_var);
5919             }
5920         }
5921       else
5922         {
5923           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5924                                              reduc_var, def0);
5925           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5926           /* Remove the statement, so that we can use the same code paths
5927              as for statements that we've just created.  */
5928           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5929           gsi_remove (&tmp_gsi, false);
5930         }
5931
5932       if (i == vec_num - 1)
5933         {
5934           gimple_set_lhs (new_stmt, scalar_dest);
5935           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5936                                                     new_stmt);
5937         }
5938       else
5939         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5940                                                      new_stmt, gsi);
5941
5942       if (slp_node)
5943         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5944     }
5945
5946   if (!slp_node)
5947     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5948
5949   return true;
5950 }
5951
5952 /* Function is_nonwrapping_integer_induction.
5953
5954    Check if STMT_VINO (which is part of loop LOOP) both increments and
5955    does not cause overflow.  */
5956
5957 static bool
5958 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5959 {
5960   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5961   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5962   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5963   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5964   widest_int ni, max_loop_value, lhs_max;
5965   wi::overflow_type overflow = wi::OVF_NONE;
5966
5967   /* Make sure the loop is integer based.  */
5968   if (TREE_CODE (base) != INTEGER_CST
5969       || TREE_CODE (step) != INTEGER_CST)
5970     return false;
5971
5972   /* Check that the max size of the loop will not wrap.  */
5973
5974   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5975     return true;
5976
5977   if (! max_stmt_executions (loop, &ni))
5978     return false;
5979
5980   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5981                             &overflow);
5982   if (overflow)
5983     return false;
5984
5985   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5986                             TYPE_SIGN (lhs_type), &overflow);
5987   if (overflow)
5988     return false;
5989
5990   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5991           <= TYPE_PRECISION (lhs_type));
5992 }
5993
5994 /* Function vectorizable_reduction.
5995
5996    Check if STMT_INFO performs a reduction operation that can be vectorized.
5997    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5998    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5999    Return true if STMT_INFO is vectorizable in this way.
6000
6001    This function also handles reduction idioms (patterns) that have been
6002    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6003    may be of this form:
6004      X = pattern_expr (arg0, arg1, ..., X)
6005    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6006    sequence that had been detected and replaced by the pattern-stmt
6007    (STMT_INFO).
6008
6009    This function also handles reduction of condition expressions, for example:
6010      for (int i = 0; i < N; i++)
6011        if (a[i] < value)
6012          last = a[i];
6013    This is handled by vectorising the loop and creating an additional vector
6014    containing the loop indexes for which "a[i] < value" was true.  In the
6015    function epilogue this is reduced to a single max value and then used to
6016    index into the vector of results.
6017
6018    In some cases of reduction patterns, the type of the reduction variable X is
6019    different than the type of the other arguments of STMT_INFO.
6020    In such cases, the vectype that is used when transforming STMT_INFO into
6021    a vector stmt is different than the vectype that is used to determine the
6022    vectorization factor, because it consists of a different number of elements
6023    than the actual number of elements that are being operated upon in parallel.
6024
6025    For example, consider an accumulation of shorts into an int accumulator.
6026    On some targets it's possible to vectorize this pattern operating on 8
6027    shorts at a time (hence, the vectype for purposes of determining the
6028    vectorization factor should be V8HI); on the other hand, the vectype that
6029    is used to create the vector form is actually V4SI (the type of the result).
6030
6031    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6032    indicates what is the actual level of parallelism (V8HI in the example), so
6033    that the right vectorization factor would be derived.  This vectype
6034    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6035    be used to create the vectorized stmt.  The right vectype for the vectorized
6036    stmt is obtained from the type of the result X:
6037         get_vectype_for_scalar_type (TREE_TYPE (X))
6038
6039    This means that, contrary to "regular" reductions (or "regular" stmts in
6040    general), the following equation:
6041       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6042    does *NOT* necessarily hold for reduction patterns.  */
6043
6044 bool
6045 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6046                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6047                         slp_instance slp_node_instance,
6048                         stmt_vector_for_cost *cost_vec)
6049 {
6050   tree vec_dest;
6051   tree scalar_dest;
6052   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6053   tree vectype_in = NULL_TREE;
6054   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6055   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6056   enum tree_code code, orig_code;
6057   internal_fn reduc_fn;
6058   machine_mode vec_mode;
6059   int op_type;
6060   optab optab;
6061   tree new_temp = NULL_TREE;
6062   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6063   stmt_vec_info cond_stmt_vinfo = NULL;
6064   enum tree_code cond_reduc_op_code = ERROR_MARK;
6065   tree scalar_type;
6066   bool is_simple_use;
6067   int i;
6068   int ncopies;
6069   int epilog_copies;
6070   stmt_vec_info prev_stmt_info, prev_phi_info;
6071   bool single_defuse_cycle = false;
6072   stmt_vec_info new_stmt_info = NULL;
6073   int j;
6074   tree ops[3];
6075   enum vect_def_type dts[3];
6076   bool nested_cycle = false, found_nested_cycle_def = false;
6077   bool double_reduc = false;
6078   basic_block def_bb;
6079   struct loop * def_stmt_loop;
6080   tree def_arg;
6081   auto_vec<tree> vec_oprnds0;
6082   auto_vec<tree> vec_oprnds1;
6083   auto_vec<tree> vec_oprnds2;
6084   auto_vec<tree> vect_defs;
6085   auto_vec<stmt_vec_info> phis;
6086   int vec_num;
6087   tree def0, tem;
6088   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6089   tree cond_reduc_val = NULL_TREE;
6090
6091   /* Make sure it was already recognized as a reduction computation.  */
6092   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6093       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6094     return false;
6095
6096   if (nested_in_vect_loop_p (loop, stmt_info))
6097     {
6098       loop = loop->inner;
6099       nested_cycle = true;
6100     }
6101
6102   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6103     gcc_assert (slp_node
6104                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6105
6106   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6107     {
6108       tree phi_result = gimple_phi_result (phi);
6109       /* Analysis is fully done on the reduction stmt invocation.  */
6110       if (! vec_stmt)
6111         {
6112           if (slp_node)
6113             slp_node_instance->reduc_phis = slp_node;
6114
6115           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6116           return true;
6117         }
6118
6119       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6120         /* Leave the scalar phi in place.  Note that checking
6121            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6122            for reductions involving a single statement.  */
6123         return true;
6124
6125       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6126       if (STMT_VINFO_IN_PATTERN_P (reduc_stmt_info))
6127         reduc_stmt_info = STMT_VINFO_RELATED_STMT (reduc_stmt_info);
6128
6129       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6130           == EXTRACT_LAST_REDUCTION)
6131         /* Leave the scalar phi in place.  */
6132         return true;
6133
6134       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6135       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6136         {
6137           tree op = gimple_op (reduc_stmt, k);
6138           if (op == phi_result)
6139             continue;
6140           if (k == 1
6141               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6142             continue;
6143           if (!vectype_in
6144               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6145                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6146             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6147           break;
6148         }
6149       gcc_assert (vectype_in);
6150
6151       if (slp_node)
6152         ncopies = 1;
6153       else
6154         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6155
6156       stmt_vec_info use_stmt_info;
6157       if (ncopies > 1
6158           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6159           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6160           && (use_stmt_info == reduc_stmt_info
6161               || STMT_VINFO_RELATED_STMT (use_stmt_info) == reduc_stmt_info))
6162         single_defuse_cycle = true;
6163
6164       /* Create the destination vector  */
6165       scalar_dest = gimple_assign_lhs (reduc_stmt);
6166       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6167
6168       if (slp_node)
6169         /* The size vect_schedule_slp_instance computes is off for us.  */
6170         vec_num = vect_get_num_vectors
6171           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6172            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6173            vectype_in);
6174       else
6175         vec_num = 1;
6176
6177       /* Generate the reduction PHIs upfront.  */
6178       prev_phi_info = NULL;
6179       for (j = 0; j < ncopies; j++)
6180         {
6181           if (j == 0 || !single_defuse_cycle)
6182             {
6183               for (i = 0; i < vec_num; i++)
6184                 {
6185                   /* Create the reduction-phi that defines the reduction
6186                      operand.  */
6187                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6188                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6189
6190                   if (slp_node)
6191                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6192                   else
6193                     {
6194                       if (j == 0)
6195                         STMT_VINFO_VEC_STMT (stmt_info)
6196                           = *vec_stmt = new_phi_info;
6197                       else
6198                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6199                       prev_phi_info = new_phi_info;
6200                     }
6201                 }
6202             }
6203         }
6204
6205       return true;
6206     }
6207
6208   /* 1. Is vectorizable reduction?  */
6209   /* Not supportable if the reduction variable is used in the loop, unless
6210      it's a reduction chain.  */
6211   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6212       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6213     return false;
6214
6215   /* Reductions that are not used even in an enclosing outer-loop,
6216      are expected to be "live" (used out of the loop).  */
6217   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6218       && !STMT_VINFO_LIVE_P (stmt_info))
6219     return false;
6220
6221   /* 2. Has this been recognized as a reduction pattern?
6222
6223      Check if STMT represents a pattern that has been recognized
6224      in earlier analysis stages.  For stmts that represent a pattern,
6225      the STMT_VINFO_RELATED_STMT field records the last stmt in
6226      the original sequence that constitutes the pattern.  */
6227
6228   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6229   if (orig_stmt_info)
6230     {
6231       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6232       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6233     }
6234
6235   /* 3. Check the operands of the operation.  The first operands are defined
6236         inside the loop body. The last operand is the reduction variable,
6237         which is defined by the loop-header-phi.  */
6238
6239   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6240
6241   /* Flatten RHS.  */
6242   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6243     {
6244     case GIMPLE_BINARY_RHS:
6245       code = gimple_assign_rhs_code (stmt);
6246       op_type = TREE_CODE_LENGTH (code);
6247       gcc_assert (op_type == binary_op);
6248       ops[0] = gimple_assign_rhs1 (stmt);
6249       ops[1] = gimple_assign_rhs2 (stmt);
6250       break;
6251
6252     case GIMPLE_TERNARY_RHS:
6253       code = gimple_assign_rhs_code (stmt);
6254       op_type = TREE_CODE_LENGTH (code);
6255       gcc_assert (op_type == ternary_op);
6256       ops[0] = gimple_assign_rhs1 (stmt);
6257       ops[1] = gimple_assign_rhs2 (stmt);
6258       ops[2] = gimple_assign_rhs3 (stmt);
6259       break;
6260
6261     case GIMPLE_UNARY_RHS:
6262       return false;
6263
6264     default:
6265       gcc_unreachable ();
6266     }
6267
6268   if (code == COND_EXPR && slp_node)
6269     return false;
6270
6271   scalar_dest = gimple_assign_lhs (stmt);
6272   scalar_type = TREE_TYPE (scalar_dest);
6273   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6274       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6275     return false;
6276
6277   /* Do not try to vectorize bit-precision reductions.  */
6278   if (!type_has_mode_precision_p (scalar_type))
6279     return false;
6280
6281   /* All uses but the last are expected to be defined in the loop.
6282      The last use is the reduction variable.  In case of nested cycle this
6283      assumption is not true: we use reduc_index to record the index of the
6284      reduction variable.  */
6285   stmt_vec_info reduc_def_info = NULL;
6286   int reduc_index = -1;
6287   for (i = 0; i < op_type; i++)
6288     {
6289       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6290       if (i == 0 && code == COND_EXPR)
6291         continue;
6292
6293       stmt_vec_info def_stmt_info;
6294       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6295                                           &def_stmt_info);
6296       dt = dts[i];
6297       gcc_assert (is_simple_use);
6298       if (dt == vect_reduction_def)
6299         {
6300           reduc_def_info = def_stmt_info;
6301           reduc_index = i;
6302           continue;
6303         }
6304       else if (tem)
6305         {
6306           /* To properly compute ncopies we are interested in the widest
6307              input type in case we're looking at a widening accumulation.  */
6308           if (!vectype_in
6309               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6310                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6311             vectype_in = tem;
6312         }
6313
6314       if (dt != vect_internal_def
6315           && dt != vect_external_def
6316           && dt != vect_constant_def
6317           && dt != vect_induction_def
6318           && !(dt == vect_nested_cycle && nested_cycle))
6319         return false;
6320
6321       if (dt == vect_nested_cycle)
6322         {
6323           found_nested_cycle_def = true;
6324           reduc_def_info = def_stmt_info;
6325           reduc_index = i;
6326         }
6327
6328       if (i == 1 && code == COND_EXPR)
6329         {
6330           /* Record how value of COND_EXPR is defined.  */
6331           if (dt == vect_constant_def)
6332             {
6333               cond_reduc_dt = dt;
6334               cond_reduc_val = ops[i];
6335             }
6336           if (dt == vect_induction_def
6337               && def_stmt_info
6338               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6339             {
6340               cond_reduc_dt = dt;
6341               cond_stmt_vinfo = def_stmt_info;
6342             }
6343         }
6344     }
6345
6346   if (!vectype_in)
6347     vectype_in = vectype_out;
6348
6349   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6350      directy used in stmt.  */
6351   if (reduc_index == -1)
6352     {
6353       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6354         {
6355           if (dump_enabled_p ())
6356             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6357                              "in-order reduction chain without SLP.\n");
6358           return false;
6359         }
6360
6361       if (orig_stmt_info)
6362         reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6363       else
6364         reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6365     }
6366
6367   if (! reduc_def_info)
6368     return false;
6369
6370   gphi *reduc_def_phi = dyn_cast <gphi *> (reduc_def_info->stmt);
6371   if (!reduc_def_phi)
6372     return false;
6373
6374   if (!(reduc_index == -1
6375         || dts[reduc_index] == vect_reduction_def
6376         || dts[reduc_index] == vect_nested_cycle
6377         || ((dts[reduc_index] == vect_internal_def
6378              || dts[reduc_index] == vect_external_def
6379              || dts[reduc_index] == vect_constant_def
6380              || dts[reduc_index] == vect_induction_def)
6381             && nested_cycle && found_nested_cycle_def)))
6382     {
6383       /* For pattern recognized stmts, orig_stmt might be a reduction,
6384          but some helper statements for the pattern might not, or
6385          might be COND_EXPRs with reduction uses in the condition.  */
6386       gcc_assert (orig_stmt_info);
6387       return false;
6388     }
6389
6390   /* PHIs should not participate in patterns.  */
6391   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6392   enum vect_reduction_type v_reduc_type
6393     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6394   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6395
6396   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6397   /* If we have a condition reduction, see if we can simplify it further.  */
6398   if (v_reduc_type == COND_REDUCTION)
6399     {
6400       /* TODO: We can't yet handle reduction chains, since we need to treat
6401          each COND_EXPR in the chain specially, not just the last one.
6402          E.g. for:
6403
6404             x_1 = PHI <x_3, ...>
6405             x_2 = a_2 ? ... : x_1;
6406             x_3 = a_3 ? ... : x_2;
6407
6408          we're interested in the last element in x_3 for which a_2 || a_3
6409          is true, whereas the current reduction chain handling would
6410          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6411          as a reduction operation.  */
6412       if (reduc_index == -1)
6413         {
6414           if (dump_enabled_p ())
6415             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6416                              "conditional reduction chains not supported\n");
6417           return false;
6418         }
6419
6420       /* vect_is_simple_reduction ensured that operand 2 is the
6421          loop-carried operand.  */
6422       gcc_assert (reduc_index == 2);
6423
6424       /* Loop peeling modifies initial value of reduction PHI, which
6425          makes the reduction stmt to be transformed different to the
6426          original stmt analyzed.  We need to record reduction code for
6427          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6428          it can be used directly at transform stage.  */
6429       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6430           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6431         {
6432           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6433           gcc_assert (cond_reduc_dt == vect_constant_def);
6434           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6435         }
6436       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6437                                                vectype_in, OPTIMIZE_FOR_SPEED))
6438         {
6439           if (dump_enabled_p ())
6440             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6441                              "optimizing condition reduction with"
6442                              " FOLD_EXTRACT_LAST.\n");
6443           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6444         }
6445       else if (cond_reduc_dt == vect_induction_def)
6446         {
6447           tree base
6448             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6449           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6450
6451           gcc_assert (TREE_CODE (base) == INTEGER_CST
6452                       && TREE_CODE (step) == INTEGER_CST);
6453           cond_reduc_val = NULL_TREE;
6454           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6455              above base; punt if base is the minimum value of the type for
6456              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6457           if (tree_int_cst_sgn (step) == -1)
6458             {
6459               cond_reduc_op_code = MIN_EXPR;
6460               if (tree_int_cst_sgn (base) == -1)
6461                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6462               else if (tree_int_cst_lt (base,
6463                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6464                 cond_reduc_val
6465                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6466             }
6467           else
6468             {
6469               cond_reduc_op_code = MAX_EXPR;
6470               if (tree_int_cst_sgn (base) == 1)
6471                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6472               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6473                                         base))
6474                 cond_reduc_val
6475                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6476             }
6477           if (cond_reduc_val)
6478             {
6479               if (dump_enabled_p ())
6480                 dump_printf_loc (MSG_NOTE, vect_location,
6481                                  "condition expression based on "
6482                                  "integer induction.\n");
6483               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6484                 = INTEGER_INDUC_COND_REDUCTION;
6485             }
6486         }
6487       else if (cond_reduc_dt == vect_constant_def)
6488         {
6489           enum vect_def_type cond_initial_dt;
6490           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6491           tree cond_initial_val
6492             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6493
6494           gcc_assert (cond_reduc_val != NULL_TREE);
6495           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6496           if (cond_initial_dt == vect_constant_def
6497               && types_compatible_p (TREE_TYPE (cond_initial_val),
6498                                      TREE_TYPE (cond_reduc_val)))
6499             {
6500               tree e = fold_binary (LE_EXPR, boolean_type_node,
6501                                     cond_initial_val, cond_reduc_val);
6502               if (e && (integer_onep (e) || integer_zerop (e)))
6503                 {
6504                   if (dump_enabled_p ())
6505                     dump_printf_loc (MSG_NOTE, vect_location,
6506                                      "condition expression based on "
6507                                      "compile time constant.\n");
6508                   /* Record reduction code at analysis stage.  */
6509                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6510                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6511                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6512                     = CONST_COND_REDUCTION;
6513                 }
6514             }
6515         }
6516     }
6517
6518   if (orig_stmt_info)
6519     gcc_assert (tmp == orig_stmt_info
6520                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6521   else
6522     /* We changed STMT to be the first stmt in reduction chain, hence we
6523        check that in this case the first element in the chain is STMT.  */
6524     gcc_assert (tmp == stmt_info
6525                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6526
6527   if (STMT_VINFO_LIVE_P (reduc_def_info))
6528     return false;
6529
6530   if (slp_node)
6531     ncopies = 1;
6532   else
6533     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6534
6535   gcc_assert (ncopies >= 1);
6536
6537   vec_mode = TYPE_MODE (vectype_in);
6538   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6539
6540   if (code == COND_EXPR)
6541     {
6542       /* Only call during the analysis stage, otherwise we'll lose
6543          STMT_VINFO_TYPE.  */
6544       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6545                                                 ops[reduc_index], 0, NULL,
6546                                                 cost_vec))
6547         {
6548           if (dump_enabled_p ())
6549             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6550                              "unsupported condition in reduction\n");
6551           return false;
6552         }
6553     }
6554   else
6555     {
6556       /* 4. Supportable by target?  */
6557
6558       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6559           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6560         {
6561           /* Shifts and rotates are only supported by vectorizable_shifts,
6562              not vectorizable_reduction.  */
6563           if (dump_enabled_p ())
6564             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6565                              "unsupported shift or rotation.\n");
6566           return false;
6567         }
6568
6569       /* 4.1. check support for the operation in the loop  */
6570       optab = optab_for_tree_code (code, vectype_in, optab_default);
6571       if (!optab)
6572         {
6573           if (dump_enabled_p ())
6574             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6575                              "no optab.\n");
6576
6577           return false;
6578         }
6579
6580       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6581         {
6582           if (dump_enabled_p ())
6583             dump_printf (MSG_NOTE, "op not supported by target.\n");
6584
6585           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6586               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6587             return false;
6588
6589           if (dump_enabled_p ())
6590             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6591         }
6592
6593       /* Worthwhile without SIMD support?  */
6594       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6595           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6596         {
6597           if (dump_enabled_p ())
6598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6599                              "not worthwhile without SIMD support.\n");
6600
6601           return false;
6602         }
6603     }
6604
6605   /* 4.2. Check support for the epilog operation.
6606
6607           If STMT represents a reduction pattern, then the type of the
6608           reduction variable may be different than the type of the rest
6609           of the arguments.  For example, consider the case of accumulation
6610           of shorts into an int accumulator; The original code:
6611                         S1: int_a = (int) short_a;
6612           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6613
6614           was replaced with:
6615                         STMT: int_acc = widen_sum <short_a, int_acc>
6616
6617           This means that:
6618           1. The tree-code that is used to create the vector operation in the
6619              epilog code (that reduces the partial results) is not the
6620              tree-code of STMT, but is rather the tree-code of the original
6621              stmt from the pattern that STMT is replacing.  I.e, in the example
6622              above we want to use 'widen_sum' in the loop, but 'plus' in the
6623              epilog.
6624           2. The type (mode) we use to check available target support
6625              for the vector operation to be created in the *epilog*, is
6626              determined by the type of the reduction variable (in the example
6627              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6628              However the type (mode) we use to check available target support
6629              for the vector operation to be created *inside the loop*, is
6630              determined by the type of the other arguments to STMT (in the
6631              example we'd check this: optab_handler (widen_sum_optab,
6632              vect_short_mode)).
6633
6634           This is contrary to "regular" reductions, in which the types of all
6635           the arguments are the same as the type of the reduction variable.
6636           For "regular" reductions we can therefore use the same vector type
6637           (and also the same tree-code) when generating the epilog code and
6638           when generating the code inside the loop.  */
6639
6640   vect_reduction_type reduction_type
6641     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6642   if (orig_stmt_info
6643       && (reduction_type == TREE_CODE_REDUCTION
6644           || reduction_type == FOLD_LEFT_REDUCTION))
6645     {
6646       /* This is a reduction pattern: get the vectype from the type of the
6647          reduction variable, and get the tree-code from orig_stmt.  */
6648       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6649       gcc_assert (vectype_out);
6650       vec_mode = TYPE_MODE (vectype_out);
6651     }
6652   else
6653     {
6654       /* Regular reduction: use the same vectype and tree-code as used for
6655          the vector code inside the loop can be used for the epilog code. */
6656       orig_code = code;
6657
6658       if (code == MINUS_EXPR)
6659         orig_code = PLUS_EXPR;
6660
6661       /* For simple condition reductions, replace with the actual expression
6662          we want to base our reduction around.  */
6663       if (reduction_type == CONST_COND_REDUCTION)
6664         {
6665           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6666           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6667         }
6668       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6669         orig_code = cond_reduc_op_code;
6670     }
6671
6672   if (nested_cycle)
6673     {
6674       def_bb = gimple_bb (reduc_def_phi);
6675       def_stmt_loop = def_bb->loop_father;
6676       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6677                                        loop_preheader_edge (def_stmt_loop));
6678       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6679       if (def_arg_stmt_info
6680           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6681               == vect_double_reduction_def))
6682         double_reduc = true;
6683     }
6684
6685   reduc_fn = IFN_LAST;
6686
6687   if (reduction_type == TREE_CODE_REDUCTION
6688       || reduction_type == FOLD_LEFT_REDUCTION
6689       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6690       || reduction_type == CONST_COND_REDUCTION)
6691     {
6692       if (reduction_type == FOLD_LEFT_REDUCTION
6693           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6694           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6695         {
6696           if (reduc_fn != IFN_LAST
6697               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6698                                                   OPTIMIZE_FOR_SPEED))
6699             {
6700               if (dump_enabled_p ())
6701                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6702                                  "reduc op not supported by target.\n");
6703
6704               reduc_fn = IFN_LAST;
6705             }
6706         }
6707       else
6708         {
6709           if (!nested_cycle || double_reduc)
6710             {
6711               if (dump_enabled_p ())
6712                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6713                                  "no reduc code for scalar code.\n");
6714
6715               return false;
6716             }
6717         }
6718     }
6719   else if (reduction_type == COND_REDUCTION)
6720     {
6721       int scalar_precision
6722         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6723       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6724       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6725                                                 nunits_out);
6726
6727       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6728                                           OPTIMIZE_FOR_SPEED))
6729         reduc_fn = IFN_REDUC_MAX;
6730     }
6731
6732   if (reduction_type != EXTRACT_LAST_REDUCTION
6733       && reduc_fn == IFN_LAST
6734       && !nunits_out.is_constant ())
6735     {
6736       if (dump_enabled_p ())
6737         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6738                          "missing target support for reduction on"
6739                          " variable-length vectors.\n");
6740       return false;
6741     }
6742
6743   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6744       && ncopies > 1)
6745     {
6746       if (dump_enabled_p ())
6747         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6748                          "multiple types in double reduction or condition "
6749                          "reduction.\n");
6750       return false;
6751     }
6752
6753   /* For SLP reductions, see if there is a neutral value we can use.  */
6754   tree neutral_op = NULL_TREE;
6755   if (slp_node)
6756     neutral_op = neutral_op_for_slp_reduction
6757       (slp_node_instance->reduc_phis, code,
6758        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6759
6760   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6761     {
6762       /* We can't support in-order reductions of code such as this:
6763
6764            for (int i = 0; i < n1; ++i)
6765              for (int j = 0; j < n2; ++j)
6766                l += a[j];
6767
6768          since GCC effectively transforms the loop when vectorizing:
6769
6770            for (int i = 0; i < n1 / VF; ++i)
6771              for (int j = 0; j < n2; ++j)
6772                for (int k = 0; k < VF; ++k)
6773                  l += a[j];
6774
6775          which is a reassociation of the original operation.  */
6776       if (dump_enabled_p ())
6777         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6778                          "in-order double reduction not supported.\n");
6779
6780       return false;
6781     }
6782
6783   if (reduction_type == FOLD_LEFT_REDUCTION
6784       && slp_node
6785       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6786     {
6787       /* We cannot use in-order reductions in this case because there is
6788          an implicit reassociation of the operations involved.  */
6789       if (dump_enabled_p ())
6790         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6791                          "in-order unchained SLP reductions not supported.\n");
6792       return false;
6793     }
6794
6795   /* For double reductions, and for SLP reductions with a neutral value,
6796      we construct a variable-length initial vector by loading a vector
6797      full of the neutral value and then shift-and-inserting the start
6798      values into the low-numbered elements.  */
6799   if ((double_reduc || neutral_op)
6800       && !nunits_out.is_constant ()
6801       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6802                                           vectype_out, OPTIMIZE_FOR_SPEED))
6803     {
6804       if (dump_enabled_p ())
6805         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6806                          "reduction on variable-length vectors requires"
6807                          " target support for a vector-shift-and-insert"
6808                          " operation.\n");
6809       return false;
6810     }
6811
6812   /* Check extra constraints for variable-length unchained SLP reductions.  */
6813   if (STMT_SLP_TYPE (stmt_info)
6814       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6815       && !nunits_out.is_constant ())
6816     {
6817       /* We checked above that we could build the initial vector when
6818          there's a neutral element value.  Check here for the case in
6819          which each SLP statement has its own initial value and in which
6820          that value needs to be repeated for every instance of the
6821          statement within the initial vector.  */
6822       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6823       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6824       if (!neutral_op
6825           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6826         {
6827           if (dump_enabled_p ())
6828             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6829                              "unsupported form of SLP reduction for"
6830                              " variable-length vectors: cannot build"
6831                              " initial vector.\n");
6832           return false;
6833         }
6834       /* The epilogue code relies on the number of elements being a multiple
6835          of the group size.  The duplicate-and-interleave approach to setting
6836          up the the initial vector does too.  */
6837       if (!multiple_p (nunits_out, group_size))
6838         {
6839           if (dump_enabled_p ())
6840             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6841                              "unsupported form of SLP reduction for"
6842                              " variable-length vectors: the vector size"
6843                              " is not a multiple of the number of results.\n");
6844           return false;
6845         }
6846     }
6847
6848   /* In case of widenning multiplication by a constant, we update the type
6849      of the constant to be the type of the other operand.  We check that the
6850      constant fits the type in the pattern recognition pass.  */
6851   if (code == DOT_PROD_EXPR
6852       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6853     {
6854       if (TREE_CODE (ops[0]) == INTEGER_CST)
6855         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6856       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6857         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6858       else
6859         {
6860           if (dump_enabled_p ())
6861             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6862                              "invalid types in dot-prod\n");
6863
6864           return false;
6865         }
6866     }
6867
6868   if (reduction_type == COND_REDUCTION)
6869     {
6870       widest_int ni;
6871
6872       if (! max_loop_iterations (loop, &ni))
6873         {
6874           if (dump_enabled_p ())
6875             dump_printf_loc (MSG_NOTE, vect_location,
6876                              "loop count not known, cannot create cond "
6877                              "reduction.\n");
6878           return false;
6879         }
6880       /* Convert backedges to iterations.  */
6881       ni += 1;
6882
6883       /* The additional index will be the same type as the condition.  Check
6884          that the loop can fit into this less one (because we'll use up the
6885          zero slot for when there are no matches).  */
6886       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6887       if (wi::geu_p (ni, wi::to_widest (max_index)))
6888         {
6889           if (dump_enabled_p ())
6890             dump_printf_loc (MSG_NOTE, vect_location,
6891                              "loop size is greater than data size.\n");
6892           return false;
6893         }
6894     }
6895
6896   /* In case the vectorization factor (VF) is bigger than the number
6897      of elements that we can fit in a vectype (nunits), we have to generate
6898      more than one vector stmt - i.e - we need to "unroll" the
6899      vector stmt by a factor VF/nunits.  For more details see documentation
6900      in vectorizable_operation.  */
6901
6902   /* If the reduction is used in an outer loop we need to generate
6903      VF intermediate results, like so (e.g. for ncopies=2):
6904         r0 = phi (init, r0)
6905         r1 = phi (init, r1)
6906         r0 = x0 + r0;
6907         r1 = x1 + r1;
6908     (i.e. we generate VF results in 2 registers).
6909     In this case we have a separate def-use cycle for each copy, and therefore
6910     for each copy we get the vector def for the reduction variable from the
6911     respective phi node created for this copy.
6912
6913     Otherwise (the reduction is unused in the loop nest), we can combine
6914     together intermediate results, like so (e.g. for ncopies=2):
6915         r = phi (init, r)
6916         r = x0 + r;
6917         r = x1 + r;
6918    (i.e. we generate VF/2 results in a single register).
6919    In this case for each copy we get the vector def for the reduction variable
6920    from the vectorized reduction operation generated in the previous iteration.
6921
6922    This only works when we see both the reduction PHI and its only consumer
6923    in vectorizable_reduction and there are no intermediate stmts
6924    participating.  */
6925   stmt_vec_info use_stmt_info;
6926   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6927   if (ncopies > 1
6928       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6929       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6930       && (use_stmt_info == stmt_info
6931           || STMT_VINFO_RELATED_STMT (use_stmt_info) == stmt_info))
6932     {
6933       single_defuse_cycle = true;
6934       epilog_copies = 1;
6935     }
6936   else
6937     epilog_copies = ncopies;
6938
6939   /* If the reduction stmt is one of the patterns that have lane
6940      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6941   if ((ncopies > 1
6942        && ! single_defuse_cycle)
6943       && (code == DOT_PROD_EXPR
6944           || code == WIDEN_SUM_EXPR
6945           || code == SAD_EXPR))
6946     {
6947       if (dump_enabled_p ())
6948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6949                          "multi def-use cycle not possible for lane-reducing "
6950                          "reduction operation\n");
6951       return false;
6952     }
6953
6954   if (slp_node)
6955     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6956   else
6957     vec_num = 1;
6958
6959   internal_fn cond_fn = get_conditional_internal_fn (code);
6960   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6961
6962   if (!vec_stmt) /* transformation not required.  */
6963     {
6964       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6965       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6966         {
6967           if (reduction_type != FOLD_LEFT_REDUCTION
6968               && (cond_fn == IFN_LAST
6969                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6970                                                       OPTIMIZE_FOR_SPEED)))
6971             {
6972               if (dump_enabled_p ())
6973                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6974                                  "can't use a fully-masked loop because no"
6975                                  " conditional operation is available.\n");
6976               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6977             }
6978           else if (reduc_index == -1)
6979             {
6980               if (dump_enabled_p ())
6981                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6982                                  "can't use a fully-masked loop for chained"
6983                                  " reductions.\n");
6984               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6985             }
6986           else
6987             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6988                                    vectype_in);
6989         }
6990       if (dump_enabled_p ()
6991           && reduction_type == FOLD_LEFT_REDUCTION)
6992         dump_printf_loc (MSG_NOTE, vect_location,
6993                          "using an in-order (fold-left) reduction.\n");
6994       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6995       return true;
6996     }
6997
6998   /* Transform.  */
6999
7000   if (dump_enabled_p ())
7001     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7002
7003   /* FORNOW: Multiple types are not supported for condition.  */
7004   if (code == COND_EXPR)
7005     gcc_assert (ncopies == 1);
7006
7007   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7008
7009   if (reduction_type == FOLD_LEFT_REDUCTION)
7010     return vectorize_fold_left_reduction
7011       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7012        reduc_fn, ops, vectype_in, reduc_index, masks);
7013
7014   if (reduction_type == EXTRACT_LAST_REDUCTION)
7015     {
7016       gcc_assert (!slp_node);
7017       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7018                                      NULL, reduc_index, NULL, NULL);
7019     }
7020
7021   /* Create the destination vector  */
7022   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7023
7024   prev_stmt_info = NULL;
7025   prev_phi_info = NULL;
7026   if (!slp_node)
7027     {
7028       vec_oprnds0.create (1);
7029       vec_oprnds1.create (1);
7030       if (op_type == ternary_op)
7031         vec_oprnds2.create (1);
7032     }
7033
7034   phis.create (vec_num);
7035   vect_defs.create (vec_num);
7036   if (!slp_node)
7037     vect_defs.quick_push (NULL_TREE);
7038
7039   if (slp_node)
7040     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7041   else
7042     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7043
7044   for (j = 0; j < ncopies; j++)
7045     {
7046       if (code == COND_EXPR)
7047         {
7048           gcc_assert (!slp_node);
7049           vectorizable_condition (stmt_info, gsi, vec_stmt,
7050                                   PHI_RESULT (phis[0]->stmt),
7051                                   reduc_index, NULL, NULL);
7052           /* Multiple types are not supported for condition.  */
7053           break;
7054         }
7055
7056       /* Handle uses.  */
7057       if (j == 0)
7058         {
7059           if (slp_node)
7060             {
7061               /* Get vec defs for all the operands except the reduction index,
7062                  ensuring the ordering of the ops in the vector is kept.  */
7063               auto_vec<tree, 3> slp_ops;
7064               auto_vec<vec<tree>, 3> vec_defs;
7065
7066               slp_ops.quick_push (ops[0]);
7067               slp_ops.quick_push (ops[1]);
7068               if (op_type == ternary_op)
7069                 slp_ops.quick_push (ops[2]);
7070
7071               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7072
7073               vec_oprnds0.safe_splice (vec_defs[0]);
7074               vec_defs[0].release ();
7075               vec_oprnds1.safe_splice (vec_defs[1]);
7076               vec_defs[1].release ();
7077               if (op_type == ternary_op)
7078                 {
7079                   vec_oprnds2.safe_splice (vec_defs[2]);
7080                   vec_defs[2].release ();
7081                 }
7082             }
7083           else
7084             {
7085               vec_oprnds0.quick_push
7086                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7087               vec_oprnds1.quick_push
7088                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7089               if (op_type == ternary_op)
7090                 vec_oprnds2.quick_push
7091                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7092             }
7093         }
7094       else
7095         {
7096           if (!slp_node)
7097             {
7098               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7099
7100               if (single_defuse_cycle && reduc_index == 0)
7101                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7102               else
7103                 vec_oprnds0[0]
7104                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7105                                                     vec_oprnds0[0]);
7106               if (single_defuse_cycle && reduc_index == 1)
7107                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7108               else
7109                 vec_oprnds1[0]
7110                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7111                                                     vec_oprnds1[0]);
7112               if (op_type == ternary_op)
7113                 {
7114                   if (single_defuse_cycle && reduc_index == 2)
7115                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7116                   else
7117                     vec_oprnds2[0]
7118                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7119                                                         vec_oprnds2[0]);
7120                 }
7121             }
7122         }
7123
7124       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7125         {
7126           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7127           if (masked_loop_p)
7128             {
7129               /* Make sure that the reduction accumulator is vop[0].  */
7130               if (reduc_index == 1)
7131                 {
7132                   gcc_assert (commutative_tree_code (code));
7133                   std::swap (vop[0], vop[1]);
7134                 }
7135               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7136                                               vectype_in, i * ncopies + j);
7137               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7138                                                         vop[0], vop[1],
7139                                                         vop[0]);
7140               new_temp = make_ssa_name (vec_dest, call);
7141               gimple_call_set_lhs (call, new_temp);
7142               gimple_call_set_nothrow (call, true);
7143               new_stmt_info
7144                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7145             }
7146           else
7147             {
7148               if (op_type == ternary_op)
7149                 vop[2] = vec_oprnds2[i];
7150
7151               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7152                                                        vop[0], vop[1], vop[2]);
7153               new_temp = make_ssa_name (vec_dest, new_stmt);
7154               gimple_assign_set_lhs (new_stmt, new_temp);
7155               new_stmt_info
7156                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7157             }
7158
7159           if (slp_node)
7160             {
7161               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7162               vect_defs.quick_push (new_temp);
7163             }
7164           else
7165             vect_defs[0] = new_temp;
7166         }
7167
7168       if (slp_node)
7169         continue;
7170
7171       if (j == 0)
7172         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7173       else
7174         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7175
7176       prev_stmt_info = new_stmt_info;
7177     }
7178
7179   /* Finalize the reduction-phi (set its arguments) and create the
7180      epilog reduction code.  */
7181   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7182     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7183
7184   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7185                                     epilog_copies, reduc_fn, phis,
7186                                     double_reduc, slp_node, slp_node_instance,
7187                                     cond_reduc_val, cond_reduc_op_code,
7188                                     neutral_op);
7189
7190   return true;
7191 }
7192
7193 /* Function vect_min_worthwhile_factor.
7194
7195    For a loop where we could vectorize the operation indicated by CODE,
7196    return the minimum vectorization factor that makes it worthwhile
7197    to use generic vectors.  */
7198 static unsigned int
7199 vect_min_worthwhile_factor (enum tree_code code)
7200 {
7201   switch (code)
7202     {
7203     case PLUS_EXPR:
7204     case MINUS_EXPR:
7205     case NEGATE_EXPR:
7206       return 4;
7207
7208     case BIT_AND_EXPR:
7209     case BIT_IOR_EXPR:
7210     case BIT_XOR_EXPR:
7211     case BIT_NOT_EXPR:
7212       return 2;
7213
7214     default:
7215       return INT_MAX;
7216     }
7217 }
7218
7219 /* Return true if VINFO indicates we are doing loop vectorization and if
7220    it is worth decomposing CODE operations into scalar operations for
7221    that loop's vectorization factor.  */
7222
7223 bool
7224 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7225 {
7226   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7227   unsigned HOST_WIDE_INT value;
7228   return (loop_vinfo
7229           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7230           && value >= vect_min_worthwhile_factor (code));
7231 }
7232
7233 /* Function vectorizable_induction
7234
7235    Check if STMT_INFO performs an induction computation that can be vectorized.
7236    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7237    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7238    Return true if STMT_INFO is vectorizable in this way.  */
7239
7240 bool
7241 vectorizable_induction (stmt_vec_info stmt_info,
7242                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7243                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7244                         stmt_vector_for_cost *cost_vec)
7245 {
7246   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7247   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7248   unsigned ncopies;
7249   bool nested_in_vect_loop = false;
7250   struct loop *iv_loop;
7251   tree vec_def;
7252   edge pe = loop_preheader_edge (loop);
7253   basic_block new_bb;
7254   tree new_vec, vec_init, vec_step, t;
7255   tree new_name;
7256   gimple *new_stmt;
7257   gphi *induction_phi;
7258   tree induc_def, vec_dest;
7259   tree init_expr, step_expr;
7260   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7261   unsigned i;
7262   tree expr;
7263   gimple_seq stmts;
7264   imm_use_iterator imm_iter;
7265   use_operand_p use_p;
7266   gimple *exit_phi;
7267   edge latch_e;
7268   tree loop_arg;
7269   gimple_stmt_iterator si;
7270
7271   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7272   if (!phi)
7273     return false;
7274
7275   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7276     return false;
7277
7278   /* Make sure it was recognized as induction computation.  */
7279   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7280     return false;
7281
7282   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7283   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7284
7285   if (slp_node)
7286     ncopies = 1;
7287   else
7288     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7289   gcc_assert (ncopies >= 1);
7290
7291   /* FORNOW. These restrictions should be relaxed.  */
7292   if (nested_in_vect_loop_p (loop, stmt_info))
7293     {
7294       imm_use_iterator imm_iter;
7295       use_operand_p use_p;
7296       gimple *exit_phi;
7297       edge latch_e;
7298       tree loop_arg;
7299
7300       if (ncopies > 1)
7301         {
7302           if (dump_enabled_p ())
7303             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7304                              "multiple types in nested loop.\n");
7305           return false;
7306         }
7307
7308       /* FORNOW: outer loop induction with SLP not supported.  */
7309       if (STMT_SLP_TYPE (stmt_info))
7310         return false;
7311
7312       exit_phi = NULL;
7313       latch_e = loop_latch_edge (loop->inner);
7314       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7315       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7316         {
7317           gimple *use_stmt = USE_STMT (use_p);
7318           if (is_gimple_debug (use_stmt))
7319             continue;
7320
7321           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7322             {
7323               exit_phi = use_stmt;
7324               break;
7325             }
7326         }
7327       if (exit_phi)
7328         {
7329           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7330           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7331                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7332             {
7333               if (dump_enabled_p ())
7334                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7335                                  "inner-loop induction only used outside "
7336                                  "of the outer vectorized loop.\n");
7337               return false;
7338             }
7339         }
7340
7341       nested_in_vect_loop = true;
7342       iv_loop = loop->inner;
7343     }
7344   else
7345     iv_loop = loop;
7346   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7347
7348   if (slp_node && !nunits.is_constant ())
7349     {
7350       /* The current SLP code creates the initial value element-by-element.  */
7351       if (dump_enabled_p ())
7352         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7353                          "SLP induction not supported for variable-length"
7354                          " vectors.\n");
7355       return false;
7356     }
7357
7358   if (!vec_stmt) /* transformation not required.  */
7359     {
7360       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7361       DUMP_VECT_SCOPE ("vectorizable_induction");
7362       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7363       return true;
7364     }
7365
7366   /* Transform.  */
7367
7368   /* Compute a vector variable, initialized with the first VF values of
7369      the induction variable.  E.g., for an iv with IV_PHI='X' and
7370      evolution S, for a vector of 4 units, we want to compute:
7371      [X, X + S, X + 2*S, X + 3*S].  */
7372
7373   if (dump_enabled_p ())
7374     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7375
7376   latch_e = loop_latch_edge (iv_loop);
7377   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7378
7379   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7380   gcc_assert (step_expr != NULL_TREE);
7381
7382   pe = loop_preheader_edge (iv_loop);
7383   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7384                                      loop_preheader_edge (iv_loop));
7385
7386   stmts = NULL;
7387   if (!nested_in_vect_loop)
7388     {
7389       /* Convert the initial value to the desired type.  */
7390       tree new_type = TREE_TYPE (vectype);
7391       init_expr = gimple_convert (&stmts, new_type, init_expr);
7392
7393       /* If we are using the loop mask to "peel" for alignment then we need
7394          to adjust the start value here.  */
7395       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7396       if (skip_niters != NULL_TREE)
7397         {
7398           if (FLOAT_TYPE_P (vectype))
7399             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7400                                         skip_niters);
7401           else
7402             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7403           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7404                                          skip_niters, step_expr);
7405           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7406                                     init_expr, skip_step);
7407         }
7408     }
7409
7410   /* Convert the step to the desired type.  */
7411   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7412
7413   if (stmts)
7414     {
7415       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7416       gcc_assert (!new_bb);
7417     }
7418
7419   /* Find the first insertion point in the BB.  */
7420   basic_block bb = gimple_bb (phi);
7421   si = gsi_after_labels (bb);
7422
7423   /* For SLP induction we have to generate several IVs as for example
7424      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7425      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7426      [VF*S, VF*S, VF*S, VF*S] for all.  */
7427   if (slp_node)
7428     {
7429       /* Enforced above.  */
7430       unsigned int const_nunits = nunits.to_constant ();
7431
7432       /* Generate [VF*S, VF*S, ... ].  */
7433       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7434         {
7435           expr = build_int_cst (integer_type_node, vf);
7436           expr = fold_convert (TREE_TYPE (step_expr), expr);
7437         }
7438       else
7439         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7440       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7441                               expr, step_expr);
7442       if (! CONSTANT_CLASS_P (new_name))
7443         new_name = vect_init_vector (stmt_info, new_name,
7444                                      TREE_TYPE (step_expr), NULL);
7445       new_vec = build_vector_from_val (vectype, new_name);
7446       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7447
7448       /* Now generate the IVs.  */
7449       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7450       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7451       unsigned elts = const_nunits * nvects;
7452       unsigned nivs = least_common_multiple (group_size,
7453                                              const_nunits) / const_nunits;
7454       gcc_assert (elts % group_size == 0);
7455       tree elt = init_expr;
7456       unsigned ivn;
7457       for (ivn = 0; ivn < nivs; ++ivn)
7458         {
7459           tree_vector_builder elts (vectype, const_nunits, 1);
7460           stmts = NULL;
7461           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7462             {
7463               if (ivn*const_nunits + eltn >= group_size
7464                   && (ivn * const_nunits + eltn) % group_size == 0)
7465                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7466                                     elt, step_expr);
7467               elts.quick_push (elt);
7468             }
7469           vec_init = gimple_build_vector (&stmts, &elts);
7470           if (stmts)
7471             {
7472               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7473               gcc_assert (!new_bb);
7474             }
7475
7476           /* Create the induction-phi that defines the induction-operand.  */
7477           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7478           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7479           stmt_vec_info induction_phi_info
7480             = loop_vinfo->add_stmt (induction_phi);
7481           induc_def = PHI_RESULT (induction_phi);
7482
7483           /* Create the iv update inside the loop  */
7484           vec_def = make_ssa_name (vec_dest);
7485           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7486           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7487           loop_vinfo->add_stmt (new_stmt);
7488
7489           /* Set the arguments of the phi node:  */
7490           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7491           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7492                        UNKNOWN_LOCATION);
7493
7494           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7495         }
7496
7497       /* Re-use IVs when we can.  */
7498       if (ivn < nvects)
7499         {
7500           unsigned vfp
7501             = least_common_multiple (group_size, const_nunits) / group_size;
7502           /* Generate [VF'*S, VF'*S, ... ].  */
7503           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7504             {
7505               expr = build_int_cst (integer_type_node, vfp);
7506               expr = fold_convert (TREE_TYPE (step_expr), expr);
7507             }
7508           else
7509             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7510           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7511                                   expr, step_expr);
7512           if (! CONSTANT_CLASS_P (new_name))
7513             new_name = vect_init_vector (stmt_info, new_name,
7514                                          TREE_TYPE (step_expr), NULL);
7515           new_vec = build_vector_from_val (vectype, new_name);
7516           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7517           for (; ivn < nvects; ++ivn)
7518             {
7519               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7520               tree def;
7521               if (gimple_code (iv) == GIMPLE_PHI)
7522                 def = gimple_phi_result (iv);
7523               else
7524                 def = gimple_assign_lhs (iv);
7525               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7526                                               PLUS_EXPR,
7527                                               def, vec_step);
7528               if (gimple_code (iv) == GIMPLE_PHI)
7529                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7530               else
7531                 {
7532                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7533                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7534                 }
7535               SLP_TREE_VEC_STMTS (slp_node).quick_push
7536                 (loop_vinfo->add_stmt (new_stmt));
7537             }
7538         }
7539
7540       return true;
7541     }
7542
7543   /* Create the vector that holds the initial_value of the induction.  */
7544   if (nested_in_vect_loop)
7545     {
7546       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7547          been created during vectorization of previous stmts.  We obtain it
7548          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7549       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7550       /* If the initial value is not of proper type, convert it.  */
7551       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7552         {
7553           new_stmt
7554             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7555                                                           vect_simple_var,
7556                                                           "vec_iv_"),
7557                                    VIEW_CONVERT_EXPR,
7558                                    build1 (VIEW_CONVERT_EXPR, vectype,
7559                                            vec_init));
7560           vec_init = gimple_assign_lhs (new_stmt);
7561           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7562                                                  new_stmt);
7563           gcc_assert (!new_bb);
7564           loop_vinfo->add_stmt (new_stmt);
7565         }
7566     }
7567   else
7568     {
7569       /* iv_loop is the loop to be vectorized. Create:
7570          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7571       stmts = NULL;
7572       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7573
7574       unsigned HOST_WIDE_INT const_nunits;
7575       if (nunits.is_constant (&const_nunits))
7576         {
7577           tree_vector_builder elts (vectype, const_nunits, 1);
7578           elts.quick_push (new_name);
7579           for (i = 1; i < const_nunits; i++)
7580             {
7581               /* Create: new_name_i = new_name + step_expr  */
7582               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7583                                        new_name, step_expr);
7584               elts.quick_push (new_name);
7585             }
7586           /* Create a vector from [new_name_0, new_name_1, ...,
7587              new_name_nunits-1]  */
7588           vec_init = gimple_build_vector (&stmts, &elts);
7589         }
7590       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7591         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7592         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7593                                  new_name, step_expr);
7594       else
7595         {
7596           /* Build:
7597                 [base, base, base, ...]
7598                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7599           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7600           gcc_assert (flag_associative_math);
7601           tree index = build_index_vector (vectype, 0, 1);
7602           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7603                                                         new_name);
7604           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7605                                                         step_expr);
7606           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7607           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7608                                    vec_init, step_vec);
7609           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7610                                    vec_init, base_vec);
7611         }
7612
7613       if (stmts)
7614         {
7615           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7616           gcc_assert (!new_bb);
7617         }
7618     }
7619
7620
7621   /* Create the vector that holds the step of the induction.  */
7622   if (nested_in_vect_loop)
7623     /* iv_loop is nested in the loop to be vectorized. Generate:
7624        vec_step = [S, S, S, S]  */
7625     new_name = step_expr;
7626   else
7627     {
7628       /* iv_loop is the loop to be vectorized. Generate:
7629           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7630       gimple_seq seq = NULL;
7631       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7632         {
7633           expr = build_int_cst (integer_type_node, vf);
7634           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7635         }
7636       else
7637         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7638       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7639                                expr, step_expr);
7640       if (seq)
7641         {
7642           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7643           gcc_assert (!new_bb);
7644         }
7645     }
7646
7647   t = unshare_expr (new_name);
7648   gcc_assert (CONSTANT_CLASS_P (new_name)
7649               || TREE_CODE (new_name) == SSA_NAME);
7650   new_vec = build_vector_from_val (vectype, t);
7651   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7652
7653
7654   /* Create the following def-use cycle:
7655      loop prolog:
7656          vec_init = ...
7657          vec_step = ...
7658      loop:
7659          vec_iv = PHI <vec_init, vec_loop>
7660          ...
7661          STMT
7662          ...
7663          vec_loop = vec_iv + vec_step;  */
7664
7665   /* Create the induction-phi that defines the induction-operand.  */
7666   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7667   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7668   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7669   induc_def = PHI_RESULT (induction_phi);
7670
7671   /* Create the iv update inside the loop  */
7672   vec_def = make_ssa_name (vec_dest);
7673   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7674   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7675   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7676
7677   /* Set the arguments of the phi node:  */
7678   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7679   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7680                UNKNOWN_LOCATION);
7681
7682   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7683
7684   /* In case that vectorization factor (VF) is bigger than the number
7685      of elements that we can fit in a vectype (nunits), we have to generate
7686      more than one vector stmt - i.e - we need to "unroll" the
7687      vector stmt by a factor VF/nunits.  For more details see documentation
7688      in vectorizable_operation.  */
7689
7690   if (ncopies > 1)
7691     {
7692       gimple_seq seq = NULL;
7693       stmt_vec_info prev_stmt_vinfo;
7694       /* FORNOW. This restriction should be relaxed.  */
7695       gcc_assert (!nested_in_vect_loop);
7696
7697       /* Create the vector that holds the step of the induction.  */
7698       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7699         {
7700           expr = build_int_cst (integer_type_node, nunits);
7701           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7702         }
7703       else
7704         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7705       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7706                                expr, step_expr);
7707       if (seq)
7708         {
7709           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7710           gcc_assert (!new_bb);
7711         }
7712
7713       t = unshare_expr (new_name);
7714       gcc_assert (CONSTANT_CLASS_P (new_name)
7715                   || TREE_CODE (new_name) == SSA_NAME);
7716       new_vec = build_vector_from_val (vectype, t);
7717       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7718
7719       vec_def = induc_def;
7720       prev_stmt_vinfo = induction_phi_info;
7721       for (i = 1; i < ncopies; i++)
7722         {
7723           /* vec_i = vec_prev + vec_step  */
7724           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7725                                           vec_def, vec_step);
7726           vec_def = make_ssa_name (vec_dest, new_stmt);
7727           gimple_assign_set_lhs (new_stmt, vec_def);
7728
7729           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7730           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7731           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7732           prev_stmt_vinfo = new_stmt_info;
7733         }
7734     }
7735
7736   if (nested_in_vect_loop)
7737     {
7738       /* Find the loop-closed exit-phi of the induction, and record
7739          the final vector of induction results:  */
7740       exit_phi = NULL;
7741       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7742         {
7743           gimple *use_stmt = USE_STMT (use_p);
7744           if (is_gimple_debug (use_stmt))
7745             continue;
7746
7747           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7748             {
7749               exit_phi = use_stmt;
7750               break;
7751             }
7752         }
7753       if (exit_phi)
7754         {
7755           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7756           /* FORNOW. Currently not supporting the case that an inner-loop induction
7757              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7758           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7759                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7760
7761           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7762           if (dump_enabled_p ())
7763             {
7764               dump_printf_loc (MSG_NOTE, vect_location,
7765                                "vector of inductions after inner-loop:");
7766               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7767             }
7768         }
7769     }
7770
7771
7772   if (dump_enabled_p ())
7773     {
7774       dump_printf_loc (MSG_NOTE, vect_location,
7775                        "transform induction: created def-use cycle: ");
7776       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7777       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7778                         SSA_NAME_DEF_STMT (vec_def), 0);
7779     }
7780
7781   return true;
7782 }
7783
7784 /* Function vectorizable_live_operation.
7785
7786    STMT_INFO computes a value that is used outside the loop.  Check if
7787    it can be supported.  */
7788
7789 bool
7790 vectorizable_live_operation (stmt_vec_info stmt_info,
7791                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7792                              slp_tree slp_node, int slp_index,
7793                              stmt_vec_info *vec_stmt,
7794                              stmt_vector_for_cost *)
7795 {
7796   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7797   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7798   imm_use_iterator imm_iter;
7799   tree lhs, lhs_type, bitsize, vec_bitsize;
7800   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7801   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7802   int ncopies;
7803   gimple *use_stmt;
7804   auto_vec<tree> vec_oprnds;
7805   int vec_entry = 0;
7806   poly_uint64 vec_index = 0;
7807
7808   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7809
7810   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7811     return false;
7812
7813   /* FORNOW.  CHECKME.  */
7814   if (nested_in_vect_loop_p (loop, stmt_info))
7815     return false;
7816
7817   /* If STMT is not relevant and it is a simple assignment and its inputs are
7818      invariant then it can remain in place, unvectorized.  The original last
7819      scalar value that it computes will be used.  */
7820   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7821     {
7822       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7823       if (dump_enabled_p ())
7824         dump_printf_loc (MSG_NOTE, vect_location,
7825                          "statement is simple and uses invariant.  Leaving in "
7826                          "place.\n");
7827       return true;
7828     }
7829
7830   if (slp_node)
7831     ncopies = 1;
7832   else
7833     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7834
7835   if (slp_node)
7836     {
7837       gcc_assert (slp_index >= 0);
7838
7839       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7840       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7841
7842       /* Get the last occurrence of the scalar index from the concatenation of
7843          all the slp vectors. Calculate which slp vector it is and the index
7844          within.  */
7845       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7846
7847       /* Calculate which vector contains the result, and which lane of
7848          that vector we need.  */
7849       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7850         {
7851           if (dump_enabled_p ())
7852             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7853                              "Cannot determine which vector holds the"
7854                              " final result.\n");
7855           return false;
7856         }
7857     }
7858
7859   if (!vec_stmt)
7860     {
7861       /* No transformation required.  */
7862       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7863         {
7864           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7865                                                OPTIMIZE_FOR_SPEED))
7866             {
7867               if (dump_enabled_p ())
7868                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7869                                  "can't use a fully-masked loop because "
7870                                  "the target doesn't support extract last "
7871                                  "reduction.\n");
7872               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7873             }
7874           else if (slp_node)
7875             {
7876               if (dump_enabled_p ())
7877                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7878                                  "can't use a fully-masked loop because an "
7879                                  "SLP statement is live after the loop.\n");
7880               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7881             }
7882           else if (ncopies > 1)
7883             {
7884               if (dump_enabled_p ())
7885                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886                                  "can't use a fully-masked loop because"
7887                                  " ncopies is greater than 1.\n");
7888               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7889             }
7890           else
7891             {
7892               gcc_assert (ncopies == 1 && !slp_node);
7893               vect_record_loop_mask (loop_vinfo,
7894                                      &LOOP_VINFO_MASKS (loop_vinfo),
7895                                      1, vectype);
7896             }
7897         }
7898       return true;
7899     }
7900
7901   /* If stmt has a related stmt, then use that for getting the lhs.  */
7902   gimple *stmt = (is_pattern_stmt_p (stmt_info)
7903                   ? STMT_VINFO_RELATED_STMT (stmt_info)->stmt
7904                   : stmt_info->stmt);
7905
7906   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7907         : gimple_get_lhs (stmt);
7908   lhs_type = TREE_TYPE (lhs);
7909
7910   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7911              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7912              : TYPE_SIZE (TREE_TYPE (vectype)));
7913   vec_bitsize = TYPE_SIZE (vectype);
7914
7915   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7916   tree vec_lhs, bitstart;
7917   if (slp_node)
7918     {
7919       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7920
7921       /* Get the correct slp vectorized stmt.  */
7922       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7923       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7924         vec_lhs = gimple_phi_result (phi);
7925       else
7926         vec_lhs = gimple_get_lhs (vec_stmt);
7927
7928       /* Get entry to use.  */
7929       bitstart = bitsize_int (vec_index);
7930       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7931     }
7932   else
7933     {
7934       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7935       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7936       gcc_checking_assert (ncopies == 1
7937                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7938
7939       /* For multiple copies, get the last copy.  */
7940       for (int i = 1; i < ncopies; ++i)
7941         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7942
7943       /* Get the last lane in the vector.  */
7944       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7945     }
7946
7947   gimple_seq stmts = NULL;
7948   tree new_tree;
7949   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7950     {
7951       /* Emit:
7952
7953            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7954
7955          where VEC_LHS is the vectorized live-out result and MASK is
7956          the loop mask for the final iteration.  */
7957       gcc_assert (ncopies == 1 && !slp_node);
7958       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7959       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7960                                       1, vectype, 0);
7961       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7962                                       scalar_type, mask, vec_lhs);
7963
7964       /* Convert the extracted vector element to the required scalar type.  */
7965       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7966     }
7967   else
7968     {
7969       tree bftype = TREE_TYPE (vectype);
7970       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7971         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7972       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7973       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7974                                        &stmts, true, NULL_TREE);
7975     }
7976
7977   if (stmts)
7978     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7979
7980   /* Replace use of lhs with newly computed result.  If the use stmt is a
7981      single arg PHI, just replace all uses of PHI result.  It's necessary
7982      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7983   use_operand_p use_p;
7984   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7985     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7986         && !is_gimple_debug (use_stmt))
7987     {
7988       if (gimple_code (use_stmt) == GIMPLE_PHI
7989           && gimple_phi_num_args (use_stmt) == 1)
7990         {
7991           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7992         }
7993       else
7994         {
7995           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7996             SET_USE (use_p, new_tree);
7997         }
7998       update_stmt (use_stmt);
7999     }
8000
8001   return true;
8002 }
8003
8004 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8005
8006 static void
8007 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8008 {
8009   ssa_op_iter op_iter;
8010   imm_use_iterator imm_iter;
8011   def_operand_p def_p;
8012   gimple *ustmt;
8013
8014   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8015     {
8016       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8017         {
8018           basic_block bb;
8019
8020           if (!is_gimple_debug (ustmt))
8021             continue;
8022
8023           bb = gimple_bb (ustmt);
8024
8025           if (!flow_bb_inside_loop_p (loop, bb))
8026             {
8027               if (gimple_debug_bind_p (ustmt))
8028                 {
8029                   if (dump_enabled_p ())
8030                     dump_printf_loc (MSG_NOTE, vect_location,
8031                                      "killing debug use\n");
8032
8033                   gimple_debug_bind_reset_value (ustmt);
8034                   update_stmt (ustmt);
8035                 }
8036               else
8037                 gcc_unreachable ();
8038             }
8039         }
8040     }
8041 }
8042
8043 /* Given loop represented by LOOP_VINFO, return true if computation of
8044    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8045    otherwise.  */
8046
8047 static bool
8048 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8049 {
8050   /* Constant case.  */
8051   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8052     {
8053       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8054       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8055
8056       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8057       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8058       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8059         return true;
8060     }
8061
8062   widest_int max;
8063   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8064   /* Check the upper bound of loop niters.  */
8065   if (get_max_loop_iterations (loop, &max))
8066     {
8067       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8068       signop sgn = TYPE_SIGN (type);
8069       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8070       if (max < type_max)
8071         return true;
8072     }
8073   return false;
8074 }
8075
8076 /* Return a mask type with half the number of elements as TYPE.  */
8077
8078 tree
8079 vect_halve_mask_nunits (tree type)
8080 {
8081   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8082   return build_truth_vector_type (nunits, current_vector_size);
8083 }
8084
8085 /* Return a mask type with twice as many elements as TYPE.  */
8086
8087 tree
8088 vect_double_mask_nunits (tree type)
8089 {
8090   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8091   return build_truth_vector_type (nunits, current_vector_size);
8092 }
8093
8094 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8095    contain a sequence of NVECTORS masks that each control a vector of type
8096    VECTYPE.  */
8097
8098 void
8099 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8100                        unsigned int nvectors, tree vectype)
8101 {
8102   gcc_assert (nvectors != 0);
8103   if (masks->length () < nvectors)
8104     masks->safe_grow_cleared (nvectors);
8105   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8106   /* The number of scalars per iteration and the number of vectors are
8107      both compile-time constants.  */
8108   unsigned int nscalars_per_iter
8109     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8110                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8111   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8112     {
8113       rgm->max_nscalars_per_iter = nscalars_per_iter;
8114       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8115     }
8116 }
8117
8118 /* Given a complete set of masks MASKS, extract mask number INDEX
8119    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8120    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8121
8122    See the comment above vec_loop_masks for more details about the mask
8123    arrangement.  */
8124
8125 tree
8126 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8127                     unsigned int nvectors, tree vectype, unsigned int index)
8128 {
8129   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8130   tree mask_type = rgm->mask_type;
8131
8132   /* Populate the rgroup's mask array, if this is the first time we've
8133      used it.  */
8134   if (rgm->masks.is_empty ())
8135     {
8136       rgm->masks.safe_grow_cleared (nvectors);
8137       for (unsigned int i = 0; i < nvectors; ++i)
8138         {
8139           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8140           /* Provide a dummy definition until the real one is available.  */
8141           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8142           rgm->masks[i] = mask;
8143         }
8144     }
8145
8146   tree mask = rgm->masks[index];
8147   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8148                 TYPE_VECTOR_SUBPARTS (vectype)))
8149     {
8150       /* A loop mask for data type X can be reused for data type Y
8151          if X has N times more elements than Y and if Y's elements
8152          are N times bigger than X's.  In this case each sequence
8153          of N elements in the loop mask will be all-zero or all-one.
8154          We can then view-convert the mask so that each sequence of
8155          N elements is replaced by a single element.  */
8156       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8157                               TYPE_VECTOR_SUBPARTS (vectype)));
8158       gimple_seq seq = NULL;
8159       mask_type = build_same_sized_truth_vector_type (vectype);
8160       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8161       if (seq)
8162         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8163     }
8164   return mask;
8165 }
8166
8167 /* Scale profiling counters by estimation for LOOP which is vectorized
8168    by factor VF.  */
8169
8170 static void
8171 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8172 {
8173   edge preheader = loop_preheader_edge (loop);
8174   /* Reduce loop iterations by the vectorization factor.  */
8175   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8176   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8177
8178   if (freq_h.nonzero_p ())
8179     {
8180       profile_probability p;
8181
8182       /* Avoid dropping loop body profile counter to 0 because of zero count
8183          in loop's preheader.  */
8184       if (!(freq_e == profile_count::zero ()))
8185         freq_e = freq_e.force_nonzero ();
8186       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8187       scale_loop_frequencies (loop, p);
8188     }
8189
8190   edge exit_e = single_exit (loop);
8191   exit_e->probability = profile_probability::always ()
8192                                  .apply_scale (1, new_est_niter + 1);
8193
8194   edge exit_l = single_pred_edge (loop->latch);
8195   profile_probability prob = exit_l->probability;
8196   exit_l->probability = exit_e->probability.invert ();
8197   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8198     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8199 }
8200
8201 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8202    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8203    stmt_vec_info.  */
8204
8205 static void
8206 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8207                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8208 {
8209   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8210   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8211
8212   if (dump_enabled_p ())
8213     {
8214       dump_printf_loc (MSG_NOTE, vect_location,
8215                        "------>vectorizing statement: ");
8216       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
8217     }
8218
8219   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8220     vect_loop_kill_debug_uses (loop, stmt_info);
8221
8222   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8223       && !STMT_VINFO_LIVE_P (stmt_info))
8224     return;
8225
8226   if (STMT_VINFO_VECTYPE (stmt_info))
8227     {
8228       poly_uint64 nunits
8229         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8230       if (!STMT_SLP_TYPE (stmt_info)
8231           && maybe_ne (nunits, vf)
8232           && dump_enabled_p ())
8233         /* For SLP VF is set according to unrolling factor, and not
8234            to vector size, hence for SLP this print is not valid.  */
8235         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8236     }
8237
8238   /* Pure SLP statements have already been vectorized.  We still need
8239      to apply loop vectorization to hybrid SLP statements.  */
8240   if (PURE_SLP_STMT (stmt_info))
8241     return;
8242
8243   if (dump_enabled_p ())
8244     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8245
8246   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8247     *seen_store = stmt_info;
8248 }
8249
8250 /* Function vect_transform_loop.
8251
8252    The analysis phase has determined that the loop is vectorizable.
8253    Vectorize the loop - created vectorized stmts to replace the scalar
8254    stmts in the loop, and update the loop exit condition.
8255    Returns scalar epilogue loop if any.  */
8256
8257 struct loop *
8258 vect_transform_loop (loop_vec_info loop_vinfo)
8259 {
8260   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8261   struct loop *epilogue = NULL;
8262   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8263   int nbbs = loop->num_nodes;
8264   int i;
8265   tree niters_vector = NULL_TREE;
8266   tree step_vector = NULL_TREE;
8267   tree niters_vector_mult_vf = NULL_TREE;
8268   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8269   unsigned int lowest_vf = constant_lower_bound (vf);
8270   gimple *stmt;
8271   bool check_profitability = false;
8272   unsigned int th;
8273
8274   DUMP_VECT_SCOPE ("vec_transform_loop");
8275
8276   loop_vinfo->shared->check_datarefs ();
8277
8278   /* Use the more conservative vectorization threshold.  If the number
8279      of iterations is constant assume the cost check has been performed
8280      by our caller.  If the threshold makes all loops profitable that
8281      run at least the (estimated) vectorization factor number of times
8282      checking is pointless, too.  */
8283   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8284   if (th >= vect_vf_for_cost (loop_vinfo)
8285       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8286     {
8287       if (dump_enabled_p ())
8288         dump_printf_loc (MSG_NOTE, vect_location,
8289                          "Profitability threshold is %d loop iterations.\n",
8290                          th);
8291       check_profitability = true;
8292     }
8293
8294   /* Make sure there exists a single-predecessor exit bb.  Do this before
8295      versioning.   */
8296   edge e = single_exit (loop);
8297   if (! single_pred_p (e->dest))
8298     {
8299       split_loop_exit_edge (e);
8300       if (dump_enabled_p ())
8301         dump_printf (MSG_NOTE, "split exit edge\n");
8302     }
8303
8304   /* Version the loop first, if required, so the profitability check
8305      comes first.  */
8306
8307   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8308     {
8309       poly_uint64 versioning_threshold
8310         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8311       if (check_profitability
8312           && ordered_p (poly_uint64 (th), versioning_threshold))
8313         {
8314           versioning_threshold = ordered_max (poly_uint64 (th),
8315                                               versioning_threshold);
8316           check_profitability = false;
8317         }
8318       vect_loop_versioning (loop_vinfo, th, check_profitability,
8319                             versioning_threshold);
8320       check_profitability = false;
8321     }
8322
8323   /* Make sure there exists a single-predecessor exit bb also on the
8324      scalar loop copy.  Do this after versioning but before peeling
8325      so CFG structure is fine for both scalar and if-converted loop
8326      to make slpeel_duplicate_current_defs_from_edges face matched
8327      loop closed PHI nodes on the exit.  */
8328   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8329     {
8330       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8331       if (! single_pred_p (e->dest))
8332         {
8333           split_loop_exit_edge (e);
8334           if (dump_enabled_p ())
8335             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8336         }
8337     }
8338
8339   tree niters = vect_build_loop_niters (loop_vinfo);
8340   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8341   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8342   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8343   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8344                               &step_vector, &niters_vector_mult_vf, th,
8345                               check_profitability, niters_no_overflow);
8346
8347   if (niters_vector == NULL_TREE)
8348     {
8349       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8350           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8351           && known_eq (lowest_vf, vf))
8352         {
8353           niters_vector
8354             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8355                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8356           step_vector = build_one_cst (TREE_TYPE (niters));
8357         }
8358       else
8359         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8360                                      &step_vector, niters_no_overflow);
8361     }
8362
8363   /* 1) Make sure the loop header has exactly two entries
8364      2) Make sure we have a preheader basic block.  */
8365
8366   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8367
8368   split_edge (loop_preheader_edge (loop));
8369
8370   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8371       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8372     /* This will deal with any possible peeling.  */
8373     vect_prepare_for_masked_peels (loop_vinfo);
8374
8375   /* Schedule the SLP instances first, then handle loop vectorization
8376      below.  */
8377   if (!loop_vinfo->slp_instances.is_empty ())
8378     {
8379       DUMP_VECT_SCOPE ("scheduling SLP instances");
8380       vect_schedule_slp (loop_vinfo);
8381     }
8382
8383   /* FORNOW: the vectorizer supports only loops which body consist
8384      of one basic block (header + empty latch). When the vectorizer will
8385      support more involved loop forms, the order by which the BBs are
8386      traversed need to be reconsidered.  */
8387
8388   for (i = 0; i < nbbs; i++)
8389     {
8390       basic_block bb = bbs[i];
8391       stmt_vec_info stmt_info;
8392
8393       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8394            gsi_next (&si))
8395         {
8396           gphi *phi = si.phi ();
8397           if (dump_enabled_p ())
8398             {
8399               dump_printf_loc (MSG_NOTE, vect_location,
8400                                "------>vectorizing phi: ");
8401               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8402             }
8403           stmt_info = loop_vinfo->lookup_stmt (phi);
8404           if (!stmt_info)
8405             continue;
8406
8407           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8408             vect_loop_kill_debug_uses (loop, stmt_info);
8409
8410           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8411               && !STMT_VINFO_LIVE_P (stmt_info))
8412             continue;
8413
8414           if (STMT_VINFO_VECTYPE (stmt_info)
8415               && (maybe_ne
8416                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8417               && dump_enabled_p ())
8418             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8419
8420           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8421                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8422                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8423               && ! PURE_SLP_STMT (stmt_info))
8424             {
8425               if (dump_enabled_p ())
8426                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8427               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8428             }
8429         }
8430
8431       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8432            !gsi_end_p (si);)
8433         {
8434           stmt = gsi_stmt (si);
8435           /* During vectorization remove existing clobber stmts.  */
8436           if (gimple_clobber_p (stmt))
8437             {
8438               unlink_stmt_vdef (stmt);
8439               gsi_remove (&si, true);
8440               release_defs (stmt);
8441             }
8442           else
8443             {
8444               stmt_info = loop_vinfo->lookup_stmt (stmt);
8445
8446               /* vector stmts created in the outer-loop during vectorization of
8447                  stmts in an inner-loop may not have a stmt_info, and do not
8448                  need to be vectorized.  */
8449               stmt_vec_info seen_store = NULL;
8450               if (stmt_info)
8451                 {
8452                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8453                     {
8454                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8455                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8456                            !gsi_end_p (subsi); gsi_next (&subsi))
8457                         {
8458                           stmt_vec_info pat_stmt_info
8459                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8460                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8461                                                     &si, &seen_store);
8462                         }
8463                       stmt_vec_info pat_stmt_info
8464                         = STMT_VINFO_RELATED_STMT (stmt_info);
8465                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8466                                                 &seen_store);
8467                     }
8468                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8469                                             &seen_store);
8470                 }
8471               gsi_next (&si);
8472               if (seen_store)
8473                 {
8474                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8475                     /* Interleaving.  If IS_STORE is TRUE, the
8476                        vectorization of the interleaving chain was
8477                        completed - free all the stores in the chain.  */
8478                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8479                   else
8480                     /* Free the attached stmt_vec_info and remove the stmt.  */
8481                     loop_vinfo->remove_stmt (stmt_info);
8482                 }
8483             }
8484         }
8485
8486       /* Stub out scalar statements that must not survive vectorization.
8487          Doing this here helps with grouped statements, or statements that
8488          are involved in patterns.  */
8489       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8490            !gsi_end_p (gsi); gsi_next (&gsi))
8491         {
8492           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8493           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8494             {
8495               tree lhs = gimple_get_lhs (call);
8496               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8497                 {
8498                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8499                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8500                   gsi_replace (&gsi, new_stmt, true);
8501                 }
8502             }
8503         }
8504     }                           /* BBs in loop */
8505
8506   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8507      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8508   if (integer_onep (step_vector))
8509     niters_no_overflow = true;
8510   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8511                            niters_vector_mult_vf, !niters_no_overflow);
8512
8513   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8514   scale_profile_for_vect_loop (loop, assumed_vf);
8515
8516   /* True if the final iteration might not handle a full vector's
8517      worth of scalar iterations.  */
8518   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8519   /* The minimum number of iterations performed by the epilogue.  This
8520      is 1 when peeling for gaps because we always need a final scalar
8521      iteration.  */
8522   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8523   /* +1 to convert latch counts to loop iteration counts,
8524      -min_epilogue_iters to remove iterations that cannot be performed
8525        by the vector code.  */
8526   int bias_for_lowest = 1 - min_epilogue_iters;
8527   int bias_for_assumed = bias_for_lowest;
8528   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8529   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8530     {
8531       /* When the amount of peeling is known at compile time, the first
8532          iteration will have exactly alignment_npeels active elements.
8533          In the worst case it will have at least one.  */
8534       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8535       bias_for_lowest += lowest_vf - min_first_active;
8536       bias_for_assumed += assumed_vf - min_first_active;
8537     }
8538   /* In these calculations the "- 1" converts loop iteration counts
8539      back to latch counts.  */
8540   if (loop->any_upper_bound)
8541     loop->nb_iterations_upper_bound
8542       = (final_iter_may_be_partial
8543          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8544                           lowest_vf) - 1
8545          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8546                            lowest_vf) - 1);
8547   if (loop->any_likely_upper_bound)
8548     loop->nb_iterations_likely_upper_bound
8549       = (final_iter_may_be_partial
8550          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8551                           + bias_for_lowest, lowest_vf) - 1
8552          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8553                            + bias_for_lowest, lowest_vf) - 1);
8554   if (loop->any_estimate)
8555     loop->nb_iterations_estimate
8556       = (final_iter_may_be_partial
8557          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8558                           assumed_vf) - 1
8559          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8560                            assumed_vf) - 1);
8561
8562   if (dump_enabled_p ())
8563     {
8564       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8565         {
8566           dump_printf_loc (MSG_NOTE, vect_location,
8567                            "LOOP VECTORIZED\n");
8568           if (loop->inner)
8569             dump_printf_loc (MSG_NOTE, vect_location,
8570                              "OUTER LOOP VECTORIZED\n");
8571           dump_printf (MSG_NOTE, "\n");
8572         }
8573       else
8574         {
8575           dump_printf_loc (MSG_NOTE, vect_location,
8576                            "LOOP EPILOGUE VECTORIZED (VS=");
8577           dump_dec (MSG_NOTE, current_vector_size);
8578           dump_printf (MSG_NOTE, ")\n");
8579         }
8580     }
8581
8582   /* Free SLP instances here because otherwise stmt reference counting
8583      won't work.  */
8584   slp_instance instance;
8585   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8586     vect_free_slp_instance (instance, true);
8587   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8588   /* Clear-up safelen field since its value is invalid after vectorization
8589      since vectorized loop can have loop-carried dependencies.  */
8590   loop->safelen = 0;
8591
8592   /* Don't vectorize epilogue for epilogue.  */
8593   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8594     epilogue = NULL;
8595
8596   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8597     epilogue = NULL;
8598
8599   if (epilogue)
8600     {
8601       auto_vector_sizes vector_sizes;
8602       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8603       unsigned int next_size = 0;
8604
8605       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8606           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8607           && known_eq (vf, lowest_vf))
8608         {
8609           unsigned int eiters
8610             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8611                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8612           eiters = eiters % lowest_vf;
8613           epilogue->nb_iterations_upper_bound = eiters - 1;
8614
8615           unsigned int ratio;
8616           while (next_size < vector_sizes.length ()
8617                  && !(constant_multiple_p (current_vector_size,
8618                                            vector_sizes[next_size], &ratio)
8619                       && eiters >= lowest_vf / ratio))
8620             next_size += 1;
8621         }
8622       else
8623         while (next_size < vector_sizes.length ()
8624                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8625           next_size += 1;
8626
8627       if (next_size == vector_sizes.length ())
8628         epilogue = NULL;
8629     }
8630
8631   if (epilogue)
8632     {
8633       epilogue->force_vectorize = loop->force_vectorize;
8634       epilogue->safelen = loop->safelen;
8635       epilogue->dont_vectorize = false;
8636
8637       /* We may need to if-convert epilogue to vectorize it.  */
8638       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8639         tree_if_conversion (epilogue);
8640     }
8641
8642   return epilogue;
8643 }
8644
8645 /* The code below is trying to perform simple optimization - revert
8646    if-conversion for masked stores, i.e. if the mask of a store is zero
8647    do not perform it and all stored value producers also if possible.
8648    For example,
8649      for (i=0; i<n; i++)
8650        if (c[i])
8651         {
8652           p1[i] += 1;
8653           p2[i] = p3[i] +2;
8654         }
8655    this transformation will produce the following semi-hammock:
8656
8657    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8658      {
8659        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8660        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8661        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8662        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8663        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8664        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8665      }
8666 */
8667
8668 void
8669 optimize_mask_stores (struct loop *loop)
8670 {
8671   basic_block *bbs = get_loop_body (loop);
8672   unsigned nbbs = loop->num_nodes;
8673   unsigned i;
8674   basic_block bb;
8675   struct loop *bb_loop;
8676   gimple_stmt_iterator gsi;
8677   gimple *stmt;
8678   auto_vec<gimple *> worklist;
8679
8680   vect_location = find_loop_location (loop);
8681   /* Pick up all masked stores in loop if any.  */
8682   for (i = 0; i < nbbs; i++)
8683     {
8684       bb = bbs[i];
8685       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8686            gsi_next (&gsi))
8687         {
8688           stmt = gsi_stmt (gsi);
8689           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8690             worklist.safe_push (stmt);
8691         }
8692     }
8693
8694   free (bbs);
8695   if (worklist.is_empty ())
8696     return;
8697
8698   /* Loop has masked stores.  */
8699   while (!worklist.is_empty ())
8700     {
8701       gimple *last, *last_store;
8702       edge e, efalse;
8703       tree mask;
8704       basic_block store_bb, join_bb;
8705       gimple_stmt_iterator gsi_to;
8706       tree vdef, new_vdef;
8707       gphi *phi;
8708       tree vectype;
8709       tree zero;
8710
8711       last = worklist.pop ();
8712       mask = gimple_call_arg (last, 2);
8713       bb = gimple_bb (last);
8714       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8715          the same loop as if_bb.  It could be different to LOOP when two
8716          level loop-nest is vectorized and mask_store belongs to the inner
8717          one.  */
8718       e = split_block (bb, last);
8719       bb_loop = bb->loop_father;
8720       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8721       join_bb = e->dest;
8722       store_bb = create_empty_bb (bb);
8723       add_bb_to_loop (store_bb, bb_loop);
8724       e->flags = EDGE_TRUE_VALUE;
8725       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8726       /* Put STORE_BB to likely part.  */
8727       efalse->probability = profile_probability::unlikely ();
8728       store_bb->count = efalse->count ();
8729       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8730       if (dom_info_available_p (CDI_DOMINATORS))
8731         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8732       if (dump_enabled_p ())
8733         dump_printf_loc (MSG_NOTE, vect_location,
8734                          "Create new block %d to sink mask stores.",
8735                          store_bb->index);
8736       /* Create vector comparison with boolean result.  */
8737       vectype = TREE_TYPE (mask);
8738       zero = build_zero_cst (vectype);
8739       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8740       gsi = gsi_last_bb (bb);
8741       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8742       /* Create new PHI node for vdef of the last masked store:
8743          .MEM_2 = VDEF <.MEM_1>
8744          will be converted to
8745          .MEM.3 = VDEF <.MEM_1>
8746          and new PHI node will be created in join bb
8747          .MEM_2 = PHI <.MEM_1, .MEM_3>
8748       */
8749       vdef = gimple_vdef (last);
8750       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8751       gimple_set_vdef (last, new_vdef);
8752       phi = create_phi_node (vdef, join_bb);
8753       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8754
8755       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8756       while (true)
8757         {
8758           gimple_stmt_iterator gsi_from;
8759           gimple *stmt1 = NULL;
8760
8761           /* Move masked store to STORE_BB.  */
8762           last_store = last;
8763           gsi = gsi_for_stmt (last);
8764           gsi_from = gsi;
8765           /* Shift GSI to the previous stmt for further traversal.  */
8766           gsi_prev (&gsi);
8767           gsi_to = gsi_start_bb (store_bb);
8768           gsi_move_before (&gsi_from, &gsi_to);
8769           /* Setup GSI_TO to the non-empty block start.  */
8770           gsi_to = gsi_start_bb (store_bb);
8771           if (dump_enabled_p ())
8772             {
8773               dump_printf_loc (MSG_NOTE, vect_location,
8774                                "Move stmt to created bb\n");
8775               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8776             }
8777           /* Move all stored value producers if possible.  */
8778           while (!gsi_end_p (gsi))
8779             {
8780               tree lhs;
8781               imm_use_iterator imm_iter;
8782               use_operand_p use_p;
8783               bool res;
8784
8785               /* Skip debug statements.  */
8786               if (is_gimple_debug (gsi_stmt (gsi)))
8787                 {
8788                   gsi_prev (&gsi);
8789                   continue;
8790                 }
8791               stmt1 = gsi_stmt (gsi);
8792               /* Do not consider statements writing to memory or having
8793                  volatile operand.  */
8794               if (gimple_vdef (stmt1)
8795                   || gimple_has_volatile_ops (stmt1))
8796                 break;
8797               gsi_from = gsi;
8798               gsi_prev (&gsi);
8799               lhs = gimple_get_lhs (stmt1);
8800               if (!lhs)
8801                 break;
8802
8803               /* LHS of vectorized stmt must be SSA_NAME.  */
8804               if (TREE_CODE (lhs) != SSA_NAME)
8805                 break;
8806
8807               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8808                 {
8809                   /* Remove dead scalar statement.  */
8810                   if (has_zero_uses (lhs))
8811                     {
8812                       gsi_remove (&gsi_from, true);
8813                       continue;
8814                     }
8815                 }
8816
8817               /* Check that LHS does not have uses outside of STORE_BB.  */
8818               res = true;
8819               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8820                 {
8821                   gimple *use_stmt;
8822                   use_stmt = USE_STMT (use_p);
8823                   if (is_gimple_debug (use_stmt))
8824                     continue;
8825                   if (gimple_bb (use_stmt) != store_bb)
8826                     {
8827                       res = false;
8828                       break;
8829                     }
8830                 }
8831               if (!res)
8832                 break;
8833
8834               if (gimple_vuse (stmt1)
8835                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8836                 break;
8837
8838               /* Can move STMT1 to STORE_BB.  */
8839               if (dump_enabled_p ())
8840                 {
8841                   dump_printf_loc (MSG_NOTE, vect_location,
8842                                    "Move stmt to created bb\n");
8843                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8844                 }
8845               gsi_move_before (&gsi_from, &gsi_to);
8846               /* Shift GSI_TO for further insertion.  */
8847               gsi_prev (&gsi_to);
8848             }
8849           /* Put other masked stores with the same mask to STORE_BB.  */
8850           if (worklist.is_empty ()
8851               || gimple_call_arg (worklist.last (), 2) != mask
8852               || worklist.last () != stmt1)
8853             break;
8854           last = worklist.pop ();
8855         }
8856       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8857     }
8858 }