gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   3    Free Software Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "tree-pretty-print.h"
  31 #include "gimple-pretty-print.h"
  32 #include "tree-flow.h"
  33 #include "tree-dump.h"
  34 #include "cfgloop.h"
  35 #include "cfglayout.h"
  36 #include "expr.h"
  37 #include "recog.h"
  38 #include "optabs.h"
  39 #include "params.h"
  40 #include "diagnostic-core.h"
  41 #include "tree-chrec.h"
  42 #include "tree-scalar-evolution.h"
  43 #include "tree-vectorizer.h"
  44 #include "target.h"
  45
  46 /* Loop Vectorization Pass.
  47
  48    This pass tries to vectorize loops.
  49
  50    For example, the vectorizer transforms the following simple loop:
  51
  52         short a[N]; short b[N]; short c[N]; int i;
  53
  54         for (i=0; i<N; i++){
  55           a[i] = b[i] + c[i];
  56         }
  57
  58    as if it was manually vectorized by rewriting the source code into:
  59
  60         typedef int __attribute__((mode(V8HI))) v8hi;
  61         short a[N];  short b[N]; short c[N];   int i;
  62         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  63         v8hi va, vb, vc;
  64
  65         for (i=0; i<N/8; i++){
  66           vb = pb[i];
  67           vc = pc[i];
  68           va = vb + vc;
  69           pa[i] = va;
  70         }
  71
  72         The main entry to this pass is vectorize_loops(), in which
  73    the vectorizer applies a set of analyses on a given set of loops,
  74    followed by the actual vectorization transformation for the loops that
  75    had successfully passed the analysis phase.
  76         Throughout this pass we make a distinction between two types of
  77    data: scalars (which are represented by SSA_NAMES), and memory references
  78    ("data-refs").  These two types of data require different handling both
  79    during analysis and transformation. The types of data-refs that the
  80    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  81    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  82    accesses are required to have a simple (consecutive) access pattern.
  83
  84    Analysis phase:
  85    ===============
  86         The driver for the analysis phase is vect_analyze_loop().
  87    It applies a set of analyses, some of which rely on the scalar evolution
  88    analyzer (scev) developed by Sebastian Pop.
  89
  90         During the analysis phase the vectorizer records some information
  91    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  92    loop, as well as general information about the loop as a whole, which is
  93    recorded in a "loop_vec_info" struct attached to each loop.
  94
  95    Transformation phase:
  96    =====================
  97         The loop transformation phase scans all the stmts in the loop, and
  98    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  99    the loop that needs to be vectorized.  It inserts the vector code sequence
 100    just before the scalar stmt S, and records a pointer to the vector code
 101    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 102    attached to S).  This pointer will be used for the vectorization of following
 103    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 104    otherwise, we rely on dead code elimination for removing it.
 105
 106         For example, say stmt S1 was vectorized into stmt VS1:
 107
 108    VS1: vb = px[i];
 109    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 110    S2:  a = b;
 111
 112    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 113    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 114    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 115    resulting sequence would be:
 116
 117    VS1: vb = px[i];
 118    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 119    VS2: va = vb;
 120    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 121
 122         Operands that are not SSA_NAMEs, are data-refs that appear in
 123    load/store operations (like 'x[i]' in S1), and are handled differently.
 124
 125    Target modeling:
 126    =================
 127         Currently the only target specific information that is used is the
 128    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 129    Targets that can support different sizes of vectors, for now will need
 130    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 131    flexibility will be added in the future.
 132
 133         Since we only vectorize operations which vector form can be
 134    expressed using existing tree codes, to verify that an operation is
 135    supported, the vectorizer checks the relevant optab at the relevant
 136    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 137    the value found is CODE_FOR_nothing, then there's no target support, and
 138    we can't vectorize the stmt.
 139
 140    For additional information on this project see:
 141    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 142 */
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL, pattern_def_stmt = NULL;
 185   bool analyze_pattern_stmt = false, pattern_def = false;
 186
 187   if (vect_print_dump_info (REPORT_DETAILS))
 188     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 189
 190   for (i = 0; i < nbbs; i++)
 191     {
 192       basic_block bb = bbs[i];
 193
 194       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 195         {
 196           phi = gsi_stmt (si);
 197           stmt_info = vinfo_for_stmt (phi);
 198           if (vect_print_dump_info (REPORT_DETAILS))
 199             {
 200               fprintf (vect_dump, "==> examining phi: ");
 201               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 202             }
 203
 204           gcc_assert (stmt_info);
 205
 206           if (STMT_VINFO_RELEVANT_P (stmt_info))
 207             {
 208               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 209               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 210
 211               if (vect_print_dump_info (REPORT_DETAILS))
 212                 {
 213                   fprintf (vect_dump, "get vectype for scalar type:  ");
 214                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 215                 }
 216
 217               vectype = get_vectype_for_scalar_type (scalar_type);
 218               if (!vectype)
 219                 {
 220                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 221                     {
 222                       fprintf (vect_dump,
 223                                "not vectorized: unsupported data-type ");
 224                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 225                     }
 226                   return false;
 227                 }
 228               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 229
 230               if (vect_print_dump_info (REPORT_DETAILS))
 231                 {
 232                   fprintf (vect_dump, "vectype: ");
 233                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 234                 }
 235
 236               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 237               if (vect_print_dump_info (REPORT_DETAILS))
 238                 fprintf (vect_dump, "nunits = %d", nunits);
 239
 240               if (!vectorization_factor
 241                   || (nunits > vectorization_factor))
 242                 vectorization_factor = nunits;
 243             }
 244         }
 245
 246       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 247         {
 248           tree vf_vectype;
 249
 250           if (analyze_pattern_stmt)
 251             {
 252               stmt = pattern_stmt;
 253               analyze_pattern_stmt = false;
 254             }
 255           else
 256             stmt = gsi_stmt (si);
 257
 258           stmt_info = vinfo_for_stmt (stmt);
 259
 260           if (vect_print_dump_info (REPORT_DETAILS))
 261             {
 262               fprintf (vect_dump, "==> examining statement: ");
 263               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 264             }
 265
 266           gcc_assert (stmt_info);
 267
 268           /* Skip stmts which do not need to be vectorized.  */
 269           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 270               && !STMT_VINFO_LIVE_P (stmt_info))
 271             {
 272               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 273                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 274                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 275                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 276                 {
 277                   stmt = pattern_stmt;
 278                   stmt_info = vinfo_for_stmt (pattern_stmt);
 279                   if (vect_print_dump_info (REPORT_DETAILS))
 280                     {
 281                       fprintf (vect_dump, "==> examining pattern statement: ");
 282                       print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 283                     }
 284                 }
 285               else
 286                 {
 287                   if (vect_print_dump_info (REPORT_DETAILS))
 288                     fprintf (vect_dump, "skip.");
 289                   gsi_next (&si);
 290                   continue;
 291                 }
 292             }
 293           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 294                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 295                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 296                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 297             analyze_pattern_stmt = true;
 298
 299           /* If a pattern statement has a def stmt, analyze it too.  */
 300           if (is_pattern_stmt_p (stmt_info)
 301               && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
 302               && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
 303                   || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
 304             {
 305               if (pattern_def)
 306                 pattern_def = false;
 307               else
 308                 {
 309                   if (vect_print_dump_info (REPORT_DETAILS))
 310                     {
 311                       fprintf (vect_dump, "==> examining pattern def stmt: ");
 312                       print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
 313                                          TDF_SLIM);
 314                     }
 315
 316                   pattern_def = true;
 317                   stmt = pattern_def_stmt;
 318                   stmt_info = vinfo_for_stmt (stmt);
 319                 }
 320             }
 321
 322           if (gimple_get_lhs (stmt) == NULL_TREE)
 323             {
 324               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 325                 {
 326                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 327                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 328                 }
 329               return false;
 330             }
 331
 332           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 333             {
 334               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 335                 {
 336                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 337                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 338                 }
 339               return false;
 340             }
 341
 342           if (STMT_VINFO_VECTYPE (stmt_info))
 343             {
 344               /* The only case when a vectype had been already set is for stmts
 345                  that contain a dataref, or for "pattern-stmts" (stmts
 346                  generated by the vectorizer to represent/replace a certain
 347                  idiom).  */
 348               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 349                           || is_pattern_stmt_p (stmt_info)
 350                           || pattern_def);
 351               vectype = STMT_VINFO_VECTYPE (stmt_info);
 352             }
 353           else
 354             {
 355               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 356               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 357               if (vect_print_dump_info (REPORT_DETAILS))
 358                 {
 359                   fprintf (vect_dump, "get vectype for scalar type:  ");
 360                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 361                 }
 362               vectype = get_vectype_for_scalar_type (scalar_type);
 363               if (!vectype)
 364                 {
 365                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 366                     {
 367                       fprintf (vect_dump,
 368                                "not vectorized: unsupported data-type ");
 369                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 370                     }
 371                   return false;
 372                 }
 373
 374               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 375             }
 376
 377           /* The vectorization factor is according to the smallest
 378              scalar type (or the largest vector size, but we only
 379              support one vector size per loop).  */
 380           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 381                                                        &dummy);
 382           if (vect_print_dump_info (REPORT_DETAILS))
 383             {
 384               fprintf (vect_dump, "get vectype for scalar type:  ");
 385               print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 386             }
 387           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 388           if (!vf_vectype)
 389             {
 390               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 391                 {
 392                   fprintf (vect_dump,
 393                            "not vectorized: unsupported data-type ");
 394                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 395                 }
 396               return false;
 397             }
 398
 399           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 400                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 401             {
 402               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 403                 {
 404                   fprintf (vect_dump,
 405                            "not vectorized: different sized vector "
 406                            "types in statement, ");
 407                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 408                   fprintf (vect_dump, " and ");
 409                   print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 410                 }
 411               return false;
 412             }
 413
 414           if (vect_print_dump_info (REPORT_DETAILS))
 415             {
 416               fprintf (vect_dump, "vectype: ");
 417               print_generic_expr (vect_dump, vf_vectype, TDF_SLIM);
 418             }
 419
 420           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 421           if (vect_print_dump_info (REPORT_DETAILS))
 422             fprintf (vect_dump, "nunits = %d", nunits);
 423
 424           if (!vectorization_factor
 425               || (nunits > vectorization_factor))
 426             vectorization_factor = nunits;
 427
 428           if (!analyze_pattern_stmt && !pattern_def)
 429             gsi_next (&si);
 430         }
 431     }
 432
 433   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 434   if (vect_print_dump_info (REPORT_DETAILS))
 435     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 436   if (vectorization_factor <= 1)
 437     {
 438       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 439         fprintf (vect_dump, "not vectorized: unsupported data-type");
 440       return false;
 441     }
 442   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 443
 444   return true;
 445 }
 446
 447
 448 /* Function vect_is_simple_iv_evolution.
 449
 450    FORNOW: A simple evolution of an induction variables in the loop is
 451    considered a polynomial evolution with constant step.  */
 452
 453 static bool
 454 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 455                              tree * step)
 456 {
 457   tree init_expr;
 458   tree step_expr;
 459   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 460
 461   /* When there is no evolution in this loop, the evolution function
 462      is not "simple".  */
 463   if (evolution_part == NULL_TREE)
 464     return false;
 465
 466   /* When the evolution is a polynomial of degree >= 2
 467      the evolution function is not "simple".  */
 468   if (tree_is_chrec (evolution_part))
 469     return false;
 470
 471   step_expr = evolution_part;
 472   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 473
 474   if (vect_print_dump_info (REPORT_DETAILS))
 475     {
 476       fprintf (vect_dump, "step: ");
 477       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 478       fprintf (vect_dump, ",  init: ");
 479       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 480     }
 481
 482   *init = init_expr;
 483   *step = step_expr;
 484
 485   if (TREE_CODE (step_expr) != INTEGER_CST)
 486     {
 487       if (vect_print_dump_info (REPORT_DETAILS))
 488         fprintf (vect_dump, "step unknown.");
 489       return false;
 490     }
 491
 492   return true;
 493 }
 494
 495 /* Function vect_analyze_scalar_cycles_1.
 496
 497    Examine the cross iteration def-use cycles of scalar variables
 498    in LOOP.  LOOP_VINFO represents the loop that is now being
 499    considered for vectorization (can be LOOP, or an outer-loop
 500    enclosing LOOP).  */
 501
 502 static void
 503 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 504 {
 505   basic_block bb = loop->header;
 506   tree dumy;
 507   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 508   gimple_stmt_iterator gsi;
 509   bool double_reduc;
 510
 511   if (vect_print_dump_info (REPORT_DETAILS))
 512     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 513
 514   /* First - identify all inductions.  Reduction detection assumes that all the
 515      inductions have been identified, therefore, this order must not be
 516      changed.  */
 517   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 518     {
 519       gimple phi = gsi_stmt (gsi);
 520       tree access_fn = NULL;
 521       tree def = PHI_RESULT (phi);
 522       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 523
 524       if (vect_print_dump_info (REPORT_DETAILS))
 525         {
 526           fprintf (vect_dump, "Analyze phi: ");
 527           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 528         }
 529
 530       /* Skip virtual phi's.  The data dependences that are associated with
 531          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 532       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 533         continue;
 534
 535       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 536
 537       /* Analyze the evolution function.  */
 538       access_fn = analyze_scalar_evolution (loop, def);
 539       if (access_fn)
 540         STRIP_NOPS (access_fn);
 541       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 542         {
 543           fprintf (vect_dump, "Access function of PHI: ");
 544           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 545         }
 546
 547       if (!access_fn
 548           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 549         {
 550           VEC_safe_push (gimple, heap, worklist, phi);
 551           continue;
 552         }
 553
 554       if (vect_print_dump_info (REPORT_DETAILS))
 555         fprintf (vect_dump, "Detected induction.");
 556       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 557     }
 558
 559
 560   /* Second - identify all reductions and nested cycles.  */
 561   while (VEC_length (gimple, worklist) > 0)
 562     {
 563       gimple phi = VEC_pop (gimple, worklist);
 564       tree def = PHI_RESULT (phi);
 565       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 566       gimple reduc_stmt;
 567       bool nested_cycle;
 568
 569       if (vect_print_dump_info (REPORT_DETAILS))
 570         {
 571           fprintf (vect_dump, "Analyze phi: ");
 572           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 573         }
 574
 575       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 576       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 577
 578       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 579       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 580                                                 &double_reduc);
 581       if (reduc_stmt)
 582         {
 583           if (double_reduc)
 584             {
 585               if (vect_print_dump_info (REPORT_DETAILS))
 586                 fprintf (vect_dump, "Detected double reduction.");
 587
 588               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 589               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 590                                                     vect_double_reduction_def;
 591             }
 592           else
 593             {
 594               if (nested_cycle)
 595                 {
 596                   if (vect_print_dump_info (REPORT_DETAILS))
 597                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 598
 599                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 600                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 601                                                              vect_nested_cycle;
 602                 }
 603               else
 604                 {
 605                   if (vect_print_dump_info (REPORT_DETAILS))
 606                     fprintf (vect_dump, "Detected reduction.");
 607
 608                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 609                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 610                                                            vect_reduction_def;
 611                   /* Store the reduction cycles for possible vectorization in
 612                      loop-aware SLP.  */
 613                   VEC_safe_push (gimple, heap,
 614                                  LOOP_VINFO_REDUCTIONS (loop_vinfo),
 615                                  reduc_stmt);
 616                 }
 617             }
 618         }
 619       else
 620         if (vect_print_dump_info (REPORT_DETAILS))
 621           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 622     }
 623
 624   VEC_free (gimple, heap, worklist);
 625 }
 626
 627
 628 /* Function vect_analyze_scalar_cycles.
 629
 630    Examine the cross iteration def-use cycles of scalar variables, by
 631    analyzing the loop-header PHIs of scalar variables.  Classify each
 632    cycle as one of the following: invariant, induction, reduction, unknown.
 633    We do that for the loop represented by LOOP_VINFO, and also to its
 634    inner-loop, if exists.
 635    Examples for scalar cycles:
 636
 637    Example1: reduction:
 638
 639               loop1:
 640               for (i=0; i<N; i++)
 641                  sum += a[i];
 642
 643    Example2: induction:
 644
 645               loop2:
 646               for (i=0; i<N; i++)
 647                  a[i] = i;  */
 648
 649 static void
 650 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 651 {
 652   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 653
 654   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 655
 656   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 657      Reductions in such inner-loop therefore have different properties than
 658      the reductions in the nest that gets vectorized:
 659      1. When vectorized, they are executed in the same order as in the original
 660         scalar loop, so we can't change the order of computation when
 661         vectorizing them.
 662      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 663         current checks are too strict.  */
 664
 665   if (loop->inner)
 666     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 667 }
 668
 669 /* Function vect_get_loop_niters.
 670
 671    Determine how many iterations the loop is executed.
 672    If an expression that represents the number of iterations
 673    can be constructed, place it in NUMBER_OF_ITERATIONS.
 674    Return the loop exit condition.  */
 675
 676 static gimple
 677 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 678 {
 679   tree niters;
 680
 681   if (vect_print_dump_info (REPORT_DETAILS))
 682     fprintf (vect_dump, "=== get_loop_niters ===");
 683
 684   niters = number_of_exit_cond_executions (loop);
 685
 686   if (niters != NULL_TREE
 687       && niters != chrec_dont_know)
 688     {
 689       *number_of_iterations = niters;
 690
 691       if (vect_print_dump_info (REPORT_DETAILS))
 692         {
 693           fprintf (vect_dump, "==> get_loop_niters:" );
 694           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 695         }
 696     }
 697
 698   return get_loop_exit_condition (loop);
 699 }
 700
 701
 702 /* Function bb_in_loop_p
 703
 704    Used as predicate for dfs order traversal of the loop bbs.  */
 705
 706 static bool
 707 bb_in_loop_p (const_basic_block bb, const void *data)
 708 {
 709   const struct loop *const loop = (const struct loop *)data;
 710   if (flow_bb_inside_loop_p (loop, bb))
 711     return true;
 712   return false;
 713 }
 714
 715
 716 /* Function new_loop_vec_info.
 717
 718    Create and initialize a new loop_vec_info struct for LOOP, as well as
 719    stmt_vec_info structs for all the stmts in LOOP.  */
 720
 721 static loop_vec_info
 722 new_loop_vec_info (struct loop *loop)
 723 {
 724   loop_vec_info res;
 725   basic_block *bbs;
 726   gimple_stmt_iterator si;
 727   unsigned int i, nbbs;
 728
 729   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 730   LOOP_VINFO_LOOP (res) = loop;
 731
 732   bbs = get_loop_body (loop);
 733
 734   /* Create/Update stmt_info for all stmts in the loop.  */
 735   for (i = 0; i < loop->num_nodes; i++)
 736     {
 737       basic_block bb = bbs[i];
 738
 739       /* BBs in a nested inner-loop will have been already processed (because
 740          we will have called vect_analyze_loop_form for any nested inner-loop).
 741          Therefore, for stmts in an inner-loop we just want to update the
 742          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 743          loop_info of the outer-loop we are currently considering to vectorize
 744          (instead of the loop_info of the inner-loop).
 745          For stmts in other BBs we need to create a stmt_info from scratch.  */
 746       if (bb->loop_father != loop)
 747         {
 748           /* Inner-loop bb.  */
 749           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 750           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 751             {
 752               gimple phi = gsi_stmt (si);
 753               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 754               loop_vec_info inner_loop_vinfo =
 755                 STMT_VINFO_LOOP_VINFO (stmt_info);
 756               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 757               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 758             }
 759           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 760            {
 761               gimple stmt = gsi_stmt (si);
 762               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 763               loop_vec_info inner_loop_vinfo =
 764                  STMT_VINFO_LOOP_VINFO (stmt_info);
 765               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 766               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 767            }
 768         }
 769       else
 770         {
 771           /* bb in current nest.  */
 772           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 773             {
 774               gimple phi = gsi_stmt (si);
 775               gimple_set_uid (phi, 0);
 776               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 777             }
 778
 779           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 780             {
 781               gimple stmt = gsi_stmt (si);
 782               gimple_set_uid (stmt, 0);
 783               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 784             }
 785         }
 786     }
 787
 788   /* CHECKME: We want to visit all BBs before their successors (except for
 789      latch blocks, for which this assertion wouldn't hold).  In the simple
 790      case of the loop forms we allow, a dfs order of the BBs would the same
 791      as reversed postorder traversal, so we are safe.  */
 792
 793    free (bbs);
 794    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 795    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 796                               bbs, loop->num_nodes, loop);
 797    gcc_assert (nbbs == loop->num_nodes);
 798
 799   LOOP_VINFO_BBS (res) = bbs;
 800   LOOP_VINFO_NITERS (res) = NULL;
 801   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 802   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 803   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 804   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 805   LOOP_VINFO_VECT_FACTOR (res) = 0;
 806   LOOP_VINFO_LOOP_NEST (res) = VEC_alloc (loop_p, heap, 3);
 807   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 808   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 809   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 810   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 811     VEC_alloc (gimple, heap,
 812                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 813   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 814     VEC_alloc (ddr_p, heap,
 815                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 816   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 817   LOOP_VINFO_REDUCTIONS (res) = VEC_alloc (gimple, heap, 10);
 818   LOOP_VINFO_REDUCTION_CHAINS (res) = VEC_alloc (gimple, heap, 10);
 819   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 820   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 821   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 822   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 823
 824   return res;
 825 }
 826
 827
 828 /* Function destroy_loop_vec_info.
 829
 830    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 831    stmts in the loop.  */
 832
 833 void
 834 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 835 {
 836   struct loop *loop;
 837   basic_block *bbs;
 838   int nbbs;
 839   gimple_stmt_iterator si;
 840   int j;
 841   VEC (slp_instance, heap) *slp_instances;
 842   slp_instance instance;
 843
 844   if (!loop_vinfo)
 845     return;
 846
 847   loop = LOOP_VINFO_LOOP (loop_vinfo);
 848
 849   bbs = LOOP_VINFO_BBS (loop_vinfo);
 850   nbbs = loop->num_nodes;
 851
 852   if (!clean_stmts)
 853     {
 854       free (LOOP_VINFO_BBS (loop_vinfo));
 855       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 856       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 857       VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 858       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 859       VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 860
 861       free (loop_vinfo);
 862       loop->aux = NULL;
 863       return;
 864     }
 865
 866   for (j = 0; j < nbbs; j++)
 867     {
 868       basic_block bb = bbs[j];
 869       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 870         free_stmt_vec_info (gsi_stmt (si));
 871
 872       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 873         {
 874           gimple stmt = gsi_stmt (si);
 875           /* Free stmt_vec_info.  */
 876           free_stmt_vec_info (stmt);
 877           gsi_next (&si);
 878         }
 879     }
 880
 881   free (LOOP_VINFO_BBS (loop_vinfo));
 882   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 883   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 884   VEC_free (loop_p, heap, LOOP_VINFO_LOOP_NEST (loop_vinfo));
 885   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 886   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 887   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 888   FOR_EACH_VEC_ELT (slp_instance, slp_instances, j, instance)
 889     vect_free_slp_instance (instance);
 890
 891   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 892   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 893   VEC_free (gimple, heap, LOOP_VINFO_REDUCTIONS (loop_vinfo));
 894   VEC_free (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo));
 895
 896   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 897     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 898
 899   free (loop_vinfo);
 900   loop->aux = NULL;
 901 }
 902
 903
 904 /* Function vect_analyze_loop_1.
 905
 906    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 907    for it. The different analyses will record information in the
 908    loop_vec_info struct.  This is a subset of the analyses applied in
 909    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 910    that is now considered for (outer-loop) vectorization.  */
 911
 912 static loop_vec_info
 913 vect_analyze_loop_1 (struct loop *loop)
 914 {
 915   loop_vec_info loop_vinfo;
 916
 917   if (vect_print_dump_info (REPORT_DETAILS))
 918     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 919
 920   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 921
 922   loop_vinfo = vect_analyze_loop_form (loop);
 923   if (!loop_vinfo)
 924     {
 925       if (vect_print_dump_info (REPORT_DETAILS))
 926         fprintf (vect_dump, "bad inner-loop form.");
 927       return NULL;
 928     }
 929
 930   return loop_vinfo;
 931 }
 932
 933
 934 /* Function vect_analyze_loop_form.
 935
 936    Verify that certain CFG restrictions hold, including:
 937    - the loop has a pre-header
 938    - the loop has a single entry and exit
 939    - the loop exit condition is simple enough, and the number of iterations
 940      can be analyzed (a countable loop).  */
 941
 942 loop_vec_info
 943 vect_analyze_loop_form (struct loop *loop)
 944 {
 945   loop_vec_info loop_vinfo;
 946   gimple loop_cond;
 947   tree number_of_iterations = NULL;
 948   loop_vec_info inner_loop_vinfo = NULL;
 949
 950   if (vect_print_dump_info (REPORT_DETAILS))
 951     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 952
 953   /* Different restrictions apply when we are considering an inner-most loop,
 954      vs. an outer (nested) loop.
 955      (FORNOW. May want to relax some of these restrictions in the future).  */
 956
 957   if (!loop->inner)
 958     {
 959       /* Inner-most loop.  We currently require that the number of BBs is
 960          exactly 2 (the header and latch).  Vectorizable inner-most loops
 961          look like this:
 962
 963                         (pre-header)
 964                            |
 965                           header <--------+
 966                            | |            |
 967                            | +--> latch --+
 968                            |
 969                         (exit-bb)  */
 970
 971       if (loop->num_nodes != 2)
 972         {
 973           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 974             fprintf (vect_dump, "not vectorized: control flow in loop.");
 975           return NULL;
 976         }
 977
 978       if (empty_block_p (loop->header))
 979     {
 980           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 981             fprintf (vect_dump, "not vectorized: empty loop.");
 982       return NULL;
 983     }
 984     }
 985   else
 986     {
 987       struct loop *innerloop = loop->inner;
 988       edge entryedge;
 989
 990       /* Nested loop. We currently require that the loop is doubly-nested,
 991          contains a single inner loop, and the number of BBs is exactly 5.
 992          Vectorizable outer-loops look like this:
 993
 994                         (pre-header)
 995                            |
 996                           header <---+
 997                            |         |
 998                           inner-loop |
 999                            |         |
1000                           tail ------+
1001                            |
1002                         (exit-bb)
1003
1004          The inner-loop has the properties expected of inner-most loops
1005          as described above.  */
1006
1007       if ((loop->inner)->inner || (loop->inner)->next)
1008         {
1009           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1010             fprintf (vect_dump, "not vectorized: multiple nested loops.");
1011           return NULL;
1012         }
1013
1014       /* Analyze the inner-loop.  */
1015       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1016       if (!inner_loop_vinfo)
1017         {
1018           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1019             fprintf (vect_dump, "not vectorized: Bad inner loop.");
1020           return NULL;
1021         }
1022
1023       if (!expr_invariant_in_loop_p (loop,
1024                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1025         {
1026           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1027             fprintf (vect_dump,
1028                      "not vectorized: inner-loop count not invariant.");
1029           destroy_loop_vec_info (inner_loop_vinfo, true);
1030           return NULL;
1031         }
1032
1033       if (loop->num_nodes != 5)
1034         {
1035           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1036             fprintf (vect_dump, "not vectorized: control flow in loop.");
1037           destroy_loop_vec_info (inner_loop_vinfo, true);
1038           return NULL;
1039         }
1040
1041       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1042       entryedge = EDGE_PRED (innerloop->header, 0);
1043       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1044         entryedge = EDGE_PRED (innerloop->header, 1);
1045
1046       if (entryedge->src != loop->header
1047           || !single_exit (innerloop)
1048           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1049         {
1050           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1051             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
1052           destroy_loop_vec_info (inner_loop_vinfo, true);
1053           return NULL;
1054         }
1055
1056       if (vect_print_dump_info (REPORT_DETAILS))
1057         fprintf (vect_dump, "Considering outer-loop vectorization.");
1058     }
1059
1060   if (!single_exit (loop)
1061       || EDGE_COUNT (loop->header->preds) != 2)
1062     {
1063       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1064         {
1065           if (!single_exit (loop))
1066             fprintf (vect_dump, "not vectorized: multiple exits.");
1067           else if (EDGE_COUNT (loop->header->preds) != 2)
1068             fprintf (vect_dump, "not vectorized: too many incoming edges.");
1069         }
1070       if (inner_loop_vinfo)
1071         destroy_loop_vec_info (inner_loop_vinfo, true);
1072       return NULL;
1073     }
1074
1075   /* We assume that the loop exit condition is at the end of the loop. i.e,
1076      that the loop is represented as a do-while (with a proper if-guard
1077      before the loop if needed), where the loop header contains all the
1078      executable statements, and the latch is empty.  */
1079   if (!empty_block_p (loop->latch)
1080         || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1081     {
1082       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1083         fprintf (vect_dump, "not vectorized: unexpected loop form.");
1084       if (inner_loop_vinfo)
1085         destroy_loop_vec_info (inner_loop_vinfo, true);
1086       return NULL;
1087     }
1088
1089   /* Make sure there exists a single-predecessor exit bb:  */
1090   if (!single_pred_p (single_exit (loop)->dest))
1091     {
1092       edge e = single_exit (loop);
1093       if (!(e->flags & EDGE_ABNORMAL))
1094         {
1095           split_loop_exit_edge (e);
1096           if (vect_print_dump_info (REPORT_DETAILS))
1097             fprintf (vect_dump, "split exit edge.");
1098         }
1099       else
1100         {
1101           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1102             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1103           if (inner_loop_vinfo)
1104             destroy_loop_vec_info (inner_loop_vinfo, true);
1105           return NULL;
1106         }
1107     }
1108
1109   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1110   if (!loop_cond)
1111     {
1112       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1113         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1114       if (inner_loop_vinfo)
1115         destroy_loop_vec_info (inner_loop_vinfo, true);
1116       return NULL;
1117     }
1118
1119   if (!number_of_iterations)
1120     {
1121       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1122         fprintf (vect_dump,
1123                  "not vectorized: number of iterations cannot be computed.");
1124       if (inner_loop_vinfo)
1125         destroy_loop_vec_info (inner_loop_vinfo, true);
1126       return NULL;
1127     }
1128
1129   if (chrec_contains_undetermined (number_of_iterations))
1130     {
1131       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1132         fprintf (vect_dump, "Infinite number of iterations.");
1133       if (inner_loop_vinfo)
1134         destroy_loop_vec_info (inner_loop_vinfo, true);
1135       return NULL;
1136     }
1137
1138   if (!NITERS_KNOWN_P (number_of_iterations))
1139     {
1140       if (vect_print_dump_info (REPORT_DETAILS))
1141         {
1142           fprintf (vect_dump, "Symbolic number of iterations is ");
1143           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1144         }
1145     }
1146   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1147     {
1148       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1149         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1150       if (inner_loop_vinfo)
1151         destroy_loop_vec_info (inner_loop_vinfo, false);
1152       return NULL;
1153     }
1154
1155   loop_vinfo = new_loop_vec_info (loop);
1156   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1157   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1158
1159   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1160
1161   /* CHECKME: May want to keep it around it in the future.  */
1162   if (inner_loop_vinfo)
1163     destroy_loop_vec_info (inner_loop_vinfo, false);
1164
1165   gcc_assert (!loop->aux);
1166   loop->aux = loop_vinfo;
1167   return loop_vinfo;
1168 }
1169
1170
1171 /* Get cost by calling cost target builtin.  */
1172
1173 static inline int
1174 vect_get_cost (enum vect_cost_for_stmt type_of_cost)
1175 {
1176   tree dummy_type = NULL;
1177   int dummy = 0;
1178
1179   return targetm.vectorize.builtin_vectorization_cost (type_of_cost,
1180                                                        dummy_type, dummy);
1181 }
1182
1183
1184 /* Function vect_analyze_loop_operations.
1185
1186    Scan the loop stmts and make sure they are all vectorizable.  */
1187
1188 static bool
1189 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1190 {
1191   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1192   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1193   int nbbs = loop->num_nodes;
1194   gimple_stmt_iterator si;
1195   unsigned int vectorization_factor = 0;
1196   int i;
1197   gimple phi;
1198   stmt_vec_info stmt_info;
1199   bool need_to_vectorize = false;
1200   int min_profitable_iters;
1201   int min_scalar_loop_bound;
1202   unsigned int th;
1203   bool only_slp_in_loop = true, ok;
1204
1205   if (vect_print_dump_info (REPORT_DETAILS))
1206     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1207
1208   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1209   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1210   if (slp)
1211     {
1212       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1213          vectorization factor of the loop is the unrolling factor required by
1214          the SLP instances.  If that unrolling factor is 1, we say, that we
1215          perform pure SLP on loop - cross iteration parallelism is not
1216          exploited.  */
1217       for (i = 0; i < nbbs; i++)
1218         {
1219           basic_block bb = bbs[i];
1220           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1221             {
1222               gimple stmt = gsi_stmt (si);
1223               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1224               gcc_assert (stmt_info);
1225               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1226                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1227                   && !PURE_SLP_STMT (stmt_info))
1228                 /* STMT needs both SLP and loop-based vectorization.  */
1229                 only_slp_in_loop = false;
1230             }
1231         }
1232
1233       if (only_slp_in_loop)
1234         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1235       else
1236         vectorization_factor = least_common_multiple (vectorization_factor,
1237                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1238
1239       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1240       if (vect_print_dump_info (REPORT_DETAILS))
1241         fprintf (vect_dump, "Updating vectorization factor to %d ",
1242                             vectorization_factor);
1243     }
1244
1245   for (i = 0; i < nbbs; i++)
1246     {
1247       basic_block bb = bbs[i];
1248
1249       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1250         {
1251           phi = gsi_stmt (si);
1252           ok = true;
1253
1254           stmt_info = vinfo_for_stmt (phi);
1255           if (vect_print_dump_info (REPORT_DETAILS))
1256             {
1257               fprintf (vect_dump, "examining phi: ");
1258               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1259             }
1260
1261           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1262              (i.e., a phi in the tail of the outer-loop).  */
1263           if (! is_loop_header_bb_p (bb))
1264             {
1265               /* FORNOW: we currently don't support the case that these phis
1266                  are not used in the outerloop (unless it is double reduction,
1267                  i.e., this phi is vect_reduction_def), cause this case
1268                  requires to actually do something here.  */
1269               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1270                    || STMT_VINFO_LIVE_P (stmt_info))
1271                   && STMT_VINFO_DEF_TYPE (stmt_info)
1272                      != vect_double_reduction_def)
1273                 {
1274                   if (vect_print_dump_info (REPORT_DETAILS))
1275                     fprintf (vect_dump,
1276                              "Unsupported loop-closed phi in outer-loop.");
1277                   return false;
1278                 }
1279
1280               /* If PHI is used in the outer loop, we check that its operand
1281                  is defined in the inner loop.  */
1282               if (STMT_VINFO_RELEVANT_P (stmt_info))
1283                 {
1284                   tree phi_op;
1285                   gimple op_def_stmt;
1286
1287                   if (gimple_phi_num_args (phi) != 1)
1288                     return false;
1289
1290                   phi_op = PHI_ARG_DEF (phi, 0);
1291                   if (TREE_CODE (phi_op) != SSA_NAME)
1292                     return false;
1293
1294                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1295                   if (!op_def_stmt || !vinfo_for_stmt (op_def_stmt))
1296                     return false;
1297
1298                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1299                         != vect_used_in_outer
1300                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1301                            != vect_used_in_outer_by_reduction)
1302                     return false;
1303                 }
1304
1305               continue;
1306             }
1307
1308           gcc_assert (stmt_info);
1309
1310           if (STMT_VINFO_LIVE_P (stmt_info))
1311             {
1312               /* FORNOW: not yet supported.  */
1313               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1314                 fprintf (vect_dump, "not vectorized: value used after loop.");
1315               return false;
1316             }
1317
1318           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1319               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1320             {
1321               /* A scalar-dependence cycle that we don't support.  */
1322               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1323                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1324               return false;
1325             }
1326
1327           if (STMT_VINFO_RELEVANT_P (stmt_info))
1328             {
1329               need_to_vectorize = true;
1330               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1331                 ok = vectorizable_induction (phi, NULL, NULL);
1332             }
1333
1334           if (!ok)
1335             {
1336               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1337                 {
1338                   fprintf (vect_dump,
1339                            "not vectorized: relevant phi not supported: ");
1340                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1341                 }
1342               return false;
1343             }
1344         }
1345
1346       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1347         {
1348           gimple stmt = gsi_stmt (si);
1349           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1350             return false;
1351         }
1352     } /* bbs */
1353
1354   /* All operations in the loop are either irrelevant (deal with loop
1355      control, or dead), or only used outside the loop and can be moved
1356      out of the loop (e.g. invariants, inductions).  The loop can be
1357      optimized away by scalar optimizations.  We're better off not
1358      touching this loop.  */
1359   if (!need_to_vectorize)
1360     {
1361       if (vect_print_dump_info (REPORT_DETAILS))
1362         fprintf (vect_dump,
1363                  "All the computation can be taken out of the loop.");
1364       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1365         fprintf (vect_dump,
1366                  "not vectorized: redundant loop. no profit to vectorize.");
1367       return false;
1368     }
1369
1370   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1371       && vect_print_dump_info (REPORT_DETAILS))
1372     fprintf (vect_dump,
1373         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1374         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1375
1376   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1377       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1378     {
1379       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1380         fprintf (vect_dump, "not vectorized: iteration count too small.");
1381       if (vect_print_dump_info (REPORT_DETAILS))
1382         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1383                  "vectorization factor.");
1384       return false;
1385     }
1386
1387   /* Analyze cost.  Decide if worth while to vectorize.  */
1388
1389   /* Once VF is set, SLP costs should be updated since the number of created
1390      vector stmts depends on VF.  */
1391   vect_update_slp_costs_according_to_vf (loop_vinfo);
1392
1393   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1394   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1395
1396   if (min_profitable_iters < 0)
1397     {
1398       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1399         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1400       if (vect_print_dump_info (REPORT_DETAILS))
1401         fprintf (vect_dump, "not vectorized: vector version will never be "
1402                  "profitable.");
1403       return false;
1404     }
1405
1406   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1407                             * vectorization_factor) - 1);
1408
1409   /* Use the cost model only if it is more conservative than user specified
1410      threshold.  */
1411
1412   th = (unsigned) min_scalar_loop_bound;
1413   if (min_profitable_iters
1414       && (!min_scalar_loop_bound
1415           || min_profitable_iters > min_scalar_loop_bound))
1416     th = (unsigned) min_profitable_iters;
1417
1418   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1419       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1420     {
1421       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1422         fprintf (vect_dump, "not vectorized: vectorization not "
1423                  "profitable.");
1424       if (vect_print_dump_info (REPORT_DETAILS))
1425         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1426                  "user specified loop bound parameter or minimum "
1427                  "profitable iterations (whichever is more conservative).");
1428       return false;
1429     }
1430
1431   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1432       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1433       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1434     {
1435       if (vect_print_dump_info (REPORT_DETAILS))
1436         fprintf (vect_dump, "epilog loop required.");
1437       if (!vect_can_advance_ivs_p (loop_vinfo))
1438         {
1439           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1440             fprintf (vect_dump,
1441                      "not vectorized: can't create epilog loop 1.");
1442           return false;
1443         }
1444       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1445         {
1446           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1447             fprintf (vect_dump,
1448                      "not vectorized: can't create epilog loop 2.");
1449           return false;
1450         }
1451     }
1452
1453   return true;
1454 }
1455
1456
1457 /* Function vect_analyze_loop_2.
1458
1459    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1460    for it.  The different analyses will record information in the
1461    loop_vec_info struct.  */
1462 static bool
1463 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1464 {
1465   bool ok, slp = false;
1466   int max_vf = MAX_VECTORIZATION_FACTOR;
1467   int min_vf = 2;
1468
1469   /* Find all data references in the loop (which correspond to vdefs/vuses)
1470      and analyze their evolution in the loop.  Also adjust the minimal
1471      vectorization factor according to the loads and stores.
1472
1473      FORNOW: Handle only simple, array references, which
1474      alignment can be forced, and aligned pointer-references.  */
1475
1476   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1477   if (!ok)
1478     {
1479       if (vect_print_dump_info (REPORT_DETAILS))
1480         fprintf (vect_dump, "bad data references.");
1481       return false;
1482     }
1483
1484   /* Classify all cross-iteration scalar data-flow cycles.
1485      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1486
1487   vect_analyze_scalar_cycles (loop_vinfo);
1488
1489   vect_pattern_recog (loop_vinfo);
1490
1491   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1492
1493   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1494   if (!ok)
1495     {
1496       if (vect_print_dump_info (REPORT_DETAILS))
1497         fprintf (vect_dump, "unexpected pattern.");
1498       return false;
1499     }
1500
1501   /* Analyze data dependences between the data-refs in the loop
1502      and adjust the maximum vectorization factor according to
1503      the dependences.
1504      FORNOW: fail at the first data dependence that we encounter.  */
1505
1506   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1507   if (!ok
1508       || max_vf < min_vf)
1509     {
1510       if (vect_print_dump_info (REPORT_DETAILS))
1511         fprintf (vect_dump, "bad data dependence.");
1512       return false;
1513     }
1514
1515   ok = vect_determine_vectorization_factor (loop_vinfo);
1516   if (!ok)
1517     {
1518       if (vect_print_dump_info (REPORT_DETAILS))
1519         fprintf (vect_dump, "can't determine vectorization factor.");
1520       return false;
1521     }
1522   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1523     {
1524       if (vect_print_dump_info (REPORT_DETAILS))
1525         fprintf (vect_dump, "bad data dependence.");
1526       return false;
1527     }
1528
1529   /* Analyze the alignment of the data-refs in the loop.
1530      Fail if a data reference is found that cannot be vectorized.  */
1531
1532   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1533   if (!ok)
1534     {
1535       if (vect_print_dump_info (REPORT_DETAILS))
1536         fprintf (vect_dump, "bad data alignment.");
1537       return false;
1538     }
1539
1540   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1541      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1542
1543   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1544   if (!ok)
1545     {
1546       if (vect_print_dump_info (REPORT_DETAILS))
1547         fprintf (vect_dump, "bad data access.");
1548       return false;
1549     }
1550
1551   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1552      It is important to call pruning after vect_analyze_data_ref_accesses,
1553      since we use grouping information gathered by interleaving analysis.  */
1554   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1555   if (!ok)
1556     {
1557       if (vect_print_dump_info (REPORT_DETAILS))
1558         fprintf (vect_dump, "too long list of versioning for alias "
1559                             "run-time tests.");
1560       return false;
1561     }
1562
1563   /* This pass will decide on using loop versioning and/or loop peeling in
1564      order to enhance the alignment of data references in the loop.  */
1565
1566   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1567   if (!ok)
1568     {
1569       if (vect_print_dump_info (REPORT_DETAILS))
1570         fprintf (vect_dump, "bad data alignment.");
1571       return false;
1572     }
1573
1574   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1575   ok = vect_analyze_slp (loop_vinfo, NULL);
1576   if (ok)
1577     {
1578       /* Decide which possible SLP instances to SLP.  */
1579       slp = vect_make_slp_decision (loop_vinfo);
1580
1581       /* Find stmts that need to be both vectorized and SLPed.  */
1582       vect_detect_hybrid_slp (loop_vinfo);
1583     }
1584   else
1585     return false;
1586
1587   /* Scan all the operations in the loop and make sure they are
1588      vectorizable.  */
1589
1590   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1591   if (!ok)
1592     {
1593       if (vect_print_dump_info (REPORT_DETAILS))
1594         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1595       return false;
1596     }
1597
1598   return true;
1599 }
1600
1601 /* Function vect_analyze_loop.
1602
1603    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1604    for it.  The different analyses will record information in the
1605    loop_vec_info struct.  */
1606 loop_vec_info
1607 vect_analyze_loop (struct loop *loop)
1608 {
1609   loop_vec_info loop_vinfo;
1610   unsigned int vector_sizes;
1611
1612   /* Autodetect first vector size we try.  */
1613   current_vector_size = 0;
1614   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1615
1616   if (vect_print_dump_info (REPORT_DETAILS))
1617     fprintf (vect_dump, "===== analyze_loop_nest =====");
1618
1619   if (loop_outer (loop)
1620       && loop_vec_info_for_loop (loop_outer (loop))
1621       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1622     {
1623       if (vect_print_dump_info (REPORT_DETAILS))
1624         fprintf (vect_dump, "outer-loop already vectorized.");
1625       return NULL;
1626     }
1627
1628   while (1)
1629     {
1630       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1631       loop_vinfo = vect_analyze_loop_form (loop);
1632       if (!loop_vinfo)
1633         {
1634           if (vect_print_dump_info (REPORT_DETAILS))
1635             fprintf (vect_dump, "bad loop form.");
1636           return NULL;
1637         }
1638
1639       if (vect_analyze_loop_2 (loop_vinfo))
1640         {
1641           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1642
1643           return loop_vinfo;
1644         }
1645
1646       destroy_loop_vec_info (loop_vinfo, true);
1647
1648       vector_sizes &= ~current_vector_size;
1649       if (vector_sizes == 0
1650           || current_vector_size == 0)
1651         return NULL;
1652
1653       /* Try the next biggest vector size.  */
1654       current_vector_size = 1 << floor_log2 (vector_sizes);
1655       if (vect_print_dump_info (REPORT_DETAILS))
1656         fprintf (vect_dump, "***** Re-trying analysis with "
1657                  "vector size %d\n", current_vector_size);
1658     }
1659 }
1660
1661
1662 /* Function reduction_code_for_scalar_code
1663
1664    Input:
1665    CODE - tree_code of a reduction operations.
1666
1667    Output:
1668    REDUC_CODE - the corresponding tree-code to be used to reduce the
1669       vector of partial results into a single scalar result (which
1670       will also reside in a vector) or ERROR_MARK if the operation is
1671       a supported reduction operation, but does not have such tree-code.
1672
1673    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1674
1675 static bool
1676 reduction_code_for_scalar_code (enum tree_code code,
1677                                 enum tree_code *reduc_code)
1678 {
1679   switch (code)
1680     {
1681       case MAX_EXPR:
1682         *reduc_code = REDUC_MAX_EXPR;
1683         return true;
1684
1685       case MIN_EXPR:
1686         *reduc_code = REDUC_MIN_EXPR;
1687         return true;
1688
1689       case PLUS_EXPR:
1690         *reduc_code = REDUC_PLUS_EXPR;
1691         return true;
1692
1693       case MULT_EXPR:
1694       case MINUS_EXPR:
1695       case BIT_IOR_EXPR:
1696       case BIT_XOR_EXPR:
1697       case BIT_AND_EXPR:
1698         *reduc_code = ERROR_MARK;
1699         return true;
1700
1701       default:
1702        return false;
1703     }
1704 }
1705
1706
1707 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1708    STMT is printed with a message MSG. */
1709
1710 static void
1711 report_vect_op (gimple stmt, const char *msg)
1712 {
1713   fprintf (vect_dump, "%s", msg);
1714   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1715 }
1716
1717
1718 /* Detect SLP reduction of the form:
1719
1720    #a1 = phi <a5, a0>
1721    a2 = operation (a1)
1722    a3 = operation (a2)
1723    a4 = operation (a3)
1724    a5 = operation (a4)
1725
1726    #a = phi <a5>
1727
1728    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1729    FIRST_STMT is the first reduction stmt in the chain
1730    (a2 = operation (a1)).
1731
1732    Return TRUE if a reduction chain was detected.  */
1733
1734 static bool
1735 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1736 {
1737   struct loop *loop = (gimple_bb (phi))->loop_father;
1738   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1739   enum tree_code code;
1740   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1741   stmt_vec_info use_stmt_info, current_stmt_info;
1742   tree lhs;
1743   imm_use_iterator imm_iter;
1744   use_operand_p use_p;
1745   int nloop_uses, size = 0, n_out_of_loop_uses;
1746   bool found = false;
1747
1748   if (loop != vect_loop)
1749     return false;
1750
1751   lhs = PHI_RESULT (phi);
1752   code = gimple_assign_rhs_code (first_stmt);
1753   while (1)
1754     {
1755       nloop_uses = 0;
1756       n_out_of_loop_uses = 0;
1757       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1758         {
1759           gimple use_stmt = USE_STMT (use_p);
1760           if (is_gimple_debug (use_stmt))
1761             continue;
1762
1763           use_stmt = USE_STMT (use_p);
1764
1765           /* Check if we got back to the reduction phi.  */
1766           if (use_stmt == phi)
1767             {
1768               loop_use_stmt = use_stmt;
1769               found = true;
1770               break;
1771             }
1772
1773           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1774             {
1775               if (vinfo_for_stmt (use_stmt)
1776                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1777                 {
1778                   loop_use_stmt = use_stmt;
1779                   nloop_uses++;
1780                 }
1781             }
1782            else
1783              n_out_of_loop_uses++;
1784
1785            /* There are can be either a single use in the loop or two uses in
1786               phi nodes.  */
1787            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1788              return false;
1789         }
1790
1791       if (found)
1792         break;
1793
1794       /* We reached a statement with no loop uses.  */
1795       if (nloop_uses == 0)
1796         return false;
1797
1798       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1799       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1800         return false;
1801
1802       if (!is_gimple_assign (loop_use_stmt)
1803           || code != gimple_assign_rhs_code (loop_use_stmt)
1804           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1805         return false;
1806
1807       /* Insert USE_STMT into reduction chain.  */
1808       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1809       if (current_stmt)
1810         {
1811           current_stmt_info = vinfo_for_stmt (current_stmt);
1812           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1813           GROUP_FIRST_ELEMENT (use_stmt_info)
1814             = GROUP_FIRST_ELEMENT (current_stmt_info);
1815         }
1816       else
1817         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1818
1819       lhs = gimple_assign_lhs (loop_use_stmt);
1820       current_stmt = loop_use_stmt;
1821       size++;
1822    }
1823
1824   if (!found || loop_use_stmt != phi || size < 2)
1825     return false;
1826
1827   /* Swap the operands, if needed, to make the reduction operand be the second
1828      operand.  */
1829   lhs = PHI_RESULT (phi);
1830   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1831   while (next_stmt)
1832     {
1833       if (gimple_assign_rhs2 (next_stmt) == lhs)
1834         {
1835           tree op = gimple_assign_rhs1 (next_stmt);
1836           gimple def_stmt = NULL;
1837
1838           if (TREE_CODE (op) == SSA_NAME)
1839             def_stmt = SSA_NAME_DEF_STMT (op);
1840
1841           /* Check that the other def is either defined in the loop
1842              ("vect_internal_def"), or it's an induction (defined by a
1843              loop-header phi-node).  */
1844           if (def_stmt
1845               && gimple_bb (def_stmt)
1846               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1847               && (is_gimple_assign (def_stmt)
1848                   || is_gimple_call (def_stmt)
1849                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1850                            == vect_induction_def
1851                   || (gimple_code (def_stmt) == GIMPLE_PHI
1852                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1853                                   == vect_internal_def
1854                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1855             {
1856               lhs = gimple_assign_lhs (next_stmt);
1857               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1858               continue;
1859             }
1860
1861           return false;
1862         }
1863       else
1864         {
1865           tree op = gimple_assign_rhs2 (next_stmt);
1866           gimple def_stmt = NULL;
1867
1868           if (TREE_CODE (op) == SSA_NAME)
1869             def_stmt = SSA_NAME_DEF_STMT (op);
1870
1871           /* Check that the other def is either defined in the loop
1872             ("vect_internal_def"), or it's an induction (defined by a
1873             loop-header phi-node).  */
1874           if (def_stmt
1875               && gimple_bb (def_stmt)
1876               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1877               && (is_gimple_assign (def_stmt)
1878                   || is_gimple_call (def_stmt)
1879                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1880                               == vect_induction_def
1881                   || (gimple_code (def_stmt) == GIMPLE_PHI
1882                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1883                                   == vect_internal_def
1884                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1885             {
1886               if (vect_print_dump_info (REPORT_DETAILS))
1887                 {
1888                   fprintf (vect_dump, "swapping oprnds: ");
1889                   print_gimple_stmt (vect_dump, next_stmt, 0, TDF_SLIM);
1890                 }
1891
1892               swap_tree_operands (next_stmt,
1893                                   gimple_assign_rhs1_ptr (next_stmt),
1894                                   gimple_assign_rhs2_ptr (next_stmt));
1895               mark_symbols_for_renaming (next_stmt);
1896             }
1897           else
1898             return false;
1899         }
1900
1901       lhs = gimple_assign_lhs (next_stmt);
1902       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1903     }
1904
1905   /* Save the chain for further analysis in SLP detection.  */
1906   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1907   VEC_safe_push (gimple, heap, LOOP_VINFO_REDUCTION_CHAINS (loop_info), first);
1908   GROUP_SIZE (vinfo_for_stmt (first)) = size;
1909
1910   return true;
1911 }
1912
1913
1914 /* Function vect_is_simple_reduction_1
1915
1916    (1) Detect a cross-iteration def-use cycle that represents a simple
1917    reduction computation.  We look for the following pattern:
1918
1919    loop_header:
1920      a1 = phi < a0, a2 >
1921      a3 = ...
1922      a2 = operation (a3, a1)
1923
1924    such that:
1925    1. operation is commutative and associative and it is safe to
1926       change the order of the computation (if CHECK_REDUCTION is true)
1927    2. no uses for a2 in the loop (a2 is used out of the loop)
1928    3. no uses of a1 in the loop besides the reduction operation
1929    4. no uses of a1 outside the loop.
1930
1931    Conditions 1,4 are tested here.
1932    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1933
1934    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1935    nested cycles, if CHECK_REDUCTION is false.
1936
1937    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1938    reductions:
1939
1940      a1 = phi < a0, a2 >
1941      inner loop (def of a3)
1942      a2 = phi < a3 >
1943
1944    If MODIFY is true it tries also to rework the code in-place to enable
1945    detection of more reduction patterns.  For the time being we rewrite
1946    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
1947 */
1948
1949 static gimple
1950 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
1951                             bool check_reduction, bool *double_reduc,
1952                             bool modify)
1953 {
1954   struct loop *loop = (gimple_bb (phi))->loop_father;
1955   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1956   edge latch_e = loop_latch_edge (loop);
1957   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1958   gimple def_stmt, def1 = NULL, def2 = NULL;
1959   enum tree_code orig_code, code;
1960   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1961   tree type;
1962   int nloop_uses;
1963   tree name;
1964   imm_use_iterator imm_iter;
1965   use_operand_p use_p;
1966   bool phi_def;
1967
1968   *double_reduc = false;
1969
1970   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1971      otherwise, we assume outer loop vectorization.  */
1972   gcc_assert ((check_reduction && loop == vect_loop)
1973               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1974
1975   name = PHI_RESULT (phi);
1976   nloop_uses = 0;
1977   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1978     {
1979       gimple use_stmt = USE_STMT (use_p);
1980       if (is_gimple_debug (use_stmt))
1981         continue;
1982
1983       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1984         {
1985           if (vect_print_dump_info (REPORT_DETAILS))
1986             fprintf (vect_dump, "intermediate value used outside loop.");
1987
1988           return NULL;
1989         }
1990
1991       if (vinfo_for_stmt (use_stmt)
1992           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1993         nloop_uses++;
1994       if (nloop_uses > 1)
1995         {
1996           if (vect_print_dump_info (REPORT_DETAILS))
1997             fprintf (vect_dump, "reduction used in loop.");
1998           return NULL;
1999         }
2000     }
2001
2002   if (TREE_CODE (loop_arg) != SSA_NAME)
2003     {
2004       if (vect_print_dump_info (REPORT_DETAILS))
2005         {
2006           fprintf (vect_dump, "reduction: not ssa_name: ");
2007           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
2008         }
2009       return NULL;
2010     }
2011
2012   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2013   if (!def_stmt)
2014     {
2015       if (vect_print_dump_info (REPORT_DETAILS))
2016         fprintf (vect_dump, "reduction: no def_stmt.");
2017       return NULL;
2018     }
2019
2020   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2021     {
2022       if (vect_print_dump_info (REPORT_DETAILS))
2023         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
2024       return NULL;
2025     }
2026
2027   if (is_gimple_assign (def_stmt))
2028     {
2029       name = gimple_assign_lhs (def_stmt);
2030       phi_def = false;
2031     }
2032   else
2033     {
2034       name = PHI_RESULT (def_stmt);
2035       phi_def = true;
2036     }
2037
2038   nloop_uses = 0;
2039   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2040     {
2041       gimple use_stmt = USE_STMT (use_p);
2042       if (is_gimple_debug (use_stmt))
2043         continue;
2044       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2045           && vinfo_for_stmt (use_stmt)
2046           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2047         nloop_uses++;
2048       if (nloop_uses > 1)
2049         {
2050           if (vect_print_dump_info (REPORT_DETAILS))
2051             fprintf (vect_dump, "reduction used in loop.");
2052           return NULL;
2053         }
2054     }
2055
2056   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2057      defined in the inner loop.  */
2058   if (phi_def)
2059     {
2060       op1 = PHI_ARG_DEF (def_stmt, 0);
2061
2062       if (gimple_phi_num_args (def_stmt) != 1
2063           || TREE_CODE (op1) != SSA_NAME)
2064         {
2065           if (vect_print_dump_info (REPORT_DETAILS))
2066             fprintf (vect_dump, "unsupported phi node definition.");
2067
2068           return NULL;
2069         }
2070
2071       def1 = SSA_NAME_DEF_STMT (op1);
2072       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2073           && loop->inner
2074           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2075           && is_gimple_assign (def1))
2076         {
2077           if (vect_print_dump_info (REPORT_DETAILS))
2078             report_vect_op (def_stmt, "detected double reduction: ");
2079
2080           *double_reduc = true;
2081           return def_stmt;
2082         }
2083
2084       return NULL;
2085     }
2086
2087   code = orig_code = gimple_assign_rhs_code (def_stmt);
2088
2089   /* We can handle "res -= x[i]", which is non-associative by
2090      simply rewriting this into "res += -x[i]".  Avoid changing
2091      gimple instruction for the first simple tests and only do this
2092      if we're allowed to change code at all.  */
2093   if (code == MINUS_EXPR
2094       && modify
2095       && (op1 = gimple_assign_rhs1 (def_stmt))
2096       && TREE_CODE (op1) == SSA_NAME
2097       && SSA_NAME_DEF_STMT (op1) == phi)
2098     code = PLUS_EXPR;
2099
2100   if (check_reduction
2101       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2102     {
2103       if (vect_print_dump_info (REPORT_DETAILS))
2104         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
2105       return NULL;
2106     }
2107
2108   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2109     {
2110       if (code != COND_EXPR)
2111         {
2112           if (vect_print_dump_info (REPORT_DETAILS))
2113             report_vect_op (def_stmt, "reduction: not binary operation: ");
2114
2115           return NULL;
2116         }
2117
2118       op3 = gimple_assign_rhs1 (def_stmt);
2119       if (COMPARISON_CLASS_P (op3))
2120         {
2121           op4 = TREE_OPERAND (op3, 1);
2122           op3 = TREE_OPERAND (op3, 0);
2123         }
2124
2125       op1 = gimple_assign_rhs2 (def_stmt);
2126       op2 = gimple_assign_rhs3 (def_stmt);
2127
2128       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2129         {
2130           if (vect_print_dump_info (REPORT_DETAILS))
2131             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2132
2133           return NULL;
2134         }
2135     }
2136   else
2137     {
2138       op1 = gimple_assign_rhs1 (def_stmt);
2139       op2 = gimple_assign_rhs2 (def_stmt);
2140
2141       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2142         {
2143           if (vect_print_dump_info (REPORT_DETAILS))
2144             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
2145
2146           return NULL;
2147         }
2148    }
2149
2150   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2151   if ((TREE_CODE (op1) == SSA_NAME
2152        && !types_compatible_p (type,TREE_TYPE (op1)))
2153       || (TREE_CODE (op2) == SSA_NAME
2154           && !types_compatible_p (type, TREE_TYPE (op2)))
2155       || (op3 && TREE_CODE (op3) == SSA_NAME
2156           && !types_compatible_p (type, TREE_TYPE (op3)))
2157       || (op4 && TREE_CODE (op4) == SSA_NAME
2158           && !types_compatible_p (type, TREE_TYPE (op4))))
2159     {
2160       if (vect_print_dump_info (REPORT_DETAILS))
2161         {
2162           fprintf (vect_dump, "reduction: multiple types: operation type: ");
2163           print_generic_expr (vect_dump, type, TDF_SLIM);
2164           fprintf (vect_dump, ", operands types: ");
2165           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
2166           fprintf (vect_dump, ",");
2167           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
2168           if (op3)
2169             {
2170               fprintf (vect_dump, ",");
2171               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
2172             }
2173
2174           if (op4)
2175             {
2176               fprintf (vect_dump, ",");
2177               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
2178             }
2179         }
2180
2181       return NULL;
2182     }
2183
2184   /* Check that it's ok to change the order of the computation.
2185      Generally, when vectorizing a reduction we change the order of the
2186      computation.  This may change the behavior of the program in some
2187      cases, so we need to check that this is ok.  One exception is when
2188      vectorizing an outer-loop: the inner-loop is executed sequentially,
2189      and therefore vectorizing reductions in the inner-loop during
2190      outer-loop vectorization is safe.  */
2191
2192   /* CHECKME: check for !flag_finite_math_only too?  */
2193   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2194       && check_reduction)
2195     {
2196       /* Changing the order of operations changes the semantics.  */
2197       if (vect_print_dump_info (REPORT_DETAILS))
2198         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
2199       return NULL;
2200     }
2201   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2202            && check_reduction)
2203     {
2204       /* Changing the order of operations changes the semantics.  */
2205       if (vect_print_dump_info (REPORT_DETAILS))
2206         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
2207       return NULL;
2208     }
2209   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2210     {
2211       /* Changing the order of operations changes the semantics.  */
2212       if (vect_print_dump_info (REPORT_DETAILS))
2213         report_vect_op (def_stmt,
2214                         "reduction: unsafe fixed-point math optimization: ");
2215       return NULL;
2216     }
2217
2218   /* If we detected "res -= x[i]" earlier, rewrite it into
2219      "res += -x[i]" now.  If this turns out to be useless reassoc
2220      will clean it up again.  */
2221   if (orig_code == MINUS_EXPR)
2222     {
2223       tree rhs = gimple_assign_rhs2 (def_stmt);
2224       tree negrhs = make_ssa_name (SSA_NAME_VAR (rhs), NULL);
2225       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2226                                                          rhs, NULL);
2227       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2228       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2229                                                           loop_info, NULL));
2230       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2231       gimple_assign_set_rhs2 (def_stmt, negrhs);
2232       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2233       update_stmt (def_stmt);
2234     }
2235
2236   /* Reduction is safe. We're dealing with one of the following:
2237      1) integer arithmetic and no trapv
2238      2) floating point arithmetic, and special flags permit this optimization
2239      3) nested cycle (i.e., outer loop vectorization).  */
2240   if (TREE_CODE (op1) == SSA_NAME)
2241     def1 = SSA_NAME_DEF_STMT (op1);
2242
2243   if (TREE_CODE (op2) == SSA_NAME)
2244     def2 = SSA_NAME_DEF_STMT (op2);
2245
2246   if (code != COND_EXPR
2247       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2248     {
2249       if (vect_print_dump_info (REPORT_DETAILS))
2250         report_vect_op (def_stmt, "reduction: no defs for operands: ");
2251       return NULL;
2252     }
2253
2254   /* Check that one def is the reduction def, defined by PHI,
2255      the other def is either defined in the loop ("vect_internal_def"),
2256      or it's an induction (defined by a loop-header phi-node).  */
2257
2258   if (def2 && def2 == phi
2259       && (code == COND_EXPR
2260           || !def1 || gimple_nop_p (def1)
2261           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2262               && (is_gimple_assign (def1)
2263                   || is_gimple_call (def1)
2264                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2265                       == vect_induction_def
2266                   || (gimple_code (def1) == GIMPLE_PHI
2267                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2268                           == vect_internal_def
2269                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2270     {
2271       if (vect_print_dump_info (REPORT_DETAILS))
2272         report_vect_op (def_stmt, "detected reduction: ");
2273       return def_stmt;
2274     }
2275
2276   if (def1 && def1 == phi
2277       && (code == COND_EXPR
2278           || !def2 || gimple_nop_p (def2)
2279           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2280               && (is_gimple_assign (def2)
2281                   || is_gimple_call (def2)
2282                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2283                       == vect_induction_def
2284                   || (gimple_code (def2) == GIMPLE_PHI
2285                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2286                           == vect_internal_def
2287                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2288     {
2289       if (check_reduction)
2290         {
2291           /* Swap operands (just for simplicity - so that the rest of the code
2292              can assume that the reduction variable is always the last (second)
2293              argument).  */
2294           if (vect_print_dump_info (REPORT_DETAILS))
2295             report_vect_op (def_stmt,
2296                             "detected reduction: need to swap operands: ");
2297
2298           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2299                               gimple_assign_rhs2_ptr (def_stmt));
2300         }
2301       else
2302         {
2303           if (vect_print_dump_info (REPORT_DETAILS))
2304             report_vect_op (def_stmt, "detected reduction: ");
2305         }
2306
2307       return def_stmt;
2308     }
2309
2310   /* Try to find SLP reduction chain.  */
2311   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2312     {
2313       if (vect_print_dump_info (REPORT_DETAILS))
2314         report_vect_op (def_stmt, "reduction: detected reduction chain: ");
2315
2316       return def_stmt;
2317     }
2318
2319   if (vect_print_dump_info (REPORT_DETAILS))
2320     report_vect_op (def_stmt, "reduction: unknown pattern: ");
2321
2322   return NULL;
2323 }
2324
2325 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2326    in-place.  Arguments as there.  */
2327
2328 static gimple
2329 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2330                           bool check_reduction, bool *double_reduc)
2331 {
2332   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2333                                      double_reduc, false);
2334 }
2335
2336 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2337    in-place if it enables detection of more reductions.  Arguments
2338    as there.  */
2339
2340 gimple
2341 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2342                           bool check_reduction, bool *double_reduc)
2343 {
2344   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2345                                      double_reduc, true);
2346 }
2347
2348 /* Calculate the cost of one scalar iteration of the loop.  */
2349 int
2350 vect_get_single_scalar_iteraion_cost (loop_vec_info loop_vinfo)
2351 {
2352   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2353   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2354   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2355   int innerloop_iters, i, stmt_cost;
2356
2357   /* Count statements in scalar loop.  Using this as scalar cost for a single
2358      iteration for now.
2359
2360      TODO: Add outer loop support.
2361
2362      TODO: Consider assigning different costs to different scalar
2363      statements.  */
2364
2365   /* FORNOW.  */
2366   innerloop_iters = 1;
2367   if (loop->inner)
2368     innerloop_iters = 50; /* FIXME */
2369
2370   for (i = 0; i < nbbs; i++)
2371     {
2372       gimple_stmt_iterator si;
2373       basic_block bb = bbs[i];
2374
2375       if (bb->loop_father == loop->inner)
2376         factor = innerloop_iters;
2377       else
2378         factor = 1;
2379
2380       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2381         {
2382           gimple stmt = gsi_stmt (si);
2383           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2384
2385           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2386             continue;
2387
2388           /* Skip stmts that are not vectorized inside the loop.  */
2389           if (stmt_info
2390               && !STMT_VINFO_RELEVANT_P (stmt_info)
2391               && (!STMT_VINFO_LIVE_P (stmt_info)
2392                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2393             continue;
2394
2395           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2396             {
2397               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2398                stmt_cost = vect_get_cost (scalar_load);
2399              else
2400                stmt_cost = vect_get_cost (scalar_store);
2401             }
2402           else
2403             stmt_cost = vect_get_cost (scalar_stmt);
2404
2405           scalar_single_iter_cost += stmt_cost * factor;
2406         }
2407     }
2408   return scalar_single_iter_cost;
2409 }
2410
2411 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2412 int
2413 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2414                              int *peel_iters_epilogue,
2415                              int scalar_single_iter_cost)
2416 {
2417   int peel_guard_costs = 0;
2418   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2419
2420   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2421     {
2422       *peel_iters_epilogue = vf/2;
2423       if (vect_print_dump_info (REPORT_COST))
2424         fprintf (vect_dump, "cost model: "
2425                             "epilogue peel iters set to vf/2 because "
2426                             "loop iterations are unknown .");
2427
2428       /* If peeled iterations are known but number of scalar loop
2429          iterations are unknown, count a taken branch per peeled loop.  */
2430       peel_guard_costs =  2 * vect_get_cost (cond_branch_taken);
2431     }
2432   else
2433     {
2434       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2435       peel_iters_prologue = niters < peel_iters_prologue ?
2436                             niters : peel_iters_prologue;
2437       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2438       /* If we need to peel for gaps, but no peeling is required, we have to
2439          peel VF iterations.  */
2440       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2441         *peel_iters_epilogue = vf;
2442     }
2443
2444    return (peel_iters_prologue * scalar_single_iter_cost)
2445             + (*peel_iters_epilogue * scalar_single_iter_cost)
2446            + peel_guard_costs;
2447 }
2448
2449 /* Function vect_estimate_min_profitable_iters
2450
2451    Return the number of iterations required for the vector version of the
2452    loop to be profitable relative to the cost of the scalar version of the
2453    loop.
2454
2455    TODO: Take profile info into account before making vectorization
2456    decisions, if available.  */
2457
2458 int
2459 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
2460 {
2461   int i;
2462   int min_profitable_iters;
2463   int peel_iters_prologue;
2464   int peel_iters_epilogue;
2465   int vec_inside_cost = 0;
2466   int vec_outside_cost = 0;
2467   int scalar_single_iter_cost = 0;
2468   int scalar_outside_cost = 0;
2469   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2470   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2471   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2472   int nbbs = loop->num_nodes;
2473   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2474   int peel_guard_costs = 0;
2475   int innerloop_iters = 0, factor;
2476   VEC (slp_instance, heap) *slp_instances;
2477   slp_instance instance;
2478
2479   /* Cost model disabled.  */
2480   if (!flag_vect_cost_model)
2481     {
2482       if (vect_print_dump_info (REPORT_COST))
2483         fprintf (vect_dump, "cost model disabled.");
2484       return 0;
2485     }
2486
2487   /* Requires loop versioning tests to handle misalignment.  */
2488   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2489     {
2490       /*  FIXME: Make cost depend on complexity of individual check.  */
2491       vec_outside_cost +=
2492         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
2493       if (vect_print_dump_info (REPORT_COST))
2494         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2495                  "versioning to treat misalignment.\n");
2496     }
2497
2498   /* Requires loop versioning with alias checks.  */
2499   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2500     {
2501       /*  FIXME: Make cost depend on complexity of individual check.  */
2502       vec_outside_cost +=
2503         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
2504       if (vect_print_dump_info (REPORT_COST))
2505         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
2506                  "versioning aliasing.\n");
2507     }
2508
2509   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2510       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2511     vec_outside_cost += vect_get_cost (cond_branch_taken);
2512
2513   /* Count statements in scalar loop.  Using this as scalar cost for a single
2514      iteration for now.
2515
2516      TODO: Add outer loop support.
2517
2518      TODO: Consider assigning different costs to different scalar
2519      statements.  */
2520
2521   /* FORNOW.  */
2522   if (loop->inner)
2523     innerloop_iters = 50; /* FIXME */
2524
2525   for (i = 0; i < nbbs; i++)
2526     {
2527       gimple_stmt_iterator si;
2528       basic_block bb = bbs[i];
2529
2530       if (bb->loop_father == loop->inner)
2531         factor = innerloop_iters;
2532       else
2533         factor = 1;
2534
2535       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2536         {
2537           gimple stmt = gsi_stmt (si);
2538           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2539           /* Skip stmts that are not vectorized inside the loop.  */
2540           if (!STMT_VINFO_RELEVANT_P (stmt_info)
2541               && (!STMT_VINFO_LIVE_P (stmt_info)
2542                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
2543             continue;
2544           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
2545           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
2546              some of the "outside" costs are generated inside the outer-loop.  */
2547           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
2548         }
2549     }
2550
2551   scalar_single_iter_cost = vect_get_single_scalar_iteraion_cost (loop_vinfo);
2552
2553   /* Add additional cost for the peeled instructions in prologue and epilogue
2554      loop.
2555
2556      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2557      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2558
2559      TODO: Build an expression that represents peel_iters for prologue and
2560      epilogue to be used in a run-time test.  */
2561
2562   if (npeel  < 0)
2563     {
2564       peel_iters_prologue = vf/2;
2565       if (vect_print_dump_info (REPORT_COST))
2566         fprintf (vect_dump, "cost model: "
2567                  "prologue peel iters set to vf/2.");
2568
2569       /* If peeling for alignment is unknown, loop bound of main loop becomes
2570          unknown.  */
2571       peel_iters_epilogue = vf/2;
2572       if (vect_print_dump_info (REPORT_COST))
2573         fprintf (vect_dump, "cost model: "
2574                  "epilogue peel iters set to vf/2 because "
2575                  "peeling for alignment is unknown .");
2576
2577       /* If peeled iterations are unknown, count a taken branch and a not taken
2578          branch per peeled loop. Even if scalar loop iterations are known,
2579          vector iterations are not known since peeled prologue iterations are
2580          not known. Hence guards remain the same.  */
2581       peel_guard_costs +=  2 * (vect_get_cost (cond_branch_taken)
2582                                 + vect_get_cost (cond_branch_not_taken));
2583       vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2584                            + (peel_iters_epilogue * scalar_single_iter_cost)
2585                            + peel_guard_costs;
2586     }
2587   else
2588     {
2589       peel_iters_prologue = npeel;
2590       vec_outside_cost += vect_get_known_peeling_cost (loop_vinfo,
2591                                     peel_iters_prologue, &peel_iters_epilogue,
2592                                     scalar_single_iter_cost);
2593     }
2594
2595   /* FORNOW: The scalar outside cost is incremented in one of the
2596      following ways:
2597
2598      1. The vectorizer checks for alignment and aliasing and generates
2599      a condition that allows dynamic vectorization.  A cost model
2600      check is ANDED with the versioning condition.  Hence scalar code
2601      path now has the added cost of the versioning check.
2602
2603        if (cost > th & versioning_check)
2604          jmp to vector code
2605
2606      Hence run-time scalar is incremented by not-taken branch cost.
2607
2608      2. The vectorizer then checks if a prologue is required.  If the
2609      cost model check was not done before during versioning, it has to
2610      be done before the prologue check.
2611
2612        if (cost <= th)
2613          prologue = scalar_iters
2614        if (prologue == 0)
2615          jmp to vector code
2616        else
2617          execute prologue
2618        if (prologue == num_iters)
2619          go to exit
2620
2621      Hence the run-time scalar cost is incremented by a taken branch,
2622      plus a not-taken branch, plus a taken branch cost.
2623
2624      3. The vectorizer then checks if an epilogue is required.  If the
2625      cost model check was not done before during prologue check, it
2626      has to be done with the epilogue check.
2627
2628        if (prologue == 0)
2629          jmp to vector code
2630        else
2631          execute prologue
2632        if (prologue == num_iters)
2633          go to exit
2634        vector code:
2635          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2636            jmp to epilogue
2637
2638      Hence the run-time scalar cost should be incremented by 2 taken
2639      branches.
2640
2641      TODO: The back end may reorder the BBS's differently and reverse
2642      conditions/branch directions.  Change the estimates below to
2643      something more reasonable.  */
2644
2645   /* If the number of iterations is known and we do not do versioning, we can
2646      decide whether to vectorize at compile time.  Hence the scalar version
2647      do not carry cost model guard costs.  */
2648   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2649       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2650       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2651     {
2652       /* Cost model check occurs at versioning.  */
2653       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2654           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2655         scalar_outside_cost += vect_get_cost (cond_branch_not_taken);
2656       else
2657         {
2658           /* Cost model check occurs at prologue generation.  */
2659           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2660             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken)
2661                                    + vect_get_cost (cond_branch_not_taken);
2662           /* Cost model check occurs at epilogue generation.  */
2663           else
2664             scalar_outside_cost += 2 * vect_get_cost (cond_branch_taken);
2665         }
2666     }
2667
2668   /* Add SLP costs.  */
2669   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2670   FOR_EACH_VEC_ELT (slp_instance, slp_instances, i, instance)
2671     {
2672       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2673       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2674     }
2675
2676   /* Calculate number of iterations required to make the vector version
2677      profitable, relative to the loop bodies only.  The following condition
2678      must hold true:
2679      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2680      where
2681      SIC = scalar iteration cost, VIC = vector iteration cost,
2682      VOC = vector outside cost, VF = vectorization factor,
2683      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2684      SOC = scalar outside cost for run time cost model check.  */
2685
2686   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2687     {
2688       if (vec_outside_cost <= 0)
2689         min_profitable_iters = 1;
2690       else
2691         {
2692           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2693                                   - vec_inside_cost * peel_iters_prologue
2694                                   - vec_inside_cost * peel_iters_epilogue)
2695                                  / ((scalar_single_iter_cost * vf)
2696                                     - vec_inside_cost);
2697
2698           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2699               <= ((vec_inside_cost * min_profitable_iters)
2700                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2701             min_profitable_iters++;
2702         }
2703     }
2704   /* vector version will never be profitable.  */
2705   else
2706     {
2707       if (vect_print_dump_info (REPORT_COST))
2708         fprintf (vect_dump, "cost model: the vector iteration cost = %d "
2709                  "divided by the scalar iteration cost = %d "
2710                  "is greater or equal to the vectorization factor = %d.",
2711                  vec_inside_cost, scalar_single_iter_cost, vf);
2712       return -1;
2713     }
2714
2715   if (vect_print_dump_info (REPORT_COST))
2716     {
2717       fprintf (vect_dump, "Cost model analysis: \n");
2718       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2719                vec_inside_cost);
2720       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2721                vec_outside_cost);
2722       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2723                scalar_single_iter_cost);
2724       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2725       fprintf (vect_dump, "  prologue iterations: %d\n",
2726                peel_iters_prologue);
2727       fprintf (vect_dump, "  epilogue iterations: %d\n",
2728                peel_iters_epilogue);
2729       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2730                min_profitable_iters);
2731     }
2732
2733   min_profitable_iters =
2734         min_profitable_iters < vf ? vf : min_profitable_iters;
2735
2736   /* Because the condition we create is:
2737      if (niters <= min_profitable_iters)
2738        then skip the vectorized loop.  */
2739   min_profitable_iters--;
2740
2741   if (vect_print_dump_info (REPORT_COST))
2742     fprintf (vect_dump, "  Profitability threshold = %d\n",
2743              min_profitable_iters);
2744
2745   return min_profitable_iters;
2746 }
2747
2748
2749 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2750    functions. Design better to avoid maintenance issues.  */
2751
2752 /* Function vect_model_reduction_cost.
2753
2754    Models cost for a reduction operation, including the vector ops
2755    generated within the strip-mine loop, the initial definition before
2756    the loop, and the epilogue code that must be generated.  */
2757
2758 static bool
2759 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2760                            int ncopies)
2761 {
2762   int outer_cost = 0;
2763   enum tree_code code;
2764   optab optab;
2765   tree vectype;
2766   gimple stmt, orig_stmt;
2767   tree reduction_op;
2768   enum machine_mode mode;
2769   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2770   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2771
2772
2773   /* Cost of reduction op inside loop.  */
2774   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2775     += ncopies * vect_get_cost (vector_stmt);
2776
2777   stmt = STMT_VINFO_STMT (stmt_info);
2778
2779   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2780     {
2781     case GIMPLE_SINGLE_RHS:
2782       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2783       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2784       break;
2785     case GIMPLE_UNARY_RHS:
2786       reduction_op = gimple_assign_rhs1 (stmt);
2787       break;
2788     case GIMPLE_BINARY_RHS:
2789       reduction_op = gimple_assign_rhs2 (stmt);
2790       break;
2791     case GIMPLE_TERNARY_RHS:
2792       reduction_op = gimple_assign_rhs3 (stmt);
2793       break;
2794     default:
2795       gcc_unreachable ();
2796     }
2797
2798   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2799   if (!vectype)
2800     {
2801       if (vect_print_dump_info (REPORT_COST))
2802         {
2803           fprintf (vect_dump, "unsupported data-type ");
2804           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2805         }
2806       return false;
2807    }
2808
2809   mode = TYPE_MODE (vectype);
2810   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2811
2812   if (!orig_stmt)
2813     orig_stmt = STMT_VINFO_STMT (stmt_info);
2814
2815   code = gimple_assign_rhs_code (orig_stmt);
2816
2817   /* Add in cost for initial definition.  */
2818   outer_cost += vect_get_cost (scalar_to_vec);
2819
2820   /* Determine cost of epilogue code.
2821
2822      We have a reduction operator that will reduce the vector in one statement.
2823      Also requires scalar extract.  */
2824
2825   if (!nested_in_vect_loop_p (loop, orig_stmt))
2826     {
2827       if (reduc_code != ERROR_MARK)
2828         outer_cost += vect_get_cost (vector_stmt)
2829                       + vect_get_cost (vec_to_scalar);
2830       else
2831         {
2832           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2833           tree bitsize =
2834             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2835           int element_bitsize = tree_low_cst (bitsize, 1);
2836           int nelements = vec_size_in_bits / element_bitsize;
2837
2838           optab = optab_for_tree_code (code, vectype, optab_default);
2839
2840           /* We have a whole vector shift available.  */
2841           if (VECTOR_MODE_P (mode)
2842               && optab_handler (optab, mode) != CODE_FOR_nothing
2843               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
2844             /* Final reduction via vector shifts and the reduction operator. Also
2845                requires scalar extract.  */
2846             outer_cost += ((exact_log2(nelements) * 2)
2847               * vect_get_cost (vector_stmt)
2848               + vect_get_cost (vec_to_scalar));
2849           else
2850             /* Use extracts and reduction op for final reduction.  For N elements,
2851                we have N extracts and N-1 reduction ops.  */
2852             outer_cost += ((nelements + nelements - 1)
2853               * vect_get_cost (vector_stmt));
2854         }
2855     }
2856
2857   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2858
2859   if (vect_print_dump_info (REPORT_COST))
2860     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2861              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2862              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2863
2864   return true;
2865 }
2866
2867
2868 /* Function vect_model_induction_cost.
2869
2870    Models cost for induction operations.  */
2871
2872 static void
2873 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2874 {
2875   /* loop cost for vec_loop.  */
2876   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info)
2877     = ncopies * vect_get_cost (vector_stmt);
2878   /* prologue cost for vec_init and vec_step.  */
2879   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)
2880     = 2 * vect_get_cost (scalar_to_vec);
2881
2882   if (vect_print_dump_info (REPORT_COST))
2883     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2884              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2885              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2886 }
2887
2888
2889 /* Function get_initial_def_for_induction
2890
2891    Input:
2892    STMT - a stmt that performs an induction operation in the loop.
2893    IV_PHI - the initial value of the induction variable
2894
2895    Output:
2896    Return a vector variable, initialized with the first VF values of
2897    the induction variable.  E.g., for an iv with IV_PHI='X' and
2898    evolution S, for a vector of 4 units, we want to return:
2899    [X, X + S, X + 2*S, X + 3*S].  */
2900
2901 static tree
2902 get_initial_def_for_induction (gimple iv_phi)
2903 {
2904   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2905   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2906   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2907   tree scalar_type;
2908   tree vectype;
2909   int nunits;
2910   edge pe = loop_preheader_edge (loop);
2911   struct loop *iv_loop;
2912   basic_block new_bb;
2913   tree vec, vec_init, vec_step, t;
2914   tree access_fn;
2915   tree new_var;
2916   tree new_name;
2917   gimple init_stmt, induction_phi, new_stmt;
2918   tree induc_def, vec_def, vec_dest;
2919   tree init_expr, step_expr;
2920   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2921   int i;
2922   bool ok;
2923   int ncopies;
2924   tree expr;
2925   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2926   bool nested_in_vect_loop = false;
2927   gimple_seq stmts = NULL;
2928   imm_use_iterator imm_iter;
2929   use_operand_p use_p;
2930   gimple exit_phi;
2931   edge latch_e;
2932   tree loop_arg;
2933   gimple_stmt_iterator si;
2934   basic_block bb = gimple_bb (iv_phi);
2935   tree stepvectype;
2936   tree resvectype;
2937
2938   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2939   if (nested_in_vect_loop_p (loop, iv_phi))
2940     {
2941       nested_in_vect_loop = true;
2942       iv_loop = loop->inner;
2943     }
2944   else
2945     iv_loop = loop;
2946   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2947
2948   latch_e = loop_latch_edge (iv_loop);
2949   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2950
2951   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2952   gcc_assert (access_fn);
2953   STRIP_NOPS (access_fn);
2954   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2955                                     &init_expr, &step_expr);
2956   gcc_assert (ok);
2957   pe = loop_preheader_edge (iv_loop);
2958
2959   scalar_type = TREE_TYPE (init_expr);
2960   vectype = get_vectype_for_scalar_type (scalar_type);
2961   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
2962   gcc_assert (vectype);
2963   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2964   ncopies = vf / nunits;
2965
2966   gcc_assert (phi_info);
2967   gcc_assert (ncopies >= 1);
2968
2969   /* Find the first insertion point in the BB.  */
2970   si = gsi_after_labels (bb);
2971
2972   /* Create the vector that holds the initial_value of the induction.  */
2973   if (nested_in_vect_loop)
2974     {
2975       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2976          been created during vectorization of previous stmts.  We obtain it
2977          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
2978       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2979                                            loop_preheader_edge (iv_loop));
2980       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2981     }
2982   else
2983     {
2984       /* iv_loop is the loop to be vectorized. Create:
2985          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2986       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2987       add_referenced_var (new_var);
2988
2989       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2990       if (stmts)
2991         {
2992           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2993           gcc_assert (!new_bb);
2994         }
2995
2996       t = NULL_TREE;
2997       t = tree_cons (NULL_TREE, new_name, t);
2998       for (i = 1; i < nunits; i++)
2999         {
3000           /* Create: new_name_i = new_name + step_expr  */
3001           enum tree_code code = POINTER_TYPE_P (scalar_type)
3002                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3003           init_stmt = gimple_build_assign_with_ops (code, new_var,
3004                                                     new_name, step_expr);
3005           new_name = make_ssa_name (new_var, init_stmt);
3006           gimple_assign_set_lhs (init_stmt, new_name);
3007
3008           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3009           gcc_assert (!new_bb);
3010
3011           if (vect_print_dump_info (REPORT_DETAILS))
3012             {
3013               fprintf (vect_dump, "created new init_stmt: ");
3014               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
3015             }
3016           t = tree_cons (NULL_TREE, new_name, t);
3017         }
3018       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3019       vec = build_constructor_from_list (vectype, nreverse (t));
3020       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
3021     }
3022
3023
3024   /* Create the vector that holds the step of the induction.  */
3025   if (nested_in_vect_loop)
3026     /* iv_loop is nested in the loop to be vectorized. Generate:
3027        vec_step = [S, S, S, S]  */
3028     new_name = step_expr;
3029   else
3030     {
3031       /* iv_loop is the loop to be vectorized. Generate:
3032           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3033       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3034       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3035                               expr, step_expr);
3036     }
3037
3038   t = unshare_expr (new_name);
3039   gcc_assert (CONSTANT_CLASS_P (new_name));
3040   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3041   gcc_assert (stepvectype);
3042   vec = build_vector_from_val (stepvectype, t);
3043   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3044
3045
3046   /* Create the following def-use cycle:
3047      loop prolog:
3048          vec_init = ...
3049          vec_step = ...
3050      loop:
3051          vec_iv = PHI <vec_init, vec_loop>
3052          ...
3053          STMT
3054          ...
3055          vec_loop = vec_iv + vec_step;  */
3056
3057   /* Create the induction-phi that defines the induction-operand.  */
3058   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3059   add_referenced_var (vec_dest);
3060   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3061   set_vinfo_for_stmt (induction_phi,
3062                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3063   induc_def = PHI_RESULT (induction_phi);
3064
3065   /* Create the iv update inside the loop  */
3066   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3067                                            induc_def, vec_step);
3068   vec_def = make_ssa_name (vec_dest, new_stmt);
3069   gimple_assign_set_lhs (new_stmt, vec_def);
3070   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3071   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3072                                                    NULL));
3073
3074   /* Set the arguments of the phi node:  */
3075   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3076   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3077                UNKNOWN_LOCATION);
3078
3079
3080   /* In case that vectorization factor (VF) is bigger than the number
3081      of elements that we can fit in a vectype (nunits), we have to generate
3082      more than one vector stmt - i.e - we need to "unroll" the
3083      vector stmt by a factor VF/nunits.  For more details see documentation
3084      in vectorizable_operation.  */
3085
3086   if (ncopies > 1)
3087     {
3088       stmt_vec_info prev_stmt_vinfo;
3089       /* FORNOW. This restriction should be relaxed.  */
3090       gcc_assert (!nested_in_vect_loop);
3091
3092       /* Create the vector that holds the step of the induction.  */
3093       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3094       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3095                               expr, step_expr);
3096       t = unshare_expr (new_name);
3097       gcc_assert (CONSTANT_CLASS_P (new_name));
3098       vec = build_vector_from_val (stepvectype, t);
3099       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
3100
3101       vec_def = induc_def;
3102       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3103       for (i = 1; i < ncopies; i++)
3104         {
3105           /* vec_i = vec_prev + vec_step  */
3106           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3107                                                    vec_def, vec_step);
3108           vec_def = make_ssa_name (vec_dest, new_stmt);
3109           gimple_assign_set_lhs (new_stmt, vec_def);
3110
3111           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3112           if (!useless_type_conversion_p (resvectype, vectype))
3113             {
3114               new_stmt = gimple_build_assign_with_ops
3115                   (VIEW_CONVERT_EXPR,
3116                    vect_get_new_vect_var (resvectype, vect_simple_var,
3117                                           "vec_iv_"),
3118                    build1 (VIEW_CONVERT_EXPR, resvectype,
3119                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3120               gimple_assign_set_lhs (new_stmt,
3121                                      make_ssa_name
3122                                        (gimple_assign_lhs (new_stmt), new_stmt));
3123               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3124             }
3125           set_vinfo_for_stmt (new_stmt,
3126                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3127           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3128           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3129         }
3130     }
3131
3132   if (nested_in_vect_loop)
3133     {
3134       /* Find the loop-closed exit-phi of the induction, and record
3135          the final vector of induction results:  */
3136       exit_phi = NULL;
3137       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3138         {
3139           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3140             {
3141               exit_phi = USE_STMT (use_p);
3142               break;
3143             }
3144         }
3145       if (exit_phi)
3146         {
3147           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3148           /* FORNOW. Currently not supporting the case that an inner-loop induction
3149              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3150           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3151                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3152
3153           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3154           if (vect_print_dump_info (REPORT_DETAILS))
3155             {
3156               fprintf (vect_dump, "vector of inductions after inner-loop:");
3157               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
3158             }
3159         }
3160     }
3161
3162
3163   if (vect_print_dump_info (REPORT_DETAILS))
3164     {
3165       fprintf (vect_dump, "transform induction: created def-use cycle: ");
3166       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
3167       fprintf (vect_dump, "\n");
3168       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
3169     }
3170
3171   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3172   if (!useless_type_conversion_p (resvectype, vectype))
3173     {
3174       new_stmt = gimple_build_assign_with_ops
3175          (VIEW_CONVERT_EXPR,
3176           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3177           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3178       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3179       gimple_assign_set_lhs (new_stmt, induc_def);
3180       si = gsi_start_bb (bb);
3181       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3182       set_vinfo_for_stmt (new_stmt,
3183                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3184       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3185         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3186     }
3187
3188   return induc_def;
3189 }
3190
3191
3192 /* Function get_initial_def_for_reduction
3193
3194    Input:
3195    STMT - a stmt that performs a reduction operation in the loop.
3196    INIT_VAL - the initial value of the reduction variable
3197
3198    Output:
3199    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3200         of the reduction (used for adjusting the epilog - see below).
3201    Return a vector variable, initialized according to the operation that STMT
3202         performs. This vector will be used as the initial value of the
3203         vector of partial results.
3204
3205    Option1 (adjust in epilog): Initialize the vector as follows:
3206      add/bit or/xor:    [0,0,...,0,0]
3207      mult/bit and:      [1,1,...,1,1]
3208      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3209    and when necessary (e.g. add/mult case) let the caller know
3210    that it needs to adjust the result by init_val.
3211
3212    Option2: Initialize the vector as follows:
3213      add/bit or/xor:    [init_val,0,0,...,0]
3214      mult/bit and:      [init_val,1,1,...,1]
3215      min/max/cond_expr: [init_val,init_val,...,init_val]
3216    and no adjustments are needed.
3217
3218    For example, for the following code:
3219
3220    s = init_val;
3221    for (i=0;i<n;i++)
3222      s = s + a[i];
3223
3224    STMT is 's = s + a[i]', and the reduction variable is 's'.
3225    For a vector of 4 units, we want to return either [0,0,0,init_val],
3226    or [0,0,0,0] and let the caller know that it needs to adjust
3227    the result at the end by 'init_val'.
3228
3229    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3230    initialization vector is simpler (same element in all entries), if
3231    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3232
3233    A cost model should help decide between these two schemes.  */
3234
3235 tree
3236 get_initial_def_for_reduction (gimple stmt, tree init_val,
3237                                tree *adjustment_def)
3238 {
3239   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3240   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3241   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3242   tree scalar_type = TREE_TYPE (init_val);
3243   tree vectype = get_vectype_for_scalar_type (scalar_type);
3244   int nunits;
3245   enum tree_code code = gimple_assign_rhs_code (stmt);
3246   tree def_for_init;
3247   tree init_def;
3248   tree t = NULL_TREE;
3249   int i;
3250   bool nested_in_vect_loop = false;
3251   tree init_value;
3252   REAL_VALUE_TYPE real_init_val = dconst0;
3253   int int_init_val = 0;
3254   gimple def_stmt = NULL;
3255
3256   gcc_assert (vectype);
3257   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3258
3259   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3260               || SCALAR_FLOAT_TYPE_P (scalar_type));
3261
3262   if (nested_in_vect_loop_p (loop, stmt))
3263     nested_in_vect_loop = true;
3264   else
3265     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3266
3267   /* In case of double reduction we only create a vector variable to be put
3268      in the reduction phi node.  The actual statement creation is done in
3269      vect_create_epilog_for_reduction.  */
3270   if (adjustment_def && nested_in_vect_loop
3271       && TREE_CODE (init_val) == SSA_NAME
3272       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3273       && gimple_code (def_stmt) == GIMPLE_PHI
3274       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3275       && vinfo_for_stmt (def_stmt)
3276       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3277           == vect_double_reduction_def)
3278     {
3279       *adjustment_def = NULL;
3280       return vect_create_destination_var (init_val, vectype);
3281     }
3282
3283   if (TREE_CONSTANT (init_val))
3284     {
3285       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3286         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3287       else
3288         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3289     }
3290   else
3291     init_value = init_val;
3292
3293   switch (code)
3294     {
3295       case WIDEN_SUM_EXPR:
3296       case DOT_PROD_EXPR:
3297       case PLUS_EXPR:
3298       case MINUS_EXPR:
3299       case BIT_IOR_EXPR:
3300       case BIT_XOR_EXPR:
3301       case MULT_EXPR:
3302       case BIT_AND_EXPR:
3303         /* ADJUSMENT_DEF is NULL when called from
3304            vect_create_epilog_for_reduction to vectorize double reduction.  */
3305         if (adjustment_def)
3306           {
3307             if (nested_in_vect_loop)
3308               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3309                                                               NULL);
3310             else
3311               *adjustment_def = init_val;
3312           }
3313
3314         if (code == MULT_EXPR)
3315           {
3316             real_init_val = dconst1;
3317             int_init_val = 1;
3318           }
3319
3320         if (code == BIT_AND_EXPR)
3321           int_init_val = -1;
3322
3323         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3324           def_for_init = build_real (scalar_type, real_init_val);
3325         else
3326           def_for_init = build_int_cst (scalar_type, int_init_val);
3327
3328         /* Create a vector of '0' or '1' except the first element.  */
3329         for (i = nunits - 2; i >= 0; --i)
3330           t = tree_cons (NULL_TREE, def_for_init, t);
3331
3332         /* Option1: the first element is '0' or '1' as well.  */
3333         if (adjustment_def)
3334           {
3335             t = tree_cons (NULL_TREE, def_for_init, t);
3336             init_def = build_vector (vectype, t);
3337             break;
3338           }
3339
3340         /* Option2: the first element is INIT_VAL.  */
3341         t = tree_cons (NULL_TREE, init_value, t);
3342         if (TREE_CONSTANT (init_val))
3343           init_def = build_vector (vectype, t);
3344         else
3345           init_def = build_constructor_from_list (vectype, t);
3346
3347         break;
3348
3349       case MIN_EXPR:
3350       case MAX_EXPR:
3351       case COND_EXPR:
3352         if (adjustment_def)
3353           {
3354             *adjustment_def = NULL_TREE;
3355             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3356             break;
3357           }
3358
3359         init_def = build_vector_from_val (vectype, init_value);
3360         break;
3361
3362       default:
3363         gcc_unreachable ();
3364     }
3365
3366   return init_def;
3367 }
3368
3369
3370 /* Function vect_create_epilog_for_reduction
3371
3372    Create code at the loop-epilog to finalize the result of a reduction
3373    computation.
3374
3375    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3376      reduction statements.
3377    STMT is the scalar reduction stmt that is being vectorized.
3378    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3379      number of elements that we can fit in a vectype (nunits).  In this case
3380      we have to generate more than one vector stmt - i.e - we need to "unroll"
3381      the vector stmt by a factor VF/nunits.  For more details see documentation
3382      in vectorizable_operation.
3383    REDUC_CODE is the tree-code for the epilog reduction.
3384    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3385      computation.
3386    REDUC_INDEX is the index of the operand in the right hand side of the
3387      statement that is defined by REDUCTION_PHI.
3388    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3389    SLP_NODE is an SLP node containing a group of reduction statements. The
3390      first one in this group is STMT.
3391
3392    This function:
3393    1. Creates the reduction def-use cycles: sets the arguments for
3394       REDUCTION_PHIS:
3395       The loop-entry argument is the vectorized initial-value of the reduction.
3396       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3397       sums.
3398    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3399       by applying the operation specified by REDUC_CODE if available, or by
3400       other means (whole-vector shifts or a scalar loop).
3401       The function also creates a new phi node at the loop exit to preserve
3402       loop-closed form, as illustrated below.
3403
3404      The flow at the entry to this function:
3405
3406         loop:
3407           vec_def = phi <null, null>            # REDUCTION_PHI
3408           VECT_DEF = vector_stmt                # vectorized form of STMT
3409           s_loop = scalar_stmt                  # (scalar) STMT
3410         loop_exit:
3411           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3412           use <s_out0>
3413           use <s_out0>
3414
3415      The above is transformed by this function into:
3416
3417         loop:
3418           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3419           VECT_DEF = vector_stmt                # vectorized form of STMT
3420           s_loop = scalar_stmt                  # (scalar) STMT
3421         loop_exit:
3422           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3423           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3424           v_out2 = reduce <v_out1>
3425           s_out3 = extract_field <v_out2, 0>
3426           s_out4 = adjust_result <s_out3>
3427           use <s_out4>
3428           use <s_out4>
3429 */
3430
3431 static void
3432 vect_create_epilog_for_reduction (VEC (tree, heap) *vect_defs, gimple stmt,
3433                                   int ncopies, enum tree_code reduc_code,
3434                                   VEC (gimple, heap) *reduction_phis,
3435                                   int reduc_index, bool double_reduc,
3436                                   slp_tree slp_node)
3437 {
3438   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3439   stmt_vec_info prev_phi_info;
3440   tree vectype;
3441   enum machine_mode mode;
3442   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3443   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3444   basic_block exit_bb;
3445   tree scalar_dest;
3446   tree scalar_type;
3447   gimple new_phi = NULL, phi;
3448   gimple_stmt_iterator exit_gsi;
3449   tree vec_dest;
3450   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3451   gimple epilog_stmt = NULL;
3452   enum tree_code code = gimple_assign_rhs_code (stmt);
3453   gimple exit_phi;
3454   tree bitsize, bitpos;
3455   tree adjustment_def = NULL;
3456   tree vec_initial_def = NULL;
3457   tree reduction_op, expr, def;
3458   tree orig_name, scalar_result;
3459   imm_use_iterator imm_iter, phi_imm_iter;
3460   use_operand_p use_p, phi_use_p;
3461   bool extract_scalar_result = false;
3462   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3463   bool nested_in_vect_loop = false;
3464   VEC (gimple, heap) *new_phis = NULL;
3465   VEC (gimple, heap) *inner_phis = NULL;
3466   enum vect_def_type dt = vect_unknown_def_type;
3467   int j, i;
3468   VEC (tree, heap) *scalar_results = NULL;
3469   unsigned int group_size = 1, k, ratio;
3470   VEC (tree, heap) *vec_initial_defs = NULL;
3471   VEC (gimple, heap) *phis;
3472   bool slp_reduc = false;
3473   tree new_phi_result;
3474   gimple inner_phi = NULL;
3475
3476   if (slp_node)
3477     group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (slp_node));
3478
3479   if (nested_in_vect_loop_p (loop, stmt))
3480     {
3481       outer_loop = loop;
3482       loop = loop->inner;
3483       nested_in_vect_loop = true;
3484       gcc_assert (!slp_node);
3485     }
3486
3487   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3488     {
3489     case GIMPLE_SINGLE_RHS:
3490       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3491                   == ternary_op);
3492       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3493       break;
3494     case GIMPLE_UNARY_RHS:
3495       reduction_op = gimple_assign_rhs1 (stmt);
3496       break;
3497     case GIMPLE_BINARY_RHS:
3498       reduction_op = reduc_index ?
3499                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3500       break;
3501     case GIMPLE_TERNARY_RHS:
3502       reduction_op = gimple_op (stmt, reduc_index + 1);
3503       break;
3504     default:
3505       gcc_unreachable ();
3506     }
3507
3508   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3509   gcc_assert (vectype);
3510   mode = TYPE_MODE (vectype);
3511
3512   /* 1. Create the reduction def-use cycle:
3513      Set the arguments of REDUCTION_PHIS, i.e., transform
3514
3515         loop:
3516           vec_def = phi <null, null>            # REDUCTION_PHI
3517           VECT_DEF = vector_stmt                # vectorized form of STMT
3518           ...
3519
3520      into:
3521
3522         loop:
3523           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3524           VECT_DEF = vector_stmt                # vectorized form of STMT
3525           ...
3526
3527      (in case of SLP, do it for all the phis). */
3528
3529   /* Get the loop-entry arguments.  */
3530   if (slp_node)
3531     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3532                        NULL, slp_node, reduc_index);
3533   else
3534     {
3535       vec_initial_defs = VEC_alloc (tree, heap, 1);
3536      /* For the case of reduction, vect_get_vec_def_for_operand returns
3537         the scalar def before the loop, that defines the initial value
3538         of the reduction variable.  */
3539       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3540                                                       &adjustment_def);
3541       VEC_quick_push (tree, vec_initial_defs, vec_initial_def);
3542     }
3543
3544   /* Set phi nodes arguments.  */
3545   FOR_EACH_VEC_ELT (gimple, reduction_phis, i, phi)
3546     {
3547       tree vec_init_def = VEC_index (tree, vec_initial_defs, i);
3548       tree def = VEC_index (tree, vect_defs, i);
3549       for (j = 0; j < ncopies; j++)
3550         {
3551           /* Set the loop-entry arg of the reduction-phi.  */
3552           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3553                        UNKNOWN_LOCATION);
3554
3555           /* Set the loop-latch arg for the reduction-phi.  */
3556           if (j > 0)
3557             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3558
3559           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3560
3561           if (vect_print_dump_info (REPORT_DETAILS))
3562             {
3563               fprintf (vect_dump, "transform reduction: created def-use"
3564                                   " cycle: ");
3565               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
3566               fprintf (vect_dump, "\n");
3567               print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0,
3568                                  TDF_SLIM);
3569             }
3570
3571           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3572         }
3573     }
3574
3575   VEC_free (tree, heap, vec_initial_defs);
3576
3577   /* 2. Create epilog code.
3578         The reduction epilog code operates across the elements of the vector
3579         of partial results computed by the vectorized loop.
3580         The reduction epilog code consists of:
3581
3582         step 1: compute the scalar result in a vector (v_out2)
3583         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3584         step 3: adjust the scalar result (s_out3) if needed.
3585
3586         Step 1 can be accomplished using one the following three schemes:
3587           (scheme 1) using reduc_code, if available.
3588           (scheme 2) using whole-vector shifts, if available.
3589           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3590                      combined.
3591
3592           The overall epilog code looks like this:
3593
3594           s_out0 = phi <s_loop>         # original EXIT_PHI
3595           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3596           v_out2 = reduce <v_out1>              # step 1
3597           s_out3 = extract_field <v_out2, 0>    # step 2
3598           s_out4 = adjust_result <s_out3>       # step 3
3599
3600           (step 3 is optional, and steps 1 and 2 may be combined).
3601           Lastly, the uses of s_out0 are replaced by s_out4.  */
3602
3603
3604   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3605          v_out1 = phi <VECT_DEF>
3606          Store them in NEW_PHIS.  */
3607
3608   exit_bb = single_exit (loop)->dest;
3609   prev_phi_info = NULL;
3610   new_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3611   FOR_EACH_VEC_ELT (tree, vect_defs, i, def)
3612     {
3613       for (j = 0; j < ncopies; j++)
3614         {
3615           phi = create_phi_node (SSA_NAME_VAR (def), exit_bb);
3616           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3617           if (j == 0)
3618             VEC_quick_push (gimple, new_phis, phi);
3619           else
3620             {
3621               def = vect_get_vec_def_for_stmt_copy (dt, def);
3622               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3623             }
3624
3625           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3626           prev_phi_info = vinfo_for_stmt (phi);
3627         }
3628     }
3629
3630   /* The epilogue is created for the outer-loop, i.e., for the loop being
3631      vectorized.  Create exit phis for the outer loop.  */
3632   if (double_reduc)
3633     {
3634       loop = outer_loop;
3635       exit_bb = single_exit (loop)->dest;
3636       inner_phis = VEC_alloc (gimple, heap, VEC_length (tree, vect_defs));
3637       FOR_EACH_VEC_ELT (gimple, new_phis, i, phi)
3638         {
3639           gimple outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
3640                                               exit_bb);
3641           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3642                            PHI_RESULT (phi));
3643           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3644                                                             loop_vinfo, NULL));
3645           VEC_quick_push (gimple, inner_phis, phi);
3646           VEC_replace (gimple, new_phis, i, outer_phi);
3647           prev_phi_info = vinfo_for_stmt (outer_phi);
3648           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3649             {
3650               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3651               outer_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (phi)),
3652                                            exit_bb);
3653               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3654                                PHI_RESULT (phi));
3655               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3656                                                         loop_vinfo, NULL));
3657               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3658               prev_phi_info = vinfo_for_stmt (outer_phi);
3659             }
3660         }
3661     }
3662
3663   exit_gsi = gsi_after_labels (exit_bb);
3664
3665   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3666          (i.e. when reduc_code is not available) and in the final adjustment
3667          code (if needed).  Also get the original scalar reduction variable as
3668          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3669          represents a reduction pattern), the tree-code and scalar-def are
3670          taken from the original stmt that the pattern-stmt (STMT) replaces.
3671          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3672          are taken from STMT.  */
3673
3674   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3675   if (!orig_stmt)
3676     {
3677       /* Regular reduction  */
3678       orig_stmt = stmt;
3679     }
3680   else
3681     {
3682       /* Reduction pattern  */
3683       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3684       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3685       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3686     }
3687
3688   code = gimple_assign_rhs_code (orig_stmt);
3689   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3690      partial results are added and not subtracted.  */
3691   if (code == MINUS_EXPR)
3692     code = PLUS_EXPR;
3693
3694   scalar_dest = gimple_assign_lhs (orig_stmt);
3695   scalar_type = TREE_TYPE (scalar_dest);
3696   scalar_results = VEC_alloc (tree, heap, group_size);
3697   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3698   bitsize = TYPE_SIZE (scalar_type);
3699
3700   /* In case this is a reduction in an inner-loop while vectorizing an outer
3701      loop - we don't need to extract a single scalar result at the end of the
3702      inner-loop (unless it is double reduction, i.e., the use of reduction is
3703      outside the outer-loop).  The final vector of partial results will be used
3704      in the vectorized outer-loop, or reduced to a scalar result at the end of
3705      the outer-loop.  */
3706   if (nested_in_vect_loop && !double_reduc)
3707     goto vect_finalize_reduction;
3708
3709   /* SLP reduction without reduction chain, e.g.,
3710      # a1 = phi <a2, a0>
3711      # b1 = phi <b2, b0>
3712      a2 = operation (a1)
3713      b2 = operation (b1)  */
3714   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3715
3716   /* In case of reduction chain, e.g.,
3717      # a1 = phi <a3, a0>
3718      a2 = operation (a1)
3719      a3 = operation (a2),
3720
3721      we may end up with more than one vector result.  Here we reduce them to
3722      one vector.  */
3723   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3724     {
3725       tree first_vect = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3726       tree tmp;
3727       gimple new_vec_stmt = NULL;
3728
3729       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3730       for (k = 1; k < VEC_length (gimple, new_phis); k++)
3731         {
3732           gimple next_phi = VEC_index (gimple, new_phis, k);
3733           tree second_vect = PHI_RESULT (next_phi);
3734
3735           tmp = build2 (code, vectype,  first_vect, second_vect);
3736           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3737           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3738           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3739           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3740         }
3741
3742       new_phi_result = first_vect;
3743       if (new_vec_stmt)
3744         {
3745           VEC_truncate (gimple, new_phis, 0);
3746           VEC_safe_push (gimple, heap, new_phis, new_vec_stmt);
3747         }
3748     }
3749   else
3750     new_phi_result = PHI_RESULT (VEC_index (gimple, new_phis, 0));
3751
3752   /* 2.3 Create the reduction code, using one of the three schemes described
3753          above. In SLP we simply need to extract all the elements from the
3754          vector (without reducing them), so we use scalar shifts.  */
3755   if (reduc_code != ERROR_MARK && !slp_reduc)
3756     {
3757       tree tmp;
3758
3759       /*** Case 1:  Create:
3760            v_out2 = reduc_expr <v_out1>  */
3761
3762       if (vect_print_dump_info (REPORT_DETAILS))
3763         fprintf (vect_dump, "Reduce using direct vector reduction.");
3764
3765       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3766       tmp = build1 (reduc_code, vectype, new_phi_result);
3767       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3768       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3769       gimple_assign_set_lhs (epilog_stmt, new_temp);
3770       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3771
3772       extract_scalar_result = true;
3773     }
3774   else
3775     {
3776       enum tree_code shift_code = ERROR_MARK;
3777       bool have_whole_vector_shift = true;
3778       int bit_offset;
3779       int element_bitsize = tree_low_cst (bitsize, 1);
3780       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3781       tree vec_temp;
3782
3783       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3784         shift_code = VEC_RSHIFT_EXPR;
3785       else
3786         have_whole_vector_shift = false;
3787
3788       /* Regardless of whether we have a whole vector shift, if we're
3789          emulating the operation via tree-vect-generic, we don't want
3790          to use it.  Only the first round of the reduction is likely
3791          to still be profitable via emulation.  */
3792       /* ??? It might be better to emit a reduction tree code here, so that
3793          tree-vect-generic can expand the first round via bit tricks.  */
3794       if (!VECTOR_MODE_P (mode))
3795         have_whole_vector_shift = false;
3796       else
3797         {
3798           optab optab = optab_for_tree_code (code, vectype, optab_default);
3799           if (optab_handler (optab, mode) == CODE_FOR_nothing)
3800             have_whole_vector_shift = false;
3801         }
3802
3803       if (have_whole_vector_shift && !slp_reduc)
3804         {
3805           /*** Case 2: Create:
3806              for (offset = VS/2; offset >= element_size; offset/=2)
3807                 {
3808                   Create:  va' = vec_shift <va, offset>
3809                   Create:  va = vop <va, va'>
3810                 }  */
3811
3812           if (vect_print_dump_info (REPORT_DETAILS))
3813             fprintf (vect_dump, "Reduce using vector shifts");
3814
3815           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3816           new_temp = new_phi_result;
3817           for (bit_offset = vec_size_in_bits/2;
3818                bit_offset >= element_bitsize;
3819                bit_offset /= 2)
3820             {
3821               tree bitpos = size_int (bit_offset);
3822
3823               epilog_stmt = gimple_build_assign_with_ops (shift_code,
3824                                                vec_dest, new_temp, bitpos);
3825               new_name = make_ssa_name (vec_dest, epilog_stmt);
3826               gimple_assign_set_lhs (epilog_stmt, new_name);
3827               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3828
3829               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3830                                                           new_name, new_temp);
3831               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3832               gimple_assign_set_lhs (epilog_stmt, new_temp);
3833               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3834             }
3835
3836           extract_scalar_result = true;
3837         }
3838       else
3839         {
3840           tree rhs;
3841
3842           /*** Case 3: Create:
3843              s = extract_field <v_out2, 0>
3844              for (offset = element_size;
3845                   offset < vector_size;
3846                   offset += element_size;)
3847                {
3848                  Create:  s' = extract_field <v_out2, offset>
3849                  Create:  s = op <s, s'>  // For non SLP cases
3850                }  */
3851
3852           if (vect_print_dump_info (REPORT_DETAILS))
3853             fprintf (vect_dump, "Reduce using scalar code. ");
3854
3855           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3856           FOR_EACH_VEC_ELT (gimple, new_phis, i, new_phi)
3857             {
3858               if (gimple_code (new_phi) == GIMPLE_PHI)
3859                 vec_temp = PHI_RESULT (new_phi);
3860               else
3861                 vec_temp = gimple_assign_lhs (new_phi);
3862               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3863                             bitsize_zero_node);
3864               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3865               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3866               gimple_assign_set_lhs (epilog_stmt, new_temp);
3867               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3868
3869               /* In SLP we don't need to apply reduction operation, so we just
3870                  collect s' values in SCALAR_RESULTS.  */
3871               if (slp_reduc)
3872                 VEC_safe_push (tree, heap, scalar_results, new_temp);
3873
3874               for (bit_offset = element_bitsize;
3875                    bit_offset < vec_size_in_bits;
3876                    bit_offset += element_bitsize)
3877                 {
3878                   tree bitpos = bitsize_int (bit_offset);
3879                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
3880                                      bitsize, bitpos);
3881
3882                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3883                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3884                   gimple_assign_set_lhs (epilog_stmt, new_name);
3885                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3886
3887                   if (slp_reduc)
3888                     {
3889                       /* In SLP we don't need to apply reduction operation, so
3890                          we just collect s' values in SCALAR_RESULTS.  */
3891                       new_temp = new_name;
3892                       VEC_safe_push (tree, heap, scalar_results, new_name);
3893                     }
3894                   else
3895                     {
3896                       epilog_stmt = gimple_build_assign_with_ops (code,
3897                                           new_scalar_dest, new_name, new_temp);
3898                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3899                       gimple_assign_set_lhs (epilog_stmt, new_temp);
3900                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3901                     }
3902                 }
3903             }
3904
3905           /* The only case where we need to reduce scalar results in SLP, is
3906              unrolling.  If the size of SCALAR_RESULTS is greater than
3907              GROUP_SIZE, we reduce them combining elements modulo
3908              GROUP_SIZE.  */
3909           if (slp_reduc)
3910             {
3911               tree res, first_res, new_res;
3912               gimple new_stmt;
3913
3914               /* Reduce multiple scalar results in case of SLP unrolling.  */
3915               for (j = group_size; VEC_iterate (tree, scalar_results, j, res);
3916                    j++)
3917                 {
3918                   first_res = VEC_index (tree, scalar_results, j % group_size);
3919                   new_stmt = gimple_build_assign_with_ops (code,
3920                                               new_scalar_dest, first_res, res);
3921                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
3922                   gimple_assign_set_lhs (new_stmt, new_res);
3923                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
3924                   VEC_replace (tree, scalar_results, j % group_size, new_res);
3925                 }
3926             }
3927           else
3928             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
3929             VEC_safe_push (tree, heap, scalar_results, new_temp);
3930
3931           extract_scalar_result = false;
3932         }
3933     }
3934
3935   /* 2.4  Extract the final scalar result.  Create:
3936           s_out3 = extract_field <v_out2, bitpos>  */
3937
3938   if (extract_scalar_result)
3939     {
3940       tree rhs;
3941
3942       if (vect_print_dump_info (REPORT_DETAILS))
3943         fprintf (vect_dump, "extract scalar result");
3944
3945       if (BYTES_BIG_ENDIAN)
3946         bitpos = size_binop (MULT_EXPR,
3947                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3948                              TYPE_SIZE (scalar_type));
3949       else
3950         bitpos = bitsize_zero_node;
3951
3952       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3953       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3954       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3955       gimple_assign_set_lhs (epilog_stmt, new_temp);
3956       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3957       VEC_safe_push (tree, heap, scalar_results, new_temp);
3958     }
3959
3960 vect_finalize_reduction:
3961
3962   if (double_reduc)
3963     loop = loop->inner;
3964
3965   /* 2.5 Adjust the final result by the initial value of the reduction
3966          variable. (When such adjustment is not needed, then
3967          'adjustment_def' is zero).  For example, if code is PLUS we create:
3968          new_temp = loop_exit_def + adjustment_def  */
3969
3970   if (adjustment_def)
3971     {
3972       gcc_assert (!slp_reduc);
3973       if (nested_in_vect_loop)
3974         {
3975           new_phi = VEC_index (gimple, new_phis, 0);
3976           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3977           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3978           new_dest = vect_create_destination_var (scalar_dest, vectype);
3979         }
3980       else
3981         {
3982           new_temp = VEC_index (tree, scalar_results, 0);
3983           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3984           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3985           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3986         }
3987
3988       epilog_stmt = gimple_build_assign (new_dest, expr);
3989       new_temp = make_ssa_name (new_dest, epilog_stmt);
3990       gimple_assign_set_lhs (epilog_stmt, new_temp);
3991       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3992       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3993       if (nested_in_vect_loop)
3994         {
3995           set_vinfo_for_stmt (epilog_stmt,
3996                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3997                                                  NULL));
3998           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3999                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4000
4001           if (!double_reduc)
4002             VEC_quick_push (tree, scalar_results, new_temp);
4003           else
4004             VEC_replace (tree, scalar_results, 0, new_temp);
4005         }
4006       else
4007         VEC_replace (tree, scalar_results, 0, new_temp);
4008
4009       VEC_replace (gimple, new_phis, 0, epilog_stmt);
4010     }
4011
4012   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4013           phis with new adjusted scalar results, i.e., replace use <s_out0>
4014           with use <s_out4>.
4015
4016      Transform:
4017         loop_exit:
4018           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4019           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4020           v_out2 = reduce <v_out1>
4021           s_out3 = extract_field <v_out2, 0>
4022           s_out4 = adjust_result <s_out3>
4023           use <s_out0>
4024           use <s_out0>
4025
4026      into:
4027
4028         loop_exit:
4029           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4030           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4031           v_out2 = reduce <v_out1>
4032           s_out3 = extract_field <v_out2, 0>
4033           s_out4 = adjust_result <s_out3>
4034           use <s_out4>
4035           use <s_out4> */
4036
4037
4038   /* In SLP reduction chain we reduce vector results into one vector if
4039      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4040      the last stmt in the reduction chain, since we are looking for the loop
4041      exit phi node.  */
4042   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4043     {
4044       scalar_dest = gimple_assign_lhs (VEC_index (gimple,
4045                                        SLP_TREE_SCALAR_STMTS (slp_node),
4046                                        group_size - 1));
4047       group_size = 1;
4048     }
4049
4050   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4051      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4052      need to match SCALAR_RESULTS with corresponding statements.  The first
4053      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4054      the first vector stmt, etc.
4055      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4056   if (group_size > VEC_length (gimple, new_phis))
4057     {
4058       ratio = group_size / VEC_length (gimple, new_phis);
4059       gcc_assert (!(group_size % VEC_length (gimple, new_phis)));
4060     }
4061   else
4062     ratio = 1;
4063
4064   for (k = 0; k < group_size; k++)
4065     {
4066       if (k % ratio == 0)
4067         {
4068           epilog_stmt = VEC_index (gimple, new_phis, k / ratio);
4069           reduction_phi = VEC_index (gimple, reduction_phis, k / ratio);
4070           if (double_reduc)
4071             inner_phi = VEC_index (gimple, inner_phis, k / ratio);
4072         }
4073
4074       if (slp_reduc)
4075         {
4076           gimple current_stmt = VEC_index (gimple,
4077                                        SLP_TREE_SCALAR_STMTS (slp_node), k);
4078
4079           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4080           /* SLP statements can't participate in patterns.  */
4081           gcc_assert (!orig_stmt);
4082           scalar_dest = gimple_assign_lhs (current_stmt);
4083         }
4084
4085       phis = VEC_alloc (gimple, heap, 3);
4086       /* Find the loop-closed-use at the loop exit of the original scalar
4087          result.  (The reduction result is expected to have two immediate uses -
4088          one at the latch block, and one at the loop exit).  */
4089       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4090         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4091           VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4092
4093       /* We expect to have found an exit_phi because of loop-closed-ssa
4094          form.  */
4095       gcc_assert (!VEC_empty (gimple, phis));
4096
4097       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4098         {
4099           if (outer_loop)
4100             {
4101               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4102               gimple vect_phi;
4103
4104               /* FORNOW. Currently not supporting the case that an inner-loop
4105                  reduction is not used in the outer-loop (but only outside the
4106                  outer-loop), unless it is double reduction.  */
4107               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4108                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4109                           || double_reduc);
4110
4111               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4112               if (!double_reduc
4113                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4114                       != vect_double_reduction_def)
4115                 continue;
4116
4117               /* Handle double reduction:
4118
4119                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4120                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4121                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4122                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4123
4124                  At that point the regular reduction (stmt2 and stmt3) is
4125                  already vectorized, as well as the exit phi node, stmt4.
4126                  Here we vectorize the phi node of double reduction, stmt1, and
4127                  update all relevant statements.  */
4128
4129               /* Go through all the uses of s2 to find double reduction phi
4130                  node, i.e., stmt1 above.  */
4131               orig_name = PHI_RESULT (exit_phi);
4132               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4133                 {
4134                   stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4135                   stmt_vec_info new_phi_vinfo;
4136                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4137                   basic_block bb = gimple_bb (use_stmt);
4138                   gimple use;
4139
4140                   /* Check that USE_STMT is really double reduction phi
4141                      node.  */
4142                   if (gimple_code (use_stmt) != GIMPLE_PHI
4143                       || gimple_phi_num_args (use_stmt) != 2
4144                       || !use_stmt_vinfo
4145                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4146                           != vect_double_reduction_def
4147                       || bb->loop_father != outer_loop)
4148                     continue;
4149
4150                   /* Create vector phi node for double reduction:
4151                      vs1 = phi <vs0, vs2>
4152                      vs1 was created previously in this function by a call to
4153                        vect_get_vec_def_for_operand and is stored in
4154                        vec_initial_def;
4155                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4156                      vs0 is created here.  */
4157
4158                   /* Create vector phi node.  */
4159                   vect_phi = create_phi_node (vec_initial_def, bb);
4160                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4161                                     loop_vec_info_for_loop (outer_loop), NULL);
4162                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4163
4164                   /* Create vs0 - initial def of the double reduction phi.  */
4165                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4166                                              loop_preheader_edge (outer_loop));
4167                   init_def = get_initial_def_for_reduction (stmt,
4168                                                           preheader_arg, NULL);
4169                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4170                                                     vectype, NULL);
4171
4172                   /* Update phi node arguments with vs0 and vs2.  */
4173                   add_phi_arg (vect_phi, vect_phi_init,
4174                                loop_preheader_edge (outer_loop),
4175                                UNKNOWN_LOCATION);
4176                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4177                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4178                   if (vect_print_dump_info (REPORT_DETAILS))
4179                     {
4180                       fprintf (vect_dump, "created double reduction phi "
4181                                           "node: ");
4182                       print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
4183                     }
4184
4185                   vect_phi_res = PHI_RESULT (vect_phi);
4186
4187                   /* Replace the use, i.e., set the correct vs1 in the regular
4188                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4189                      loop is redundant.  */
4190                   use = reduction_phi;
4191                   for (j = 0; j < ncopies; j++)
4192                     {
4193                       edge pr_edge = loop_preheader_edge (loop);
4194                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4195                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4196                     }
4197                 }
4198             }
4199         }
4200
4201       VEC_free (gimple, heap, phis);
4202       if (nested_in_vect_loop)
4203         {
4204           if (double_reduc)
4205             loop = outer_loop;
4206           else
4207             continue;
4208         }
4209
4210       phis = VEC_alloc (gimple, heap, 3);
4211       /* Find the loop-closed-use at the loop exit of the original scalar
4212          result.  (The reduction result is expected to have two immediate uses,
4213          one at the latch block, and one at the loop exit).  For double
4214          reductions we are looking for exit phis of the outer loop.  */
4215       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4216         {
4217           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4218             VEC_safe_push (gimple, heap, phis, USE_STMT (use_p));
4219           else
4220             {
4221               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4222                 {
4223                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4224
4225                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4226                     {
4227                       if (!flow_bb_inside_loop_p (loop,
4228                                              gimple_bb (USE_STMT (phi_use_p))))
4229                         VEC_safe_push (gimple, heap, phis,
4230                                        USE_STMT (phi_use_p));
4231                     }
4232                 }
4233             }
4234         }
4235
4236       FOR_EACH_VEC_ELT (gimple, phis, i, exit_phi)
4237         {
4238           /* Replace the uses:  */
4239           orig_name = PHI_RESULT (exit_phi);
4240           scalar_result = VEC_index (tree, scalar_results, k);
4241           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4242             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4243               SET_USE (use_p, scalar_result);
4244         }
4245
4246       VEC_free (gimple, heap, phis);
4247     }
4248
4249   VEC_free (tree, heap, scalar_results);
4250   VEC_free (gimple, heap, new_phis);
4251 }
4252
4253
4254 /* Function vectorizable_reduction.
4255
4256    Check if STMT performs a reduction operation that can be vectorized.
4257    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4258    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4259    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4260
4261    This function also handles reduction idioms (patterns) that have been
4262    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4263    of this form:
4264      X = pattern_expr (arg0, arg1, ..., X)
4265    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4266    sequence that had been detected and replaced by the pattern-stmt (STMT).
4267
4268    In some cases of reduction patterns, the type of the reduction variable X is
4269    different than the type of the other arguments of STMT.
4270    In such cases, the vectype that is used when transforming STMT into a vector
4271    stmt is different than the vectype that is used to determine the
4272    vectorization factor, because it consists of a different number of elements
4273    than the actual number of elements that are being operated upon in parallel.
4274
4275    For example, consider an accumulation of shorts into an int accumulator.
4276    On some targets it's possible to vectorize this pattern operating on 8
4277    shorts at a time (hence, the vectype for purposes of determining the
4278    vectorization factor should be V8HI); on the other hand, the vectype that
4279    is used to create the vector form is actually V4SI (the type of the result).
4280
4281    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4282    indicates what is the actual level of parallelism (V8HI in the example), so
4283    that the right vectorization factor would be derived.  This vectype
4284    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4285    be used to create the vectorized stmt.  The right vectype for the vectorized
4286    stmt is obtained from the type of the result X:
4287         get_vectype_for_scalar_type (TREE_TYPE (X))
4288
4289    This means that, contrary to "regular" reductions (or "regular" stmts in
4290    general), the following equation:
4291       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4292    does *NOT* necessarily hold for reduction patterns.  */
4293
4294 bool
4295 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4296                         gimple *vec_stmt, slp_tree slp_node)
4297 {
4298   tree vec_dest;
4299   tree scalar_dest;
4300   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4301   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4302   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4303   tree vectype_in = NULL_TREE;
4304   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4305   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4306   enum tree_code code, orig_code, epilog_reduc_code;
4307   enum machine_mode vec_mode;
4308   int op_type;
4309   optab optab, reduc_optab;
4310   tree new_temp = NULL_TREE;
4311   tree def;
4312   gimple def_stmt;
4313   enum vect_def_type dt;
4314   gimple new_phi = NULL;
4315   tree scalar_type;
4316   bool is_simple_use;
4317   gimple orig_stmt;
4318   stmt_vec_info orig_stmt_info;
4319   tree expr = NULL_TREE;
4320   int i;
4321   int ncopies;
4322   int epilog_copies;
4323   stmt_vec_info prev_stmt_info, prev_phi_info;
4324   bool single_defuse_cycle = false;
4325   tree reduc_def = NULL_TREE;
4326   gimple new_stmt = NULL;
4327   int j;
4328   tree ops[3];
4329   bool nested_cycle = false, found_nested_cycle_def = false;
4330   gimple reduc_def_stmt = NULL;
4331   /* The default is that the reduction variable is the last in statement.  */
4332   int reduc_index = 2;
4333   bool double_reduc = false, dummy;
4334   basic_block def_bb;
4335   struct loop * def_stmt_loop, *outer_loop = NULL;
4336   tree def_arg;
4337   gimple def_arg_stmt;
4338   VEC (tree, heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL, *vect_defs = NULL;
4339   VEC (gimple, heap) *phis = NULL;
4340   int vec_num;
4341   tree def0, def1, tem, op0, op1 = NULL_TREE;
4342
4343   /* In case of reduction chain we switch to the first stmt in the chain, but
4344      we don't update STMT_INFO, since only the last stmt is marked as reduction
4345      and has reduction properties.  */
4346   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4347     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4348
4349   if (nested_in_vect_loop_p (loop, stmt))
4350     {
4351       outer_loop = loop;
4352       loop = loop->inner;
4353       nested_cycle = true;
4354     }
4355
4356   /* 1. Is vectorizable reduction?  */
4357   /* Not supportable if the reduction variable is used in the loop, unless
4358      it's a reduction chain.  */
4359   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4360       && !GROUP_FIRST_ELEMENT (stmt_info))
4361     return false;
4362
4363   /* Reductions that are not used even in an enclosing outer-loop,
4364      are expected to be "live" (used out of the loop).  */
4365   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4366       && !STMT_VINFO_LIVE_P (stmt_info))
4367     return false;
4368
4369   /* Make sure it was already recognized as a reduction computation.  */
4370   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4371       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4372     return false;
4373
4374   /* 2. Has this been recognized as a reduction pattern?
4375
4376      Check if STMT represents a pattern that has been recognized
4377      in earlier analysis stages.  For stmts that represent a pattern,
4378      the STMT_VINFO_RELATED_STMT field records the last stmt in
4379      the original sequence that constitutes the pattern.  */
4380
4381   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4382   if (orig_stmt)
4383     {
4384       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4385       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
4386       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4387       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4388     }
4389
4390   /* 3. Check the operands of the operation.  The first operands are defined
4391         inside the loop body. The last operand is the reduction variable,
4392         which is defined by the loop-header-phi.  */
4393
4394   gcc_assert (is_gimple_assign (stmt));
4395
4396   /* Flatten RHS.  */
4397   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4398     {
4399     case GIMPLE_SINGLE_RHS:
4400       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4401       if (op_type == ternary_op)
4402         {
4403           tree rhs = gimple_assign_rhs1 (stmt);
4404           ops[0] = TREE_OPERAND (rhs, 0);
4405           ops[1] = TREE_OPERAND (rhs, 1);
4406           ops[2] = TREE_OPERAND (rhs, 2);
4407           code = TREE_CODE (rhs);
4408         }
4409       else
4410         return false;
4411       break;
4412
4413     case GIMPLE_BINARY_RHS:
4414       code = gimple_assign_rhs_code (stmt);
4415       op_type = TREE_CODE_LENGTH (code);
4416       gcc_assert (op_type == binary_op);
4417       ops[0] = gimple_assign_rhs1 (stmt);
4418       ops[1] = gimple_assign_rhs2 (stmt);
4419       break;
4420
4421     case GIMPLE_TERNARY_RHS:
4422       code = gimple_assign_rhs_code (stmt);
4423       op_type = TREE_CODE_LENGTH (code);
4424       gcc_assert (op_type == ternary_op);
4425       ops[0] = gimple_assign_rhs1 (stmt);
4426       ops[1] = gimple_assign_rhs2 (stmt);
4427       ops[2] = gimple_assign_rhs3 (stmt);
4428       break;
4429
4430     case GIMPLE_UNARY_RHS:
4431       return false;
4432
4433     default:
4434       gcc_unreachable ();
4435     }
4436
4437   if (code == COND_EXPR && slp_node)
4438     return false;
4439
4440   scalar_dest = gimple_assign_lhs (stmt);
4441   scalar_type = TREE_TYPE (scalar_dest);
4442   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4443       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4444     return false;
4445
4446   /* Do not try to vectorize bit-precision reductions.  */
4447   if ((TYPE_PRECISION (scalar_type)
4448        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4449     return false;
4450
4451   /* All uses but the last are expected to be defined in the loop.
4452      The last use is the reduction variable.  In case of nested cycle this
4453      assumption is not true: we use reduc_index to record the index of the
4454      reduction variable.  */
4455   for (i = 0; i < op_type-1; i++)
4456     {
4457       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4458       if (i == 0 && code == COND_EXPR)
4459         continue;
4460
4461       is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL,
4462                                             &def_stmt, &def, &dt, &tem);
4463       if (!vectype_in)
4464         vectype_in = tem;
4465       gcc_assert (is_simple_use);
4466
4467       if (dt != vect_internal_def
4468           && dt != vect_external_def
4469           && dt != vect_constant_def
4470           && dt != vect_induction_def
4471           && !(dt == vect_nested_cycle && nested_cycle))
4472         return false;
4473
4474       if (dt == vect_nested_cycle)
4475         {
4476           found_nested_cycle_def = true;
4477           reduc_def_stmt = def_stmt;
4478           reduc_index = i;
4479         }
4480     }
4481
4482   is_simple_use = vect_is_simple_use_1 (ops[i], loop_vinfo, NULL, &def_stmt,
4483                                         &def, &dt, &tem);
4484   if (!vectype_in)
4485     vectype_in = tem;
4486   gcc_assert (is_simple_use);
4487   gcc_assert (dt == vect_reduction_def
4488               || dt == vect_nested_cycle
4489               || ((dt == vect_internal_def || dt == vect_external_def
4490                    || dt == vect_constant_def || dt == vect_induction_def)
4491                    && nested_cycle && found_nested_cycle_def));
4492   if (!found_nested_cycle_def)
4493     reduc_def_stmt = def_stmt;
4494
4495   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4496   if (orig_stmt)
4497     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4498                                                        reduc_def_stmt,
4499                                                        !nested_cycle,
4500                                                        &dummy));
4501   else
4502     {
4503       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4504                                              !nested_cycle, &dummy);
4505       /* We changed STMT to be the first stmt in reduction chain, hence we
4506          check that in this case the first element in the chain is STMT.  */
4507       gcc_assert (stmt == tmp
4508                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4509     }
4510
4511   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4512     return false;
4513
4514   if (slp_node || PURE_SLP_STMT (stmt_info))
4515     ncopies = 1;
4516   else
4517     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4518                / TYPE_VECTOR_SUBPARTS (vectype_in));
4519
4520   gcc_assert (ncopies >= 1);
4521
4522   vec_mode = TYPE_MODE (vectype_in);
4523
4524   if (code == COND_EXPR)
4525     {
4526       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4527         {
4528           if (vect_print_dump_info (REPORT_DETAILS))
4529             fprintf (vect_dump, "unsupported condition in reduction");
4530
4531             return false;
4532         }
4533     }
4534   else
4535     {
4536       /* 4. Supportable by target?  */
4537
4538       /* 4.1. check support for the operation in the loop  */
4539       optab = optab_for_tree_code (code, vectype_in, optab_default);
4540       if (!optab)
4541         {
4542           if (vect_print_dump_info (REPORT_DETAILS))
4543             fprintf (vect_dump, "no optab.");
4544
4545           return false;
4546         }
4547
4548       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4549         {
4550           if (vect_print_dump_info (REPORT_DETAILS))
4551             fprintf (vect_dump, "op not supported by target.");
4552
4553           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4554               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4555                   < vect_min_worthwhile_factor (code))
4556             return false;
4557
4558           if (vect_print_dump_info (REPORT_DETAILS))
4559             fprintf (vect_dump, "proceeding using word mode.");
4560         }
4561
4562       /* Worthwhile without SIMD support?  */
4563       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4564           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4565              < vect_min_worthwhile_factor (code))
4566         {
4567           if (vect_print_dump_info (REPORT_DETAILS))
4568             fprintf (vect_dump, "not worthwhile without SIMD support.");
4569
4570           return false;
4571         }
4572     }
4573
4574   /* 4.2. Check support for the epilog operation.
4575
4576           If STMT represents a reduction pattern, then the type of the
4577           reduction variable may be different than the type of the rest
4578           of the arguments.  For example, consider the case of accumulation
4579           of shorts into an int accumulator; The original code:
4580                         S1: int_a = (int) short_a;
4581           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4582
4583           was replaced with:
4584                         STMT: int_acc = widen_sum <short_a, int_acc>
4585
4586           This means that:
4587           1. The tree-code that is used to create the vector operation in the
4588              epilog code (that reduces the partial results) is not the
4589              tree-code of STMT, but is rather the tree-code of the original
4590              stmt from the pattern that STMT is replacing.  I.e, in the example
4591              above we want to use 'widen_sum' in the loop, but 'plus' in the
4592              epilog.
4593           2. The type (mode) we use to check available target support
4594              for the vector operation to be created in the *epilog*, is
4595              determined by the type of the reduction variable (in the example
4596              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4597              However the type (mode) we use to check available target support
4598              for the vector operation to be created *inside the loop*, is
4599              determined by the type of the other arguments to STMT (in the
4600              example we'd check this: optab_handler (widen_sum_optab,
4601              vect_short_mode)).
4602
4603           This is contrary to "regular" reductions, in which the types of all
4604           the arguments are the same as the type of the reduction variable.
4605           For "regular" reductions we can therefore use the same vector type
4606           (and also the same tree-code) when generating the epilog code and
4607           when generating the code inside the loop.  */
4608
4609   if (orig_stmt)
4610     {
4611       /* This is a reduction pattern: get the vectype from the type of the
4612          reduction variable, and get the tree-code from orig_stmt.  */
4613       orig_code = gimple_assign_rhs_code (orig_stmt);
4614       gcc_assert (vectype_out);
4615       vec_mode = TYPE_MODE (vectype_out);
4616     }
4617   else
4618     {
4619       /* Regular reduction: use the same vectype and tree-code as used for
4620          the vector code inside the loop can be used for the epilog code. */
4621       orig_code = code;
4622     }
4623
4624   if (nested_cycle)
4625     {
4626       def_bb = gimple_bb (reduc_def_stmt);
4627       def_stmt_loop = def_bb->loop_father;
4628       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4629                                        loop_preheader_edge (def_stmt_loop));
4630       if (TREE_CODE (def_arg) == SSA_NAME
4631           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4632           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4633           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4634           && vinfo_for_stmt (def_arg_stmt)
4635           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4636               == vect_double_reduction_def)
4637         double_reduc = true;
4638     }
4639
4640   epilog_reduc_code = ERROR_MARK;
4641   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4642     {
4643       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4644                                          optab_default);
4645       if (!reduc_optab)
4646         {
4647           if (vect_print_dump_info (REPORT_DETAILS))
4648             fprintf (vect_dump, "no optab for reduction.");
4649
4650           epilog_reduc_code = ERROR_MARK;
4651         }
4652
4653       if (reduc_optab
4654           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4655         {
4656           if (vect_print_dump_info (REPORT_DETAILS))
4657             fprintf (vect_dump, "reduc op not supported by target.");
4658
4659           epilog_reduc_code = ERROR_MARK;
4660         }
4661     }
4662   else
4663     {
4664       if (!nested_cycle || double_reduc)
4665         {
4666           if (vect_print_dump_info (REPORT_DETAILS))
4667             fprintf (vect_dump, "no reduc code for scalar code.");
4668
4669           return false;
4670         }
4671     }
4672
4673   if (double_reduc && ncopies > 1)
4674     {
4675       if (vect_print_dump_info (REPORT_DETAILS))
4676         fprintf (vect_dump, "multiple types in double reduction");
4677
4678       return false;
4679     }
4680
4681   /* In case of widenning multiplication by a constant, we update the type
4682      of the constant to be the type of the other operand.  We check that the
4683      constant fits the type in the pattern recognition pass.  */
4684   if (code == DOT_PROD_EXPR
4685       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4686     {
4687       if (TREE_CODE (ops[0]) == INTEGER_CST)
4688         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4689       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4690         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4691       else
4692         {
4693           if (vect_print_dump_info (REPORT_DETAILS))
4694             fprintf (vect_dump, "invalid types in dot-prod");
4695
4696           return false;
4697         }
4698     }
4699
4700   if (!vec_stmt) /* transformation not required.  */
4701     {
4702       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4703         return false;
4704       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4705       return true;
4706     }
4707
4708   /** Transform.  **/
4709
4710   if (vect_print_dump_info (REPORT_DETAILS))
4711     fprintf (vect_dump, "transform reduction.");
4712
4713   /* FORNOW: Multiple types are not supported for condition.  */
4714   if (code == COND_EXPR)
4715     gcc_assert (ncopies == 1);
4716
4717   /* Create the destination vector  */
4718   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
4719
4720   /* In case the vectorization factor (VF) is bigger than the number
4721      of elements that we can fit in a vectype (nunits), we have to generate
4722      more than one vector stmt - i.e - we need to "unroll" the
4723      vector stmt by a factor VF/nunits.  For more details see documentation
4724      in vectorizable_operation.  */
4725
4726   /* If the reduction is used in an outer loop we need to generate
4727      VF intermediate results, like so (e.g. for ncopies=2):
4728         r0 = phi (init, r0)
4729         r1 = phi (init, r1)
4730         r0 = x0 + r0;
4731         r1 = x1 + r1;
4732     (i.e. we generate VF results in 2 registers).
4733     In this case we have a separate def-use cycle for each copy, and therefore
4734     for each copy we get the vector def for the reduction variable from the
4735     respective phi node created for this copy.
4736
4737     Otherwise (the reduction is unused in the loop nest), we can combine
4738     together intermediate results, like so (e.g. for ncopies=2):
4739         r = phi (init, r)
4740         r = x0 + r;
4741         r = x1 + r;
4742    (i.e. we generate VF/2 results in a single register).
4743    In this case for each copy we get the vector def for the reduction variable
4744    from the vectorized reduction operation generated in the previous iteration.
4745   */
4746
4747   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
4748     {
4749       single_defuse_cycle = true;
4750       epilog_copies = 1;
4751     }
4752   else
4753     epilog_copies = ncopies;
4754
4755   prev_stmt_info = NULL;
4756   prev_phi_info = NULL;
4757   if (slp_node)
4758     {
4759       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4760       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
4761                   == TYPE_VECTOR_SUBPARTS (vectype_in));
4762     }
4763   else
4764     {
4765       vec_num = 1;
4766       vec_oprnds0 = VEC_alloc (tree, heap, 1);
4767       if (op_type == ternary_op)
4768         vec_oprnds1 = VEC_alloc (tree, heap, 1);
4769     }
4770
4771   phis = VEC_alloc (gimple, heap, vec_num);
4772   vect_defs = VEC_alloc (tree, heap, vec_num);
4773   if (!slp_node)
4774     VEC_quick_push (tree, vect_defs, NULL_TREE);
4775
4776   for (j = 0; j < ncopies; j++)
4777     {
4778       if (j == 0 || !single_defuse_cycle)
4779         {
4780           for (i = 0; i < vec_num; i++)
4781             {
4782               /* Create the reduction-phi that defines the reduction
4783                  operand.  */
4784               new_phi = create_phi_node (vec_dest, loop->header);
4785               set_vinfo_for_stmt (new_phi,
4786                                   new_stmt_vec_info (new_phi, loop_vinfo,
4787                                                      NULL));
4788                if (j == 0 || slp_node)
4789                  VEC_quick_push (gimple, phis, new_phi);
4790             }
4791         }
4792
4793       if (code == COND_EXPR)
4794         {
4795           gcc_assert (!slp_node);
4796           vectorizable_condition (stmt, gsi, vec_stmt,
4797                                   PHI_RESULT (VEC_index (gimple, phis, 0)),
4798                                   reduc_index, NULL);
4799           /* Multiple types are not supported for condition.  */
4800           break;
4801         }
4802
4803       /* Handle uses.  */
4804       if (j == 0)
4805         {
4806           op0 = ops[!reduc_index];
4807           if (op_type == ternary_op)
4808             {
4809               if (reduc_index == 0)
4810                 op1 = ops[2];
4811               else
4812                 op1 = ops[1];
4813             }
4814
4815           if (slp_node)
4816             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
4817                                slp_node, -1);
4818           else
4819             {
4820               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
4821                                                             stmt, NULL);
4822               VEC_quick_push (tree, vec_oprnds0, loop_vec_def0);
4823               if (op_type == ternary_op)
4824                {
4825                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
4826                                                                NULL);
4827                  VEC_quick_push (tree, vec_oprnds1, loop_vec_def1);
4828                }
4829             }
4830         }
4831       else
4832         {
4833           if (!slp_node)
4834             {
4835               enum vect_def_type dt;
4836               gimple dummy_stmt;
4837               tree dummy;
4838
4839               vect_is_simple_use (ops[!reduc_index], loop_vinfo, NULL,
4840                                   &dummy_stmt, &dummy, &dt);
4841               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
4842                                                               loop_vec_def0);
4843               VEC_replace (tree, vec_oprnds0, 0, loop_vec_def0);
4844               if (op_type == ternary_op)
4845                 {
4846                   vect_is_simple_use (op1, loop_vinfo, NULL, &dummy_stmt,
4847                                       &dummy, &dt);
4848                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
4849                                                                 loop_vec_def1);
4850                   VEC_replace (tree, vec_oprnds1, 0, loop_vec_def1);
4851                 }
4852             }
4853
4854           if (single_defuse_cycle)
4855             reduc_def = gimple_assign_lhs (new_stmt);
4856
4857           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
4858         }
4859
4860       FOR_EACH_VEC_ELT (tree, vec_oprnds0, i, def0)
4861         {
4862           if (slp_node)
4863             reduc_def = PHI_RESULT (VEC_index (gimple, phis, i));
4864           else
4865             {
4866               if (!single_defuse_cycle || j == 0)
4867                 reduc_def = PHI_RESULT (new_phi);
4868             }
4869
4870           def1 = ((op_type == ternary_op)
4871                   ? VEC_index (tree, vec_oprnds1, i) : NULL);
4872           if (op_type == binary_op)
4873             {
4874               if (reduc_index == 0)
4875                 expr = build2 (code, vectype_out, reduc_def, def0);
4876               else
4877                 expr = build2 (code, vectype_out, def0, reduc_def);
4878             }
4879           else
4880             {
4881               if (reduc_index == 0)
4882                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
4883               else
4884                 {
4885                   if (reduc_index == 1)
4886                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
4887                   else
4888                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
4889                 }
4890             }
4891
4892           new_stmt = gimple_build_assign (vec_dest, expr);
4893           new_temp = make_ssa_name (vec_dest, new_stmt);
4894           gimple_assign_set_lhs (new_stmt, new_temp);
4895           vect_finish_stmt_generation (stmt, new_stmt, gsi);
4896
4897           if (slp_node)
4898             {
4899               VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt);
4900               VEC_quick_push (tree, vect_defs, new_temp);
4901             }
4902           else
4903             VEC_replace (tree, vect_defs, 0, new_temp);
4904         }
4905
4906       if (slp_node)
4907         continue;
4908
4909       if (j == 0)
4910         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
4911       else
4912         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
4913
4914       prev_stmt_info = vinfo_for_stmt (new_stmt);
4915       prev_phi_info = vinfo_for_stmt (new_phi);
4916     }
4917
4918   /* Finalize the reduction-phi (set its arguments) and create the
4919      epilog reduction code.  */
4920   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
4921     {
4922       new_temp = gimple_assign_lhs (*vec_stmt);
4923       VEC_replace (tree, vect_defs, 0, new_temp);
4924     }
4925
4926   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
4927                                     epilog_reduc_code, phis, reduc_index,
4928                                     double_reduc, slp_node);
4929
4930   VEC_free (gimple, heap, phis);
4931   VEC_free (tree, heap, vec_oprnds0);
4932   if (vec_oprnds1)
4933     VEC_free (tree, heap, vec_oprnds1);
4934
4935   return true;
4936 }
4937
4938 /* Function vect_min_worthwhile_factor.
4939
4940    For a loop where we could vectorize the operation indicated by CODE,
4941    return the minimum vectorization factor that makes it worthwhile
4942    to use generic vectors.  */
4943 int
4944 vect_min_worthwhile_factor (enum tree_code code)
4945 {
4946   switch (code)
4947     {
4948     case PLUS_EXPR:
4949     case MINUS_EXPR:
4950     case NEGATE_EXPR:
4951       return 4;
4952
4953     case BIT_AND_EXPR:
4954     case BIT_IOR_EXPR:
4955     case BIT_XOR_EXPR:
4956     case BIT_NOT_EXPR:
4957       return 2;
4958
4959     default:
4960       return INT_MAX;
4961     }
4962 }
4963
4964
4965 /* Function vectorizable_induction
4966
4967    Check if PHI performs an induction computation that can be vectorized.
4968    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
4969    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
4970    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
4971
4972 bool
4973 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4974                         gimple *vec_stmt)
4975 {
4976   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
4977   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4978   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4979   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4980   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4981   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4982   tree vec_def;
4983
4984   gcc_assert (ncopies >= 1);
4985   /* FORNOW. This restriction should be relaxed.  */
4986   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4987     {
4988       if (vect_print_dump_info (REPORT_DETAILS))
4989         fprintf (vect_dump, "multiple types in nested loop.");
4990       return false;
4991     }
4992
4993   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4994     return false;
4995
4996   /* FORNOW: SLP not supported.  */
4997   if (STMT_SLP_TYPE (stmt_info))
4998     return false;
4999
5000   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5001
5002   if (gimple_code (phi) != GIMPLE_PHI)
5003     return false;
5004
5005   if (!vec_stmt) /* transformation not required.  */
5006     {
5007       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5008       if (vect_print_dump_info (REPORT_DETAILS))
5009         fprintf (vect_dump, "=== vectorizable_induction ===");
5010       vect_model_induction_cost (stmt_info, ncopies);
5011       return true;
5012     }
5013
5014   /** Transform.  **/
5015
5016   if (vect_print_dump_info (REPORT_DETAILS))
5017     fprintf (vect_dump, "transform induction phi.");
5018
5019   vec_def = get_initial_def_for_induction (phi);
5020   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5021   return true;
5022 }
5023
5024 /* Function vectorizable_live_operation.
5025
5026    STMT computes a value that is used outside the loop.  Check if
5027    it can be supported.  */
5028
5029 bool
5030 vectorizable_live_operation (gimple stmt,
5031                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5032                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5033 {
5034   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5035   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5036   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5037   int i;
5038   int op_type;
5039   tree op;
5040   tree def;
5041   gimple def_stmt;
5042   enum vect_def_type dt;
5043   enum tree_code code;
5044   enum gimple_rhs_class rhs_class;
5045
5046   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5047
5048   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5049     return false;
5050
5051   if (!is_gimple_assign (stmt))
5052     return false;
5053
5054   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5055     return false;
5056
5057   /* FORNOW. CHECKME. */
5058   if (nested_in_vect_loop_p (loop, stmt))
5059     return false;
5060
5061   code = gimple_assign_rhs_code (stmt);
5062   op_type = TREE_CODE_LENGTH (code);
5063   rhs_class = get_gimple_rhs_class (code);
5064   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5065   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5066
5067   /* FORNOW: support only if all uses are invariant.  This means
5068      that the scalar operations can remain in place, unvectorized.
5069      The original last scalar value that they compute will be used.  */
5070
5071   for (i = 0; i < op_type; i++)
5072     {
5073       if (rhs_class == GIMPLE_SINGLE_RHS)
5074         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5075       else
5076         op = gimple_op (stmt, i + 1);
5077       if (op
5078           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
5079         {
5080           if (vect_print_dump_info (REPORT_DETAILS))
5081             fprintf (vect_dump, "use not simple.");
5082           return false;
5083         }
5084
5085       if (dt != vect_external_def && dt != vect_constant_def)
5086         return false;
5087     }
5088
5089   /* No transformation is required for the cases we currently support.  */
5090   return true;
5091 }
5092
5093 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5094
5095 static void
5096 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5097 {
5098   ssa_op_iter op_iter;
5099   imm_use_iterator imm_iter;
5100   def_operand_p def_p;
5101   gimple ustmt;
5102
5103   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5104     {
5105       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5106         {
5107           basic_block bb;
5108
5109           if (!is_gimple_debug (ustmt))
5110             continue;
5111
5112           bb = gimple_bb (ustmt);
5113
5114           if (!flow_bb_inside_loop_p (loop, bb))
5115             {
5116               if (gimple_debug_bind_p (ustmt))
5117                 {
5118                   if (vect_print_dump_info (REPORT_DETAILS))
5119                     fprintf (vect_dump, "killing debug use");
5120
5121                   gimple_debug_bind_reset_value (ustmt);
5122                   update_stmt (ustmt);
5123                 }
5124               else
5125                 gcc_unreachable ();
5126             }
5127         }
5128     }
5129 }
5130
5131 /* Function vect_transform_loop.
5132
5133    The analysis phase has determined that the loop is vectorizable.
5134    Vectorize the loop - created vectorized stmts to replace the scalar
5135    stmts in the loop, and update the loop exit condition.  */
5136
5137 void
5138 vect_transform_loop (loop_vec_info loop_vinfo)
5139 {
5140   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5141   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5142   int nbbs = loop->num_nodes;
5143   gimple_stmt_iterator si;
5144   int i;
5145   tree ratio = NULL;
5146   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5147   bool strided_store;
5148   bool slp_scheduled = false;
5149   unsigned int nunits;
5150   tree cond_expr = NULL_TREE;
5151   gimple_seq cond_expr_stmt_list = NULL;
5152   bool do_peeling_for_loop_bound;
5153   gimple stmt, pattern_stmt, pattern_def_stmt;
5154   bool transform_pattern_stmt = false, pattern_def = false;
5155
5156   if (vect_print_dump_info (REPORT_DETAILS))
5157     fprintf (vect_dump, "=== vec_transform_loop ===");
5158
5159   /* Peel the loop if there are data refs with unknown alignment.
5160      Only one data ref with unknown store is allowed.  */
5161
5162   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5163     vect_do_peeling_for_alignment (loop_vinfo);
5164
5165   do_peeling_for_loop_bound
5166     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5167        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5168            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5169        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
5170
5171   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5172       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5173     vect_loop_versioning (loop_vinfo,
5174                           !do_peeling_for_loop_bound,
5175                           &cond_expr, &cond_expr_stmt_list);
5176
5177   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5178      compile time constant), or it is a constant that doesn't divide by the
5179      vectorization factor, then an epilog loop needs to be created.
5180      We therefore duplicate the loop: the original loop will be vectorized,
5181      and will compute the first (n/VF) iterations.  The second copy of the loop
5182      will remain scalar and will compute the remaining (n%VF) iterations.
5183      (VF is the vectorization factor).  */
5184
5185   if (do_peeling_for_loop_bound)
5186     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5187                                     cond_expr, cond_expr_stmt_list);
5188   else
5189     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5190                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5191
5192   /* 1) Make sure the loop header has exactly two entries
5193      2) Make sure we have a preheader basic block.  */
5194
5195   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5196
5197   split_edge (loop_preheader_edge (loop));
5198
5199   /* FORNOW: the vectorizer supports only loops which body consist
5200      of one basic block (header + empty latch). When the vectorizer will
5201      support more involved loop forms, the order by which the BBs are
5202      traversed need to be reconsidered.  */
5203
5204   for (i = 0; i < nbbs; i++)
5205     {
5206       basic_block bb = bbs[i];
5207       stmt_vec_info stmt_info;
5208       gimple phi;
5209
5210       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5211         {
5212           phi = gsi_stmt (si);
5213           if (vect_print_dump_info (REPORT_DETAILS))
5214             {
5215               fprintf (vect_dump, "------>vectorizing phi: ");
5216               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
5217             }
5218           stmt_info = vinfo_for_stmt (phi);
5219           if (!stmt_info)
5220             continue;
5221
5222           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5223             vect_loop_kill_debug_uses (loop, phi);
5224
5225           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5226               && !STMT_VINFO_LIVE_P (stmt_info))
5227             continue;
5228
5229           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5230                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5231               && vect_print_dump_info (REPORT_DETAILS))
5232             fprintf (vect_dump, "multiple-types.");
5233
5234           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5235             {
5236               if (vect_print_dump_info (REPORT_DETAILS))
5237                 fprintf (vect_dump, "transform phi.");
5238               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5239             }
5240         }
5241
5242       pattern_stmt = NULL;
5243       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5244         {
5245           bool is_store;
5246
5247           if (transform_pattern_stmt)
5248             {
5249               stmt = pattern_stmt;
5250               transform_pattern_stmt = false;
5251             }
5252           else
5253             stmt = gsi_stmt (si);
5254
5255           if (vect_print_dump_info (REPORT_DETAILS))
5256             {
5257               fprintf (vect_dump, "------>vectorizing statement: ");
5258               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
5259             }
5260
5261           stmt_info = vinfo_for_stmt (stmt);
5262
5263           /* vector stmts created in the outer-loop during vectorization of
5264              stmts in an inner-loop may not have a stmt_info, and do not
5265              need to be vectorized.  */
5266           if (!stmt_info)
5267             {
5268               gsi_next (&si);
5269               continue;
5270             }
5271
5272           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5273             vect_loop_kill_debug_uses (loop, stmt);
5274
5275           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5276               && !STMT_VINFO_LIVE_P (stmt_info))
5277             {
5278               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5279                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5280                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5281                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5282                 {
5283                   stmt = pattern_stmt;
5284                   stmt_info = vinfo_for_stmt (stmt);
5285                 }
5286               else
5287                 {
5288                   gsi_next (&si);
5289                   continue;
5290                 }
5291             }
5292           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5293                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5294                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5295                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5296             transform_pattern_stmt = true;
5297
5298           /* If pattern statement has a def stmt, vectorize it too.  */
5299           if (is_pattern_stmt_p (stmt_info)
5300               && (pattern_def_stmt = STMT_VINFO_PATTERN_DEF_STMT (stmt_info))
5301               && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_def_stmt))
5302                   || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_def_stmt))))
5303             {
5304               if (pattern_def)
5305                 pattern_def = false;
5306               else
5307                 {
5308                   if (vect_print_dump_info (REPORT_DETAILS))
5309                     {
5310                       fprintf (vect_dump, "==> vectorizing pattern def"
5311                                           " stmt: ");
5312                       print_gimple_stmt (vect_dump, pattern_def_stmt, 0,
5313                                          TDF_SLIM);
5314                     }
5315
5316                   pattern_def = true;
5317                   stmt = pattern_def_stmt;
5318                   stmt_info = vinfo_for_stmt (stmt);
5319                 }
5320             }
5321
5322           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5323           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5324                                                STMT_VINFO_VECTYPE (stmt_info));
5325           if (!STMT_SLP_TYPE (stmt_info)
5326               && nunits != (unsigned int) vectorization_factor
5327               && vect_print_dump_info (REPORT_DETAILS))
5328             /* For SLP VF is set according to unrolling factor, and not to
5329                vector size, hence for SLP this print is not valid.  */
5330             fprintf (vect_dump, "multiple-types.");
5331
5332           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5333              reached.  */
5334           if (STMT_SLP_TYPE (stmt_info))
5335             {
5336               if (!slp_scheduled)
5337                 {
5338                   slp_scheduled = true;
5339
5340                   if (vect_print_dump_info (REPORT_DETAILS))
5341                     fprintf (vect_dump, "=== scheduling SLP instances ===");
5342
5343                   vect_schedule_slp (loop_vinfo, NULL);
5344                 }
5345
5346               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5347               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5348                 {
5349                   if (!transform_pattern_stmt && !pattern_def)
5350                     gsi_next (&si);
5351                   continue;
5352                 }
5353             }
5354
5355           /* -------- vectorize statement ------------ */
5356           if (vect_print_dump_info (REPORT_DETAILS))
5357             fprintf (vect_dump, "transform statement.");
5358
5359           strided_store = false;
5360           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
5361           if (is_store)
5362             {
5363               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
5364                 {
5365                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5366                      interleaving chain was completed - free all the stores in
5367                      the chain.  */
5368                   gsi_next (&si);
5369                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5370                   continue;
5371                 }
5372               else
5373                 {
5374                   /* Free the attached stmt_vec_info and remove the stmt.  */
5375                   free_stmt_vec_info (gsi_stmt (si));
5376                   gsi_remove (&si, true);
5377                   continue;
5378                 }
5379             }
5380
5381           if (!transform_pattern_stmt && !pattern_def)
5382             gsi_next (&si);
5383         }                       /* stmts in BB */
5384     }                           /* BBs in loop */
5385
5386   slpeel_make_loop_iterate_ntimes (loop, ratio);
5387
5388   /* The memory tags and pointers in vectorized statements need to
5389      have their SSA forms updated.  FIXME, why can't this be delayed
5390      until all the loops have been transformed?  */
5391   update_ssa (TODO_update_ssa);
5392
5393   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5394     fprintf (vect_dump, "LOOP VECTORIZED.");
5395   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
5396     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
5397 }