gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software
   3    Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "diagnostic.h"
  31 #include "tree-flow.h"
  32 #include "tree-dump.h"
  33 #include "cfgloop.h"
  34 #include "cfglayout.h"
  35 #include "expr.h"
  36 #include "recog.h"
  37 #include "optabs.h"
  38 #include "params.h"
  39 #include "toplev.h"
  40 #include "tree-chrec.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs"). These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized. It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S). This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 127    support different sizes of vectors, for now will need to specify one value
 128    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 129
 130         Since we only vectorize operations which vector form can be
 131    expressed using existing tree codes, to verify that an operation is
 132    supported, the vectorizer checks the relevant optab at the relevant
 133    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 134    the value found is CODE_FOR_nothing, then there's no target support, and
 135    we can't vectorize the stmt.
 136
 137    For additional information on this project see:
 138    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 139 */
 140
 141 /* Function vect_determine_vectorization_factor
 142
 143    Determine the vectorization factor (VF). VF is the number of data elements
 144    that are operated upon in parallel in a single iteration of the vectorized
 145    loop. For example, when vectorizing a loop that operates on 4byte elements,
 146    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 147    elements can fit in a single vector register.
 148
 149    We currently support vectorization of loops in which all types operated upon
 150    are of the same size. Therefore this function currently sets VF according to
 151    the size of the types operated upon, and fails if there are multiple sizes
 152    in the loop.
 153
 154    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 155    original loop:
 156         for (i=0; i<N; i++){
 157           a[i] = b[i] + c[i];
 158         }
 159
 160    vectorized loop:
 161         for (i=0; i<N; i+=VF){
 162           a[i:VF] = b[i:VF] + c[i:VF];
 163         }
 164 */
 165
 166 static bool
 167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 168 {
 169   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 170   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 171   int nbbs = loop->num_nodes;
 172   gimple_stmt_iterator si;
 173   unsigned int vectorization_factor = 0;
 174   tree scalar_type;
 175   gimple phi;
 176   tree vectype;
 177   unsigned int nunits;
 178   stmt_vec_info stmt_info;
 179   int i;
 180   HOST_WIDE_INT dummy;
 181
 182   if (vect_print_dump_info (REPORT_DETAILS))
 183     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 184
 185   for (i = 0; i < nbbs; i++)
 186     {
 187       basic_block bb = bbs[i];
 188
 189       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 190         {
 191           phi = gsi_stmt (si);
 192           stmt_info = vinfo_for_stmt (phi);
 193           if (vect_print_dump_info (REPORT_DETAILS))
 194             {
 195               fprintf (vect_dump, "==> examining phi: ");
 196               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 197             }
 198
 199           gcc_assert (stmt_info);
 200
 201           if (STMT_VINFO_RELEVANT_P (stmt_info))
 202             {
 203               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 204               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 205
 206               if (vect_print_dump_info (REPORT_DETAILS))
 207                 {
 208                   fprintf (vect_dump, "get vectype for scalar type:  ");
 209                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 210                 }
 211
 212               vectype = get_vectype_for_scalar_type (scalar_type);
 213               if (!vectype)
 214                 {
 215                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 216                     {
 217                       fprintf (vect_dump,
 218                                "not vectorized: unsupported data-type ");
 219                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 220                     }
 221                   return false;
 222                 }
 223               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 224
 225               if (vect_print_dump_info (REPORT_DETAILS))
 226                 {
 227                   fprintf (vect_dump, "vectype: ");
 228                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 229                 }
 230
 231               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 232               if (vect_print_dump_info (REPORT_DETAILS))
 233                 fprintf (vect_dump, "nunits = %d", nunits);
 234
 235               if (!vectorization_factor
 236                   || (nunits > vectorization_factor))
 237                 vectorization_factor = nunits;
 238             }
 239         }
 240
 241       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 242         {
 243           gimple stmt = gsi_stmt (si);
 244           stmt_info = vinfo_for_stmt (stmt);
 245
 246           if (vect_print_dump_info (REPORT_DETAILS))
 247             {
 248               fprintf (vect_dump, "==> examining statement: ");
 249               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 250             }
 251
 252           gcc_assert (stmt_info);
 253
 254           /* skip stmts which do not need to be vectorized.  */
 255           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 256               && !STMT_VINFO_LIVE_P (stmt_info))
 257             {
 258               if (vect_print_dump_info (REPORT_DETAILS))
 259                 fprintf (vect_dump, "skip.");
 260               continue;
 261             }
 262
 263           if (gimple_get_lhs (stmt) == NULL_TREE)
 264             {
 265               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 266                 {
 267                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 268                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 269                 }
 270               return false;
 271             }
 272
 273           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 274             {
 275               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 276                 {
 277                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 278                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 279                 }
 280               return false;
 281             }
 282
 283           if (STMT_VINFO_VECTYPE (stmt_info))
 284             {
 285               /* The only case when a vectype had been already set is for stmts
 286                  that contain a dataref, or for "pattern-stmts" (stmts generated
 287                  by the vectorizer to represent/replace a certain idiom).  */
 288               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 289                           || is_pattern_stmt_p (stmt_info));
 290               vectype = STMT_VINFO_VECTYPE (stmt_info);
 291             }
 292           else
 293             {
 294               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 295                           && !is_pattern_stmt_p (stmt_info));
 296
 297               scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 298                                                            &dummy);
 299               if (vect_print_dump_info (REPORT_DETAILS))
 300                 {
 301                   fprintf (vect_dump, "get vectype for scalar type:  ");
 302                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 303                 }
 304
 305               vectype = get_vectype_for_scalar_type (scalar_type);
 306               if (!vectype)
 307                 {
 308                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 309                     {
 310                       fprintf (vect_dump,
 311                                "not vectorized: unsupported data-type ");
 312                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 313                     }
 314                   return false;
 315                 }
 316               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 317             }
 318
 319           if (vect_print_dump_info (REPORT_DETAILS))
 320             {
 321               fprintf (vect_dump, "vectype: ");
 322               print_generic_expr (vect_dump, vectype, TDF_SLIM);
 323             }
 324
 325           nunits = TYPE_VECTOR_SUBPARTS (vectype);
 326           if (vect_print_dump_info (REPORT_DETAILS))
 327             fprintf (vect_dump, "nunits = %d", nunits);
 328
 329           if (!vectorization_factor
 330               || (nunits > vectorization_factor))
 331             vectorization_factor = nunits;
 332
 333         }
 334     }
 335
 336   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 337   if (vect_print_dump_info (REPORT_DETAILS))
 338     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 339   if (vectorization_factor <= 1)
 340     {
 341       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 342         fprintf (vect_dump, "not vectorized: unsupported data-type");
 343       return false;
 344     }
 345   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 346
 347   return true;
 348 }
 349
 350
 351 /* Function vect_is_simple_iv_evolution.
 352
 353    FORNOW: A simple evolution of an induction variables in the loop is
 354    considered a polynomial evolution with constant step.  */
 355
 356 static bool
 357 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 358                              tree * step)
 359 {
 360   tree init_expr;
 361   tree step_expr;
 362   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 363
 364   /* When there is no evolution in this loop, the evolution function
 365      is not "simple".  */
 366   if (evolution_part == NULL_TREE)
 367     return false;
 368
 369   /* When the evolution is a polynomial of degree >= 2
 370      the evolution function is not "simple".  */
 371   if (tree_is_chrec (evolution_part))
 372     return false;
 373
 374   step_expr = evolution_part;
 375   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 376
 377   if (vect_print_dump_info (REPORT_DETAILS))
 378     {
 379       fprintf (vect_dump, "step: ");
 380       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 381       fprintf (vect_dump, ",  init: ");
 382       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 383     }
 384
 385   *init = init_expr;
 386   *step = step_expr;
 387
 388   if (TREE_CODE (step_expr) != INTEGER_CST)
 389     {
 390       if (vect_print_dump_info (REPORT_DETAILS))
 391         fprintf (vect_dump, "step unknown.");
 392       return false;
 393     }
 394
 395   return true;
 396 }
 397
 398 /* Function vect_analyze_scalar_cycles_1.
 399
 400    Examine the cross iteration def-use cycles of scalar variables
 401    in LOOP. LOOP_VINFO represents the loop that is now being
 402    considered for vectorization (can be LOOP, or an outer-loop
 403    enclosing LOOP).  */
 404
 405 static void
 406 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 407 {
 408   basic_block bb = loop->header;
 409   tree dumy;
 410   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 411   gimple_stmt_iterator gsi;
 412   bool double_reduc;
 413
 414   if (vect_print_dump_info (REPORT_DETAILS))
 415     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 416
 417   /* First - identify all inductions. Reduction detection assumes that all the
 418      inductions have been identified, therefore, this order must not be
 419      changed.  */
 420   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 421     {
 422       gimple phi = gsi_stmt (gsi);
 423       tree access_fn = NULL;
 424       tree def = PHI_RESULT (phi);
 425       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 426
 427       if (vect_print_dump_info (REPORT_DETAILS))
 428         {
 429           fprintf (vect_dump, "Analyze phi: ");
 430           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 431         }
 432
 433       /* Skip virtual phi's. The data dependences that are associated with
 434          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 435       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 436         continue;
 437
 438       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 439
 440       /* Analyze the evolution function.  */
 441       access_fn = analyze_scalar_evolution (loop, def);
 442       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 443         {
 444           fprintf (vect_dump, "Access function of PHI: ");
 445           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 446         }
 447
 448       if (!access_fn
 449           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 450         {
 451           VEC_safe_push (gimple, heap, worklist, phi);
 452           continue;
 453         }
 454
 455       if (vect_print_dump_info (REPORT_DETAILS))
 456         fprintf (vect_dump, "Detected induction.");
 457       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 458     }
 459
 460
 461   /* Second - identify all reductions and nested cycles.  */
 462   while (VEC_length (gimple, worklist) > 0)
 463     {
 464       gimple phi = VEC_pop (gimple, worklist);
 465       tree def = PHI_RESULT (phi);
 466       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 467       gimple reduc_stmt;
 468       bool nested_cycle;
 469
 470       if (vect_print_dump_info (REPORT_DETAILS))
 471         {
 472           fprintf (vect_dump, "Analyze phi: ");
 473           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 474         }
 475
 476       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 477       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 478
 479       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 480       reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
 481                                              &double_reduc);
 482       if (reduc_stmt)
 483         {
 484           if (double_reduc)
 485             {
 486               if (vect_print_dump_info (REPORT_DETAILS))
 487                 fprintf (vect_dump, "Detected double reduction.");
 488
 489               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 490               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 491                                                     vect_double_reduction_def;
 492             }
 493           else
 494             {
 495               if (nested_cycle)
 496                 {
 497                   if (vect_print_dump_info (REPORT_DETAILS))
 498                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 499
 500                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 501                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 502                                                              vect_nested_cycle;
 503                 }
 504               else
 505                 {
 506                   if (vect_print_dump_info (REPORT_DETAILS))
 507                     fprintf (vect_dump, "Detected reduction.");
 508
 509                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 510                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 511                                                            vect_reduction_def;
 512                 }
 513             }
 514         }
 515       else
 516         if (vect_print_dump_info (REPORT_DETAILS))
 517           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 518     }
 519
 520   VEC_free (gimple, heap, worklist);
 521 }
 522
 523
 524 /* Function vect_analyze_scalar_cycles.
 525
 526    Examine the cross iteration def-use cycles of scalar variables, by
 527    analyzing the loop-header PHIs of scalar variables; Classify each
 528    cycle as one of the following: invariant, induction, reduction, unknown.
 529    We do that for the loop represented by LOOP_VINFO, and also to its
 530    inner-loop, if exists.
 531    Examples for scalar cycles:
 532
 533    Example1: reduction:
 534
 535               loop1:
 536               for (i=0; i<N; i++)
 537                  sum += a[i];
 538
 539    Example2: induction:
 540
 541               loop2:
 542               for (i=0; i<N; i++)
 543                  a[i] = i;  */
 544
 545 static void
 546 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 547 {
 548   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 549
 550   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 551
 552   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 553      Reductions in such inner-loop therefore have different properties than
 554      the reductions in the nest that gets vectorized:
 555      1. When vectorized, they are executed in the same order as in the original
 556         scalar loop, so we can't change the order of computation when
 557         vectorizing them.
 558      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 559         current checks are too strict.  */
 560
 561   if (loop->inner)
 562     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 563 }
 564
 565 /* Function vect_get_loop_niters.
 566
 567    Determine how many iterations the loop is executed.
 568    If an expression that represents the number of iterations
 569    can be constructed, place it in NUMBER_OF_ITERATIONS.
 570    Return the loop exit condition.  */
 571
 572 static gimple
 573 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 574 {
 575   tree niters;
 576
 577   if (vect_print_dump_info (REPORT_DETAILS))
 578     fprintf (vect_dump, "=== get_loop_niters ===");
 579
 580   niters = number_of_exit_cond_executions (loop);
 581
 582   if (niters != NULL_TREE
 583       && niters != chrec_dont_know)
 584     {
 585       *number_of_iterations = niters;
 586
 587       if (vect_print_dump_info (REPORT_DETAILS))
 588         {
 589           fprintf (vect_dump, "==> get_loop_niters:" );
 590           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 591         }
 592     }
 593
 594   return get_loop_exit_condition (loop);
 595 }
 596
 597
 598 /* Function bb_in_loop_p
 599
 600    Used as predicate for dfs order traversal of the loop bbs.  */
 601
 602 static bool
 603 bb_in_loop_p (const_basic_block bb, const void *data)
 604 {
 605   const struct loop *const loop = (const struct loop *)data;
 606   if (flow_bb_inside_loop_p (loop, bb))
 607     return true;
 608   return false;
 609 }
 610
 611
 612 /* Function new_loop_vec_info.
 613
 614    Create and initialize a new loop_vec_info struct for LOOP, as well as
 615    stmt_vec_info structs for all the stmts in LOOP.  */
 616
 617 static loop_vec_info
 618 new_loop_vec_info (struct loop *loop)
 619 {
 620   loop_vec_info res;
 621   basic_block *bbs;
 622   gimple_stmt_iterator si;
 623   unsigned int i, nbbs;
 624
 625   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 626   LOOP_VINFO_LOOP (res) = loop;
 627
 628   bbs = get_loop_body (loop);
 629
 630   /* Create/Update stmt_info for all stmts in the loop.  */
 631   for (i = 0; i < loop->num_nodes; i++)
 632     {
 633       basic_block bb = bbs[i];
 634
 635       /* BBs in a nested inner-loop will have been already processed (because
 636          we will have called vect_analyze_loop_form for any nested inner-loop).
 637          Therefore, for stmts in an inner-loop we just want to update the
 638          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 639          loop_info of the outer-loop we are currently considering to vectorize
 640          (instead of the loop_info of the inner-loop).
 641          For stmts in other BBs we need to create a stmt_info from scratch.  */
 642       if (bb->loop_father != loop)
 643         {
 644           /* Inner-loop bb.  */
 645           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 646           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 647             {
 648               gimple phi = gsi_stmt (si);
 649               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 650               loop_vec_info inner_loop_vinfo =
 651                 STMT_VINFO_LOOP_VINFO (stmt_info);
 652               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 653               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 654             }
 655           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 656            {
 657               gimple stmt = gsi_stmt (si);
 658               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 659               loop_vec_info inner_loop_vinfo =
 660                  STMT_VINFO_LOOP_VINFO (stmt_info);
 661               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 662               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 663            }
 664         }
 665       else
 666         {
 667           /* bb in current nest.  */
 668           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 669             {
 670               gimple phi = gsi_stmt (si);
 671               gimple_set_uid (phi, 0);
 672               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 673             }
 674
 675           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 676             {
 677               gimple stmt = gsi_stmt (si);
 678               gimple_set_uid (stmt, 0);
 679               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 680             }
 681         }
 682     }
 683
 684   /* CHECKME: We want to visit all BBs before their successors (except for
 685      latch blocks, for which this assertion wouldn't hold).  In the simple
 686      case of the loop forms we allow, a dfs order of the BBs would the same
 687      as reversed postorder traversal, so we are safe.  */
 688
 689    free (bbs);
 690    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 691    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 692                               bbs, loop->num_nodes, loop);
 693    gcc_assert (nbbs == loop->num_nodes);
 694
 695   LOOP_VINFO_BBS (res) = bbs;
 696   LOOP_VINFO_NITERS (res) = NULL;
 697   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 698   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 699   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 700   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 701   LOOP_VINFO_VECT_FACTOR (res) = 0;
 702   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 703   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 704   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 705   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 706     VEC_alloc (gimple, heap,
 707                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 708   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 709     VEC_alloc (ddr_p, heap,
 710                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 711   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 712   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 713   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 714
 715   return res;
 716 }
 717
 718
 719 /* Function destroy_loop_vec_info.
 720
 721    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 722    stmts in the loop.  */
 723
 724 void
 725 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 726 {
 727   struct loop *loop;
 728   basic_block *bbs;
 729   int nbbs;
 730   gimple_stmt_iterator si;
 731   int j;
 732   VEC (slp_instance, heap) *slp_instances;
 733   slp_instance instance;
 734
 735   if (!loop_vinfo)
 736     return;
 737
 738   loop = LOOP_VINFO_LOOP (loop_vinfo);
 739
 740   bbs = LOOP_VINFO_BBS (loop_vinfo);
 741   nbbs = loop->num_nodes;
 742
 743   if (!clean_stmts)
 744     {
 745       free (LOOP_VINFO_BBS (loop_vinfo));
 746       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 747       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 748       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 749
 750       free (loop_vinfo);
 751       loop->aux = NULL;
 752       return;
 753     }
 754
 755   for (j = 0; j < nbbs; j++)
 756     {
 757       basic_block bb = bbs[j];
 758       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 759         free_stmt_vec_info (gsi_stmt (si));
 760
 761       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 762         {
 763           gimple stmt = gsi_stmt (si);
 764           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 765
 766           if (stmt_info)
 767             {
 768               /* Check if this is a "pattern stmt" (introduced by the
 769                  vectorizer during the pattern recognition pass).  */
 770               bool remove_stmt_p = false;
 771               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 772               if (orig_stmt)
 773                 {
 774                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 775                   if (orig_stmt_info
 776                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 777                     remove_stmt_p = true;
 778                 }
 779
 780               /* Free stmt_vec_info.  */
 781               free_stmt_vec_info (stmt);
 782
 783               /* Remove dead "pattern stmts".  */
 784               if (remove_stmt_p)
 785                 gsi_remove (&si, true);
 786             }
 787           gsi_next (&si);
 788         }
 789     }
 790
 791   free (LOOP_VINFO_BBS (loop_vinfo));
 792   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 793   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 794   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 795   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 796   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 797   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
 798     vect_free_slp_instance (instance);
 799
 800   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 801   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 802
 803   free (loop_vinfo);
 804   loop->aux = NULL;
 805 }
 806
 807
 808 /* Function vect_analyze_loop_1.
 809
 810    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 811    for it. The different analyses will record information in the
 812    loop_vec_info struct.  This is a subset of the analyses applied in
 813    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 814    that is now considered for (outer-loop) vectorization.  */
 815
 816 static loop_vec_info
 817 vect_analyze_loop_1 (struct loop *loop)
 818 {
 819   loop_vec_info loop_vinfo;
 820
 821   if (vect_print_dump_info (REPORT_DETAILS))
 822     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 823
 824   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 825
 826   loop_vinfo = vect_analyze_loop_form (loop);
 827   if (!loop_vinfo)
 828     {
 829       if (vect_print_dump_info (REPORT_DETAILS))
 830         fprintf (vect_dump, "bad inner-loop form.");
 831       return NULL;
 832     }
 833
 834   return loop_vinfo;
 835 }
 836
 837
 838 /* Function vect_analyze_loop_form.
 839
 840    Verify that certain CFG restrictions hold, including:
 841    - the loop has a pre-header
 842    - the loop has a single entry and exit
 843    - the loop exit condition is simple enough, and the number of iterations
 844      can be analyzed (a countable loop).  */
 845
 846 loop_vec_info
 847 vect_analyze_loop_form (struct loop *loop)
 848 {
 849   loop_vec_info loop_vinfo;
 850   gimple loop_cond;
 851   tree number_of_iterations = NULL;
 852   loop_vec_info inner_loop_vinfo = NULL;
 853
 854   if (vect_print_dump_info (REPORT_DETAILS))
 855     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 856
 857   /* Different restrictions apply when we are considering an inner-most loop,
 858      vs. an outer (nested) loop.
 859      (FORNOW. May want to relax some of these restrictions in the future).  */
 860
 861   if (!loop->inner)
 862     {
 863       /* Inner-most loop.  We currently require that the number of BBs is
 864          exactly 2 (the header and latch).  Vectorizable inner-most loops
 865          look like this:
 866
 867                         (pre-header)
 868                            |
 869                           header <--------+
 870                            | |            |
 871                            | +--> latch --+
 872                            |
 873                         (exit-bb)  */
 874
 875       if (loop->num_nodes != 2)
 876         {
 877           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 878             fprintf (vect_dump, "not vectorized: control flow in loop.");
 879           return NULL;
 880         }
 881
 882       if (empty_block_p (loop->header))
 883     {
 884           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 885             fprintf (vect_dump, "not vectorized: empty loop.");
 886       return NULL;
 887     }
 888     }
 889   else
 890     {
 891       struct loop *innerloop = loop->inner;
 892       edge backedge, entryedge;
 893
 894       /* Nested loop. We currently require that the loop is doubly-nested,
 895          contains a single inner loop, and the number of BBs is exactly 5.
 896          Vectorizable outer-loops look like this:
 897
 898                         (pre-header)
 899                            |
 900                           header <---+
 901                            |         |
 902                           inner-loop |
 903                            |         |
 904                           tail ------+
 905                            |
 906                         (exit-bb)
 907
 908          The inner-loop has the properties expected of inner-most loops
 909          as described above.  */
 910
 911       if ((loop->inner)->inner || (loop->inner)->next)
 912         {
 913           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 914             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 915           return NULL;
 916         }
 917
 918       /* Analyze the inner-loop.  */
 919       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 920       if (!inner_loop_vinfo)
 921         {
 922           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 923             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 924           return NULL;
 925         }
 926
 927       if (!expr_invariant_in_loop_p (loop,
 928                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 929         {
 930           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 931             fprintf (vect_dump,
 932                      "not vectorized: inner-loop count not invariant.");
 933           destroy_loop_vec_info (inner_loop_vinfo, true);
 934           return NULL;
 935         }
 936
 937       if (loop->num_nodes != 5)
 938         {
 939           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 940             fprintf (vect_dump, "not vectorized: control flow in loop.");
 941           destroy_loop_vec_info (inner_loop_vinfo, true);
 942           return NULL;
 943         }
 944
 945       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
 946       backedge = EDGE_PRED (innerloop->header, 1);
 947       entryedge = EDGE_PRED (innerloop->header, 0);
 948       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
 949         {
 950           backedge = EDGE_PRED (innerloop->header, 0);
 951           entryedge = EDGE_PRED (innerloop->header, 1);
 952         }
 953
 954       if (entryedge->src != loop->header
 955           || !single_exit (innerloop)
 956           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
 957         {
 958           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 959             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
 960           destroy_loop_vec_info (inner_loop_vinfo, true);
 961           return NULL;
 962         }
 963
 964       if (vect_print_dump_info (REPORT_DETAILS))
 965         fprintf (vect_dump, "Considering outer-loop vectorization.");
 966     }
 967
 968   if (!single_exit (loop)
 969       || EDGE_COUNT (loop->header->preds) != 2)
 970     {
 971       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 972         {
 973           if (!single_exit (loop))
 974             fprintf (vect_dump, "not vectorized: multiple exits.");
 975           else if (EDGE_COUNT (loop->header->preds) != 2)
 976             fprintf (vect_dump, "not vectorized: too many incoming edges.");
 977         }
 978       if (inner_loop_vinfo)
 979         destroy_loop_vec_info (inner_loop_vinfo, true);
 980       return NULL;
 981     }
 982
 983   /* We assume that the loop exit condition is at the end of the loop. i.e,
 984      that the loop is represented as a do-while (with a proper if-guard
 985      before the loop if needed), where the loop header contains all the
 986      executable statements, and the latch is empty.  */
 987   if (!empty_block_p (loop->latch)
 988         || phi_nodes (loop->latch))
 989     {
 990       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 991         fprintf (vect_dump, "not vectorized: unexpected loop form.");
 992       if (inner_loop_vinfo)
 993         destroy_loop_vec_info (inner_loop_vinfo, true);
 994       return NULL;
 995     }
 996
 997   /* Make sure there exists a single-predecessor exit bb:  */
 998   if (!single_pred_p (single_exit (loop)->dest))
 999     {
1000       edge e = single_exit (loop);
1001       if (!(e->flags & EDGE_ABNORMAL))
1002         {
1003           split_loop_exit_edge (e);
1004           if (vect_print_dump_info (REPORT_DETAILS))
1005             fprintf (vect_dump, "split exit edge.");
1006         }
1007       else
1008         {
1009           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1010             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1011           if (inner_loop_vinfo)
1012             destroy_loop_vec_info (inner_loop_vinfo, true);
1013           return NULL;
1014         }
1015     }
1016
1017   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1018   if (!loop_cond)
1019     {
1020       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1021         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1022       if (inner_loop_vinfo)
1023         destroy_loop_vec_info (inner_loop_vinfo, true);
1024       return NULL;
1025     }
1026
1027   if (!number_of_iterations)
1028     {
1029       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1030         fprintf (vect_dump,
1031                  "not vectorized: number of iterations cannot be computed.");
1032       if (inner_loop_vinfo)
1033         destroy_loop_vec_info (inner_loop_vinfo, true);
1034       return NULL;
1035     }
1036
1037   if (chrec_contains_undetermined (number_of_iterations))
1038     {
1039       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1040         fprintf (vect_dump, "Infinite number of iterations.");
1041       if (inner_loop_vinfo)
1042         destroy_loop_vec_info (inner_loop_vinfo, true);
1043       return NULL;
1044     }
1045
1046   if (!NITERS_KNOWN_P (number_of_iterations))
1047     {
1048       if (vect_print_dump_info (REPORT_DETAILS))
1049         {
1050           fprintf (vect_dump, "Symbolic number of iterations is ");
1051           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1052         }
1053     }
1054   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1055     {
1056       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1057         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1058       if (inner_loop_vinfo)
1059         destroy_loop_vec_info (inner_loop_vinfo, false);
1060       return NULL;
1061     }
1062
1063   loop_vinfo = new_loop_vec_info (loop);
1064   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1065   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1066
1067   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1068
1069   /* CHECKME: May want to keep it around it in the future.  */
1070   if (inner_loop_vinfo)
1071     destroy_loop_vec_info (inner_loop_vinfo, false);
1072
1073   gcc_assert (!loop->aux);
1074   loop->aux = loop_vinfo;
1075   return loop_vinfo;
1076 }
1077
1078
1079 /* Function vect_analyze_loop_operations.
1080
1081    Scan the loop stmts and make sure they are all vectorizable.  */
1082
1083 static bool
1084 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1085 {
1086   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1087   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1088   int nbbs = loop->num_nodes;
1089   gimple_stmt_iterator si;
1090   unsigned int vectorization_factor = 0;
1091   int i;
1092   gimple phi;
1093   stmt_vec_info stmt_info;
1094   bool need_to_vectorize = false;
1095   int min_profitable_iters;
1096   int min_scalar_loop_bound;
1097   unsigned int th;
1098   bool only_slp_in_loop = true, ok;
1099
1100   if (vect_print_dump_info (REPORT_DETAILS))
1101     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1102
1103   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1104   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1105
1106   for (i = 0; i < nbbs; i++)
1107     {
1108       basic_block bb = bbs[i];
1109
1110       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1111         {
1112           phi = gsi_stmt (si);
1113           ok = true;
1114
1115           stmt_info = vinfo_for_stmt (phi);
1116           if (vect_print_dump_info (REPORT_DETAILS))
1117             {
1118               fprintf (vect_dump, "examining phi: ");
1119               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1120             }
1121
1122           if (! is_loop_header_bb_p (bb))
1123             {
1124               /* inner-loop loop-closed exit phi in outer-loop vectorization
1125                  (i.e. a phi in the tail of the outer-loop).
1126                  FORNOW: we currently don't support the case that these phis
1127                  are not used in the outerloop (unless it is double reduction,
1128                  i.e., this phi is vect_reduction_def), cause this case
1129                  requires to actually do something here.  */
1130               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1131                    || STMT_VINFO_LIVE_P (stmt_info))
1132                   && STMT_VINFO_DEF_TYPE (stmt_info)
1133                      != vect_double_reduction_def)
1134                 {
1135                   if (vect_print_dump_info (REPORT_DETAILS))
1136                     fprintf (vect_dump,
1137                              "Unsupported loop-closed phi in outer-loop.");
1138                   return false;
1139                 }
1140               continue;
1141             }
1142
1143           gcc_assert (stmt_info);
1144
1145           if (STMT_VINFO_LIVE_P (stmt_info))
1146             {
1147               /* FORNOW: not yet supported.  */
1148               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1149                 fprintf (vect_dump, "not vectorized: value used after loop.");
1150               return false;
1151             }
1152
1153           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1154               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1155             {
1156               /* A scalar-dependence cycle that we don't support.  */
1157               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1158                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1159               return false;
1160             }
1161
1162           if (STMT_VINFO_RELEVANT_P (stmt_info))
1163             {
1164               need_to_vectorize = true;
1165               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1166                 ok = vectorizable_induction (phi, NULL, NULL);
1167             }
1168
1169           if (!ok)
1170             {
1171               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1172                 {
1173                   fprintf (vect_dump,
1174                            "not vectorized: relevant phi not supported: ");
1175                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1176                 }
1177               return false;
1178             }
1179         }
1180
1181       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1182         {
1183           gimple stmt = gsi_stmt (si);
1184           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1185
1186           gcc_assert (stmt_info);
1187
1188           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1189             return false;
1190
1191           if (STMT_VINFO_RELEVANT_P (stmt_info) && !PURE_SLP_STMT (stmt_info))
1192             /* STMT needs both SLP and loop-based vectorization.  */
1193             only_slp_in_loop = false;
1194         }
1195     } /* bbs */
1196
1197   /* All operations in the loop are either irrelevant (deal with loop
1198      control, or dead), or only used outside the loop and can be moved
1199      out of the loop (e.g. invariants, inductions).  The loop can be
1200      optimized away by scalar optimizations.  We're better off not
1201      touching this loop.  */
1202   if (!need_to_vectorize)
1203     {
1204       if (vect_print_dump_info (REPORT_DETAILS))
1205         fprintf (vect_dump,
1206                  "All the computation can be taken out of the loop.");
1207       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1208         fprintf (vect_dump,
1209                  "not vectorized: redundant loop. no profit to vectorize.");
1210       return false;
1211     }
1212
1213   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1214      vectorization factor of the loop is the unrolling factor required by the
1215      SLP instances.  If that unrolling factor is 1, we say, that we perform
1216      pure SLP on loop - cross iteration parallelism is not exploited.  */
1217   if (only_slp_in_loop)
1218     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1219   else
1220     vectorization_factor = least_common_multiple (vectorization_factor,
1221                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1222
1223   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1224
1225   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1226       && vect_print_dump_info (REPORT_DETAILS))
1227     fprintf (vect_dump,
1228         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1229         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1230
1231   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1232       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1233     {
1234       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1235         fprintf (vect_dump, "not vectorized: iteration count too small.");
1236       if (vect_print_dump_info (REPORT_DETAILS))
1237         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1238                  "vectorization factor.");
1239       return false;
1240     }
1241
1242   /* Analyze cost. Decide if worth while to vectorize.  */
1243
1244   /* Once VF is set, SLP costs should be updated since the number of created
1245      vector stmts depends on VF.  */
1246   vect_update_slp_costs_according_to_vf (loop_vinfo);
1247
1248   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1249   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1250
1251   if (min_profitable_iters < 0)
1252     {
1253       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1254         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1255       if (vect_print_dump_info (REPORT_DETAILS))
1256         fprintf (vect_dump, "not vectorized: vector version will never be "
1257                  "profitable.");
1258       return false;
1259     }
1260
1261   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1262                             * vectorization_factor) - 1);
1263
1264   /* Use the cost model only if it is more conservative than user specified
1265      threshold.  */
1266
1267   th = (unsigned) min_scalar_loop_bound;
1268   if (min_profitable_iters
1269       && (!min_scalar_loop_bound
1270           || min_profitable_iters > min_scalar_loop_bound))
1271     th = (unsigned) min_profitable_iters;
1272
1273   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1274       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1275     {
1276       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1277         fprintf (vect_dump, "not vectorized: vectorization not "
1278                  "profitable.");
1279       if (vect_print_dump_info (REPORT_DETAILS))
1280         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1281                  "user specified loop bound parameter or minimum "
1282                  "profitable iterations (whichever is more conservative).");
1283       return false;
1284     }
1285
1286   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1287       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1288       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1289     {
1290       if (vect_print_dump_info (REPORT_DETAILS))
1291         fprintf (vect_dump, "epilog loop required.");
1292       if (!vect_can_advance_ivs_p (loop_vinfo))
1293         {
1294           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295             fprintf (vect_dump,
1296                      "not vectorized: can't create epilog loop 1.");
1297           return false;
1298         }
1299       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1300         {
1301           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1302             fprintf (vect_dump,
1303                      "not vectorized: can't create epilog loop 2.");
1304           return false;
1305         }
1306     }
1307
1308   return true;
1309 }
1310
1311
1312 /* Function vect_analyze_loop.
1313
1314    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1315    for it. The different analyses will record information in the
1316    loop_vec_info struct.  */
1317 loop_vec_info
1318 vect_analyze_loop (struct loop *loop)
1319 {
1320   bool ok;
1321   loop_vec_info loop_vinfo;
1322
1323   if (vect_print_dump_info (REPORT_DETAILS))
1324     fprintf (vect_dump, "===== analyze_loop_nest =====");
1325
1326   if (loop_outer (loop)
1327       && loop_vec_info_for_loop (loop_outer (loop))
1328       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1329     {
1330       if (vect_print_dump_info (REPORT_DETAILS))
1331         fprintf (vect_dump, "outer-loop already vectorized.");
1332       return NULL;
1333     }
1334
1335   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1336
1337   loop_vinfo = vect_analyze_loop_form (loop);
1338   if (!loop_vinfo)
1339     {
1340       if (vect_print_dump_info (REPORT_DETAILS))
1341         fprintf (vect_dump, "bad loop form.");
1342       return NULL;
1343     }
1344
1345   /* Find all data references in the loop (which correspond to vdefs/vuses)
1346      and analyze their evolution in the loop.
1347
1348      FORNOW: Handle only simple, array references, which
1349      alignment can be forced, and aligned pointer-references.  */
1350
1351   ok = vect_analyze_data_refs (loop_vinfo, NULL);
1352   if (!ok)
1353     {
1354       if (vect_print_dump_info (REPORT_DETAILS))
1355         fprintf (vect_dump, "bad data references.");
1356       destroy_loop_vec_info (loop_vinfo, true);
1357       return NULL;
1358     }
1359
1360   /* Classify all cross-iteration scalar data-flow cycles.
1361      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1362
1363   vect_analyze_scalar_cycles (loop_vinfo);
1364
1365   vect_pattern_recog (loop_vinfo);
1366
1367   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1368
1369   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1370   if (!ok)
1371     {
1372       if (vect_print_dump_info (REPORT_DETAILS))
1373         fprintf (vect_dump, "unexpected pattern.");
1374       destroy_loop_vec_info (loop_vinfo, true);
1375       return NULL;
1376     }
1377
1378   /* Analyze the alignment of the data-refs in the loop.
1379      Fail if a data reference is found that cannot be vectorized.  */
1380
1381   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1382   if (!ok)
1383     {
1384       if (vect_print_dump_info (REPORT_DETAILS))
1385         fprintf (vect_dump, "bad data alignment.");
1386       destroy_loop_vec_info (loop_vinfo, true);
1387       return NULL;
1388     }
1389
1390   ok = vect_determine_vectorization_factor (loop_vinfo);
1391   if (!ok)
1392     {
1393       if (vect_print_dump_info (REPORT_DETAILS))
1394         fprintf (vect_dump, "can't determine vectorization factor.");
1395       destroy_loop_vec_info (loop_vinfo, true);
1396       return NULL;
1397     }
1398
1399   /* Analyze data dependences between the data-refs in the loop.
1400      FORNOW: fail at the first data dependence that we encounter.  */
1401
1402   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1403   if (!ok)
1404     {
1405       if (vect_print_dump_info (REPORT_DETAILS))
1406         fprintf (vect_dump, "bad data dependence.");
1407       destroy_loop_vec_info (loop_vinfo, true);
1408       return NULL;
1409     }
1410
1411   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1412      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1413
1414   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1415   if (!ok)
1416     {
1417       if (vect_print_dump_info (REPORT_DETAILS))
1418         fprintf (vect_dump, "bad data access.");
1419       destroy_loop_vec_info (loop_vinfo, true);
1420       return NULL;
1421     }
1422
1423   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1424      It is important to call pruning after vect_analyze_data_ref_accesses,
1425      since we use grouping information gathered by interleaving analysis.  */
1426   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1427   if (!ok)
1428     {
1429       if (vect_print_dump_info (REPORT_DETAILS))
1430         fprintf (vect_dump, "too long list of versioning for alias "
1431                             "run-time tests.");
1432       destroy_loop_vec_info (loop_vinfo, true);
1433       return NULL;
1434     }
1435
1436   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1437   ok = vect_analyze_slp (loop_vinfo, NULL);
1438   if (ok)
1439     {
1440       /* Decide which possible SLP instances to SLP.  */
1441       vect_make_slp_decision (loop_vinfo);
1442
1443       /* Find stmts that need to be both vectorized and SLPed.  */
1444       vect_detect_hybrid_slp (loop_vinfo);
1445     }
1446
1447   /* This pass will decide on using loop versioning and/or loop peeling in
1448      order to enhance the alignment of data references in the loop.  */
1449
1450   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1451   if (!ok)
1452     {
1453       if (vect_print_dump_info (REPORT_DETAILS))
1454         fprintf (vect_dump, "bad data alignment.");
1455       destroy_loop_vec_info (loop_vinfo, true);
1456       return NULL;
1457     }
1458
1459   /* Scan all the operations in the loop and make sure they are
1460      vectorizable.  */
1461
1462   ok = vect_analyze_loop_operations (loop_vinfo);
1463   if (!ok)
1464     {
1465       if (vect_print_dump_info (REPORT_DETAILS))
1466         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1467       destroy_loop_vec_info (loop_vinfo, true);
1468       return NULL;
1469     }
1470
1471   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1472
1473   return loop_vinfo;
1474 }
1475
1476
1477 /* Function reduction_code_for_scalar_code
1478
1479    Input:
1480    CODE - tree_code of a reduction operations.
1481
1482    Output:
1483    REDUC_CODE - the corresponding tree-code to be used to reduce the
1484       vector of partial results into a single scalar result (which
1485       will also reside in a vector) or ERROR_MARK if the operation is
1486       a supported reduction operation, but does not have such tree-code.
1487
1488    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1489
1490 static bool
1491 reduction_code_for_scalar_code (enum tree_code code,
1492                                 enum tree_code *reduc_code)
1493 {
1494   switch (code)
1495     {
1496       case MAX_EXPR:
1497         *reduc_code = REDUC_MAX_EXPR;
1498         return true;
1499
1500       case MIN_EXPR:
1501         *reduc_code = REDUC_MIN_EXPR;
1502         return true;
1503
1504       case PLUS_EXPR:
1505         *reduc_code = REDUC_PLUS_EXPR;
1506         return true;
1507
1508       case MULT_EXPR:
1509       case MINUS_EXPR:
1510       case BIT_IOR_EXPR:
1511       case BIT_XOR_EXPR:
1512       case BIT_AND_EXPR:
1513         *reduc_code = ERROR_MARK;
1514         return true;
1515
1516       default:
1517        return false;
1518     }
1519 }
1520
1521
1522 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1523    STMT is printed with a message MSG. */
1524
1525 static void
1526 report_vect_op (gimple stmt, const char *msg)
1527 {
1528   fprintf (vect_dump, "%s", msg);
1529   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1530 }
1531
1532
1533 /* Function vect_is_simple_reduction
1534
1535    (1) Detect a cross-iteration def-use cycle that represents a simple
1536    reduction computation. We look for the following pattern:
1537
1538    loop_header:
1539      a1 = phi < a0, a2 >
1540      a3 = ...
1541      a2 = operation (a3, a1)
1542
1543    such that:
1544    1. operation is commutative and associative and it is safe to
1545       change the order of the computation (if CHECK_REDUCTION is true)
1546    2. no uses for a2 in the loop (a2 is used out of the loop)
1547    3. no uses of a1 in the loop besides the reduction operation.
1548
1549    Condition 1 is tested here.
1550    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1551
1552    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1553    nested cycles, if CHECK_REDUCTION is false.
1554
1555    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1556    reductions:
1557
1558      a1 = phi < a0, a2 >
1559      inner loop (def of a3)
1560      a2 = phi < a3 >
1561 */
1562
1563 gimple
1564 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1565                           bool check_reduction, bool *double_reduc)
1566 {
1567   struct loop *loop = (gimple_bb (phi))->loop_father;
1568   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1569   edge latch_e = loop_latch_edge (loop);
1570   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1571   gimple def_stmt, def1, def2;
1572   enum tree_code code;
1573   tree op1, op2;
1574   tree type;
1575   int nloop_uses;
1576   tree name;
1577   imm_use_iterator imm_iter;
1578   use_operand_p use_p;
1579   bool phi_def;
1580
1581   *double_reduc = false;
1582
1583   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1584      otherwise, we assume outer loop vectorization.  */
1585   gcc_assert ((check_reduction && loop == vect_loop)
1586               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1587
1588   name = PHI_RESULT (phi);
1589   nloop_uses = 0;
1590   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1591     {
1592       gimple use_stmt = USE_STMT (use_p);
1593       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1594           && vinfo_for_stmt (use_stmt)
1595           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1596         nloop_uses++;
1597       if (nloop_uses > 1)
1598         {
1599           if (vect_print_dump_info (REPORT_DETAILS))
1600             fprintf (vect_dump, "reduction used in loop.");
1601           return NULL;
1602         }
1603     }
1604
1605   if (TREE_CODE (loop_arg) != SSA_NAME)
1606     {
1607       if (vect_print_dump_info (REPORT_DETAILS))
1608         {
1609           fprintf (vect_dump, "reduction: not ssa_name: ");
1610           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1611         }
1612       return NULL;
1613     }
1614
1615   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1616   if (!def_stmt)
1617     {
1618       if (vect_print_dump_info (REPORT_DETAILS))
1619         fprintf (vect_dump, "reduction: no def_stmt.");
1620       return NULL;
1621     }
1622
1623   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1624     {
1625       if (vect_print_dump_info (REPORT_DETAILS))
1626         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1627       return NULL;
1628     }
1629
1630   if (is_gimple_assign (def_stmt))
1631     {
1632       name = gimple_assign_lhs (def_stmt);
1633       phi_def = false;
1634     }
1635   else
1636     {
1637       name = PHI_RESULT (def_stmt);
1638       phi_def = true;
1639     }
1640
1641   nloop_uses = 0;
1642   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1643     {
1644       gimple use_stmt = USE_STMT (use_p);
1645       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1646           && vinfo_for_stmt (use_stmt)
1647           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1648         nloop_uses++;
1649       if (nloop_uses > 1)
1650         {
1651           if (vect_print_dump_info (REPORT_DETAILS))
1652             fprintf (vect_dump, "reduction used in loop.");
1653           return NULL;
1654         }
1655     }
1656
1657   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1658      defined in the inner loop.  */
1659   if (phi_def)
1660     {
1661       op1 = PHI_ARG_DEF (def_stmt, 0);
1662
1663       if (gimple_phi_num_args (def_stmt) != 1
1664           || TREE_CODE (op1) != SSA_NAME)
1665         {
1666           if (vect_print_dump_info (REPORT_DETAILS))
1667             fprintf (vect_dump, "unsupported phi node definition.");
1668
1669           return NULL;
1670         }
1671
1672       def1 = SSA_NAME_DEF_STMT (op1);
1673       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1674           && loop->inner
1675           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1676           && is_gimple_assign (def1))
1677         {
1678           if (vect_print_dump_info (REPORT_DETAILS))
1679             report_vect_op (def_stmt, "detected double reduction: ");
1680
1681           *double_reduc = true;
1682           return def_stmt;
1683         }
1684
1685       return NULL;
1686     }
1687
1688   code = gimple_assign_rhs_code (def_stmt);
1689
1690   if (check_reduction
1691       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1692     {
1693       if (vect_print_dump_info (REPORT_DETAILS))
1694         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1695       return NULL;
1696     }
1697
1698   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1699     {
1700       if (vect_print_dump_info (REPORT_DETAILS))
1701         report_vect_op (def_stmt, "reduction: not binary operation: ");
1702       return NULL;
1703     }
1704
1705   op1 = gimple_assign_rhs1 (def_stmt);
1706   op2 = gimple_assign_rhs2 (def_stmt);
1707   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1708     {
1709       if (vect_print_dump_info (REPORT_DETAILS))
1710         report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1711       return NULL;
1712     }
1713
1714   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1715   if (TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op1))
1716       || TYPE_MAIN_VARIANT (type) != TYPE_MAIN_VARIANT (TREE_TYPE (op2)))
1717     {
1718       if (vect_print_dump_info (REPORT_DETAILS))
1719         {
1720           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1721           print_generic_expr (vect_dump, type, TDF_SLIM);
1722           fprintf (vect_dump, ", operands types: ");
1723           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1724           fprintf (vect_dump, ",");
1725           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1726         }
1727       return NULL;
1728     }
1729
1730   /* Check that it's ok to change the order of the computation.
1731      Generally, when vectorizing a reduction we change the order of the
1732      computation.  This may change the behavior of the program in some
1733      cases, so we need to check that this is ok.  One exception is when
1734      vectorizing an outer-loop: the inner-loop is executed sequentially,
1735      and therefore vectorizing reductions in the inner-loop during
1736      outer-loop vectorization is safe.  */
1737
1738   /* CHECKME: check for !flag_finite_math_only too?  */
1739   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1740       && check_reduction)
1741     {
1742       /* Changing the order of operations changes the semantics.  */
1743       if (vect_print_dump_info (REPORT_DETAILS))
1744         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1745       return NULL;
1746     }
1747   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1748            && check_reduction)
1749     {
1750       /* Changing the order of operations changes the semantics.  */
1751       if (vect_print_dump_info (REPORT_DETAILS))
1752         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1753       return NULL;
1754     }
1755   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1756     {
1757       /* Changing the order of operations changes the semantics.  */
1758       if (vect_print_dump_info (REPORT_DETAILS))
1759         report_vect_op (def_stmt,
1760                         "reduction: unsafe fixed-point math optimization: ");
1761       return NULL;
1762     }
1763
1764   /* Reduction is safe. We're dealing with one of the following:
1765      1) integer arithmetic and no trapv
1766      2) floating point arithmetic, and special flags permit this optimization
1767      3) nested cycle (i.e., outer loop vectorization).  */
1768   def1 = SSA_NAME_DEF_STMT (op1);
1769   def2 = SSA_NAME_DEF_STMT (op2);
1770   if (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2))
1771     {
1772       if (vect_print_dump_info (REPORT_DETAILS))
1773         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1774       return NULL;
1775     }
1776
1777   /* Check that one def is the reduction def, defined by PHI,
1778      the other def is either defined in the loop ("vect_internal_def"),
1779      or it's an induction (defined by a loop-header phi-node).  */
1780
1781   if (def2 == phi
1782       && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1783       && (is_gimple_assign (def1)
1784           || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) == vect_induction_def
1785           || (gimple_code (def1) == GIMPLE_PHI
1786               && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1787                   == vect_internal_def
1788               && !is_loop_header_bb_p (gimple_bb (def1)))))
1789     {
1790       if (vect_print_dump_info (REPORT_DETAILS))
1791         report_vect_op (def_stmt, "detected reduction: ");
1792       return def_stmt;
1793     }
1794   else if (def1 == phi
1795            && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1796            && (is_gimple_assign (def2)
1797                || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1798                     == vect_induction_def
1799                || (gimple_code (def2) == GIMPLE_PHI
1800                    && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1801                     == vect_internal_def
1802                    && !is_loop_header_bb_p (gimple_bb (def2)))))
1803     {
1804       if (check_reduction)
1805         {
1806           /* Swap operands (just for simplicity - so that the rest of the code
1807              can assume that the reduction variable is always the last (second)
1808              argument).  */
1809           if (vect_print_dump_info (REPORT_DETAILS))
1810             report_vect_op (def_stmt,
1811                             "detected reduction: need to swap operands: ");
1812
1813           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1814                               gimple_assign_rhs2_ptr (def_stmt));
1815         }
1816       else
1817         {
1818           if (vect_print_dump_info (REPORT_DETAILS))
1819             report_vect_op (def_stmt, "detected reduction: ");
1820         }
1821
1822       return def_stmt;
1823     }
1824   else
1825     {
1826       if (vect_print_dump_info (REPORT_DETAILS))
1827         report_vect_op (def_stmt, "reduction: unknown pattern: ");
1828
1829       return NULL;
1830     }
1831 }
1832
1833
1834 /* Function vect_estimate_min_profitable_iters
1835
1836    Return the number of iterations required for the vector version of the
1837    loop to be profitable relative to the cost of the scalar version of the
1838    loop.
1839
1840    TODO: Take profile info into account before making vectorization
1841    decisions, if available.  */
1842
1843 int
1844 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1845 {
1846   int i;
1847   int min_profitable_iters;
1848   int peel_iters_prologue;
1849   int peel_iters_epilogue;
1850   int vec_inside_cost = 0;
1851   int vec_outside_cost = 0;
1852   int scalar_single_iter_cost = 0;
1853   int scalar_outside_cost = 0;
1854   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1855   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1856   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1857   int nbbs = loop->num_nodes;
1858   int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1859   int peel_guard_costs = 0;
1860   int innerloop_iters = 0, factor;
1861   VEC (slp_instance, heap) *slp_instances;
1862   slp_instance instance;
1863
1864   /* Cost model disabled.  */
1865   if (!flag_vect_cost_model)
1866     {
1867       if (vect_print_dump_info (REPORT_COST))
1868         fprintf (vect_dump, "cost model disabled.");
1869       return 0;
1870     }
1871
1872   /* Requires loop versioning tests to handle misalignment.  */
1873   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1874     {
1875       /*  FIXME: Make cost depend on complexity of individual check.  */
1876       vec_outside_cost +=
1877         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1878       if (vect_print_dump_info (REPORT_COST))
1879         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1880                  "versioning to treat misalignment.\n");
1881     }
1882
1883   /* Requires loop versioning with alias checks.  */
1884   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1885     {
1886       /*  FIXME: Make cost depend on complexity of individual check.  */
1887       vec_outside_cost +=
1888         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1889       if (vect_print_dump_info (REPORT_COST))
1890         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1891                  "versioning aliasing.\n");
1892     }
1893
1894   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1895       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1896     vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1897
1898   /* Count statements in scalar loop.  Using this as scalar cost for a single
1899      iteration for now.
1900
1901      TODO: Add outer loop support.
1902
1903      TODO: Consider assigning different costs to different scalar
1904      statements.  */
1905
1906   /* FORNOW.  */
1907   if (loop->inner)
1908     innerloop_iters = 50; /* FIXME */
1909
1910   for (i = 0; i < nbbs; i++)
1911     {
1912       gimple_stmt_iterator si;
1913       basic_block bb = bbs[i];
1914
1915       if (bb->loop_father == loop->inner)
1916         factor = innerloop_iters;
1917       else
1918         factor = 1;
1919
1920       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1921         {
1922           gimple stmt = gsi_stmt (si);
1923           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1924           /* Skip stmts that are not vectorized inside the loop.  */
1925           if (!STMT_VINFO_RELEVANT_P (stmt_info)
1926               && (!STMT_VINFO_LIVE_P (stmt_info)
1927                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1928             continue;
1929           scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1930           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1931           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1932              some of the "outside" costs are generated inside the outer-loop.  */
1933           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1934         }
1935     }
1936
1937   /* Add additional cost for the peeled instructions in prologue and epilogue
1938      loop.
1939
1940      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1941      at compile-time - we assume it's vf/2 (the worst would be vf-1).
1942
1943      TODO: Build an expression that represents peel_iters for prologue and
1944      epilogue to be used in a run-time test.  */
1945
1946   if (byte_misalign < 0)
1947     {
1948       peel_iters_prologue = vf/2;
1949       if (vect_print_dump_info (REPORT_COST))
1950         fprintf (vect_dump, "cost model: "
1951                  "prologue peel iters set to vf/2.");
1952
1953       /* If peeling for alignment is unknown, loop bound of main loop becomes
1954          unknown.  */
1955       peel_iters_epilogue = vf/2;
1956       if (vect_print_dump_info (REPORT_COST))
1957         fprintf (vect_dump, "cost model: "
1958                  "epilogue peel iters set to vf/2 because "
1959                  "peeling for alignment is unknown .");
1960
1961       /* If peeled iterations are unknown, count a taken branch and a not taken
1962          branch per peeled loop. Even if scalar loop iterations are known,
1963          vector iterations are not known since peeled prologue iterations are
1964          not known. Hence guards remain the same.  */
1965       peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
1966                               + TARG_COND_NOT_TAKEN_BRANCH_COST);
1967     }
1968   else
1969     {
1970       if (byte_misalign)
1971         {
1972           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1973           int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
1974           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
1975           int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1976
1977           peel_iters_prologue = nelements - (byte_misalign / element_size);
1978         }
1979       else
1980         peel_iters_prologue = 0;
1981
1982       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1983         {
1984           peel_iters_epilogue = vf/2;
1985           if (vect_print_dump_info (REPORT_COST))
1986             fprintf (vect_dump, "cost model: "
1987                      "epilogue peel iters set to vf/2 because "
1988                      "loop iterations are unknown .");
1989
1990           /* If peeled iterations are known but number of scalar loop
1991              iterations are unknown, count a taken branch per peeled loop.  */
1992           peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
1993
1994         }
1995       else
1996         {
1997           int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
1998           peel_iters_prologue = niters < peel_iters_prologue ?
1999                                         niters : peel_iters_prologue;
2000           peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2001         }
2002     }
2003
2004   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2005                       + (peel_iters_epilogue * scalar_single_iter_cost)
2006                       + peel_guard_costs;
2007
2008   /* FORNOW: The scalar outside cost is incremented in one of the
2009      following ways:
2010
2011      1. The vectorizer checks for alignment and aliasing and generates
2012      a condition that allows dynamic vectorization.  A cost model
2013      check is ANDED with the versioning condition.  Hence scalar code
2014      path now has the added cost of the versioning check.
2015
2016        if (cost > th & versioning_check)
2017          jmp to vector code
2018
2019      Hence run-time scalar is incremented by not-taken branch cost.
2020
2021      2. The vectorizer then checks if a prologue is required.  If the
2022      cost model check was not done before during versioning, it has to
2023      be done before the prologue check.
2024
2025        if (cost <= th)
2026          prologue = scalar_iters
2027        if (prologue == 0)
2028          jmp to vector code
2029        else
2030          execute prologue
2031        if (prologue == num_iters)
2032          go to exit
2033
2034      Hence the run-time scalar cost is incremented by a taken branch,
2035      plus a not-taken branch, plus a taken branch cost.
2036
2037      3. The vectorizer then checks if an epilogue is required.  If the
2038      cost model check was not done before during prologue check, it
2039      has to be done with the epilogue check.
2040
2041        if (prologue == 0)
2042          jmp to vector code
2043        else
2044          execute prologue
2045        if (prologue == num_iters)
2046          go to exit
2047        vector code:
2048          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2049            jmp to epilogue
2050
2051      Hence the run-time scalar cost should be incremented by 2 taken
2052      branches.
2053
2054      TODO: The back end may reorder the BBS's differently and reverse
2055      conditions/branch directions.  Change the estimates below to
2056      something more reasonable.  */
2057
2058   /* If the number of iterations is known and we do not do versioning, we can
2059      decide whether to vectorize at compile time. Hence the scalar version
2060      do not carry cost model guard costs.  */
2061   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2062       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2063       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2064     {
2065       /* Cost model check occurs at versioning.  */
2066       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2067           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2068         scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2069       else
2070         {
2071           /* Cost model check occurs at prologue generation.  */
2072           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2073             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2074               + TARG_COND_NOT_TAKEN_BRANCH_COST;
2075           /* Cost model check occurs at epilogue generation.  */
2076           else
2077             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2078         }
2079     }
2080
2081   /* Add SLP costs.  */
2082   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2083   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2084     {
2085       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2086       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2087     }
2088
2089   /* Calculate number of iterations required to make the vector version
2090      profitable, relative to the loop bodies only. The following condition
2091      must hold true:
2092      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2093      where
2094      SIC = scalar iteration cost, VIC = vector iteration cost,
2095      VOC = vector outside cost, VF = vectorization factor,
2096      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2097      SOC = scalar outside cost for run time cost model check.  */
2098
2099   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2100     {
2101       if (vec_outside_cost <= 0)
2102         min_profitable_iters = 1;
2103       else
2104         {
2105           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2106                                   - vec_inside_cost * peel_iters_prologue
2107                                   - vec_inside_cost * peel_iters_epilogue)
2108                                  / ((scalar_single_iter_cost * vf)
2109                                     - vec_inside_cost);
2110
2111           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2112               <= ((vec_inside_cost * min_profitable_iters)
2113                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2114             min_profitable_iters++;
2115         }
2116     }
2117   /* vector version will never be profitable.  */
2118   else
2119     {
2120       if (vect_print_dump_info (REPORT_COST))
2121         fprintf (vect_dump, "cost model: vector iteration cost = %d "
2122                  "is divisible by scalar iteration cost = %d by a factor "
2123                  "greater than or equal to the vectorization factor = %d .",
2124                  vec_inside_cost, scalar_single_iter_cost, vf);
2125       return -1;
2126     }
2127
2128   if (vect_print_dump_info (REPORT_COST))
2129     {
2130       fprintf (vect_dump, "Cost model analysis: \n");
2131       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2132                vec_inside_cost);
2133       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2134                vec_outside_cost);
2135       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2136                scalar_single_iter_cost);
2137       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2138       fprintf (vect_dump, "  prologue iterations: %d\n",
2139                peel_iters_prologue);
2140       fprintf (vect_dump, "  epilogue iterations: %d\n",
2141                peel_iters_epilogue);
2142       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2143                min_profitable_iters);
2144     }
2145
2146   min_profitable_iters =
2147         min_profitable_iters < vf ? vf : min_profitable_iters;
2148
2149   /* Because the condition we create is:
2150      if (niters <= min_profitable_iters)
2151        then skip the vectorized loop.  */
2152   min_profitable_iters--;
2153
2154   if (vect_print_dump_info (REPORT_COST))
2155     fprintf (vect_dump, "  Profitability threshold = %d\n",
2156              min_profitable_iters);
2157
2158   return min_profitable_iters;
2159 }
2160
2161
2162 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2163    functions. Design better to avoid maintenance issues.  */
2164
2165 /* Function vect_model_reduction_cost.
2166
2167    Models cost for a reduction operation, including the vector ops
2168    generated within the strip-mine loop, the initial definition before
2169    the loop, and the epilogue code that must be generated.  */
2170
2171 static bool
2172 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2173                            int ncopies)
2174 {
2175   int outer_cost = 0;
2176   enum tree_code code;
2177   optab optab;
2178   tree vectype;
2179   gimple stmt, orig_stmt;
2180   tree reduction_op;
2181   enum machine_mode mode;
2182   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2183   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2184
2185
2186   /* Cost of reduction op inside loop.  */
2187   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2188
2189   stmt = STMT_VINFO_STMT (stmt_info);
2190
2191   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2192     {
2193     case GIMPLE_SINGLE_RHS:
2194       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2195       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2196       break;
2197     case GIMPLE_UNARY_RHS:
2198       reduction_op = gimple_assign_rhs1 (stmt);
2199       break;
2200     case GIMPLE_BINARY_RHS:
2201       reduction_op = gimple_assign_rhs2 (stmt);
2202       break;
2203     default:
2204       gcc_unreachable ();
2205     }
2206
2207   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2208   if (!vectype)
2209     {
2210       if (vect_print_dump_info (REPORT_COST))
2211         {
2212           fprintf (vect_dump, "unsupported data-type ");
2213           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2214         }
2215       return false;
2216    }
2217
2218   mode = TYPE_MODE (vectype);
2219   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2220
2221   if (!orig_stmt)
2222     orig_stmt = STMT_VINFO_STMT (stmt_info);
2223
2224   code = gimple_assign_rhs_code (orig_stmt);
2225
2226   /* Add in cost for initial definition.  */
2227   outer_cost += TARG_SCALAR_TO_VEC_COST;
2228
2229   /* Determine cost of epilogue code.
2230
2231      We have a reduction operator that will reduce the vector in one statement.
2232      Also requires scalar extract.  */
2233
2234   if (!nested_in_vect_loop_p (loop, orig_stmt))
2235     {
2236       if (reduc_code != ERROR_MARK)
2237         outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2238       else
2239         {
2240           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2241           tree bitsize =
2242             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2243           int element_bitsize = tree_low_cst (bitsize, 1);
2244           int nelements = vec_size_in_bits / element_bitsize;
2245
2246           optab = optab_for_tree_code (code, vectype, optab_default);
2247
2248           /* We have a whole vector shift available.  */
2249           if (VECTOR_MODE_P (mode)
2250               && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2251               && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2252             /* Final reduction via vector shifts and the reduction operator. Also
2253                requires scalar extract.  */
2254             outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2255                                 + TARG_VEC_TO_SCALAR_COST);
2256           else
2257             /* Use extracts and reduction op for final reduction.  For N elements,
2258                we have N extracts and N-1 reduction ops.  */
2259             outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2260         }
2261     }
2262
2263   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2264
2265   if (vect_print_dump_info (REPORT_COST))
2266     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2267              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2268              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2269
2270   return true;
2271 }
2272
2273
2274 /* Function vect_model_induction_cost.
2275
2276    Models cost for induction operations.  */
2277
2278 static void
2279 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2280 {
2281   /* loop cost for vec_loop.  */
2282   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2283   /* prologue cost for vec_init and vec_step.  */
2284   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2285
2286   if (vect_print_dump_info (REPORT_COST))
2287     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2288              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2289              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2290 }
2291
2292
2293 /* Function get_initial_def_for_induction
2294
2295    Input:
2296    STMT - a stmt that performs an induction operation in the loop.
2297    IV_PHI - the initial value of the induction variable
2298
2299    Output:
2300    Return a vector variable, initialized with the first VF values of
2301    the induction variable. E.g., for an iv with IV_PHI='X' and
2302    evolution S, for a vector of 4 units, we want to return:
2303    [X, X + S, X + 2*S, X + 3*S].  */
2304
2305 static tree
2306 get_initial_def_for_induction (gimple iv_phi)
2307 {
2308   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2309   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2310   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2311   tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2312   tree vectype;
2313   int nunits;
2314   edge pe = loop_preheader_edge (loop);
2315   struct loop *iv_loop;
2316   basic_block new_bb;
2317   tree vec, vec_init, vec_step, t;
2318   tree access_fn;
2319   tree new_var;
2320   tree new_name;
2321   gimple init_stmt, induction_phi, new_stmt;
2322   tree induc_def, vec_def, vec_dest;
2323   tree init_expr, step_expr;
2324   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2325   int i;
2326   bool ok;
2327   int ncopies;
2328   tree expr;
2329   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2330   bool nested_in_vect_loop = false;
2331   gimple_seq stmts = NULL;
2332   imm_use_iterator imm_iter;
2333   use_operand_p use_p;
2334   gimple exit_phi;
2335   edge latch_e;
2336   tree loop_arg;
2337   gimple_stmt_iterator si;
2338   basic_block bb = gimple_bb (iv_phi);
2339   tree stepvectype;
2340
2341   vectype = get_vectype_for_scalar_type (scalar_type);
2342   gcc_assert (vectype);
2343   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2344   ncopies = vf / nunits;
2345
2346   gcc_assert (phi_info);
2347   gcc_assert (ncopies >= 1);
2348
2349   /* Find the first insertion point in the BB.  */
2350   si = gsi_after_labels (bb);
2351
2352   if (INTEGRAL_TYPE_P (scalar_type))
2353     step_expr = build_int_cst (scalar_type, 0);
2354   else if (POINTER_TYPE_P (scalar_type))
2355     step_expr = build_int_cst (sizetype, 0);
2356   else
2357     step_expr = build_real (scalar_type, dconst0);
2358
2359   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2360   if (nested_in_vect_loop_p (loop, iv_phi))
2361     {
2362       nested_in_vect_loop = true;
2363       iv_loop = loop->inner;
2364     }
2365   else
2366     iv_loop = loop;
2367   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2368
2369   latch_e = loop_latch_edge (iv_loop);
2370   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2371
2372   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2373   gcc_assert (access_fn);
2374   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2375                                     &init_expr, &step_expr);
2376   gcc_assert (ok);
2377   pe = loop_preheader_edge (iv_loop);
2378
2379   /* Create the vector that holds the initial_value of the induction.  */
2380   if (nested_in_vect_loop)
2381     {
2382       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2383          been created during vectorization of previous stmts; We obtain it from
2384          the STMT_VINFO_VEC_STMT of the defining stmt. */
2385       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2386                                            loop_preheader_edge (iv_loop));
2387       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2388     }
2389   else
2390     {
2391       /* iv_loop is the loop to be vectorized. Create:
2392          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2393       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2394       add_referenced_var (new_var);
2395
2396       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2397       if (stmts)
2398         {
2399           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2400           gcc_assert (!new_bb);
2401         }
2402
2403       t = NULL_TREE;
2404       t = tree_cons (NULL_TREE, init_expr, t);
2405       for (i = 1; i < nunits; i++)
2406         {
2407           /* Create: new_name_i = new_name + step_expr  */
2408           enum tree_code code = POINTER_TYPE_P (scalar_type)
2409                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2410           init_stmt = gimple_build_assign_with_ops (code, new_var,
2411                                                     new_name, step_expr);
2412           new_name = make_ssa_name (new_var, init_stmt);
2413           gimple_assign_set_lhs (init_stmt, new_name);
2414
2415           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2416           gcc_assert (!new_bb);
2417
2418           if (vect_print_dump_info (REPORT_DETAILS))
2419             {
2420               fprintf (vect_dump, "created new init_stmt: ");
2421               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2422             }
2423           t = tree_cons (NULL_TREE, new_name, t);
2424         }
2425       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2426       vec = build_constructor_from_list (vectype, nreverse (t));
2427       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2428     }
2429
2430
2431   /* Create the vector that holds the step of the induction.  */
2432   if (nested_in_vect_loop)
2433     /* iv_loop is nested in the loop to be vectorized. Generate:
2434        vec_step = [S, S, S, S]  */
2435     new_name = step_expr;
2436   else
2437     {
2438       /* iv_loop is the loop to be vectorized. Generate:
2439           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2440       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2441       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2442                               expr, step_expr);
2443     }
2444
2445   t = NULL_TREE;
2446   for (i = 0; i < nunits; i++)
2447     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2448   gcc_assert (CONSTANT_CLASS_P (new_name));
2449   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2450   gcc_assert (stepvectype);
2451   vec = build_vector (stepvectype, t);
2452   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2453
2454
2455   /* Create the following def-use cycle:
2456      loop prolog:
2457          vec_init = ...
2458          vec_step = ...
2459      loop:
2460          vec_iv = PHI <vec_init, vec_loop>
2461          ...
2462          STMT
2463          ...
2464          vec_loop = vec_iv + vec_step;  */
2465
2466   /* Create the induction-phi that defines the induction-operand.  */
2467   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2468   add_referenced_var (vec_dest);
2469   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2470   set_vinfo_for_stmt (induction_phi,
2471                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2472   induc_def = PHI_RESULT (induction_phi);
2473
2474   /* Create the iv update inside the loop  */
2475   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2476                                            induc_def, vec_step);
2477   vec_def = make_ssa_name (vec_dest, new_stmt);
2478   gimple_assign_set_lhs (new_stmt, vec_def);
2479   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2480   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2481                                                    NULL));
2482
2483   /* Set the arguments of the phi node:  */
2484   add_phi_arg (induction_phi, vec_init, pe);
2485   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop));
2486
2487
2488   /* In case that vectorization factor (VF) is bigger than the number
2489      of elements that we can fit in a vectype (nunits), we have to generate
2490      more than one vector stmt - i.e - we need to "unroll" the
2491      vector stmt by a factor VF/nunits.  For more details see documentation
2492      in vectorizable_operation.  */
2493
2494   if (ncopies > 1)
2495     {
2496       stmt_vec_info prev_stmt_vinfo;
2497       /* FORNOW. This restriction should be relaxed.  */
2498       gcc_assert (!nested_in_vect_loop);
2499
2500       /* Create the vector that holds the step of the induction.  */
2501       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2502       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2503                               expr, step_expr);
2504       t = NULL_TREE;
2505       for (i = 0; i < nunits; i++)
2506         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2507       gcc_assert (CONSTANT_CLASS_P (new_name));
2508       vec = build_vector (stepvectype, t);
2509       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2510
2511       vec_def = induc_def;
2512       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2513       for (i = 1; i < ncopies; i++)
2514         {
2515           /* vec_i = vec_prev + vec_step  */
2516           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2517                                                    vec_def, vec_step);
2518           vec_def = make_ssa_name (vec_dest, new_stmt);
2519           gimple_assign_set_lhs (new_stmt, vec_def);
2520
2521           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2522           set_vinfo_for_stmt (new_stmt,
2523                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2524           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2525           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2526         }
2527     }
2528
2529   if (nested_in_vect_loop)
2530     {
2531       /* Find the loop-closed exit-phi of the induction, and record
2532          the final vector of induction results:  */
2533       exit_phi = NULL;
2534       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2535         {
2536           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2537             {
2538               exit_phi = USE_STMT (use_p);
2539               break;
2540             }
2541         }
2542       if (exit_phi)
2543         {
2544           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2545           /* FORNOW. Currently not supporting the case that an inner-loop induction
2546              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2547           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2548                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2549
2550           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2551           if (vect_print_dump_info (REPORT_DETAILS))
2552             {
2553               fprintf (vect_dump, "vector of inductions after inner-loop:");
2554               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2555             }
2556         }
2557     }
2558
2559
2560   if (vect_print_dump_info (REPORT_DETAILS))
2561     {
2562       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2563       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2564       fprintf (vect_dump, "\n");
2565       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2566     }
2567
2568   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2569   return induc_def;
2570 }
2571
2572
2573 /* Function get_initial_def_for_reduction
2574
2575    Input:
2576    STMT - a stmt that performs a reduction operation in the loop.
2577    INIT_VAL - the initial value of the reduction variable
2578
2579    Output:
2580    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2581         of the reduction (used for adjusting the epilog - see below).
2582    Return a vector variable, initialized according to the operation that STMT
2583         performs. This vector will be used as the initial value of the
2584         vector of partial results.
2585
2586    Option1 (adjust in epilog): Initialize the vector as follows:
2587      add/bit or/xor: [0,0,...,0,0]
2588      mult/bit and:   [1,1,...,1,1]
2589      min/max:        [init_val,init_val,..,init_val,init_val]
2590    and when necessary (e.g. add/mult case) let the caller know
2591    that it needs to adjust the result by init_val.
2592
2593    Option2: Initialize the vector as follows:
2594      add/bit or/xor: [init_val,0,0,...,0]
2595      mult/bit and:   [init_val,1,1,...,1]
2596      min/max:        [init_val,init_val,...,init_val]
2597    and no adjustments are needed.
2598
2599    For example, for the following code:
2600
2601    s = init_val;
2602    for (i=0;i<n;i++)
2603      s = s + a[i];
2604
2605    STMT is 's = s + a[i]', and the reduction variable is 's'.
2606    For a vector of 4 units, we want to return either [0,0,0,init_val],
2607    or [0,0,0,0] and let the caller know that it needs to adjust
2608    the result at the end by 'init_val'.
2609
2610    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2611    initialization vector is simpler (same element in all entries), if
2612    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2613
2614    A cost model should help decide between these two schemes.  */
2615
2616 tree
2617 get_initial_def_for_reduction (gimple stmt, tree init_val,
2618                                tree *adjustment_def)
2619 {
2620   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2621   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2622   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2623   tree scalar_type = TREE_TYPE (init_val);
2624   tree vectype = get_vectype_for_scalar_type (scalar_type);
2625   int nunits;
2626   enum tree_code code = gimple_assign_rhs_code (stmt);
2627   tree def_for_init;
2628   tree init_def;
2629   tree t = NULL_TREE;
2630   int i;
2631   bool nested_in_vect_loop = false;
2632   tree init_value;
2633   REAL_VALUE_TYPE real_init_val = dconst0;
2634   int int_init_val = 0;
2635
2636   gcc_assert (vectype);
2637   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2638
2639   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2640               || SCALAR_FLOAT_TYPE_P (scalar_type));
2641
2642   if (nested_in_vect_loop_p (loop, stmt))
2643     nested_in_vect_loop = true;
2644   else
2645     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2646
2647   /* In case of double reduction we only create a vector variable to be put
2648      in the reduction phi node. The actual statement creation is done in
2649      vect_create_epilog_for_reduction.  */
2650   if (TREE_CODE (init_val) == SSA_NAME
2651       && vinfo_for_stmt (SSA_NAME_DEF_STMT (init_val))
2652       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SSA_NAME_DEF_STMT (init_val)))
2653           == vect_double_reduction_def)
2654     {
2655       *adjustment_def = NULL;
2656       return vect_create_destination_var (init_val, vectype);
2657     }
2658
2659   if (TREE_CONSTANT (init_val))
2660     {
2661       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2662         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2663       else
2664         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2665     }
2666   else
2667     init_value = init_val;
2668
2669   switch (code)
2670     {
2671       case WIDEN_SUM_EXPR:
2672       case DOT_PROD_EXPR:
2673       case PLUS_EXPR:
2674       case MINUS_EXPR:
2675       case BIT_IOR_EXPR:
2676       case BIT_XOR_EXPR:
2677       case MULT_EXPR:
2678       case BIT_AND_EXPR:
2679         /* ADJUSMENT_DEF is NULL when called from
2680            vect_create_epilog_for_reduction to vectorize double reduction.  */
2681         if (adjustment_def)
2682           {
2683             if (nested_in_vect_loop)
2684               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2685                                                               NULL);
2686             else
2687               *adjustment_def = init_val;
2688           }
2689
2690         if (code == MULT_EXPR || code == BIT_AND_EXPR)
2691           {
2692             real_init_val = dconst1;
2693             int_init_val = 1;
2694           }
2695
2696         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2697           def_for_init = build_real (scalar_type, real_init_val);
2698         else
2699           def_for_init = build_int_cst (scalar_type, int_init_val);
2700
2701         /* Create a vector of '0' or '1' except the first element.  */
2702         for (i = nunits - 2; i >= 0; --i)
2703           t = tree_cons (NULL_TREE, def_for_init, t);
2704
2705         /* Option1: the first element is '0' or '1' as well.  */
2706         if (adjustment_def)
2707           {
2708             t = tree_cons (NULL_TREE, def_for_init, t);
2709             init_def = build_vector (vectype, t);
2710             break;
2711           }
2712
2713         /* Option2: the first element is INIT_VAL.  */
2714         t = tree_cons (NULL_TREE, init_value, t);
2715         if (TREE_CONSTANT (init_val))
2716           init_def = build_vector (vectype, t);
2717         else
2718           init_def = build_constructor_from_list (vectype, t);
2719
2720         break;
2721
2722       case MIN_EXPR:
2723       case MAX_EXPR:
2724         if (adjustment_def)
2725           {
2726             *adjustment_def = NULL_TREE;
2727             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2728             break;
2729           }
2730
2731         for (i = nunits - 1; i >= 0; --i)
2732           t = tree_cons (NULL_TREE, init_value, t);
2733
2734         if (TREE_CONSTANT (init_val))
2735           init_def = build_vector (vectype, t);
2736         else
2737           init_def = build_constructor_from_list (vectype, t);
2738
2739         break;
2740
2741       default:
2742         gcc_unreachable ();
2743     }
2744
2745   return init_def;
2746 }
2747
2748
2749 /* Function vect_create_epilog_for_reduction
2750
2751    Create code at the loop-epilog to finalize the result of a reduction
2752    computation.
2753
2754    VECT_DEF is a vector of partial results.
2755    REDUC_CODE is the tree-code for the epilog reduction.
2756    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2757      number of elements that we can fit in a vectype (nunits). In this case
2758      we have to generate more than one vector stmt - i.e - we need to "unroll"
2759      the vector stmt by a factor VF/nunits.  For more details see documentation
2760      in vectorizable_operation.
2761    STMT is the scalar reduction stmt that is being vectorized.
2762    REDUCTION_PHI is the phi-node that carries the reduction computation.
2763    REDUC_INDEX is the index of the operand in the right hand side of the
2764      statement that is defined by REDUCTION_PHI.
2765    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2766
2767    This function:
2768    1. Creates the reduction def-use cycle: sets the arguments for
2769       REDUCTION_PHI:
2770       The loop-entry argument is the vectorized initial-value of the reduction.
2771       The loop-latch argument is VECT_DEF - the vector of partial sums.
2772    2. "Reduces" the vector of partial results VECT_DEF into a single result,
2773       by applying the operation specified by REDUC_CODE if available, or by
2774       other means (whole-vector shifts or a scalar loop).
2775       The function also creates a new phi node at the loop exit to preserve
2776       loop-closed form, as illustrated below.
2777
2778      The flow at the entry to this function:
2779
2780         loop:
2781           vec_def = phi <null, null>            # REDUCTION_PHI
2782           VECT_DEF = vector_stmt                # vectorized form of STMT
2783           s_loop = scalar_stmt                  # (scalar) STMT
2784         loop_exit:
2785           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2786           use <s_out0>
2787           use <s_out0>
2788
2789      The above is transformed by this function into:
2790
2791         loop:
2792           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2793           VECT_DEF = vector_stmt                # vectorized form of STMT
2794           s_loop = scalar_stmt                  # (scalar) STMT
2795         loop_exit:
2796           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2797           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2798           v_out2 = reduce <v_out1>
2799           s_out3 = extract_field <v_out2, 0>
2800           s_out4 = adjust_result <s_out3>
2801           use <s_out4>
2802           use <s_out4>
2803 */
2804
2805 static void
2806 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2807                                   int ncopies,
2808                                   enum tree_code reduc_code,
2809                                   gimple reduction_phi,
2810                                   int reduc_index,
2811                                   bool double_reduc)
2812 {
2813   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2814   stmt_vec_info prev_phi_info;
2815   tree vectype;
2816   enum machine_mode mode;
2817   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2818   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2819   basic_block exit_bb;
2820   tree scalar_dest;
2821   tree scalar_type;
2822   gimple new_phi = NULL, phi;
2823   gimple_stmt_iterator exit_gsi;
2824   tree vec_dest;
2825   tree new_temp = NULL_TREE;
2826   tree new_name;
2827   gimple epilog_stmt = NULL;
2828   tree new_scalar_dest, new_dest;
2829   gimple exit_phi;
2830   tree bitsize, bitpos, bytesize;
2831   enum tree_code code = gimple_assign_rhs_code (stmt);
2832   tree adjustment_def;
2833   tree vec_initial_def, def;
2834   tree orig_name;
2835   imm_use_iterator imm_iter;
2836   use_operand_p use_p;
2837   bool extract_scalar_result = false;
2838   tree reduction_op, expr;
2839   gimple orig_stmt;
2840   gimple use_stmt;
2841   bool nested_in_vect_loop = false;
2842   VEC(gimple,heap) *phis = NULL;
2843   enum vect_def_type dt = vect_unknown_def_type;
2844   int j, i;
2845
2846   if (nested_in_vect_loop_p (loop, stmt))
2847     {
2848       outer_loop = loop;
2849       loop = loop->inner;
2850       nested_in_vect_loop = true;
2851     }
2852
2853   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2854     {
2855     case GIMPLE_SINGLE_RHS:
2856       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2857                                        == ternary_op);
2858       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2859       break;
2860     case GIMPLE_UNARY_RHS:
2861       reduction_op = gimple_assign_rhs1 (stmt);
2862       break;
2863     case GIMPLE_BINARY_RHS:
2864       reduction_op = reduc_index ?
2865                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2866       break;
2867     default:
2868       gcc_unreachable ();
2869     }
2870
2871   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2872   gcc_assert (vectype);
2873   mode = TYPE_MODE (vectype);
2874
2875   /*** 1. Create the reduction def-use cycle  ***/
2876
2877   /* For the case of reduction, vect_get_vec_def_for_operand returns
2878      the scalar def before the loop, that defines the initial value
2879      of the reduction variable.  */
2880   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2881                                                   &adjustment_def);
2882
2883   phi = reduction_phi;
2884   def = vect_def;
2885   for (j = 0; j < ncopies; j++)
2886     {
2887       /* 1.1 set the loop-entry arg of the reduction-phi:  */
2888       add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop));
2889
2890       /* 1.2 set the loop-latch arg for the reduction-phi:  */
2891       if (j > 0)
2892         def = vect_get_vec_def_for_stmt_copy (dt, def);
2893       add_phi_arg (phi, def, loop_latch_edge (loop));
2894
2895       if (vect_print_dump_info (REPORT_DETAILS))
2896         {
2897           fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2898           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2899           fprintf (vect_dump, "\n");
2900           print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2901         }
2902
2903       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2904     }
2905
2906   /*** 2. Create epilog code
2907           The reduction epilog code operates across the elements of the vector
2908           of partial results computed by the vectorized loop.
2909           The reduction epilog code consists of:
2910           step 1: compute the scalar result in a vector (v_out2)
2911           step 2: extract the scalar result (s_out3) from the vector (v_out2)
2912           step 3: adjust the scalar result (s_out3) if needed.
2913
2914           Step 1 can be accomplished using one the following three schemes:
2915           (scheme 1) using reduc_code, if available.
2916           (scheme 2) using whole-vector shifts, if available.
2917           (scheme 3) using a scalar loop. In this case steps 1+2 above are
2918                      combined.
2919
2920           The overall epilog code looks like this:
2921
2922           s_out0 = phi <s_loop>         # original EXIT_PHI
2923           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
2924           v_out2 = reduce <v_out1>              # step 1
2925           s_out3 = extract_field <v_out2, 0>    # step 2
2926           s_out4 = adjust_result <s_out3>       # step 3
2927
2928           (step 3 is optional, and steps 1 and 2 may be combined).
2929           Lastly, the uses of s_out0 are replaced by s_out4.
2930
2931           ***/
2932
2933   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2934         v_out1 = phi <v_loop>  */
2935
2936   exit_bb = single_exit (loop)->dest;
2937   def = vect_def;
2938   prev_phi_info = NULL;
2939   for (j = 0; j < ncopies; j++)
2940     {
2941       phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
2942       set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
2943       if (j == 0)
2944         new_phi = phi;
2945       else
2946         {
2947           def = vect_get_vec_def_for_stmt_copy (dt, def);
2948           STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
2949         }
2950       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
2951       prev_phi_info = vinfo_for_stmt (phi);
2952     }
2953
2954   exit_gsi = gsi_after_labels (exit_bb);
2955
2956   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
2957          (i.e. when reduc_code is not available) and in the final adjustment
2958          code (if needed).  Also get the original scalar reduction variable as
2959          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
2960          represents a reduction pattern), the tree-code and scalar-def are
2961          taken from the original stmt that the pattern-stmt (STMT) replaces.
2962          Otherwise (it is a regular reduction) - the tree-code and scalar-def
2963          are taken from STMT.  */
2964
2965   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2966   if (!orig_stmt)
2967     {
2968       /* Regular reduction  */
2969       orig_stmt = stmt;
2970     }
2971   else
2972     {
2973       /* Reduction pattern  */
2974       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
2975       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
2976       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
2977     }
2978
2979   code = gimple_assign_rhs_code (orig_stmt);
2980   scalar_dest = gimple_assign_lhs (orig_stmt);
2981   scalar_type = TREE_TYPE (scalar_dest);
2982   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
2983   bitsize = TYPE_SIZE (scalar_type);
2984   bytesize = TYPE_SIZE_UNIT (scalar_type);
2985
2986   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
2987      partial results are added and not subtracted.  */
2988   if (code == MINUS_EXPR)
2989     code = PLUS_EXPR;
2990
2991   /* In case this is a reduction in an inner-loop while vectorizing an outer
2992      loop - we don't need to extract a single scalar result at the end of the
2993      inner-loop (unless it is double reduction, i.e., the use of reduction is
2994      outside the outer-loop). The final vector of partial results will be used
2995      in the vectorized outer-loop, or reduced to a scalar result at the end of
2996      the outer-loop.  */
2997   if (nested_in_vect_loop && !double_reduc)
2998     goto vect_finalize_reduction;
2999
3000   /* The epilogue is created for the outer-loop, i.e., for the loop being
3001      vectorized.  */
3002   if (double_reduc)
3003     loop = outer_loop;
3004
3005   /* FORNOW */
3006   gcc_assert (ncopies == 1);
3007
3008   /* 2.3 Create the reduction code, using one of the three schemes described
3009          above.  */
3010
3011   if (reduc_code != ERROR_MARK)
3012     {
3013       tree tmp;
3014
3015       /*** Case 1:  Create:
3016            v_out2 = reduc_expr <v_out1>  */
3017
3018       if (vect_print_dump_info (REPORT_DETAILS))
3019         fprintf (vect_dump, "Reduce using direct vector reduction.");
3020
3021       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3022       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3023       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3024       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3025       gimple_assign_set_lhs (epilog_stmt, new_temp);
3026       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3027
3028       extract_scalar_result = true;
3029     }
3030   else
3031     {
3032       enum tree_code shift_code = ERROR_MARK;
3033       bool have_whole_vector_shift = true;
3034       int bit_offset;
3035       int element_bitsize = tree_low_cst (bitsize, 1);
3036       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3037       tree vec_temp;
3038
3039       if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3040         shift_code = VEC_RSHIFT_EXPR;
3041       else
3042         have_whole_vector_shift = false;
3043
3044       /* Regardless of whether we have a whole vector shift, if we're
3045          emulating the operation via tree-vect-generic, we don't want
3046          to use it.  Only the first round of the reduction is likely
3047          to still be profitable via emulation.  */
3048       /* ??? It might be better to emit a reduction tree code here, so that
3049          tree-vect-generic can expand the first round via bit tricks.  */
3050       if (!VECTOR_MODE_P (mode))
3051         have_whole_vector_shift = false;
3052       else
3053         {
3054           optab optab = optab_for_tree_code (code, vectype, optab_default);
3055           if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3056             have_whole_vector_shift = false;
3057         }
3058
3059       if (have_whole_vector_shift)
3060         {
3061           /*** Case 2: Create:
3062              for (offset = VS/2; offset >= element_size; offset/=2)
3063                 {
3064                   Create:  va' = vec_shift <va, offset>
3065                   Create:  va = vop <va, va'>
3066                 }  */
3067
3068           if (vect_print_dump_info (REPORT_DETAILS))
3069             fprintf (vect_dump, "Reduce using vector shifts");
3070
3071           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3072           new_temp = PHI_RESULT (new_phi);
3073
3074           for (bit_offset = vec_size_in_bits/2;
3075                bit_offset >= element_bitsize;
3076                bit_offset /= 2)
3077             {
3078               tree bitpos = size_int (bit_offset);
3079
3080               epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
3081                                                           new_temp, bitpos);
3082               new_name = make_ssa_name (vec_dest, epilog_stmt);
3083               gimple_assign_set_lhs (epilog_stmt, new_name);
3084               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3085
3086               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3087                                                           new_name, new_temp);
3088               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3089               gimple_assign_set_lhs (epilog_stmt, new_temp);
3090               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3091             }
3092
3093           extract_scalar_result = true;
3094         }
3095       else
3096         {
3097           tree rhs;
3098
3099           /*** Case 3: Create:
3100              s = extract_field <v_out2, 0>
3101              for (offset = element_size;
3102                   offset < vector_size;
3103                   offset += element_size;)
3104                {
3105                  Create:  s' = extract_field <v_out2, offset>
3106                  Create:  s = op <s, s'>
3107                }  */
3108
3109           if (vect_print_dump_info (REPORT_DETAILS))
3110             fprintf (vect_dump, "Reduce using scalar code. ");
3111
3112           vec_temp = PHI_RESULT (new_phi);
3113           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3114           rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3115                          bitsize_zero_node);
3116           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3117           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3118           gimple_assign_set_lhs (epilog_stmt, new_temp);
3119           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3120
3121           for (bit_offset = element_bitsize;
3122                bit_offset < vec_size_in_bits;
3123                bit_offset += element_bitsize)
3124             {
3125               tree bitpos = bitsize_int (bit_offset);
3126               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3127                                  bitpos);
3128
3129               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3130               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3131               gimple_assign_set_lhs (epilog_stmt, new_name);
3132               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3133
3134               epilog_stmt = gimple_build_assign_with_ops (code,
3135                                                           new_scalar_dest,
3136                                                           new_name, new_temp);
3137               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3138               gimple_assign_set_lhs (epilog_stmt, new_temp);
3139               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3140             }
3141
3142           extract_scalar_result = false;
3143         }
3144     }
3145
3146   /* 2.4  Extract the final scalar result.  Create:
3147          s_out3 = extract_field <v_out2, bitpos>  */
3148
3149   if (extract_scalar_result)
3150     {
3151       tree rhs;
3152
3153       gcc_assert (!nested_in_vect_loop || double_reduc);
3154       if (vect_print_dump_info (REPORT_DETAILS))
3155         fprintf (vect_dump, "extract scalar result");
3156
3157       if (BYTES_BIG_ENDIAN)
3158         bitpos = size_binop (MULT_EXPR,
3159                        bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3160                        TYPE_SIZE (scalar_type));
3161       else
3162         bitpos = bitsize_zero_node;
3163
3164       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3165       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3166       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3167       gimple_assign_set_lhs (epilog_stmt, new_temp);
3168       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3169     }
3170
3171 vect_finalize_reduction:
3172
3173   if (double_reduc)
3174     loop = loop->inner;
3175
3176   /* 2.5 Adjust the final result by the initial value of the reduction
3177          variable. (When such adjustment is not needed, then
3178          'adjustment_def' is zero).  For example, if code is PLUS we create:
3179          new_temp = loop_exit_def + adjustment_def  */
3180
3181   if (adjustment_def)
3182     {
3183       if (nested_in_vect_loop)
3184         {
3185           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3186           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3187           new_dest = vect_create_destination_var (scalar_dest, vectype);
3188         }
3189       else
3190         {
3191           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3192           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3193           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3194         }
3195
3196       epilog_stmt = gimple_build_assign (new_dest, expr);
3197       new_temp = make_ssa_name (new_dest, epilog_stmt);
3198       gimple_assign_set_lhs (epilog_stmt, new_temp);
3199       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3200       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3201     }
3202
3203
3204   /* 2.6  Handle the loop-exit phi  */
3205
3206   /* Replace uses of s_out0 with uses of s_out3:
3207      Find the loop-closed-use at the loop exit of the original scalar result.
3208      (The reduction result is expected to have two immediate uses - one at the
3209      latch block, and one at the loop exit).  */
3210   phis = VEC_alloc (gimple, heap, 10);
3211   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3212     {
3213       if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3214         {
3215           exit_phi = USE_STMT (use_p);
3216           VEC_quick_push (gimple, phis, exit_phi);
3217         }
3218     }
3219
3220   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
3221   gcc_assert (!VEC_empty (gimple, phis));
3222
3223   for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3224     {
3225       if (nested_in_vect_loop)
3226         {
3227           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3228           gimple vect_phi;
3229
3230           /* FORNOW. Currently not supporting the case that an inner-loop
3231              reduction is not used in the outer-loop (but only outside the
3232              outer-loop), unless it is double reduction.  */
3233           gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
3234                       && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
3235
3236           epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3237           STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3238           set_vinfo_for_stmt (epilog_stmt,
3239                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3240                                                  NULL));
3241           if (adjustment_def)
3242             STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3243                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3244
3245           if (!double_reduc
3246               || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
3247             continue;
3248
3249           /* Handle double reduction:
3250
3251              stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3252              stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
3253              stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
3254              stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3255
3256              At that point the regular reduction (stmt2 and stmt3) is already
3257              vectorized, as well as the exit phi node, stmt4.
3258              Here we vectorize the phi node of double reduction, stmt1, and
3259              update all relevant statements.  */
3260
3261           /* Go through all the uses of s2 to find double reduction phi node,
3262              i.e., stmt1 above.  */
3263           orig_name = PHI_RESULT (exit_phi);
3264           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3265             {
3266               stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3267               stmt_vec_info new_phi_vinfo;
3268               tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3269               basic_block bb = gimple_bb (use_stmt);
3270               gimple use;
3271
3272               /* Check that USE_STMT is really double reduction phi node.  */
3273               if (gimple_code (use_stmt) != GIMPLE_PHI
3274                   || gimple_phi_num_args (use_stmt) != 2
3275                   || !use_stmt_vinfo
3276                   || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3277                       != vect_double_reduction_def
3278                   || bb->loop_father != outer_loop)
3279                 continue;
3280
3281               /* Create vector phi node for double reduction:
3282                  vs1 = phi <vs0, vs2>
3283                  vs1 was created previously in this function by a call to
3284                  vect_get_vec_def_for_operand and is stored in vec_initial_def;
3285                  vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3286                  vs0 is created here.  */
3287
3288               /* Create vector phi node.  */
3289               vect_phi = create_phi_node (vec_initial_def, bb);
3290               new_phi_vinfo = new_stmt_vec_info (vect_phi,
3291                                     loop_vec_info_for_loop (outer_loop), NULL);
3292               set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3293
3294               /* Create vs0 - initial def of the double reduction phi.  */
3295               preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3296                                              loop_preheader_edge (outer_loop));
3297               init_def = get_initial_def_for_reduction (stmt, preheader_arg,
3298                                                         NULL);
3299               vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
3300                                                 NULL);
3301
3302               /* Update phi node arguments with vs0 and vs2.  */
3303               add_phi_arg (vect_phi, vect_phi_init,
3304                            loop_preheader_edge (outer_loop));
3305               add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3306                            loop_latch_edge (outer_loop));
3307               if (vect_print_dump_info (REPORT_DETAILS))
3308                 {
3309                   fprintf (vect_dump, "created double reduction phi node: ");
3310                   print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3311                 }
3312
3313               vect_phi_res = PHI_RESULT (vect_phi);
3314
3315               /* Replace the use, i.e., set the correct vs1 in the regular
3316                  reduction phi node. FORNOW, NCOPIES is always 1, so the loop
3317                  is redundant.  */
3318               use = reduction_phi;
3319               for (j = 0; j < ncopies; j++)
3320                 {
3321                   edge pr_edge = loop_preheader_edge (loop);
3322                   SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3323                   use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3324                 }
3325             }
3326         }
3327
3328       /* Replace the uses:  */
3329       orig_name = PHI_RESULT (exit_phi);
3330       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3331         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3332           SET_USE (use_p, new_temp);
3333     }
3334
3335   VEC_free (gimple, heap, phis);
3336 }
3337
3338
3339 /* Function vectorizable_reduction.
3340
3341    Check if STMT performs a reduction operation that can be vectorized.
3342    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3343    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3344    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3345
3346    This function also handles reduction idioms (patterns) that have been
3347    recognized in advance during vect_pattern_recog. In this case, STMT may be
3348    of this form:
3349      X = pattern_expr (arg0, arg1, ..., X)
3350    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3351    sequence that had been detected and replaced by the pattern-stmt (STMT).
3352
3353    In some cases of reduction patterns, the type of the reduction variable X is
3354    different than the type of the other arguments of STMT.
3355    In such cases, the vectype that is used when transforming STMT into a vector
3356    stmt is different than the vectype that is used to determine the
3357    vectorization factor, because it consists of a different number of elements
3358    than the actual number of elements that are being operated upon in parallel.
3359
3360    For example, consider an accumulation of shorts into an int accumulator.
3361    On some targets it's possible to vectorize this pattern operating on 8
3362    shorts at a time (hence, the vectype for purposes of determining the
3363    vectorization factor should be V8HI); on the other hand, the vectype that
3364    is used to create the vector form is actually V4SI (the type of the result).
3365
3366    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3367    indicates what is the actual level of parallelism (V8HI in the example), so
3368    that the right vectorization factor would be derived. This vectype
3369    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3370    be used to create the vectorized stmt. The right vectype for the vectorized
3371    stmt is obtained from the type of the result X:
3372         get_vectype_for_scalar_type (TREE_TYPE (X))
3373
3374    This means that, contrary to "regular" reductions (or "regular" stmts in
3375    general), the following equation:
3376       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3377    does *NOT* necessarily hold for reduction patterns.  */
3378
3379 bool
3380 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3381                         gimple *vec_stmt)
3382 {
3383   tree vec_dest;
3384   tree scalar_dest;
3385   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3386   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3387   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3388   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3389   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3390   enum tree_code code, orig_code, epilog_reduc_code;
3391   enum machine_mode vec_mode;
3392   int op_type;
3393   optab optab, reduc_optab;
3394   tree new_temp = NULL_TREE;
3395   tree def;
3396   gimple def_stmt;
3397   enum vect_def_type dt;
3398   gimple new_phi = NULL;
3399   tree scalar_type;
3400   bool is_simple_use;
3401   gimple orig_stmt;
3402   stmt_vec_info orig_stmt_info;
3403   tree expr = NULL_TREE;
3404   int i;
3405   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3406   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3407   int epilog_copies;
3408   stmt_vec_info prev_stmt_info, prev_phi_info;
3409   gimple first_phi = NULL;
3410   bool single_defuse_cycle = false;
3411   tree reduc_def;
3412   gimple new_stmt = NULL;
3413   int j;
3414   tree ops[3];
3415   bool nested_cycle = false, found_nested_cycle_def = false;
3416   gimple reduc_def_stmt = NULL;
3417   /* The default is that the reduction variable is the last in statement.  */
3418   int reduc_index = 2;
3419   bool double_reduc = false, dummy;
3420   basic_block def_bb;
3421   struct loop * def_stmt_loop;
3422   tree def_arg;
3423
3424   if (nested_in_vect_loop_p (loop, stmt))
3425     {
3426       loop = loop->inner;
3427       nested_cycle = true;
3428     }
3429
3430   gcc_assert (ncopies >= 1);
3431
3432   /* FORNOW: SLP not supported.  */
3433   if (STMT_SLP_TYPE (stmt_info))
3434     return false;
3435
3436   /* 1. Is vectorizable reduction?  */
3437   /* Not supportable if the reduction variable is used in the loop.  */
3438   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3439     return false;
3440
3441   /* Reductions that are not used even in an enclosing outer-loop,
3442      are expected to be "live" (used out of the loop).  */
3443   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3444       && !STMT_VINFO_LIVE_P (stmt_info))
3445     return false;
3446
3447   /* Make sure it was already recognized as a reduction computation.  */
3448   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3449       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3450     return false;
3451
3452   /* 2. Has this been recognized as a reduction pattern?
3453
3454      Check if STMT represents a pattern that has been recognized
3455      in earlier analysis stages.  For stmts that represent a pattern,
3456      the STMT_VINFO_RELATED_STMT field records the last stmt in
3457      the original sequence that constitutes the pattern.  */
3458
3459   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3460   if (orig_stmt)
3461     {
3462       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3463       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3464       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3465       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3466     }
3467
3468   /* 3. Check the operands of the operation. The first operands are defined
3469         inside the loop body. The last operand is the reduction variable,
3470         which is defined by the loop-header-phi.  */
3471
3472   gcc_assert (is_gimple_assign (stmt));
3473
3474   /* Flatten RHS */
3475   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3476     {
3477     case GIMPLE_SINGLE_RHS:
3478       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3479       if (op_type == ternary_op)
3480         {
3481           tree rhs = gimple_assign_rhs1 (stmt);
3482           ops[0] = TREE_OPERAND (rhs, 0);
3483           ops[1] = TREE_OPERAND (rhs, 1);
3484           ops[2] = TREE_OPERAND (rhs, 2);
3485           code = TREE_CODE (rhs);
3486         }
3487       else
3488         return false;
3489       break;
3490
3491     case GIMPLE_BINARY_RHS:
3492       code = gimple_assign_rhs_code (stmt);
3493       op_type = TREE_CODE_LENGTH (code);
3494       gcc_assert (op_type == binary_op);
3495       ops[0] = gimple_assign_rhs1 (stmt);
3496       ops[1] = gimple_assign_rhs2 (stmt);
3497       break;
3498
3499     case GIMPLE_UNARY_RHS:
3500       return false;
3501
3502     default:
3503       gcc_unreachable ();
3504     }
3505
3506   scalar_dest = gimple_assign_lhs (stmt);
3507   scalar_type = TREE_TYPE (scalar_dest);
3508   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3509       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3510     return false;
3511
3512   /* All uses but the last are expected to be defined in the loop.
3513      The last use is the reduction variable. In case of nested cycle this
3514      assumption is not true: we use reduc_index to record the index of the
3515      reduction variable.  */
3516   for (i = 0; i < op_type-1; i++)
3517     {
3518       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3519                                           &def, &dt);
3520       gcc_assert (is_simple_use);
3521       if (dt != vect_internal_def
3522           && dt != vect_external_def
3523           && dt != vect_constant_def
3524           && dt != vect_induction_def
3525           && dt != vect_nested_cycle)
3526         return false;
3527
3528       if (dt == vect_nested_cycle)
3529         {
3530           found_nested_cycle_def = true;
3531           reduc_def_stmt = def_stmt;
3532           reduc_index = i;
3533         }
3534     }
3535
3536   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3537                                       &def, &dt);
3538   gcc_assert (is_simple_use);
3539   gcc_assert (dt == vect_reduction_def
3540               || dt == vect_nested_cycle
3541               || ((dt == vect_internal_def || dt == vect_external_def
3542                    || dt == vect_constant_def || dt == vect_induction_def)
3543                    && nested_cycle && found_nested_cycle_def));
3544   if (!found_nested_cycle_def)
3545     reduc_def_stmt = def_stmt;
3546
3547   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3548   if (orig_stmt)
3549     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3550                                                        reduc_def_stmt,
3551                                                        !nested_cycle,
3552                                                        &dummy));
3553   else
3554     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3555                                                   !nested_cycle, &dummy));
3556
3557   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3558     return false;
3559
3560   /* 4. Supportable by target?  */
3561
3562   /* 4.1. check support for the operation in the loop  */
3563   optab = optab_for_tree_code (code, vectype, optab_default);
3564   if (!optab)
3565     {
3566       if (vect_print_dump_info (REPORT_DETAILS))
3567         fprintf (vect_dump, "no optab.");
3568       return false;
3569     }
3570   vec_mode = TYPE_MODE (vectype);
3571   if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3572     {
3573       if (vect_print_dump_info (REPORT_DETAILS))
3574         fprintf (vect_dump, "op not supported by target.");
3575       if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3576           || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3577              < vect_min_worthwhile_factor (code))
3578         return false;
3579       if (vect_print_dump_info (REPORT_DETAILS))
3580         fprintf (vect_dump, "proceeding using word mode.");
3581     }
3582
3583   /* Worthwhile without SIMD support?  */
3584   if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3585       && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3586          < vect_min_worthwhile_factor (code))
3587     {
3588       if (vect_print_dump_info (REPORT_DETAILS))
3589         fprintf (vect_dump, "not worthwhile without SIMD support.");
3590       return false;
3591     }
3592
3593   /* 4.2. Check support for the epilog operation.
3594
3595           If STMT represents a reduction pattern, then the type of the
3596           reduction variable may be different than the type of the rest
3597           of the arguments.  For example, consider the case of accumulation
3598           of shorts into an int accumulator; The original code:
3599                         S1: int_a = (int) short_a;
3600           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
3601
3602           was replaced with:
3603                         STMT: int_acc = widen_sum <short_a, int_acc>
3604
3605           This means that:
3606           1. The tree-code that is used to create the vector operation in the
3607              epilog code (that reduces the partial results) is not the
3608              tree-code of STMT, but is rather the tree-code of the original
3609              stmt from the pattern that STMT is replacing. I.e, in the example
3610              above we want to use 'widen_sum' in the loop, but 'plus' in the
3611              epilog.
3612           2. The type (mode) we use to check available target support
3613              for the vector operation to be created in the *epilog*, is
3614              determined by the type of the reduction variable (in the example
3615              above we'd check this: plus_optab[vect_int_mode]).
3616              However the type (mode) we use to check available target support
3617              for the vector operation to be created *inside the loop*, is
3618              determined by the type of the other arguments to STMT (in the
3619              example we'd check this: widen_sum_optab[vect_short_mode]).
3620
3621           This is contrary to "regular" reductions, in which the types of all
3622           the arguments are the same as the type of the reduction variable.
3623           For "regular" reductions we can therefore use the same vector type
3624           (and also the same tree-code) when generating the epilog code and
3625           when generating the code inside the loop.  */
3626
3627   if (orig_stmt)
3628     {
3629       /* This is a reduction pattern: get the vectype from the type of the
3630          reduction variable, and get the tree-code from orig_stmt.  */
3631       orig_code = gimple_assign_rhs_code (orig_stmt);
3632       vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3633       if (!vectype)
3634         {
3635           if (vect_print_dump_info (REPORT_DETAILS))
3636             {
3637               fprintf (vect_dump, "unsupported data-type ");
3638               print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3639             }
3640           return false;
3641         }
3642
3643       vec_mode = TYPE_MODE (vectype);
3644     }
3645   else
3646     {
3647       /* Regular reduction: use the same vectype and tree-code as used for
3648          the vector code inside the loop can be used for the epilog code. */
3649       orig_code = code;
3650     }
3651
3652   if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3653     return false;
3654
3655   reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype,
3656                                      optab_default);
3657   if (!reduc_optab)
3658     {
3659       if (vect_print_dump_info (REPORT_DETAILS))
3660         fprintf (vect_dump, "no optab for reduction.");
3661       epilog_reduc_code = ERROR_MARK;
3662     }
3663
3664   if (reduc_optab
3665       && optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing)
3666     {
3667       if (vect_print_dump_info (REPORT_DETAILS))
3668         fprintf (vect_dump, "reduc op not supported by target.");
3669       epilog_reduc_code = ERROR_MARK;
3670     }
3671
3672   def_bb = gimple_bb (reduc_def_stmt);
3673   def_stmt_loop = def_bb->loop_father;
3674   def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3675                                    loop_preheader_edge (def_stmt_loop));
3676   if (TREE_CODE (def_arg) == SSA_NAME
3677       && vinfo_for_stmt (SSA_NAME_DEF_STMT (def_arg))
3678       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (SSA_NAME_DEF_STMT (def_arg)))
3679           == vect_double_reduction_def)
3680     double_reduc = true;
3681
3682   if (double_reduc && ncopies > 1)
3683     {
3684       if (vect_print_dump_info (REPORT_DETAILS))
3685         fprintf (vect_dump, "multiple types in double reduction");
3686
3687       return false;
3688     }
3689
3690   if (!vec_stmt) /* transformation not required.  */
3691     {
3692       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3693       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3694         return false;
3695       return true;
3696     }
3697
3698   /** Transform.  **/
3699
3700   if (vect_print_dump_info (REPORT_DETAILS))
3701     fprintf (vect_dump, "transform reduction.");
3702
3703   /* Create the destination vector  */
3704   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3705
3706   /* In case the vectorization factor (VF) is bigger than the number
3707      of elements that we can fit in a vectype (nunits), we have to generate
3708      more than one vector stmt - i.e - we need to "unroll" the
3709      vector stmt by a factor VF/nunits.  For more details see documentation
3710      in vectorizable_operation.  */
3711
3712   /* If the reduction is used in an outer loop we need to generate
3713      VF intermediate results, like so (e.g. for ncopies=2):
3714         r0 = phi (init, r0)
3715         r1 = phi (init, r1)
3716         r0 = x0 + r0;
3717         r1 = x1 + r1;
3718     (i.e. we generate VF results in 2 registers).
3719     In this case we have a separate def-use cycle for each copy, and therefore
3720     for each copy we get the vector def for the reduction variable from the
3721     respective phi node created for this copy.
3722
3723     Otherwise (the reduction is unused in the loop nest), we can combine
3724     together intermediate results, like so (e.g. for ncopies=2):
3725         r = phi (init, r)
3726         r = x0 + r;
3727         r = x1 + r;
3728    (i.e. we generate VF/2 results in a single register).
3729    In this case for each copy we get the vector def for the reduction variable
3730    from the vectorized reduction operation generated in the previous iteration.
3731   */
3732
3733   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3734     {
3735       single_defuse_cycle = true;
3736       epilog_copies = 1;
3737     }
3738   else
3739     epilog_copies = ncopies;
3740
3741   prev_stmt_info = NULL;
3742   prev_phi_info = NULL;
3743   for (j = 0; j < ncopies; j++)
3744     {
3745       if (j == 0 || !single_defuse_cycle)
3746         {
3747           /* Create the reduction-phi that defines the reduction-operand.  */
3748           new_phi = create_phi_node (vec_dest, loop->header);
3749           set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3750                                                           NULL));
3751         }
3752
3753       /* Handle uses.  */
3754       if (j == 0)
3755         {
3756           loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3757                                                         stmt, NULL);
3758           if (op_type == ternary_op)
3759             {
3760               if (reduc_index == 0)
3761                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3762                                                               NULL);
3763               else
3764                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3765                                                               NULL);
3766             }
3767
3768           /* Get the vector def for the reduction variable from the phi
3769              node.  */
3770           reduc_def = PHI_RESULT (new_phi);
3771           first_phi = new_phi;
3772         }
3773       else
3774         {
3775           enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3776           loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3777           if (op_type == ternary_op)
3778             loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3779
3780           if (single_defuse_cycle)
3781             reduc_def = gimple_assign_lhs (new_stmt);
3782           else
3783             reduc_def = PHI_RESULT (new_phi);
3784
3785           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3786         }
3787
3788
3789       /* Arguments are ready. create the new vector stmt.  */
3790       if (op_type == binary_op)
3791         {
3792           if (reduc_index == 0)
3793             expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3794           else
3795             expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3796         }
3797       else
3798         {
3799           if (reduc_index == 0)
3800             expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3801                            loop_vec_def1);
3802           else
3803             {
3804               if (reduc_index == 1)
3805                 expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3806                                loop_vec_def1);
3807               else
3808                 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3809                                reduc_def);
3810             }
3811         }
3812
3813       new_stmt = gimple_build_assign (vec_dest, expr);
3814       new_temp = make_ssa_name (vec_dest, new_stmt);
3815       gimple_assign_set_lhs (new_stmt, new_temp);
3816       vect_finish_stmt_generation (stmt, new_stmt, gsi);
3817
3818       if (j == 0)
3819         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3820       else
3821         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3822       prev_stmt_info = vinfo_for_stmt (new_stmt);
3823       prev_phi_info = vinfo_for_stmt (new_phi);
3824     }
3825
3826   /* Finalize the reduction-phi (set its arguments) and create the
3827      epilog reduction code.  */
3828   if (!single_defuse_cycle)
3829     new_temp = gimple_assign_lhs (*vec_stmt);
3830
3831   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3832                                     epilog_reduc_code, first_phi, reduc_index,
3833                                     double_reduc);
3834   return true;
3835 }
3836
3837 /* Function vect_min_worthwhile_factor.
3838
3839    For a loop where we could vectorize the operation indicated by CODE,
3840    return the minimum vectorization factor that makes it worthwhile
3841    to use generic vectors.  */
3842 int
3843 vect_min_worthwhile_factor (enum tree_code code)
3844 {
3845   switch (code)
3846     {
3847     case PLUS_EXPR:
3848     case MINUS_EXPR:
3849     case NEGATE_EXPR:
3850       return 4;
3851
3852     case BIT_AND_EXPR:
3853     case BIT_IOR_EXPR:
3854     case BIT_XOR_EXPR:
3855     case BIT_NOT_EXPR:
3856       return 2;
3857
3858     default:
3859       return INT_MAX;
3860     }
3861 }
3862
3863
3864 /* Function vectorizable_induction
3865
3866    Check if PHI performs an induction computation that can be vectorized.
3867    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3868    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3869    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3870
3871 bool
3872 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3873                         gimple *vec_stmt)
3874 {
3875   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3876   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3877   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3878   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3879   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3880   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3881   tree vec_def;
3882
3883   gcc_assert (ncopies >= 1);
3884   /* FORNOW. This restriction should be relaxed.  */
3885   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
3886     {
3887       if (vect_print_dump_info (REPORT_DETAILS))
3888         fprintf (vect_dump, "multiple types in nested loop.");
3889       return false;
3890     }
3891
3892   if (!STMT_VINFO_RELEVANT_P (stmt_info))
3893     return false;
3894
3895   /* FORNOW: SLP not supported.  */
3896   if (STMT_SLP_TYPE (stmt_info))
3897     return false;
3898
3899   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
3900
3901   if (gimple_code (phi) != GIMPLE_PHI)
3902     return false;
3903
3904   if (!vec_stmt) /* transformation not required.  */
3905     {
3906       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
3907       if (vect_print_dump_info (REPORT_DETAILS))
3908         fprintf (vect_dump, "=== vectorizable_induction ===");
3909       vect_model_induction_cost (stmt_info, ncopies);
3910       return true;
3911     }
3912
3913   /** Transform.  **/
3914
3915   if (vect_print_dump_info (REPORT_DETAILS))
3916     fprintf (vect_dump, "transform induction phi.");
3917
3918   vec_def = get_initial_def_for_induction (phi);
3919   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
3920   return true;
3921 }
3922
3923 /* Function vectorizable_live_operation.
3924
3925    STMT computes a value that is used outside the loop. Check if
3926    it can be supported.  */
3927
3928 bool
3929 vectorizable_live_operation (gimple stmt,
3930                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3931                              gimple *vec_stmt ATTRIBUTE_UNUSED)
3932 {
3933   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3934   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3935   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3936   int i;
3937   int op_type;
3938   tree op;
3939   tree def;
3940   gimple def_stmt;
3941   enum vect_def_type dt;
3942   enum tree_code code;
3943   enum gimple_rhs_class rhs_class;
3944
3945   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
3946
3947   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
3948     return false;
3949
3950   if (!is_gimple_assign (stmt))
3951     return false;
3952
3953   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
3954     return false;
3955
3956   /* FORNOW. CHECKME. */
3957   if (nested_in_vect_loop_p (loop, stmt))
3958     return false;
3959
3960   code = gimple_assign_rhs_code (stmt);
3961   op_type = TREE_CODE_LENGTH (code);
3962   rhs_class = get_gimple_rhs_class (code);
3963   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
3964   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
3965
3966   /* FORNOW: support only if all uses are invariant. This means
3967      that the scalar operations can remain in place, unvectorized.
3968      The original last scalar value that they compute will be used.  */
3969
3970   for (i = 0; i < op_type; i++)
3971     {
3972       if (rhs_class == GIMPLE_SINGLE_RHS)
3973         op = TREE_OPERAND (gimple_op (stmt, 1), i);
3974       else
3975         op = gimple_op (stmt, i + 1);
3976       if (op
3977           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
3978         {
3979           if (vect_print_dump_info (REPORT_DETAILS))
3980             fprintf (vect_dump, "use not simple.");
3981           return false;
3982         }
3983
3984       if (dt != vect_external_def && dt != vect_constant_def)
3985         return false;
3986     }
3987
3988   /* No transformation is required for the cases we currently support.  */
3989   return true;
3990 }
3991
3992 /* Function vect_transform_loop.
3993
3994    The analysis phase has determined that the loop is vectorizable.
3995    Vectorize the loop - created vectorized stmts to replace the scalar
3996    stmts in the loop, and update the loop exit condition.  */
3997
3998 void
3999 vect_transform_loop (loop_vec_info loop_vinfo)
4000 {
4001   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4002   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4003   int nbbs = loop->num_nodes;
4004   gimple_stmt_iterator si;
4005   int i;
4006   tree ratio = NULL;
4007   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4008   bool strided_store;
4009   bool slp_scheduled = false;
4010   unsigned int nunits;
4011   tree cond_expr = NULL_TREE;
4012   gimple_seq cond_expr_stmt_list = NULL;
4013   bool do_peeling_for_loop_bound;
4014
4015   if (vect_print_dump_info (REPORT_DETAILS))
4016     fprintf (vect_dump, "=== vec_transform_loop ===");
4017
4018   /* Peel the loop if there are data refs with unknown alignment.
4019      Only one data ref with unknown store is allowed.  */
4020
4021   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4022     vect_do_peeling_for_alignment (loop_vinfo);
4023
4024   do_peeling_for_loop_bound
4025     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4026        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4027            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4028
4029   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4030       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4031     vect_loop_versioning (loop_vinfo,
4032                           !do_peeling_for_loop_bound,
4033                           &cond_expr, &cond_expr_stmt_list);
4034
4035   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4036      compile time constant), or it is a constant that doesn't divide by the
4037      vectorization factor, then an epilog loop needs to be created.
4038      We therefore duplicate the loop: the original loop will be vectorized,
4039      and will compute the first (n/VF) iterations. The second copy of the loop
4040      will remain scalar and will compute the remaining (n%VF) iterations.
4041      (VF is the vectorization factor).  */
4042
4043   if (do_peeling_for_loop_bound)
4044     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4045                                     cond_expr, cond_expr_stmt_list);
4046   else
4047     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4048                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4049
4050   /* 1) Make sure the loop header has exactly two entries
4051      2) Make sure we have a preheader basic block.  */
4052
4053   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4054
4055   split_edge (loop_preheader_edge (loop));
4056
4057   /* FORNOW: the vectorizer supports only loops which body consist
4058      of one basic block (header + empty latch). When the vectorizer will
4059      support more involved loop forms, the order by which the BBs are
4060      traversed need to be reconsidered.  */
4061
4062   for (i = 0; i < nbbs; i++)
4063     {
4064       basic_block bb = bbs[i];
4065       stmt_vec_info stmt_info;
4066       gimple phi;
4067
4068       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4069         {
4070           phi = gsi_stmt (si);
4071           if (vect_print_dump_info (REPORT_DETAILS))
4072             {
4073               fprintf (vect_dump, "------>vectorizing phi: ");
4074               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4075             }
4076           stmt_info = vinfo_for_stmt (phi);
4077           if (!stmt_info)
4078             continue;
4079
4080           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4081               && !STMT_VINFO_LIVE_P (stmt_info))
4082             continue;
4083
4084           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4085                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4086               && vect_print_dump_info (REPORT_DETAILS))
4087             fprintf (vect_dump, "multiple-types.");
4088
4089           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4090             {
4091               if (vect_print_dump_info (REPORT_DETAILS))
4092                 fprintf (vect_dump, "transform phi.");
4093               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4094             }
4095         }
4096
4097       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4098         {
4099           gimple stmt = gsi_stmt (si);
4100           bool is_store;
4101
4102           if (vect_print_dump_info (REPORT_DETAILS))
4103             {
4104               fprintf (vect_dump, "------>vectorizing statement: ");
4105               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4106             }
4107
4108           stmt_info = vinfo_for_stmt (stmt);
4109
4110           /* vector stmts created in the outer-loop during vectorization of
4111              stmts in an inner-loop may not have a stmt_info, and do not
4112              need to be vectorized.  */
4113           if (!stmt_info)
4114             {
4115               gsi_next (&si);
4116               continue;
4117             }
4118
4119           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4120               && !STMT_VINFO_LIVE_P (stmt_info))
4121             {
4122               gsi_next (&si);
4123               continue;
4124             }
4125
4126           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4127           nunits =
4128             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4129           if (!STMT_SLP_TYPE (stmt_info)
4130               && nunits != (unsigned int) vectorization_factor
4131               && vect_print_dump_info (REPORT_DETAILS))
4132             /* For SLP VF is set according to unrolling factor, and not to
4133                vector size, hence for SLP this print is not valid.  */
4134             fprintf (vect_dump, "multiple-types.");
4135
4136           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4137              reached.  */
4138           if (STMT_SLP_TYPE (stmt_info))
4139             {
4140               if (!slp_scheduled)
4141                 {
4142                   slp_scheduled = true;
4143
4144                   if (vect_print_dump_info (REPORT_DETAILS))
4145                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4146
4147                   vect_schedule_slp (loop_vinfo, NULL);
4148                 }
4149
4150               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4151               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4152                 {
4153                   gsi_next (&si);
4154                   continue;
4155                 }
4156             }
4157
4158           /* -------- vectorize statement ------------ */
4159           if (vect_print_dump_info (REPORT_DETAILS))
4160             fprintf (vect_dump, "transform statement.");
4161
4162           strided_store = false;
4163           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4164           if (is_store)
4165             {
4166               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4167                 {
4168                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4169                      interleaving chain was completed - free all the stores in
4170                      the chain.  */
4171                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4172                   gsi_remove (&si, true);
4173                   continue;
4174                 }
4175               else
4176                 {
4177                   /* Free the attached stmt_vec_info and remove the stmt.  */
4178                   free_stmt_vec_info (stmt);
4179                   gsi_remove (&si, true);
4180                   continue;
4181                 }
4182             }
4183           gsi_next (&si);
4184         }                       /* stmts in BB */
4185     }                           /* BBs in loop */
4186
4187   slpeel_make_loop_iterate_ntimes (loop, ratio);
4188
4189   /* The memory tags and pointers in vectorized statements need to
4190      have their SSA forms updated.  FIXME, why can't this be delayed
4191      until all the loops have been transformed?  */
4192   update_ssa (TODO_update_ssa);
4193
4194   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4195     fprintf (vect_dump, "LOOP VECTORIZED.");
4196   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4197     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4198 }