gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software
   3    Foundation, Inc.
   4    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   5    Ira Rosen <irar@il.ibm.com>
   6
   7 This file is part of GCC.
   8
   9 GCC is free software; you can redistribute it and/or modify it under
  10 the terms of the GNU General Public License as published by the Free
  11 Software Foundation; either version 3, or (at your option) any later
  12 version.
  13
  14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  16 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  17 for more details.
  18
  19 You should have received a copy of the GNU General Public License
  20 along with GCC; see the file COPYING3.  If not see
  21 <http://www.gnu.org/licenses/>.  */
  22
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "diagnostic.h"
  31 #include "tree-flow.h"
  32 #include "tree-dump.h"
  33 #include "cfgloop.h"
  34 #include "cfglayout.h"
  35 #include "expr.h"
  36 #include "recog.h"
  37 #include "optabs.h"
  38 #include "params.h"
  39 #include "toplev.h"
  40 #include "tree-chrec.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs"). These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized. It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S). This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can
 127    support different sizes of vectors, for now will need to specify one value
 128    for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future.
 129
 130         Since we only vectorize operations which vector form can be
 131    expressed using existing tree codes, to verify that an operation is
 132    supported, the vectorizer checks the relevant optab at the relevant
 133    machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If
 134    the value found is CODE_FOR_nothing, then there's no target support, and
 135    we can't vectorize the stmt.
 136
 137    For additional information on this project see:
 138    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 139 */
 140
 141 /* Function vect_determine_vectorization_factor
 142
 143    Determine the vectorization factor (VF). VF is the number of data elements
 144    that are operated upon in parallel in a single iteration of the vectorized
 145    loop. For example, when vectorizing a loop that operates on 4byte elements,
 146    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 147    elements can fit in a single vector register.
 148
 149    We currently support vectorization of loops in which all types operated upon
 150    are of the same size. Therefore this function currently sets VF according to
 151    the size of the types operated upon, and fails if there are multiple sizes
 152    in the loop.
 153
 154    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 155    original loop:
 156         for (i=0; i<N; i++){
 157           a[i] = b[i] + c[i];
 158         }
 159
 160    vectorized loop:
 161         for (i=0; i<N; i+=VF){
 162           a[i:VF] = b[i:VF] + c[i:VF];
 163         }
 164 */
 165
 166 static bool
 167 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 168 {
 169   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 170   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 171   int nbbs = loop->num_nodes;
 172   gimple_stmt_iterator si;
 173   unsigned int vectorization_factor = 0;
 174   tree scalar_type;
 175   gimple phi;
 176   tree vectype;
 177   unsigned int nunits;
 178   stmt_vec_info stmt_info;
 179   int i;
 180   HOST_WIDE_INT dummy;
 181
 182   if (vect_print_dump_info (REPORT_DETAILS))
 183     fprintf (vect_dump, "=== vect_determine_vectorization_factor ===");
 184
 185   for (i = 0; i < nbbs; i++)
 186     {
 187       basic_block bb = bbs[i];
 188
 189       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 190         {
 191           phi = gsi_stmt (si);
 192           stmt_info = vinfo_for_stmt (phi);
 193           if (vect_print_dump_info (REPORT_DETAILS))
 194             {
 195               fprintf (vect_dump, "==> examining phi: ");
 196               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 197             }
 198
 199           gcc_assert (stmt_info);
 200
 201           if (STMT_VINFO_RELEVANT_P (stmt_info))
 202             {
 203               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 204               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 205
 206               if (vect_print_dump_info (REPORT_DETAILS))
 207                 {
 208                   fprintf (vect_dump, "get vectype for scalar type:  ");
 209                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 210                 }
 211
 212               vectype = get_vectype_for_scalar_type (scalar_type);
 213               if (!vectype)
 214                 {
 215                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 216                     {
 217                       fprintf (vect_dump,
 218                                "not vectorized: unsupported data-type ");
 219                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 220                     }
 221                   return false;
 222                 }
 223               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 224
 225               if (vect_print_dump_info (REPORT_DETAILS))
 226                 {
 227                   fprintf (vect_dump, "vectype: ");
 228                   print_generic_expr (vect_dump, vectype, TDF_SLIM);
 229                 }
 230
 231               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 232               if (vect_print_dump_info (REPORT_DETAILS))
 233                 fprintf (vect_dump, "nunits = %d", nunits);
 234
 235               if (!vectorization_factor
 236                   || (nunits > vectorization_factor))
 237                 vectorization_factor = nunits;
 238             }
 239         }
 240
 241       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 242         {
 243           gimple stmt = gsi_stmt (si);
 244           stmt_info = vinfo_for_stmt (stmt);
 245
 246           if (vect_print_dump_info (REPORT_DETAILS))
 247             {
 248               fprintf (vect_dump, "==> examining statement: ");
 249               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 250             }
 251
 252           gcc_assert (stmt_info);
 253
 254           /* skip stmts which do not need to be vectorized.  */
 255           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 256               && !STMT_VINFO_LIVE_P (stmt_info))
 257             {
 258               if (vect_print_dump_info (REPORT_DETAILS))
 259                 fprintf (vect_dump, "skip.");
 260               continue;
 261             }
 262
 263           if (gimple_get_lhs (stmt) == NULL_TREE)
 264             {
 265               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 266                 {
 267                   fprintf (vect_dump, "not vectorized: irregular stmt.");
 268                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 269                 }
 270               return false;
 271             }
 272
 273           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 274             {
 275               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 276                 {
 277                   fprintf (vect_dump, "not vectorized: vector stmt in loop:");
 278                   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
 279                 }
 280               return false;
 281             }
 282
 283           if (STMT_VINFO_VECTYPE (stmt_info))
 284             {
 285               /* The only case when a vectype had been already set is for stmts
 286                  that contain a dataref, or for "pattern-stmts" (stmts generated
 287                  by the vectorizer to represent/replace a certain idiom).  */
 288               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 289                           || is_pattern_stmt_p (stmt_info));
 290               vectype = STMT_VINFO_VECTYPE (stmt_info);
 291             }
 292           else
 293             {
 294               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info)
 295                           && !is_pattern_stmt_p (stmt_info));
 296
 297               scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 298                                                            &dummy);
 299               if (vect_print_dump_info (REPORT_DETAILS))
 300                 {
 301                   fprintf (vect_dump, "get vectype for scalar type:  ");
 302                   print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 303                 }
 304
 305               vectype = get_vectype_for_scalar_type (scalar_type);
 306               if (!vectype)
 307                 {
 308                   if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 309                     {
 310                       fprintf (vect_dump,
 311                                "not vectorized: unsupported data-type ");
 312                       print_generic_expr (vect_dump, scalar_type, TDF_SLIM);
 313                     }
 314                   return false;
 315                 }
 316               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 317             }
 318
 319           if (vect_print_dump_info (REPORT_DETAILS))
 320             {
 321               fprintf (vect_dump, "vectype: ");
 322               print_generic_expr (vect_dump, vectype, TDF_SLIM);
 323             }
 324
 325           nunits = TYPE_VECTOR_SUBPARTS (vectype);
 326           if (vect_print_dump_info (REPORT_DETAILS))
 327             fprintf (vect_dump, "nunits = %d", nunits);
 328
 329           if (!vectorization_factor
 330               || (nunits > vectorization_factor))
 331             vectorization_factor = nunits;
 332
 333         }
 334     }
 335
 336   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 337   if (vect_print_dump_info (REPORT_DETAILS))
 338     fprintf (vect_dump, "vectorization factor = %d", vectorization_factor);
 339   if (vectorization_factor <= 1)
 340     {
 341       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
 342         fprintf (vect_dump, "not vectorized: unsupported data-type");
 343       return false;
 344     }
 345   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 346
 347   return true;
 348 }
 349
 350
 351 /* Function vect_is_simple_iv_evolution.
 352
 353    FORNOW: A simple evolution of an induction variables in the loop is
 354    considered a polynomial evolution with constant step.  */
 355
 356 static bool
 357 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 358                              tree * step)
 359 {
 360   tree init_expr;
 361   tree step_expr;
 362   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 363
 364   /* When there is no evolution in this loop, the evolution function
 365      is not "simple".  */
 366   if (evolution_part == NULL_TREE)
 367     return false;
 368
 369   /* When the evolution is a polynomial of degree >= 2
 370      the evolution function is not "simple".  */
 371   if (tree_is_chrec (evolution_part))
 372     return false;
 373
 374   step_expr = evolution_part;
 375   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 376
 377   if (vect_print_dump_info (REPORT_DETAILS))
 378     {
 379       fprintf (vect_dump, "step: ");
 380       print_generic_expr (vect_dump, step_expr, TDF_SLIM);
 381       fprintf (vect_dump, ",  init: ");
 382       print_generic_expr (vect_dump, init_expr, TDF_SLIM);
 383     }
 384
 385   *init = init_expr;
 386   *step = step_expr;
 387
 388   if (TREE_CODE (step_expr) != INTEGER_CST)
 389     {
 390       if (vect_print_dump_info (REPORT_DETAILS))
 391         fprintf (vect_dump, "step unknown.");
 392       return false;
 393     }
 394
 395   return true;
 396 }
 397
 398 /* Function vect_analyze_scalar_cycles_1.
 399
 400    Examine the cross iteration def-use cycles of scalar variables
 401    in LOOP. LOOP_VINFO represents the loop that is now being
 402    considered for vectorization (can be LOOP, or an outer-loop
 403    enclosing LOOP).  */
 404
 405 static void
 406 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 407 {
 408   basic_block bb = loop->header;
 409   tree dumy;
 410   VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64);
 411   gimple_stmt_iterator gsi;
 412   bool double_reduc;
 413
 414   if (vect_print_dump_info (REPORT_DETAILS))
 415     fprintf (vect_dump, "=== vect_analyze_scalar_cycles ===");
 416
 417   /* First - identify all inductions. Reduction detection assumes that all the
 418      inductions have been identified, therefore, this order must not be
 419      changed.  */
 420   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 421     {
 422       gimple phi = gsi_stmt (gsi);
 423       tree access_fn = NULL;
 424       tree def = PHI_RESULT (phi);
 425       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 426
 427       if (vect_print_dump_info (REPORT_DETAILS))
 428         {
 429           fprintf (vect_dump, "Analyze phi: ");
 430           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 431         }
 432
 433       /* Skip virtual phi's. The data dependences that are associated with
 434          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 435       if (!is_gimple_reg (SSA_NAME_VAR (def)))
 436         continue;
 437
 438       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 439
 440       /* Analyze the evolution function.  */
 441       access_fn = analyze_scalar_evolution (loop, def);
 442       if (access_fn && vect_print_dump_info (REPORT_DETAILS))
 443         {
 444           fprintf (vect_dump, "Access function of PHI: ");
 445           print_generic_expr (vect_dump, access_fn, TDF_SLIM);
 446         }
 447
 448       if (!access_fn
 449           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 450         {
 451           VEC_safe_push (gimple, heap, worklist, phi);
 452           continue;
 453         }
 454
 455       if (vect_print_dump_info (REPORT_DETAILS))
 456         fprintf (vect_dump, "Detected induction.");
 457       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 458     }
 459
 460
 461   /* Second - identify all reductions and nested cycles.  */
 462   while (VEC_length (gimple, worklist) > 0)
 463     {
 464       gimple phi = VEC_pop (gimple, worklist);
 465       tree def = PHI_RESULT (phi);
 466       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 467       gimple reduc_stmt;
 468       bool nested_cycle;
 469
 470       if (vect_print_dump_info (REPORT_DETAILS))
 471         {
 472           fprintf (vect_dump, "Analyze phi: ");
 473           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
 474         }
 475
 476       gcc_assert (is_gimple_reg (SSA_NAME_VAR (def)));
 477       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 478
 479       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 480       reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle,
 481                                              &double_reduc);
 482       if (reduc_stmt)
 483         {
 484           if (double_reduc)
 485             {
 486               if (vect_print_dump_info (REPORT_DETAILS))
 487                 fprintf (vect_dump, "Detected double reduction.");
 488
 489               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 490               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 491                                                     vect_double_reduction_def;
 492             }
 493           else
 494             {
 495               if (nested_cycle)
 496                 {
 497                   if (vect_print_dump_info (REPORT_DETAILS))
 498                     fprintf (vect_dump, "Detected vectorizable nested cycle.");
 499
 500                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 501                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 502                                                              vect_nested_cycle;
 503                 }
 504               else
 505                 {
 506                   if (vect_print_dump_info (REPORT_DETAILS))
 507                     fprintf (vect_dump, "Detected reduction.");
 508
 509                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 510                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 511                                                            vect_reduction_def;
 512                 }
 513             }
 514         }
 515       else
 516         if (vect_print_dump_info (REPORT_DETAILS))
 517           fprintf (vect_dump, "Unknown def-use cycle pattern.");
 518     }
 519
 520   VEC_free (gimple, heap, worklist);
 521 }
 522
 523
 524 /* Function vect_analyze_scalar_cycles.
 525
 526    Examine the cross iteration def-use cycles of scalar variables, by
 527    analyzing the loop-header PHIs of scalar variables; Classify each
 528    cycle as one of the following: invariant, induction, reduction, unknown.
 529    We do that for the loop represented by LOOP_VINFO, and also to its
 530    inner-loop, if exists.
 531    Examples for scalar cycles:
 532
 533    Example1: reduction:
 534
 535               loop1:
 536               for (i=0; i<N; i++)
 537                  sum += a[i];
 538
 539    Example2: induction:
 540
 541               loop2:
 542               for (i=0; i<N; i++)
 543                  a[i] = i;  */
 544
 545 static void
 546 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 547 {
 548   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 549
 550   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 551
 552   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 553      Reductions in such inner-loop therefore have different properties than
 554      the reductions in the nest that gets vectorized:
 555      1. When vectorized, they are executed in the same order as in the original
 556         scalar loop, so we can't change the order of computation when
 557         vectorizing them.
 558      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 559         current checks are too strict.  */
 560
 561   if (loop->inner)
 562     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 563 }
 564
 565 /* Function vect_get_loop_niters.
 566
 567    Determine how many iterations the loop is executed.
 568    If an expression that represents the number of iterations
 569    can be constructed, place it in NUMBER_OF_ITERATIONS.
 570    Return the loop exit condition.  */
 571
 572 static gimple
 573 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 574 {
 575   tree niters;
 576
 577   if (vect_print_dump_info (REPORT_DETAILS))
 578     fprintf (vect_dump, "=== get_loop_niters ===");
 579
 580   niters = number_of_exit_cond_executions (loop);
 581
 582   if (niters != NULL_TREE
 583       && niters != chrec_dont_know)
 584     {
 585       *number_of_iterations = niters;
 586
 587       if (vect_print_dump_info (REPORT_DETAILS))
 588         {
 589           fprintf (vect_dump, "==> get_loop_niters:" );
 590           print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM);
 591         }
 592     }
 593
 594   return get_loop_exit_condition (loop);
 595 }
 596
 597
 598 /* Function bb_in_loop_p
 599
 600    Used as predicate for dfs order traversal of the loop bbs.  */
 601
 602 static bool
 603 bb_in_loop_p (const_basic_block bb, const void *data)
 604 {
 605   const struct loop *const loop = (const struct loop *)data;
 606   if (flow_bb_inside_loop_p (loop, bb))
 607     return true;
 608   return false;
 609 }
 610
 611
 612 /* Function new_loop_vec_info.
 613
 614    Create and initialize a new loop_vec_info struct for LOOP, as well as
 615    stmt_vec_info structs for all the stmts in LOOP.  */
 616
 617 static loop_vec_info
 618 new_loop_vec_info (struct loop *loop)
 619 {
 620   loop_vec_info res;
 621   basic_block *bbs;
 622   gimple_stmt_iterator si;
 623   unsigned int i, nbbs;
 624
 625   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 626   LOOP_VINFO_LOOP (res) = loop;
 627
 628   bbs = get_loop_body (loop);
 629
 630   /* Create/Update stmt_info for all stmts in the loop.  */
 631   for (i = 0; i < loop->num_nodes; i++)
 632     {
 633       basic_block bb = bbs[i];
 634
 635       /* BBs in a nested inner-loop will have been already processed (because
 636          we will have called vect_analyze_loop_form for any nested inner-loop).
 637          Therefore, for stmts in an inner-loop we just want to update the
 638          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 639          loop_info of the outer-loop we are currently considering to vectorize
 640          (instead of the loop_info of the inner-loop).
 641          For stmts in other BBs we need to create a stmt_info from scratch.  */
 642       if (bb->loop_father != loop)
 643         {
 644           /* Inner-loop bb.  */
 645           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 646           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 647             {
 648               gimple phi = gsi_stmt (si);
 649               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 650               loop_vec_info inner_loop_vinfo =
 651                 STMT_VINFO_LOOP_VINFO (stmt_info);
 652               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 653               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 654             }
 655           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 656            {
 657               gimple stmt = gsi_stmt (si);
 658               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 659               loop_vec_info inner_loop_vinfo =
 660                  STMT_VINFO_LOOP_VINFO (stmt_info);
 661               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 662               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 663            }
 664         }
 665       else
 666         {
 667           /* bb in current nest.  */
 668           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 669             {
 670               gimple phi = gsi_stmt (si);
 671               gimple_set_uid (phi, 0);
 672               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 673             }
 674
 675           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 676             {
 677               gimple stmt = gsi_stmt (si);
 678               gimple_set_uid (stmt, 0);
 679               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 680             }
 681         }
 682     }
 683
 684   /* CHECKME: We want to visit all BBs before their successors (except for
 685      latch blocks, for which this assertion wouldn't hold).  In the simple
 686      case of the loop forms we allow, a dfs order of the BBs would the same
 687      as reversed postorder traversal, so we are safe.  */
 688
 689    free (bbs);
 690    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 691    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 692                               bbs, loop->num_nodes, loop);
 693    gcc_assert (nbbs == loop->num_nodes);
 694
 695   LOOP_VINFO_BBS (res) = bbs;
 696   LOOP_VINFO_NITERS (res) = NULL;
 697   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 698   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 699   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 700   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 701   LOOP_VINFO_VECT_FACTOR (res) = 0;
 702   LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10);
 703   LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10);
 704   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 705   LOOP_VINFO_MAY_MISALIGN_STMTS (res) =
 706     VEC_alloc (gimple, heap,
 707                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 708   LOOP_VINFO_MAY_ALIAS_DDRS (res) =
 709     VEC_alloc (ddr_p, heap,
 710                PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 711   LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10);
 712   LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10);
 713   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 714
 715   return res;
 716 }
 717
 718
 719 /* Function destroy_loop_vec_info.
 720
 721    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 722    stmts in the loop.  */
 723
 724 void
 725 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 726 {
 727   struct loop *loop;
 728   basic_block *bbs;
 729   int nbbs;
 730   gimple_stmt_iterator si;
 731   int j;
 732   VEC (slp_instance, heap) *slp_instances;
 733   slp_instance instance;
 734
 735   if (!loop_vinfo)
 736     return;
 737
 738   loop = LOOP_VINFO_LOOP (loop_vinfo);
 739
 740   bbs = LOOP_VINFO_BBS (loop_vinfo);
 741   nbbs = loop->num_nodes;
 742
 743   if (!clean_stmts)
 744     {
 745       free (LOOP_VINFO_BBS (loop_vinfo));
 746       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 747       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 748       VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 749
 750       free (loop_vinfo);
 751       loop->aux = NULL;
 752       return;
 753     }
 754
 755   for (j = 0; j < nbbs; j++)
 756     {
 757       basic_block bb = bbs[j];
 758       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 759         free_stmt_vec_info (gsi_stmt (si));
 760
 761       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 762         {
 763           gimple stmt = gsi_stmt (si);
 764           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 765
 766           if (stmt_info)
 767             {
 768               /* Check if this is a "pattern stmt" (introduced by the
 769                  vectorizer during the pattern recognition pass).  */
 770               bool remove_stmt_p = false;
 771               gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
 772               if (orig_stmt)
 773                 {
 774                   stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt);
 775                   if (orig_stmt_info
 776                       && STMT_VINFO_IN_PATTERN_P (orig_stmt_info))
 777                     remove_stmt_p = true;
 778                 }
 779
 780               /* Free stmt_vec_info.  */
 781               free_stmt_vec_info (stmt);
 782
 783               /* Remove dead "pattern stmts".  */
 784               if (remove_stmt_p)
 785                 gsi_remove (&si, true);
 786             }
 787           gsi_next (&si);
 788         }
 789     }
 790
 791   free (LOOP_VINFO_BBS (loop_vinfo));
 792   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 793   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 794   VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
 795   VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
 796   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 797   for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++)
 798     vect_free_slp_instance (instance);
 799
 800   VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
 801   VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo));
 802
 803   free (loop_vinfo);
 804   loop->aux = NULL;
 805 }
 806
 807
 808 /* Function vect_analyze_loop_1.
 809
 810    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 811    for it. The different analyses will record information in the
 812    loop_vec_info struct.  This is a subset of the analyses applied in
 813    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 814    that is now considered for (outer-loop) vectorization.  */
 815
 816 static loop_vec_info
 817 vect_analyze_loop_1 (struct loop *loop)
 818 {
 819   loop_vec_info loop_vinfo;
 820
 821   if (vect_print_dump_info (REPORT_DETAILS))
 822     fprintf (vect_dump, "===== analyze_loop_nest_1 =====");
 823
 824   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 825
 826   loop_vinfo = vect_analyze_loop_form (loop);
 827   if (!loop_vinfo)
 828     {
 829       if (vect_print_dump_info (REPORT_DETAILS))
 830         fprintf (vect_dump, "bad inner-loop form.");
 831       return NULL;
 832     }
 833
 834   return loop_vinfo;
 835 }
 836
 837
 838 /* Function vect_analyze_loop_form.
 839
 840    Verify that certain CFG restrictions hold, including:
 841    - the loop has a pre-header
 842    - the loop has a single entry and exit
 843    - the loop exit condition is simple enough, and the number of iterations
 844      can be analyzed (a countable loop).  */
 845
 846 loop_vec_info
 847 vect_analyze_loop_form (struct loop *loop)
 848 {
 849   loop_vec_info loop_vinfo;
 850   gimple loop_cond;
 851   tree number_of_iterations = NULL;
 852   loop_vec_info inner_loop_vinfo = NULL;
 853
 854   if (vect_print_dump_info (REPORT_DETAILS))
 855     fprintf (vect_dump, "=== vect_analyze_loop_form ===");
 856
 857   /* Different restrictions apply when we are considering an inner-most loop,
 858      vs. an outer (nested) loop.
 859      (FORNOW. May want to relax some of these restrictions in the future).  */
 860
 861   if (!loop->inner)
 862     {
 863       /* Inner-most loop.  We currently require that the number of BBs is
 864          exactly 2 (the header and latch).  Vectorizable inner-most loops
 865          look like this:
 866
 867                         (pre-header)
 868                            |
 869                           header <--------+
 870                            | |            |
 871                            | +--> latch --+
 872                            |
 873                         (exit-bb)  */
 874
 875       if (loop->num_nodes != 2)
 876         {
 877           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 878             fprintf (vect_dump, "not vectorized: control flow in loop.");
 879           return NULL;
 880         }
 881
 882       if (empty_block_p (loop->header))
 883     {
 884           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 885             fprintf (vect_dump, "not vectorized: empty loop.");
 886       return NULL;
 887     }
 888     }
 889   else
 890     {
 891       struct loop *innerloop = loop->inner;
 892       edge backedge, entryedge;
 893
 894       /* Nested loop. We currently require that the loop is doubly-nested,
 895          contains a single inner loop, and the number of BBs is exactly 5.
 896          Vectorizable outer-loops look like this:
 897
 898                         (pre-header)
 899                            |
 900                           header <---+
 901                            |         |
 902                           inner-loop |
 903                            |         |
 904                           tail ------+
 905                            |
 906                         (exit-bb)
 907
 908          The inner-loop has the properties expected of inner-most loops
 909          as described above.  */
 910
 911       if ((loop->inner)->inner || (loop->inner)->next)
 912         {
 913           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 914             fprintf (vect_dump, "not vectorized: multiple nested loops.");
 915           return NULL;
 916         }
 917
 918       /* Analyze the inner-loop.  */
 919       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
 920       if (!inner_loop_vinfo)
 921         {
 922           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 923             fprintf (vect_dump, "not vectorized: Bad inner loop.");
 924           return NULL;
 925         }
 926
 927       if (!expr_invariant_in_loop_p (loop,
 928                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
 929         {
 930           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 931             fprintf (vect_dump,
 932                      "not vectorized: inner-loop count not invariant.");
 933           destroy_loop_vec_info (inner_loop_vinfo, true);
 934           return NULL;
 935         }
 936
 937       if (loop->num_nodes != 5)
 938         {
 939           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 940             fprintf (vect_dump, "not vectorized: control flow in loop.");
 941           destroy_loop_vec_info (inner_loop_vinfo, true);
 942           return NULL;
 943         }
 944
 945       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
 946       backedge = EDGE_PRED (innerloop->header, 1);
 947       entryedge = EDGE_PRED (innerloop->header, 0);
 948       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
 949         {
 950           backedge = EDGE_PRED (innerloop->header, 0);
 951           entryedge = EDGE_PRED (innerloop->header, 1);
 952         }
 953
 954       if (entryedge->src != loop->header
 955           || !single_exit (innerloop)
 956           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
 957         {
 958           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 959             fprintf (vect_dump, "not vectorized: unsupported outerloop form.");
 960           destroy_loop_vec_info (inner_loop_vinfo, true);
 961           return NULL;
 962         }
 963
 964       if (vect_print_dump_info (REPORT_DETAILS))
 965         fprintf (vect_dump, "Considering outer-loop vectorization.");
 966     }
 967
 968   if (!single_exit (loop)
 969       || EDGE_COUNT (loop->header->preds) != 2)
 970     {
 971       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 972         {
 973           if (!single_exit (loop))
 974             fprintf (vect_dump, "not vectorized: multiple exits.");
 975           else if (EDGE_COUNT (loop->header->preds) != 2)
 976             fprintf (vect_dump, "not vectorized: too many incoming edges.");
 977         }
 978       if (inner_loop_vinfo)
 979         destroy_loop_vec_info (inner_loop_vinfo, true);
 980       return NULL;
 981     }
 982
 983   /* We assume that the loop exit condition is at the end of the loop. i.e,
 984      that the loop is represented as a do-while (with a proper if-guard
 985      before the loop if needed), where the loop header contains all the
 986      executable statements, and the latch is empty.  */
 987   if (!empty_block_p (loop->latch)
 988         || phi_nodes (loop->latch))
 989     {
 990       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
 991         fprintf (vect_dump, "not vectorized: unexpected loop form.");
 992       if (inner_loop_vinfo)
 993         destroy_loop_vec_info (inner_loop_vinfo, true);
 994       return NULL;
 995     }
 996
 997   /* Make sure there exists a single-predecessor exit bb:  */
 998   if (!single_pred_p (single_exit (loop)->dest))
 999     {
1000       edge e = single_exit (loop);
1001       if (!(e->flags & EDGE_ABNORMAL))
1002         {
1003           split_loop_exit_edge (e);
1004           if (vect_print_dump_info (REPORT_DETAILS))
1005             fprintf (vect_dump, "split exit edge.");
1006         }
1007       else
1008         {
1009           if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1010             fprintf (vect_dump, "not vectorized: abnormal loop exit edge.");
1011           if (inner_loop_vinfo)
1012             destroy_loop_vec_info (inner_loop_vinfo, true);
1013           return NULL;
1014         }
1015     }
1016
1017   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1018   if (!loop_cond)
1019     {
1020       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1021         fprintf (vect_dump, "not vectorized: complicated exit condition.");
1022       if (inner_loop_vinfo)
1023         destroy_loop_vec_info (inner_loop_vinfo, true);
1024       return NULL;
1025     }
1026
1027   if (!number_of_iterations)
1028     {
1029       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1030         fprintf (vect_dump,
1031                  "not vectorized: number of iterations cannot be computed.");
1032       if (inner_loop_vinfo)
1033         destroy_loop_vec_info (inner_loop_vinfo, true);
1034       return NULL;
1035     }
1036
1037   if (chrec_contains_undetermined (number_of_iterations))
1038     {
1039       if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS))
1040         fprintf (vect_dump, "Infinite number of iterations.");
1041       if (inner_loop_vinfo)
1042         destroy_loop_vec_info (inner_loop_vinfo, true);
1043       return NULL;
1044     }
1045
1046   if (!NITERS_KNOWN_P (number_of_iterations))
1047     {
1048       if (vect_print_dump_info (REPORT_DETAILS))
1049         {
1050           fprintf (vect_dump, "Symbolic number of iterations is ");
1051           print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS);
1052         }
1053     }
1054   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1055     {
1056       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1057         fprintf (vect_dump, "not vectorized: number of iterations = 0.");
1058       if (inner_loop_vinfo)
1059         destroy_loop_vec_info (inner_loop_vinfo, false);
1060       return NULL;
1061     }
1062
1063   loop_vinfo = new_loop_vec_info (loop);
1064   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1065   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1066
1067   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1068
1069   /* CHECKME: May want to keep it around it in the future.  */
1070   if (inner_loop_vinfo)
1071     destroy_loop_vec_info (inner_loop_vinfo, false);
1072
1073   gcc_assert (!loop->aux);
1074   loop->aux = loop_vinfo;
1075   return loop_vinfo;
1076 }
1077
1078
1079 /* Function vect_analyze_loop_operations.
1080
1081    Scan the loop stmts and make sure they are all vectorizable.  */
1082
1083 static bool
1084 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1085 {
1086   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1087   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1088   int nbbs = loop->num_nodes;
1089   gimple_stmt_iterator si;
1090   unsigned int vectorization_factor = 0;
1091   int i;
1092   gimple phi;
1093   stmt_vec_info stmt_info;
1094   bool need_to_vectorize = false;
1095   int min_profitable_iters;
1096   int min_scalar_loop_bound;
1097   unsigned int th;
1098   bool only_slp_in_loop = true, ok;
1099
1100   if (vect_print_dump_info (REPORT_DETAILS))
1101     fprintf (vect_dump, "=== vect_analyze_loop_operations ===");
1102
1103   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1104   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1105
1106   for (i = 0; i < nbbs; i++)
1107     {
1108       basic_block bb = bbs[i];
1109
1110       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1111         {
1112           phi = gsi_stmt (si);
1113           ok = true;
1114
1115           stmt_info = vinfo_for_stmt (phi);
1116           if (vect_print_dump_info (REPORT_DETAILS))
1117             {
1118               fprintf (vect_dump, "examining phi: ");
1119               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1120             }
1121
1122           if (! is_loop_header_bb_p (bb))
1123             {
1124               /* inner-loop loop-closed exit phi in outer-loop vectorization
1125                  (i.e. a phi in the tail of the outer-loop).
1126                  FORNOW: we currently don't support the case that these phis
1127                  are not used in the outerloop (unless it is double reduction,
1128                  i.e., this phi is vect_reduction_def), cause this case
1129                  requires to actually do something here.  */
1130               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1131                    || STMT_VINFO_LIVE_P (stmt_info))
1132                   && STMT_VINFO_DEF_TYPE (stmt_info)
1133                      != vect_double_reduction_def)
1134                 {
1135                   if (vect_print_dump_info (REPORT_DETAILS))
1136                     fprintf (vect_dump,
1137                              "Unsupported loop-closed phi in outer-loop.");
1138                   return false;
1139                 }
1140               continue;
1141             }
1142
1143           gcc_assert (stmt_info);
1144
1145           if (STMT_VINFO_LIVE_P (stmt_info))
1146             {
1147               /* FORNOW: not yet supported.  */
1148               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1149                 fprintf (vect_dump, "not vectorized: value used after loop.");
1150               return false;
1151             }
1152
1153           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1154               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1155             {
1156               /* A scalar-dependence cycle that we don't support.  */
1157               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1158                 fprintf (vect_dump, "not vectorized: scalar dependence cycle.");
1159               return false;
1160             }
1161
1162           if (STMT_VINFO_RELEVANT_P (stmt_info))
1163             {
1164               need_to_vectorize = true;
1165               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1166                 ok = vectorizable_induction (phi, NULL, NULL);
1167             }
1168
1169           if (!ok)
1170             {
1171               if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1172                 {
1173                   fprintf (vect_dump,
1174                            "not vectorized: relevant phi not supported: ");
1175                   print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
1176                 }
1177               return false;
1178             }
1179         }
1180
1181       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1182         {
1183           gimple stmt = gsi_stmt (si);
1184           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1185
1186           gcc_assert (stmt_info);
1187
1188           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1189             return false;
1190
1191           if (STMT_VINFO_RELEVANT_P (stmt_info) && !PURE_SLP_STMT (stmt_info))
1192             /* STMT needs both SLP and loop-based vectorization.  */
1193             only_slp_in_loop = false;
1194         }
1195     } /* bbs */
1196
1197   /* All operations in the loop are either irrelevant (deal with loop
1198      control, or dead), or only used outside the loop and can be moved
1199      out of the loop (e.g. invariants, inductions).  The loop can be
1200      optimized away by scalar optimizations.  We're better off not
1201      touching this loop.  */
1202   if (!need_to_vectorize)
1203     {
1204       if (vect_print_dump_info (REPORT_DETAILS))
1205         fprintf (vect_dump,
1206                  "All the computation can be taken out of the loop.");
1207       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1208         fprintf (vect_dump,
1209                  "not vectorized: redundant loop. no profit to vectorize.");
1210       return false;
1211     }
1212
1213   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1214      vectorization factor of the loop is the unrolling factor required by the
1215      SLP instances.  If that unrolling factor is 1, we say, that we perform
1216      pure SLP on loop - cross iteration parallelism is not exploited.  */
1217   if (only_slp_in_loop)
1218     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1219   else
1220     vectorization_factor = least_common_multiple (vectorization_factor,
1221                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1222
1223   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1224
1225   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1226       && vect_print_dump_info (REPORT_DETAILS))
1227     fprintf (vect_dump,
1228         "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC,
1229         vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo));
1230
1231   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1232       && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1233     {
1234       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1235         fprintf (vect_dump, "not vectorized: iteration count too small.");
1236       if (vect_print_dump_info (REPORT_DETAILS))
1237         fprintf (vect_dump,"not vectorized: iteration count smaller than "
1238                  "vectorization factor.");
1239       return false;
1240     }
1241
1242   /* Analyze cost. Decide if worth while to vectorize.  */
1243
1244   /* Once VF is set, SLP costs should be updated since the number of created
1245      vector stmts depends on VF.  */
1246   vect_update_slp_costs_according_to_vf (loop_vinfo);
1247
1248   min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo);
1249   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1250
1251   if (min_profitable_iters < 0)
1252     {
1253       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1254         fprintf (vect_dump, "not vectorized: vectorization not profitable.");
1255       if (vect_print_dump_info (REPORT_DETAILS))
1256         fprintf (vect_dump, "not vectorized: vector version will never be "
1257                  "profitable.");
1258       return false;
1259     }
1260
1261   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1262                             * vectorization_factor) - 1);
1263
1264   /* Use the cost model only if it is more conservative than user specified
1265      threshold.  */
1266
1267   th = (unsigned) min_scalar_loop_bound;
1268   if (min_profitable_iters
1269       && (!min_scalar_loop_bound
1270           || min_profitable_iters > min_scalar_loop_bound))
1271     th = (unsigned) min_profitable_iters;
1272
1273   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1274       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1275     {
1276       if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1277         fprintf (vect_dump, "not vectorized: vectorization not "
1278                  "profitable.");
1279       if (vect_print_dump_info (REPORT_DETAILS))
1280         fprintf (vect_dump, "not vectorized: iteration count smaller than "
1281                  "user specified loop bound parameter or minimum "
1282                  "profitable iterations (whichever is more conservative).");
1283       return false;
1284     }
1285
1286   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1287       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1288       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1289     {
1290       if (vect_print_dump_info (REPORT_DETAILS))
1291         fprintf (vect_dump, "epilog loop required.");
1292       if (!vect_can_advance_ivs_p (loop_vinfo))
1293         {
1294           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1295             fprintf (vect_dump,
1296                      "not vectorized: can't create epilog loop 1.");
1297           return false;
1298         }
1299       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1300         {
1301           if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS))
1302             fprintf (vect_dump,
1303                      "not vectorized: can't create epilog loop 2.");
1304           return false;
1305         }
1306     }
1307
1308   return true;
1309 }
1310
1311
1312 /* Function vect_analyze_loop.
1313
1314    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1315    for it. The different analyses will record information in the
1316    loop_vec_info struct.  */
1317 loop_vec_info
1318 vect_analyze_loop (struct loop *loop)
1319 {
1320   bool ok;
1321   loop_vec_info loop_vinfo;
1322
1323   if (vect_print_dump_info (REPORT_DETAILS))
1324     fprintf (vect_dump, "===== analyze_loop_nest =====");
1325
1326   if (loop_outer (loop)
1327       && loop_vec_info_for_loop (loop_outer (loop))
1328       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1329     {
1330       if (vect_print_dump_info (REPORT_DETAILS))
1331         fprintf (vect_dump, "outer-loop already vectorized.");
1332       return NULL;
1333     }
1334
1335   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1336
1337   loop_vinfo = vect_analyze_loop_form (loop);
1338   if (!loop_vinfo)
1339     {
1340       if (vect_print_dump_info (REPORT_DETAILS))
1341         fprintf (vect_dump, "bad loop form.");
1342       return NULL;
1343     }
1344
1345   /* Find all data references in the loop (which correspond to vdefs/vuses)
1346      and analyze their evolution in the loop.
1347
1348      FORNOW: Handle only simple, array references, which
1349      alignment can be forced, and aligned pointer-references.  */
1350
1351   ok = vect_analyze_data_refs (loop_vinfo, NULL);
1352   if (!ok)
1353     {
1354       if (vect_print_dump_info (REPORT_DETAILS))
1355         fprintf (vect_dump, "bad data references.");
1356       destroy_loop_vec_info (loop_vinfo, true);
1357       return NULL;
1358     }
1359
1360   /* Classify all cross-iteration scalar data-flow cycles.
1361      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1362
1363   vect_analyze_scalar_cycles (loop_vinfo);
1364
1365   vect_pattern_recog (loop_vinfo);
1366
1367   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1368
1369   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1370   if (!ok)
1371     {
1372       if (vect_print_dump_info (REPORT_DETAILS))
1373         fprintf (vect_dump, "unexpected pattern.");
1374       destroy_loop_vec_info (loop_vinfo, true);
1375       return NULL;
1376     }
1377
1378   /* Analyze the alignment of the data-refs in the loop.
1379      Fail if a data reference is found that cannot be vectorized.  */
1380
1381   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1382   if (!ok)
1383     {
1384       if (vect_print_dump_info (REPORT_DETAILS))
1385         fprintf (vect_dump, "bad data alignment.");
1386       destroy_loop_vec_info (loop_vinfo, true);
1387       return NULL;
1388     }
1389
1390   ok = vect_determine_vectorization_factor (loop_vinfo);
1391   if (!ok)
1392     {
1393       if (vect_print_dump_info (REPORT_DETAILS))
1394         fprintf (vect_dump, "can't determine vectorization factor.");
1395       destroy_loop_vec_info (loop_vinfo, true);
1396       return NULL;
1397     }
1398
1399   /* Analyze data dependences between the data-refs in the loop.
1400      FORNOW: fail at the first data dependence that we encounter.  */
1401
1402   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL);
1403   if (!ok)
1404     {
1405       if (vect_print_dump_info (REPORT_DETAILS))
1406         fprintf (vect_dump, "bad data dependence.");
1407       destroy_loop_vec_info (loop_vinfo, true);
1408       return NULL;
1409     }
1410
1411   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1412      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1413
1414   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1415   if (!ok)
1416     {
1417       if (vect_print_dump_info (REPORT_DETAILS))
1418         fprintf (vect_dump, "bad data access.");
1419       destroy_loop_vec_info (loop_vinfo, true);
1420       return NULL;
1421     }
1422
1423   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1424      It is important to call pruning after vect_analyze_data_ref_accesses,
1425      since we use grouping information gathered by interleaving analysis.  */
1426   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1427   if (!ok)
1428     {
1429       if (vect_print_dump_info (REPORT_DETAILS))
1430         fprintf (vect_dump, "too long list of versioning for alias "
1431                             "run-time tests.");
1432       destroy_loop_vec_info (loop_vinfo, true);
1433       return NULL;
1434     }
1435
1436   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1437   ok = vect_analyze_slp (loop_vinfo, NULL);
1438   if (ok)
1439     {
1440       /* Decide which possible SLP instances to SLP.  */
1441       vect_make_slp_decision (loop_vinfo);
1442
1443       /* Find stmts that need to be both vectorized and SLPed.  */
1444       vect_detect_hybrid_slp (loop_vinfo);
1445     }
1446
1447   /* This pass will decide on using loop versioning and/or loop peeling in
1448      order to enhance the alignment of data references in the loop.  */
1449
1450   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1451   if (!ok)
1452     {
1453       if (vect_print_dump_info (REPORT_DETAILS))
1454         fprintf (vect_dump, "bad data alignment.");
1455       destroy_loop_vec_info (loop_vinfo, true);
1456       return NULL;
1457     }
1458
1459   /* Scan all the operations in the loop and make sure they are
1460      vectorizable.  */
1461
1462   ok = vect_analyze_loop_operations (loop_vinfo);
1463   if (!ok)
1464     {
1465       if (vect_print_dump_info (REPORT_DETAILS))
1466         fprintf (vect_dump, "bad operation or unsupported loop bound.");
1467       destroy_loop_vec_info (loop_vinfo, true);
1468       return NULL;
1469     }
1470
1471   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1472
1473   return loop_vinfo;
1474 }
1475
1476
1477 /* Function reduction_code_for_scalar_code
1478
1479    Input:
1480    CODE - tree_code of a reduction operations.
1481
1482    Output:
1483    REDUC_CODE - the corresponding tree-code to be used to reduce the
1484       vector of partial results into a single scalar result (which
1485       will also reside in a vector) or ERROR_MARK if the operation is
1486       a supported reduction operation, but does not have such tree-code.
1487
1488    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1489
1490 static bool
1491 reduction_code_for_scalar_code (enum tree_code code,
1492                                 enum tree_code *reduc_code)
1493 {
1494   switch (code)
1495     {
1496       case MAX_EXPR:
1497         *reduc_code = REDUC_MAX_EXPR;
1498         return true;
1499
1500       case MIN_EXPR:
1501         *reduc_code = REDUC_MIN_EXPR;
1502         return true;
1503
1504       case PLUS_EXPR:
1505         *reduc_code = REDUC_PLUS_EXPR;
1506         return true;
1507
1508       case MULT_EXPR:
1509       case MINUS_EXPR:
1510       case BIT_IOR_EXPR:
1511       case BIT_XOR_EXPR:
1512       case BIT_AND_EXPR:
1513         *reduc_code = ERROR_MARK;
1514         return true;
1515
1516       default:
1517        return false;
1518     }
1519 }
1520
1521
1522 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1523    STMT is printed with a message MSG. */
1524
1525 static void
1526 report_vect_op (gimple stmt, const char *msg)
1527 {
1528   fprintf (vect_dump, "%s", msg);
1529   print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
1530 }
1531
1532
1533 /* Function vect_is_simple_reduction
1534
1535    (1) Detect a cross-iteration def-use cycle that represents a simple
1536    reduction computation. We look for the following pattern:
1537
1538    loop_header:
1539      a1 = phi < a0, a2 >
1540      a3 = ...
1541      a2 = operation (a3, a1)
1542
1543    such that:
1544    1. operation is commutative and associative and it is safe to
1545       change the order of the computation (if CHECK_REDUCTION is true)
1546    2. no uses for a2 in the loop (a2 is used out of the loop)
1547    3. no uses of a1 in the loop besides the reduction operation.
1548
1549    Condition 1 is tested here.
1550    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
1551
1552    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
1553    nested cycles, if CHECK_REDUCTION is false.
1554
1555    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
1556    reductions:
1557
1558      a1 = phi < a0, a2 >
1559      inner loop (def of a3)
1560      a2 = phi < a3 >
1561 */
1562
1563 gimple
1564 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
1565                           bool check_reduction, bool *double_reduc)
1566 {
1567   struct loop *loop = (gimple_bb (phi))->loop_father;
1568   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1569   edge latch_e = loop_latch_edge (loop);
1570   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
1571   gimple def_stmt, def1 = NULL, def2 = NULL;
1572   enum tree_code code;
1573   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
1574   tree type;
1575   int nloop_uses;
1576   tree name;
1577   imm_use_iterator imm_iter;
1578   use_operand_p use_p;
1579   bool phi_def;
1580
1581   *double_reduc = false;
1582
1583   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
1584      otherwise, we assume outer loop vectorization.  */
1585   gcc_assert ((check_reduction && loop == vect_loop)
1586               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
1587
1588   name = PHI_RESULT (phi);
1589   nloop_uses = 0;
1590   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1591     {
1592       gimple use_stmt = USE_STMT (use_p);
1593       if (is_gimple_debug (use_stmt))
1594         continue;
1595       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1596           && vinfo_for_stmt (use_stmt)
1597           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1598         nloop_uses++;
1599       if (nloop_uses > 1)
1600         {
1601           if (vect_print_dump_info (REPORT_DETAILS))
1602             fprintf (vect_dump, "reduction used in loop.");
1603           return NULL;
1604         }
1605     }
1606
1607   if (TREE_CODE (loop_arg) != SSA_NAME)
1608     {
1609       if (vect_print_dump_info (REPORT_DETAILS))
1610         {
1611           fprintf (vect_dump, "reduction: not ssa_name: ");
1612           print_generic_expr (vect_dump, loop_arg, TDF_SLIM);
1613         }
1614       return NULL;
1615     }
1616
1617   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
1618   if (!def_stmt)
1619     {
1620       if (vect_print_dump_info (REPORT_DETAILS))
1621         fprintf (vect_dump, "reduction: no def_stmt.");
1622       return NULL;
1623     }
1624
1625   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
1626     {
1627       if (vect_print_dump_info (REPORT_DETAILS))
1628         print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM);
1629       return NULL;
1630     }
1631
1632   if (is_gimple_assign (def_stmt))
1633     {
1634       name = gimple_assign_lhs (def_stmt);
1635       phi_def = false;
1636     }
1637   else
1638     {
1639       name = PHI_RESULT (def_stmt);
1640       phi_def = true;
1641     }
1642
1643   nloop_uses = 0;
1644   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
1645     {
1646       gimple use_stmt = USE_STMT (use_p);
1647       if (is_gimple_debug (use_stmt))
1648         continue;
1649       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
1650           && vinfo_for_stmt (use_stmt)
1651           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
1652         nloop_uses++;
1653       if (nloop_uses > 1)
1654         {
1655           if (vect_print_dump_info (REPORT_DETAILS))
1656             fprintf (vect_dump, "reduction used in loop.");
1657           return NULL;
1658         }
1659     }
1660
1661   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
1662      defined in the inner loop.  */
1663   if (phi_def)
1664     {
1665       op1 = PHI_ARG_DEF (def_stmt, 0);
1666
1667       if (gimple_phi_num_args (def_stmt) != 1
1668           || TREE_CODE (op1) != SSA_NAME)
1669         {
1670           if (vect_print_dump_info (REPORT_DETAILS))
1671             fprintf (vect_dump, "unsupported phi node definition.");
1672
1673           return NULL;
1674         }
1675
1676       def1 = SSA_NAME_DEF_STMT (op1);
1677       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1678           && loop->inner
1679           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
1680           && is_gimple_assign (def1))
1681         {
1682           if (vect_print_dump_info (REPORT_DETAILS))
1683             report_vect_op (def_stmt, "detected double reduction: ");
1684
1685           *double_reduc = true;
1686           return def_stmt;
1687         }
1688
1689       return NULL;
1690     }
1691
1692   code = gimple_assign_rhs_code (def_stmt);
1693
1694   if (check_reduction
1695       && (!commutative_tree_code (code) || !associative_tree_code (code)))
1696     {
1697       if (vect_print_dump_info (REPORT_DETAILS))
1698         report_vect_op (def_stmt, "reduction: not commutative/associative: ");
1699       return NULL;
1700     }
1701
1702   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
1703     {
1704       if (code != COND_EXPR)
1705         {
1706           if (vect_print_dump_info (REPORT_DETAILS))
1707             report_vect_op (def_stmt, "reduction: not binary operation: ");
1708
1709           return NULL;
1710         }
1711
1712       op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
1713       if (COMPARISON_CLASS_P (op3))
1714         {
1715           op4 = TREE_OPERAND (op3, 1);
1716           op3 = TREE_OPERAND (op3, 0);
1717         }
1718
1719       op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1);
1720       op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2);
1721
1722       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
1723         {
1724           if (vect_print_dump_info (REPORT_DETAILS))
1725             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1726
1727           return NULL;
1728         }
1729     }
1730   else
1731     {
1732       op1 = gimple_assign_rhs1 (def_stmt);
1733       op2 = gimple_assign_rhs2 (def_stmt);
1734
1735       if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
1736         {
1737           if (vect_print_dump_info (REPORT_DETAILS))
1738             report_vect_op (def_stmt, "reduction: uses not ssa_names: ");
1739
1740           return NULL;
1741         }
1742    }
1743
1744   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
1745   if ((TREE_CODE (op1) == SSA_NAME
1746        && !types_compatible_p (type,TREE_TYPE (op1)))
1747       || (TREE_CODE (op2) == SSA_NAME
1748           && !types_compatible_p (type, TREE_TYPE (op2)))
1749       || (op3 && TREE_CODE (op3) == SSA_NAME
1750           && !types_compatible_p (type, TREE_TYPE (op3)))
1751       || (op4 && TREE_CODE (op4) == SSA_NAME
1752           && !types_compatible_p (type, TREE_TYPE (op4))))
1753     {
1754       if (vect_print_dump_info (REPORT_DETAILS))
1755         {
1756           fprintf (vect_dump, "reduction: multiple types: operation type: ");
1757           print_generic_expr (vect_dump, type, TDF_SLIM);
1758           fprintf (vect_dump, ", operands types: ");
1759           print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM);
1760           fprintf (vect_dump, ",");
1761           print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM);
1762           if (op3)
1763             {
1764               fprintf (vect_dump, ",");
1765               print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM);
1766             }
1767
1768           if (op4)
1769             {
1770               fprintf (vect_dump, ",");
1771               print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM);
1772             }
1773         }
1774
1775       return NULL;
1776     }
1777
1778   /* Check that it's ok to change the order of the computation.
1779      Generally, when vectorizing a reduction we change the order of the
1780      computation.  This may change the behavior of the program in some
1781      cases, so we need to check that this is ok.  One exception is when
1782      vectorizing an outer-loop: the inner-loop is executed sequentially,
1783      and therefore vectorizing reductions in the inner-loop during
1784      outer-loop vectorization is safe.  */
1785
1786   /* CHECKME: check for !flag_finite_math_only too?  */
1787   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
1788       && check_reduction)
1789     {
1790       /* Changing the order of operations changes the semantics.  */
1791       if (vect_print_dump_info (REPORT_DETAILS))
1792         report_vect_op (def_stmt, "reduction: unsafe fp math optimization: ");
1793       return NULL;
1794     }
1795   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
1796            && check_reduction)
1797     {
1798       /* Changing the order of operations changes the semantics.  */
1799       if (vect_print_dump_info (REPORT_DETAILS))
1800         report_vect_op (def_stmt, "reduction: unsafe int math optimization: ");
1801       return NULL;
1802     }
1803   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
1804     {
1805       /* Changing the order of operations changes the semantics.  */
1806       if (vect_print_dump_info (REPORT_DETAILS))
1807         report_vect_op (def_stmt,
1808                         "reduction: unsafe fixed-point math optimization: ");
1809       return NULL;
1810     }
1811
1812   /* Reduction is safe. We're dealing with one of the following:
1813      1) integer arithmetic and no trapv
1814      2) floating point arithmetic, and special flags permit this optimization
1815      3) nested cycle (i.e., outer loop vectorization).  */
1816   if (TREE_CODE (op1) == SSA_NAME)
1817     def1 = SSA_NAME_DEF_STMT (op1);
1818
1819   if (TREE_CODE (op2) == SSA_NAME)
1820     def2 = SSA_NAME_DEF_STMT (op2);
1821
1822   if (code != COND_EXPR
1823       && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2)))
1824     {
1825       if (vect_print_dump_info (REPORT_DETAILS))
1826         report_vect_op (def_stmt, "reduction: no defs for operands: ");
1827       return NULL;
1828     }
1829
1830   /* Check that one def is the reduction def, defined by PHI,
1831      the other def is either defined in the loop ("vect_internal_def"),
1832      or it's an induction (defined by a loop-header phi-node).  */
1833
1834   if (def2 && def2 == phi
1835       && (code == COND_EXPR
1836           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
1837               && (is_gimple_assign (def1)
1838                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1839                       == vect_induction_def
1840                   || (gimple_code (def1) == GIMPLE_PHI
1841                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
1842                           == vect_internal_def
1843                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
1844     {
1845       if (vect_print_dump_info (REPORT_DETAILS))
1846         report_vect_op (def_stmt, "detected reduction: ");
1847       return def_stmt;
1848     }
1849   else if (def1 && def1 == phi
1850            && (code == COND_EXPR
1851                || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
1852                    && (is_gimple_assign (def2)
1853                        || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1854                            == vect_induction_def
1855                        || (gimple_code (def2) == GIMPLE_PHI
1856                            && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
1857                                == vect_internal_def
1858                            && !is_loop_header_bb_p (gimple_bb (def2)))))))
1859     {
1860       if (check_reduction)
1861         {
1862           /* Swap operands (just for simplicity - so that the rest of the code
1863              can assume that the reduction variable is always the last (second)
1864              argument).  */
1865           if (vect_print_dump_info (REPORT_DETAILS))
1866             report_vect_op (def_stmt,
1867                             "detected reduction: need to swap operands: ");
1868
1869           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
1870                               gimple_assign_rhs2_ptr (def_stmt));
1871         }
1872       else
1873         {
1874           if (vect_print_dump_info (REPORT_DETAILS))
1875             report_vect_op (def_stmt, "detected reduction: ");
1876         }
1877
1878       return def_stmt;
1879     }
1880   else
1881     {
1882       if (vect_print_dump_info (REPORT_DETAILS))
1883         report_vect_op (def_stmt, "reduction: unknown pattern: ");
1884
1885       return NULL;
1886     }
1887 }
1888
1889
1890 /* Function vect_estimate_min_profitable_iters
1891
1892    Return the number of iterations required for the vector version of the
1893    loop to be profitable relative to the cost of the scalar version of the
1894    loop.
1895
1896    TODO: Take profile info into account before making vectorization
1897    decisions, if available.  */
1898
1899 int
1900 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo)
1901 {
1902   int i;
1903   int min_profitable_iters;
1904   int peel_iters_prologue;
1905   int peel_iters_epilogue;
1906   int vec_inside_cost = 0;
1907   int vec_outside_cost = 0;
1908   int scalar_single_iter_cost = 0;
1909   int scalar_outside_cost = 0;
1910   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1911   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1912   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1913   int nbbs = loop->num_nodes;
1914   int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
1915   int peel_guard_costs = 0;
1916   int innerloop_iters = 0, factor;
1917   VEC (slp_instance, heap) *slp_instances;
1918   slp_instance instance;
1919
1920   /* Cost model disabled.  */
1921   if (!flag_vect_cost_model)
1922     {
1923       if (vect_print_dump_info (REPORT_COST))
1924         fprintf (vect_dump, "cost model disabled.");
1925       return 0;
1926     }
1927
1928   /* Requires loop versioning tests to handle misalignment.  */
1929   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1930     {
1931       /*  FIXME: Make cost depend on complexity of individual check.  */
1932       vec_outside_cost +=
1933         VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo));
1934       if (vect_print_dump_info (REPORT_COST))
1935         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1936                  "versioning to treat misalignment.\n");
1937     }
1938
1939   /* Requires loop versioning with alias checks.  */
1940   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1941     {
1942       /*  FIXME: Make cost depend on complexity of individual check.  */
1943       vec_outside_cost +=
1944         VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo));
1945       if (vect_print_dump_info (REPORT_COST))
1946         fprintf (vect_dump, "cost model: Adding cost of checks for loop "
1947                  "versioning aliasing.\n");
1948     }
1949
1950   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
1951       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
1952     vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST;
1953
1954   /* Count statements in scalar loop.  Using this as scalar cost for a single
1955      iteration for now.
1956
1957      TODO: Add outer loop support.
1958
1959      TODO: Consider assigning different costs to different scalar
1960      statements.  */
1961
1962   /* FORNOW.  */
1963   if (loop->inner)
1964     innerloop_iters = 50; /* FIXME */
1965
1966   for (i = 0; i < nbbs; i++)
1967     {
1968       gimple_stmt_iterator si;
1969       basic_block bb = bbs[i];
1970
1971       if (bb->loop_father == loop->inner)
1972         factor = innerloop_iters;
1973       else
1974         factor = 1;
1975
1976       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1977         {
1978           gimple stmt = gsi_stmt (si);
1979           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1980           /* Skip stmts that are not vectorized inside the loop.  */
1981           if (!STMT_VINFO_RELEVANT_P (stmt_info)
1982               && (!STMT_VINFO_LIVE_P (stmt_info)
1983                   || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def))
1984             continue;
1985           scalar_single_iter_cost += cost_for_stmt (stmt) * factor;
1986           vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor;
1987           /* FIXME: for stmts in the inner-loop in outer-loop vectorization,
1988              some of the "outside" costs are generated inside the outer-loop.  */
1989           vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info);
1990         }
1991     }
1992
1993   /* Add additional cost for the peeled instructions in prologue and epilogue
1994      loop.
1995
1996      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
1997      at compile-time - we assume it's vf/2 (the worst would be vf-1).
1998
1999      TODO: Build an expression that represents peel_iters for prologue and
2000      epilogue to be used in a run-time test.  */
2001
2002   if (byte_misalign < 0)
2003     {
2004       peel_iters_prologue = vf/2;
2005       if (vect_print_dump_info (REPORT_COST))
2006         fprintf (vect_dump, "cost model: "
2007                  "prologue peel iters set to vf/2.");
2008
2009       /* If peeling for alignment is unknown, loop bound of main loop becomes
2010          unknown.  */
2011       peel_iters_epilogue = vf/2;
2012       if (vect_print_dump_info (REPORT_COST))
2013         fprintf (vect_dump, "cost model: "
2014                  "epilogue peel iters set to vf/2 because "
2015                  "peeling for alignment is unknown .");
2016
2017       /* If peeled iterations are unknown, count a taken branch and a not taken
2018          branch per peeled loop. Even if scalar loop iterations are known,
2019          vector iterations are not known since peeled prologue iterations are
2020          not known. Hence guards remain the same.  */
2021       peel_guard_costs +=  2 * (TARG_COND_TAKEN_BRANCH_COST
2022                               + TARG_COND_NOT_TAKEN_BRANCH_COST);
2023     }
2024   else
2025     {
2026       if (byte_misalign)
2027         {
2028           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2029           int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
2030           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2031           int nelements = TYPE_VECTOR_SUBPARTS (vectype);
2032
2033           peel_iters_prologue = nelements - (byte_misalign / element_size);
2034         }
2035       else
2036         peel_iters_prologue = 0;
2037
2038       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2039         {
2040           peel_iters_epilogue = vf/2;
2041           if (vect_print_dump_info (REPORT_COST))
2042             fprintf (vect_dump, "cost model: "
2043                      "epilogue peel iters set to vf/2 because "
2044                      "loop iterations are unknown .");
2045
2046           /* If peeled iterations are known but number of scalar loop
2047              iterations are unknown, count a taken branch per peeled loop.  */
2048           peel_guard_costs +=  2 * TARG_COND_TAKEN_BRANCH_COST;
2049
2050         }
2051       else
2052         {
2053           int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2054           peel_iters_prologue = niters < peel_iters_prologue ?
2055                                         niters : peel_iters_prologue;
2056           peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2057         }
2058     }
2059
2060   vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost)
2061                       + (peel_iters_epilogue * scalar_single_iter_cost)
2062                       + peel_guard_costs;
2063
2064   /* FORNOW: The scalar outside cost is incremented in one of the
2065      following ways:
2066
2067      1. The vectorizer checks for alignment and aliasing and generates
2068      a condition that allows dynamic vectorization.  A cost model
2069      check is ANDED with the versioning condition.  Hence scalar code
2070      path now has the added cost of the versioning check.
2071
2072        if (cost > th & versioning_check)
2073          jmp to vector code
2074
2075      Hence run-time scalar is incremented by not-taken branch cost.
2076
2077      2. The vectorizer then checks if a prologue is required.  If the
2078      cost model check was not done before during versioning, it has to
2079      be done before the prologue check.
2080
2081        if (cost <= th)
2082          prologue = scalar_iters
2083        if (prologue == 0)
2084          jmp to vector code
2085        else
2086          execute prologue
2087        if (prologue == num_iters)
2088          go to exit
2089
2090      Hence the run-time scalar cost is incremented by a taken branch,
2091      plus a not-taken branch, plus a taken branch cost.
2092
2093      3. The vectorizer then checks if an epilogue is required.  If the
2094      cost model check was not done before during prologue check, it
2095      has to be done with the epilogue check.
2096
2097        if (prologue == 0)
2098          jmp to vector code
2099        else
2100          execute prologue
2101        if (prologue == num_iters)
2102          go to exit
2103        vector code:
2104          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2105            jmp to epilogue
2106
2107      Hence the run-time scalar cost should be incremented by 2 taken
2108      branches.
2109
2110      TODO: The back end may reorder the BBS's differently and reverse
2111      conditions/branch directions.  Change the estimates below to
2112      something more reasonable.  */
2113
2114   /* If the number of iterations is known and we do not do versioning, we can
2115      decide whether to vectorize at compile time. Hence the scalar version
2116      do not carry cost model guard costs.  */
2117   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2118       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2119       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2120     {
2121       /* Cost model check occurs at versioning.  */
2122       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2123           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2124         scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST;
2125       else
2126         {
2127           /* Cost model check occurs at prologue generation.  */
2128           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2129             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST
2130               + TARG_COND_NOT_TAKEN_BRANCH_COST;
2131           /* Cost model check occurs at epilogue generation.  */
2132           else
2133             scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST;
2134         }
2135     }
2136
2137   /* Add SLP costs.  */
2138   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
2139   for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++)
2140     {
2141       vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance);
2142       vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance);
2143     }
2144
2145   /* Calculate number of iterations required to make the vector version
2146      profitable, relative to the loop bodies only. The following condition
2147      must hold true:
2148      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2149      where
2150      SIC = scalar iteration cost, VIC = vector iteration cost,
2151      VOC = vector outside cost, VF = vectorization factor,
2152      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2153      SOC = scalar outside cost for run time cost model check.  */
2154
2155   if ((scalar_single_iter_cost * vf) > vec_inside_cost)
2156     {
2157       if (vec_outside_cost <= 0)
2158         min_profitable_iters = 1;
2159       else
2160         {
2161           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2162                                   - vec_inside_cost * peel_iters_prologue
2163                                   - vec_inside_cost * peel_iters_epilogue)
2164                                  / ((scalar_single_iter_cost * vf)
2165                                     - vec_inside_cost);
2166
2167           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2168               <= ((vec_inside_cost * min_profitable_iters)
2169                   + ((vec_outside_cost - scalar_outside_cost) * vf)))
2170             min_profitable_iters++;
2171         }
2172     }
2173   /* vector version will never be profitable.  */
2174   else
2175     {
2176       if (vect_print_dump_info (REPORT_COST))
2177         fprintf (vect_dump, "cost model: vector iteration cost = %d "
2178                  "is divisible by scalar iteration cost = %d by a factor "
2179                  "greater than or equal to the vectorization factor = %d .",
2180                  vec_inside_cost, scalar_single_iter_cost, vf);
2181       return -1;
2182     }
2183
2184   if (vect_print_dump_info (REPORT_COST))
2185     {
2186       fprintf (vect_dump, "Cost model analysis: \n");
2187       fprintf (vect_dump, "  Vector inside of loop cost: %d\n",
2188                vec_inside_cost);
2189       fprintf (vect_dump, "  Vector outside of loop cost: %d\n",
2190                vec_outside_cost);
2191       fprintf (vect_dump, "  Scalar iteration cost: %d\n",
2192                scalar_single_iter_cost);
2193       fprintf (vect_dump, "  Scalar outside cost: %d\n", scalar_outside_cost);
2194       fprintf (vect_dump, "  prologue iterations: %d\n",
2195                peel_iters_prologue);
2196       fprintf (vect_dump, "  epilogue iterations: %d\n",
2197                peel_iters_epilogue);
2198       fprintf (vect_dump, "  Calculated minimum iters for profitability: %d\n",
2199                min_profitable_iters);
2200     }
2201
2202   min_profitable_iters =
2203         min_profitable_iters < vf ? vf : min_profitable_iters;
2204
2205   /* Because the condition we create is:
2206      if (niters <= min_profitable_iters)
2207        then skip the vectorized loop.  */
2208   min_profitable_iters--;
2209
2210   if (vect_print_dump_info (REPORT_COST))
2211     fprintf (vect_dump, "  Profitability threshold = %d\n",
2212              min_profitable_iters);
2213
2214   return min_profitable_iters;
2215 }
2216
2217
2218 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2219    functions. Design better to avoid maintenance issues.  */
2220
2221 /* Function vect_model_reduction_cost.
2222
2223    Models cost for a reduction operation, including the vector ops
2224    generated within the strip-mine loop, the initial definition before
2225    the loop, and the epilogue code that must be generated.  */
2226
2227 static bool
2228 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2229                            int ncopies)
2230 {
2231   int outer_cost = 0;
2232   enum tree_code code;
2233   optab optab;
2234   tree vectype;
2235   gimple stmt, orig_stmt;
2236   tree reduction_op;
2237   enum machine_mode mode;
2238   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2239   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2240
2241
2242   /* Cost of reduction op inside loop.  */
2243   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST;
2244
2245   stmt = STMT_VINFO_STMT (stmt_info);
2246
2247   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2248     {
2249     case GIMPLE_SINGLE_RHS:
2250       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2251       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2252       break;
2253     case GIMPLE_UNARY_RHS:
2254       reduction_op = gimple_assign_rhs1 (stmt);
2255       break;
2256     case GIMPLE_BINARY_RHS:
2257       reduction_op = gimple_assign_rhs2 (stmt);
2258       break;
2259     default:
2260       gcc_unreachable ();
2261     }
2262
2263   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2264   if (!vectype)
2265     {
2266       if (vect_print_dump_info (REPORT_COST))
2267         {
2268           fprintf (vect_dump, "unsupported data-type ");
2269           print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM);
2270         }
2271       return false;
2272    }
2273
2274   mode = TYPE_MODE (vectype);
2275   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
2276
2277   if (!orig_stmt)
2278     orig_stmt = STMT_VINFO_STMT (stmt_info);
2279
2280   code = gimple_assign_rhs_code (orig_stmt);
2281
2282   /* Add in cost for initial definition.  */
2283   outer_cost += TARG_SCALAR_TO_VEC_COST;
2284
2285   /* Determine cost of epilogue code.
2286
2287      We have a reduction operator that will reduce the vector in one statement.
2288      Also requires scalar extract.  */
2289
2290   if (!nested_in_vect_loop_p (loop, orig_stmt))
2291     {
2292       if (reduc_code != ERROR_MARK)
2293         outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST;
2294       else
2295         {
2296           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
2297           tree bitsize =
2298             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
2299           int element_bitsize = tree_low_cst (bitsize, 1);
2300           int nelements = vec_size_in_bits / element_bitsize;
2301
2302           optab = optab_for_tree_code (code, vectype, optab_default);
2303
2304           /* We have a whole vector shift available.  */
2305           if (VECTOR_MODE_P (mode)
2306               && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing
2307               && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
2308             /* Final reduction via vector shifts and the reduction operator. Also
2309                requires scalar extract.  */
2310             outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST
2311                                 + TARG_VEC_TO_SCALAR_COST);
2312           else
2313             /* Use extracts and reduction op for final reduction.  For N elements,
2314                we have N extracts and N-1 reduction ops.  */
2315             outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST);
2316         }
2317     }
2318
2319   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost;
2320
2321   if (vect_print_dump_info (REPORT_COST))
2322     fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, "
2323              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2324              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2325
2326   return true;
2327 }
2328
2329
2330 /* Function vect_model_induction_cost.
2331
2332    Models cost for induction operations.  */
2333
2334 static void
2335 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
2336 {
2337   /* loop cost for vec_loop.  */
2338   STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST;
2339   /* prologue cost for vec_init and vec_step.  */
2340   STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST;
2341
2342   if (vect_print_dump_info (REPORT_COST))
2343     fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, "
2344              "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info),
2345              STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info));
2346 }
2347
2348
2349 /* Function get_initial_def_for_induction
2350
2351    Input:
2352    STMT - a stmt that performs an induction operation in the loop.
2353    IV_PHI - the initial value of the induction variable
2354
2355    Output:
2356    Return a vector variable, initialized with the first VF values of
2357    the induction variable. E.g., for an iv with IV_PHI='X' and
2358    evolution S, for a vector of 4 units, we want to return:
2359    [X, X + S, X + 2*S, X + 3*S].  */
2360
2361 static tree
2362 get_initial_def_for_induction (gimple iv_phi)
2363 {
2364   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
2365   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2366   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2367   tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi));
2368   tree vectype;
2369   int nunits;
2370   edge pe = loop_preheader_edge (loop);
2371   struct loop *iv_loop;
2372   basic_block new_bb;
2373   tree vec, vec_init, vec_step, t;
2374   tree access_fn;
2375   tree new_var;
2376   tree new_name;
2377   gimple init_stmt, induction_phi, new_stmt;
2378   tree induc_def, vec_def, vec_dest;
2379   tree init_expr, step_expr;
2380   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2381   int i;
2382   bool ok;
2383   int ncopies;
2384   tree expr;
2385   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
2386   bool nested_in_vect_loop = false;
2387   gimple_seq stmts = NULL;
2388   imm_use_iterator imm_iter;
2389   use_operand_p use_p;
2390   gimple exit_phi;
2391   edge latch_e;
2392   tree loop_arg;
2393   gimple_stmt_iterator si;
2394   basic_block bb = gimple_bb (iv_phi);
2395   tree stepvectype;
2396
2397   vectype = get_vectype_for_scalar_type (scalar_type);
2398   gcc_assert (vectype);
2399   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2400   ncopies = vf / nunits;
2401
2402   gcc_assert (phi_info);
2403   gcc_assert (ncopies >= 1);
2404
2405   /* Find the first insertion point in the BB.  */
2406   si = gsi_after_labels (bb);
2407
2408   if (INTEGRAL_TYPE_P (scalar_type))
2409     step_expr = build_int_cst (scalar_type, 0);
2410   else if (POINTER_TYPE_P (scalar_type))
2411     step_expr = build_int_cst (sizetype, 0);
2412   else
2413     step_expr = build_real (scalar_type, dconst0);
2414
2415   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
2416   if (nested_in_vect_loop_p (loop, iv_phi))
2417     {
2418       nested_in_vect_loop = true;
2419       iv_loop = loop->inner;
2420     }
2421   else
2422     iv_loop = loop;
2423   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
2424
2425   latch_e = loop_latch_edge (iv_loop);
2426   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
2427
2428   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
2429   gcc_assert (access_fn);
2430   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
2431                                     &init_expr, &step_expr);
2432   gcc_assert (ok);
2433   pe = loop_preheader_edge (iv_loop);
2434
2435   /* Create the vector that holds the initial_value of the induction.  */
2436   if (nested_in_vect_loop)
2437     {
2438       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
2439          been created during vectorization of previous stmts; We obtain it from
2440          the STMT_VINFO_VEC_STMT of the defining stmt. */
2441       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
2442                                            loop_preheader_edge (iv_loop));
2443       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
2444     }
2445   else
2446     {
2447       /* iv_loop is the loop to be vectorized. Create:
2448          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
2449       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
2450       add_referenced_var (new_var);
2451
2452       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
2453       if (stmts)
2454         {
2455           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
2456           gcc_assert (!new_bb);
2457         }
2458
2459       t = NULL_TREE;
2460       t = tree_cons (NULL_TREE, init_expr, t);
2461       for (i = 1; i < nunits; i++)
2462         {
2463           /* Create: new_name_i = new_name + step_expr  */
2464           enum tree_code code = POINTER_TYPE_P (scalar_type)
2465                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
2466           init_stmt = gimple_build_assign_with_ops (code, new_var,
2467                                                     new_name, step_expr);
2468           new_name = make_ssa_name (new_var, init_stmt);
2469           gimple_assign_set_lhs (init_stmt, new_name);
2470
2471           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
2472           gcc_assert (!new_bb);
2473
2474           if (vect_print_dump_info (REPORT_DETAILS))
2475             {
2476               fprintf (vect_dump, "created new init_stmt: ");
2477               print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM);
2478             }
2479           t = tree_cons (NULL_TREE, new_name, t);
2480         }
2481       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
2482       vec = build_constructor_from_list (vectype, nreverse (t));
2483       vec_init = vect_init_vector (iv_phi, vec, vectype, NULL);
2484     }
2485
2486
2487   /* Create the vector that holds the step of the induction.  */
2488   if (nested_in_vect_loop)
2489     /* iv_loop is nested in the loop to be vectorized. Generate:
2490        vec_step = [S, S, S, S]  */
2491     new_name = step_expr;
2492   else
2493     {
2494       /* iv_loop is the loop to be vectorized. Generate:
2495           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
2496       expr = build_int_cst (TREE_TYPE (step_expr), vf);
2497       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2498                               expr, step_expr);
2499     }
2500
2501   t = NULL_TREE;
2502   for (i = 0; i < nunits; i++)
2503     t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2504   gcc_assert (CONSTANT_CLASS_P (new_name));
2505   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
2506   gcc_assert (stepvectype);
2507   vec = build_vector (stepvectype, t);
2508   vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2509
2510
2511   /* Create the following def-use cycle:
2512      loop prolog:
2513          vec_init = ...
2514          vec_step = ...
2515      loop:
2516          vec_iv = PHI <vec_init, vec_loop>
2517          ...
2518          STMT
2519          ...
2520          vec_loop = vec_iv + vec_step;  */
2521
2522   /* Create the induction-phi that defines the induction-operand.  */
2523   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
2524   add_referenced_var (vec_dest);
2525   induction_phi = create_phi_node (vec_dest, iv_loop->header);
2526   set_vinfo_for_stmt (induction_phi,
2527                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
2528   induc_def = PHI_RESULT (induction_phi);
2529
2530   /* Create the iv update inside the loop  */
2531   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2532                                            induc_def, vec_step);
2533   vec_def = make_ssa_name (vec_dest, new_stmt);
2534   gimple_assign_set_lhs (new_stmt, vec_def);
2535   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2536   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
2537                                                    NULL));
2538
2539   /* Set the arguments of the phi node:  */
2540   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
2541   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
2542                UNKNOWN_LOCATION);
2543
2544
2545   /* In case that vectorization factor (VF) is bigger than the number
2546      of elements that we can fit in a vectype (nunits), we have to generate
2547      more than one vector stmt - i.e - we need to "unroll" the
2548      vector stmt by a factor VF/nunits.  For more details see documentation
2549      in vectorizable_operation.  */
2550
2551   if (ncopies > 1)
2552     {
2553       stmt_vec_info prev_stmt_vinfo;
2554       /* FORNOW. This restriction should be relaxed.  */
2555       gcc_assert (!nested_in_vect_loop);
2556
2557       /* Create the vector that holds the step of the induction.  */
2558       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
2559       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
2560                               expr, step_expr);
2561       t = NULL_TREE;
2562       for (i = 0; i < nunits; i++)
2563         t = tree_cons (NULL_TREE, unshare_expr (new_name), t);
2564       gcc_assert (CONSTANT_CLASS_P (new_name));
2565       vec = build_vector (stepvectype, t);
2566       vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL);
2567
2568       vec_def = induc_def;
2569       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
2570       for (i = 1; i < ncopies; i++)
2571         {
2572           /* vec_i = vec_prev + vec_step  */
2573           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
2574                                                    vec_def, vec_step);
2575           vec_def = make_ssa_name (vec_dest, new_stmt);
2576           gimple_assign_set_lhs (new_stmt, vec_def);
2577
2578           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
2579           set_vinfo_for_stmt (new_stmt,
2580                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
2581           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
2582           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
2583         }
2584     }
2585
2586   if (nested_in_vect_loop)
2587     {
2588       /* Find the loop-closed exit-phi of the induction, and record
2589          the final vector of induction results:  */
2590       exit_phi = NULL;
2591       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
2592         {
2593           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
2594             {
2595               exit_phi = USE_STMT (use_p);
2596               break;
2597             }
2598         }
2599       if (exit_phi)
2600         {
2601           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
2602           /* FORNOW. Currently not supporting the case that an inner-loop induction
2603              is not used in the outer-loop (i.e. only outside the outer-loop).  */
2604           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
2605                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
2606
2607           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
2608           if (vect_print_dump_info (REPORT_DETAILS))
2609             {
2610               fprintf (vect_dump, "vector of inductions after inner-loop:");
2611               print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM);
2612             }
2613         }
2614     }
2615
2616
2617   if (vect_print_dump_info (REPORT_DETAILS))
2618     {
2619       fprintf (vect_dump, "transform induction: created def-use cycle: ");
2620       print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM);
2621       fprintf (vect_dump, "\n");
2622       print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM);
2623     }
2624
2625   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
2626   return induc_def;
2627 }
2628
2629
2630 /* Function get_initial_def_for_reduction
2631
2632    Input:
2633    STMT - a stmt that performs a reduction operation in the loop.
2634    INIT_VAL - the initial value of the reduction variable
2635
2636    Output:
2637    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
2638         of the reduction (used for adjusting the epilog - see below).
2639    Return a vector variable, initialized according to the operation that STMT
2640         performs. This vector will be used as the initial value of the
2641         vector of partial results.
2642
2643    Option1 (adjust in epilog): Initialize the vector as follows:
2644      add/bit or/xor:    [0,0,...,0,0]
2645      mult/bit and:      [1,1,...,1,1]
2646      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
2647    and when necessary (e.g. add/mult case) let the caller know
2648    that it needs to adjust the result by init_val.
2649
2650    Option2: Initialize the vector as follows:
2651      add/bit or/xor:    [init_val,0,0,...,0]
2652      mult/bit and:      [init_val,1,1,...,1]
2653      min/max/cond_expr: [init_val,init_val,...,init_val]
2654    and no adjustments are needed.
2655
2656    For example, for the following code:
2657
2658    s = init_val;
2659    for (i=0;i<n;i++)
2660      s = s + a[i];
2661
2662    STMT is 's = s + a[i]', and the reduction variable is 's'.
2663    For a vector of 4 units, we want to return either [0,0,0,init_val],
2664    or [0,0,0,0] and let the caller know that it needs to adjust
2665    the result at the end by 'init_val'.
2666
2667    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
2668    initialization vector is simpler (same element in all entries), if
2669    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
2670
2671    A cost model should help decide between these two schemes.  */
2672
2673 tree
2674 get_initial_def_for_reduction (gimple stmt, tree init_val,
2675                                tree *adjustment_def)
2676 {
2677   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2678   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
2679   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2680   tree scalar_type = TREE_TYPE (init_val);
2681   tree vectype = get_vectype_for_scalar_type (scalar_type);
2682   int nunits;
2683   enum tree_code code = gimple_assign_rhs_code (stmt);
2684   tree def_for_init;
2685   tree init_def;
2686   tree t = NULL_TREE;
2687   int i;
2688   bool nested_in_vect_loop = false;
2689   tree init_value;
2690   REAL_VALUE_TYPE real_init_val = dconst0;
2691   int int_init_val = 0;
2692   gimple def_stmt = NULL;
2693
2694   gcc_assert (vectype);
2695   nunits = TYPE_VECTOR_SUBPARTS (vectype);
2696
2697   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
2698               || SCALAR_FLOAT_TYPE_P (scalar_type));
2699
2700   if (nested_in_vect_loop_p (loop, stmt))
2701     nested_in_vect_loop = true;
2702   else
2703     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
2704
2705   /* In case of double reduction we only create a vector variable to be put
2706      in the reduction phi node. The actual statement creation is done in
2707      vect_create_epilog_for_reduction.  */
2708   if (adjustment_def && nested_in_vect_loop
2709       && TREE_CODE (init_val) == SSA_NAME
2710       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
2711       && gimple_code (def_stmt) == GIMPLE_PHI
2712       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2713       && vinfo_for_stmt (def_stmt)
2714       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2715           == vect_double_reduction_def)
2716     {
2717       *adjustment_def = NULL;
2718       return vect_create_destination_var (init_val, vectype);
2719     }
2720
2721   if (TREE_CONSTANT (init_val))
2722     {
2723       if (SCALAR_FLOAT_TYPE_P (scalar_type))
2724         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
2725       else
2726         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
2727     }
2728   else
2729     init_value = init_val;
2730
2731   switch (code)
2732     {
2733       case WIDEN_SUM_EXPR:
2734       case DOT_PROD_EXPR:
2735       case PLUS_EXPR:
2736       case MINUS_EXPR:
2737       case BIT_IOR_EXPR:
2738       case BIT_XOR_EXPR:
2739       case MULT_EXPR:
2740       case BIT_AND_EXPR:
2741         /* ADJUSMENT_DEF is NULL when called from
2742            vect_create_epilog_for_reduction to vectorize double reduction.  */
2743         if (adjustment_def)
2744           {
2745             if (nested_in_vect_loop)
2746               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
2747                                                               NULL);
2748             else
2749               *adjustment_def = init_val;
2750           }
2751
2752         if (code == MULT_EXPR || code == BIT_AND_EXPR)
2753           {
2754             real_init_val = dconst1;
2755             int_init_val = 1;
2756           }
2757
2758         if (SCALAR_FLOAT_TYPE_P (scalar_type))
2759           def_for_init = build_real (scalar_type, real_init_val);
2760         else
2761           def_for_init = build_int_cst (scalar_type, int_init_val);
2762
2763         /* Create a vector of '0' or '1' except the first element.  */
2764         for (i = nunits - 2; i >= 0; --i)
2765           t = tree_cons (NULL_TREE, def_for_init, t);
2766
2767         /* Option1: the first element is '0' or '1' as well.  */
2768         if (adjustment_def)
2769           {
2770             t = tree_cons (NULL_TREE, def_for_init, t);
2771             init_def = build_vector (vectype, t);
2772             break;
2773           }
2774
2775         /* Option2: the first element is INIT_VAL.  */
2776         t = tree_cons (NULL_TREE, init_value, t);
2777         if (TREE_CONSTANT (init_val))
2778           init_def = build_vector (vectype, t);
2779         else
2780           init_def = build_constructor_from_list (vectype, t);
2781
2782         break;
2783
2784       case MIN_EXPR:
2785       case MAX_EXPR:
2786       case COND_EXPR:
2787         if (adjustment_def)
2788           {
2789             *adjustment_def = NULL_TREE;
2790             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
2791             break;
2792           }
2793
2794         for (i = nunits - 1; i >= 0; --i)
2795           t = tree_cons (NULL_TREE, init_value, t);
2796
2797         if (TREE_CONSTANT (init_val))
2798           init_def = build_vector (vectype, t);
2799         else
2800           init_def = build_constructor_from_list (vectype, t);
2801
2802         break;
2803
2804       default:
2805         gcc_unreachable ();
2806     }
2807
2808   return init_def;
2809 }
2810
2811
2812 /* Function vect_create_epilog_for_reduction
2813
2814    Create code at the loop-epilog to finalize the result of a reduction
2815    computation.
2816
2817    VECT_DEF is a vector of partial results.
2818    REDUC_CODE is the tree-code for the epilog reduction.
2819    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
2820      number of elements that we can fit in a vectype (nunits). In this case
2821      we have to generate more than one vector stmt - i.e - we need to "unroll"
2822      the vector stmt by a factor VF/nunits.  For more details see documentation
2823      in vectorizable_operation.
2824    STMT is the scalar reduction stmt that is being vectorized.
2825    REDUCTION_PHI is the phi-node that carries the reduction computation.
2826    REDUC_INDEX is the index of the operand in the right hand side of the
2827      statement that is defined by REDUCTION_PHI.
2828    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
2829
2830    This function:
2831    1. Creates the reduction def-use cycle: sets the arguments for
2832       REDUCTION_PHI:
2833       The loop-entry argument is the vectorized initial-value of the reduction.
2834       The loop-latch argument is VECT_DEF - the vector of partial sums.
2835    2. "Reduces" the vector of partial results VECT_DEF into a single result,
2836       by applying the operation specified by REDUC_CODE if available, or by
2837       other means (whole-vector shifts or a scalar loop).
2838       The function also creates a new phi node at the loop exit to preserve
2839       loop-closed form, as illustrated below.
2840
2841      The flow at the entry to this function:
2842
2843         loop:
2844           vec_def = phi <null, null>            # REDUCTION_PHI
2845           VECT_DEF = vector_stmt                # vectorized form of STMT
2846           s_loop = scalar_stmt                  # (scalar) STMT
2847         loop_exit:
2848           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2849           use <s_out0>
2850           use <s_out0>
2851
2852      The above is transformed by this function into:
2853
2854         loop:
2855           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
2856           VECT_DEF = vector_stmt                # vectorized form of STMT
2857           s_loop = scalar_stmt                  # (scalar) STMT
2858         loop_exit:
2859           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
2860           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
2861           v_out2 = reduce <v_out1>
2862           s_out3 = extract_field <v_out2, 0>
2863           s_out4 = adjust_result <s_out3>
2864           use <s_out4>
2865           use <s_out4>
2866 */
2867
2868 static void
2869 vect_create_epilog_for_reduction (tree vect_def, gimple stmt,
2870                                   int ncopies,
2871                                   enum tree_code reduc_code,
2872                                   gimple reduction_phi,
2873                                   int reduc_index,
2874                                   bool double_reduc)
2875 {
2876   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2877   stmt_vec_info prev_phi_info;
2878   tree vectype;
2879   enum machine_mode mode;
2880   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2881   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
2882   basic_block exit_bb;
2883   tree scalar_dest;
2884   tree scalar_type;
2885   gimple new_phi = NULL, phi;
2886   gimple_stmt_iterator exit_gsi;
2887   tree vec_dest;
2888   tree new_temp = NULL_TREE;
2889   tree new_name;
2890   gimple epilog_stmt = NULL;
2891   tree new_scalar_dest, new_dest;
2892   gimple exit_phi;
2893   tree bitsize, bitpos, bytesize;
2894   enum tree_code code = gimple_assign_rhs_code (stmt);
2895   tree adjustment_def;
2896   tree vec_initial_def, def;
2897   tree orig_name;
2898   imm_use_iterator imm_iter;
2899   use_operand_p use_p;
2900   bool extract_scalar_result = false;
2901   tree reduction_op, expr;
2902   gimple orig_stmt;
2903   gimple use_stmt;
2904   bool nested_in_vect_loop = false;
2905   VEC(gimple,heap) *phis = NULL;
2906   enum vect_def_type dt = vect_unknown_def_type;
2907   int j, i;
2908
2909   if (nested_in_vect_loop_p (loop, stmt))
2910     {
2911       outer_loop = loop;
2912       loop = loop->inner;
2913       nested_in_vect_loop = true;
2914     }
2915
2916   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2917     {
2918     case GIMPLE_SINGLE_RHS:
2919       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
2920                                        == ternary_op);
2921       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
2922       break;
2923     case GIMPLE_UNARY_RHS:
2924       reduction_op = gimple_assign_rhs1 (stmt);
2925       break;
2926     case GIMPLE_BINARY_RHS:
2927       reduction_op = reduc_index ?
2928                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
2929       break;
2930     default:
2931       gcc_unreachable ();
2932     }
2933
2934   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
2935   gcc_assert (vectype);
2936   mode = TYPE_MODE (vectype);
2937
2938   /*** 1. Create the reduction def-use cycle  ***/
2939
2940   /* For the case of reduction, vect_get_vec_def_for_operand returns
2941      the scalar def before the loop, that defines the initial value
2942      of the reduction variable.  */
2943   vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
2944                                                   &adjustment_def);
2945
2946   phi = reduction_phi;
2947   def = vect_def;
2948   for (j = 0; j < ncopies; j++)
2949     {
2950       /* 1.1 set the loop-entry arg of the reduction-phi:  */
2951       add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop),
2952                    UNKNOWN_LOCATION);
2953
2954       /* 1.2 set the loop-latch arg for the reduction-phi:  */
2955       if (j > 0)
2956         def = vect_get_vec_def_for_stmt_copy (dt, def);
2957       add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
2958
2959       if (vect_print_dump_info (REPORT_DETAILS))
2960         {
2961           fprintf (vect_dump, "transform reduction: created def-use cycle: ");
2962           print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
2963           fprintf (vect_dump, "\n");
2964           print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM);
2965         }
2966
2967       phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
2968     }
2969
2970   /*** 2. Create epilog code
2971           The reduction epilog code operates across the elements of the vector
2972           of partial results computed by the vectorized loop.
2973           The reduction epilog code consists of:
2974           step 1: compute the scalar result in a vector (v_out2)
2975           step 2: extract the scalar result (s_out3) from the vector (v_out2)
2976           step 3: adjust the scalar result (s_out3) if needed.
2977
2978           Step 1 can be accomplished using one the following three schemes:
2979           (scheme 1) using reduc_code, if available.
2980           (scheme 2) using whole-vector shifts, if available.
2981           (scheme 3) using a scalar loop. In this case steps 1+2 above are
2982                      combined.
2983
2984           The overall epilog code looks like this:
2985
2986           s_out0 = phi <s_loop>         # original EXIT_PHI
2987           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
2988           v_out2 = reduce <v_out1>              # step 1
2989           s_out3 = extract_field <v_out2, 0>    # step 2
2990           s_out4 = adjust_result <s_out3>       # step 3
2991
2992           (step 3 is optional, and steps 1 and 2 may be combined).
2993           Lastly, the uses of s_out0 are replaced by s_out4.
2994
2995           ***/
2996
2997   /* 2.1 Create new loop-exit-phi to preserve loop-closed form:
2998         v_out1 = phi <v_loop>  */
2999
3000   exit_bb = single_exit (loop)->dest;
3001   def = vect_def;
3002   prev_phi_info = NULL;
3003   for (j = 0; j < ncopies; j++)
3004     {
3005       phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb);
3006       set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3007       if (j == 0)
3008         new_phi = phi;
3009       else
3010         {
3011           def = vect_get_vec_def_for_stmt_copy (dt, def);
3012           STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3013         }
3014       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3015       prev_phi_info = vinfo_for_stmt (phi);
3016     }
3017
3018   exit_gsi = gsi_after_labels (exit_bb);
3019
3020   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3021          (i.e. when reduc_code is not available) and in the final adjustment
3022          code (if needed).  Also get the original scalar reduction variable as
3023          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3024          represents a reduction pattern), the tree-code and scalar-def are
3025          taken from the original stmt that the pattern-stmt (STMT) replaces.
3026          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3027          are taken from STMT.  */
3028
3029   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3030   if (!orig_stmt)
3031     {
3032       /* Regular reduction  */
3033       orig_stmt = stmt;
3034     }
3035   else
3036     {
3037       /* Reduction pattern  */
3038       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3039       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3040       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3041     }
3042
3043   code = gimple_assign_rhs_code (orig_stmt);
3044   scalar_dest = gimple_assign_lhs (orig_stmt);
3045   scalar_type = TREE_TYPE (scalar_dest);
3046   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3047   bitsize = TYPE_SIZE (scalar_type);
3048   bytesize = TYPE_SIZE_UNIT (scalar_type);
3049
3050   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3051      partial results are added and not subtracted.  */
3052   if (code == MINUS_EXPR)
3053     code = PLUS_EXPR;
3054
3055   /* In case this is a reduction in an inner-loop while vectorizing an outer
3056      loop - we don't need to extract a single scalar result at the end of the
3057      inner-loop (unless it is double reduction, i.e., the use of reduction is
3058      outside the outer-loop). The final vector of partial results will be used
3059      in the vectorized outer-loop, or reduced to a scalar result at the end of
3060      the outer-loop.  */
3061   if (nested_in_vect_loop && !double_reduc)
3062     goto vect_finalize_reduction;
3063
3064   /* The epilogue is created for the outer-loop, i.e., for the loop being
3065      vectorized.  */
3066   if (double_reduc)
3067     loop = outer_loop;
3068
3069   /* FORNOW */
3070   gcc_assert (ncopies == 1);
3071
3072   /* 2.3 Create the reduction code, using one of the three schemes described
3073          above.  */
3074
3075   if (reduc_code != ERROR_MARK)
3076     {
3077       tree tmp;
3078
3079       /*** Case 1:  Create:
3080            v_out2 = reduc_expr <v_out1>  */
3081
3082       if (vect_print_dump_info (REPORT_DETAILS))
3083         fprintf (vect_dump, "Reduce using direct vector reduction.");
3084
3085       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3086       tmp = build1 (reduc_code, vectype,  PHI_RESULT (new_phi));
3087       epilog_stmt = gimple_build_assign (vec_dest, tmp);
3088       new_temp = make_ssa_name (vec_dest, epilog_stmt);
3089       gimple_assign_set_lhs (epilog_stmt, new_temp);
3090       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3091
3092       extract_scalar_result = true;
3093     }
3094   else
3095     {
3096       enum tree_code shift_code = ERROR_MARK;
3097       bool have_whole_vector_shift = true;
3098       int bit_offset;
3099       int element_bitsize = tree_low_cst (bitsize, 1);
3100       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3101       tree vec_temp;
3102
3103       if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing)
3104         shift_code = VEC_RSHIFT_EXPR;
3105       else
3106         have_whole_vector_shift = false;
3107
3108       /* Regardless of whether we have a whole vector shift, if we're
3109          emulating the operation via tree-vect-generic, we don't want
3110          to use it.  Only the first round of the reduction is likely
3111          to still be profitable via emulation.  */
3112       /* ??? It might be better to emit a reduction tree code here, so that
3113          tree-vect-generic can expand the first round via bit tricks.  */
3114       if (!VECTOR_MODE_P (mode))
3115         have_whole_vector_shift = false;
3116       else
3117         {
3118           optab optab = optab_for_tree_code (code, vectype, optab_default);
3119           if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing)
3120             have_whole_vector_shift = false;
3121         }
3122
3123       if (have_whole_vector_shift)
3124         {
3125           /*** Case 2: Create:
3126              for (offset = VS/2; offset >= element_size; offset/=2)
3127                 {
3128                   Create:  va' = vec_shift <va, offset>
3129                   Create:  va = vop <va, va'>
3130                 }  */
3131
3132           if (vect_print_dump_info (REPORT_DETAILS))
3133             fprintf (vect_dump, "Reduce using vector shifts");
3134
3135           vec_dest = vect_create_destination_var (scalar_dest, vectype);
3136           new_temp = PHI_RESULT (new_phi);
3137
3138           for (bit_offset = vec_size_in_bits/2;
3139                bit_offset >= element_bitsize;
3140                bit_offset /= 2)
3141             {
3142               tree bitpos = size_int (bit_offset);
3143
3144               epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest,
3145                                                           new_temp, bitpos);
3146               new_name = make_ssa_name (vec_dest, epilog_stmt);
3147               gimple_assign_set_lhs (epilog_stmt, new_name);
3148               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3149
3150               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
3151                                                           new_name, new_temp);
3152               new_temp = make_ssa_name (vec_dest, epilog_stmt);
3153               gimple_assign_set_lhs (epilog_stmt, new_temp);
3154               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3155             }
3156
3157           extract_scalar_result = true;
3158         }
3159       else
3160         {
3161           tree rhs;
3162
3163           /*** Case 3: Create:
3164              s = extract_field <v_out2, 0>
3165              for (offset = element_size;
3166                   offset < vector_size;
3167                   offset += element_size;)
3168                {
3169                  Create:  s' = extract_field <v_out2, offset>
3170                  Create:  s = op <s, s'>
3171                }  */
3172
3173           if (vect_print_dump_info (REPORT_DETAILS))
3174             fprintf (vect_dump, "Reduce using scalar code. ");
3175
3176           vec_temp = PHI_RESULT (new_phi);
3177           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3178           rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3179                          bitsize_zero_node);
3180           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3181           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3182           gimple_assign_set_lhs (epilog_stmt, new_temp);
3183           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3184
3185           for (bit_offset = element_bitsize;
3186                bit_offset < vec_size_in_bits;
3187                bit_offset += element_bitsize)
3188             {
3189               tree bitpos = bitsize_int (bit_offset);
3190               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
3191                                  bitpos);
3192
3193               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3194               new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
3195               gimple_assign_set_lhs (epilog_stmt, new_name);
3196               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3197
3198               epilog_stmt = gimple_build_assign_with_ops (code,
3199                                                           new_scalar_dest,
3200                                                           new_name, new_temp);
3201               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3202               gimple_assign_set_lhs (epilog_stmt, new_temp);
3203               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3204             }
3205
3206           extract_scalar_result = false;
3207         }
3208     }
3209
3210   /* 2.4  Extract the final scalar result.  Create:
3211          s_out3 = extract_field <v_out2, bitpos>  */
3212
3213   if (extract_scalar_result)
3214     {
3215       tree rhs;
3216
3217       gcc_assert (!nested_in_vect_loop || double_reduc);
3218       if (vect_print_dump_info (REPORT_DETAILS))
3219         fprintf (vect_dump, "extract scalar result");
3220
3221       if (BYTES_BIG_ENDIAN)
3222         bitpos = size_binop (MULT_EXPR,
3223                        bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
3224                        TYPE_SIZE (scalar_type));
3225       else
3226         bitpos = bitsize_zero_node;
3227
3228       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
3229       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
3230       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
3231       gimple_assign_set_lhs (epilog_stmt, new_temp);
3232       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3233     }
3234
3235 vect_finalize_reduction:
3236
3237   if (double_reduc)
3238     loop = loop->inner;
3239
3240   /* 2.5 Adjust the final result by the initial value of the reduction
3241          variable. (When such adjustment is not needed, then
3242          'adjustment_def' is zero).  For example, if code is PLUS we create:
3243          new_temp = loop_exit_def + adjustment_def  */
3244
3245   if (adjustment_def)
3246     {
3247       if (nested_in_vect_loop)
3248         {
3249           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
3250           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
3251           new_dest = vect_create_destination_var (scalar_dest, vectype);
3252         }
3253       else
3254         {
3255           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
3256           expr = build2 (code, scalar_type, new_temp, adjustment_def);
3257           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
3258         }
3259
3260       epilog_stmt = gimple_build_assign (new_dest, expr);
3261       new_temp = make_ssa_name (new_dest, epilog_stmt);
3262       gimple_assign_set_lhs (epilog_stmt, new_temp);
3263       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
3264       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
3265     }
3266
3267
3268   /* 2.6  Handle the loop-exit phi  */
3269
3270   /* Replace uses of s_out0 with uses of s_out3:
3271      Find the loop-closed-use at the loop exit of the original scalar result.
3272      (The reduction result is expected to have two immediate uses - one at the
3273      latch block, and one at the loop exit).  */
3274   phis = VEC_alloc (gimple, heap, 10);
3275   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
3276     {
3277       if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
3278         {
3279           exit_phi = USE_STMT (use_p);
3280           VEC_quick_push (gimple, phis, exit_phi);
3281         }
3282     }
3283
3284   /* We expect to have found an exit_phi because of loop-closed-ssa form.  */
3285   gcc_assert (!VEC_empty (gimple, phis));
3286
3287   for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++)
3288     {
3289       if (nested_in_vect_loop)
3290         {
3291           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3292           gimple vect_phi;
3293
3294           /* FORNOW. Currently not supporting the case that an inner-loop
3295              reduction is not used in the outer-loop (but only outside the
3296              outer-loop), unless it is double reduction.  */
3297           gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo)
3298                       && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc);
3299
3300           epilog_stmt = adjustment_def ? epilog_stmt : new_phi;
3301           STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt;
3302           set_vinfo_for_stmt (epilog_stmt,
3303                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
3304                                                  NULL));
3305           if (adjustment_def)
3306             STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
3307                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
3308
3309           if (!double_reduc
3310               || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def)
3311             continue;
3312
3313           /* Handle double reduction:
3314
3315              stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
3316              stmt2:   s3 = phi <s1, s4> - (regular) reduction phi (inner loop)
3317              stmt3:   s4 = use (s3)     - (regular) reduction stmt (inner loop)
3318              stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
3319
3320              At that point the regular reduction (stmt2 and stmt3) is already
3321              vectorized, as well as the exit phi node, stmt4.
3322              Here we vectorize the phi node of double reduction, stmt1, and
3323              update all relevant statements.  */
3324
3325           /* Go through all the uses of s2 to find double reduction phi node,
3326              i.e., stmt1 above.  */
3327           orig_name = PHI_RESULT (exit_phi);
3328           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3329             {
3330               stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt);
3331               stmt_vec_info new_phi_vinfo;
3332               tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
3333               basic_block bb = gimple_bb (use_stmt);
3334               gimple use;
3335
3336               /* Check that USE_STMT is really double reduction phi node.  */
3337               if (gimple_code (use_stmt) != GIMPLE_PHI
3338                   || gimple_phi_num_args (use_stmt) != 2
3339                   || !use_stmt_vinfo
3340                   || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
3341                       != vect_double_reduction_def
3342                   || bb->loop_father != outer_loop)
3343                 continue;
3344
3345               /* Create vector phi node for double reduction:
3346                  vs1 = phi <vs0, vs2>
3347                  vs1 was created previously in this function by a call to
3348                  vect_get_vec_def_for_operand and is stored in vec_initial_def;
3349                  vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI;
3350                  vs0 is created here.  */
3351
3352               /* Create vector phi node.  */
3353               vect_phi = create_phi_node (vec_initial_def, bb);
3354               new_phi_vinfo = new_stmt_vec_info (vect_phi,
3355                                     loop_vec_info_for_loop (outer_loop), NULL);
3356               set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
3357
3358               /* Create vs0 - initial def of the double reduction phi.  */
3359               preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
3360                                              loop_preheader_edge (outer_loop));
3361               init_def = get_initial_def_for_reduction (stmt, preheader_arg,
3362                                                         NULL);
3363               vect_phi_init = vect_init_vector (use_stmt, init_def, vectype,
3364                                                 NULL);
3365
3366               /* Update phi node arguments with vs0 and vs2.  */
3367               add_phi_arg (vect_phi, vect_phi_init,
3368                            loop_preheader_edge (outer_loop), UNKNOWN_LOCATION);
3369               add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt),
3370                            loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
3371               if (vect_print_dump_info (REPORT_DETAILS))
3372                 {
3373                   fprintf (vect_dump, "created double reduction phi node: ");
3374                   print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM);
3375                 }
3376
3377               vect_phi_res = PHI_RESULT (vect_phi);
3378
3379               /* Replace the use, i.e., set the correct vs1 in the regular
3380                  reduction phi node. FORNOW, NCOPIES is always 1, so the loop
3381                  is redundant.  */
3382               use = reduction_phi;
3383               for (j = 0; j < ncopies; j++)
3384                 {
3385                   edge pr_edge = loop_preheader_edge (loop);
3386                   SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
3387                   use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
3388                 }
3389             }
3390         }
3391
3392       /* Replace the uses:  */
3393       orig_name = PHI_RESULT (exit_phi);
3394       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
3395         FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3396           SET_USE (use_p, new_temp);
3397     }
3398
3399   VEC_free (gimple, heap, phis);
3400 }
3401
3402
3403 /* Function vectorizable_reduction.
3404
3405    Check if STMT performs a reduction operation that can be vectorized.
3406    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
3407    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
3408    Return FALSE if not a vectorizable STMT, TRUE otherwise.
3409
3410    This function also handles reduction idioms (patterns) that have been
3411    recognized in advance during vect_pattern_recog. In this case, STMT may be
3412    of this form:
3413      X = pattern_expr (arg0, arg1, ..., X)
3414    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
3415    sequence that had been detected and replaced by the pattern-stmt (STMT).
3416
3417    In some cases of reduction patterns, the type of the reduction variable X is
3418    different than the type of the other arguments of STMT.
3419    In such cases, the vectype that is used when transforming STMT into a vector
3420    stmt is different than the vectype that is used to determine the
3421    vectorization factor, because it consists of a different number of elements
3422    than the actual number of elements that are being operated upon in parallel.
3423
3424    For example, consider an accumulation of shorts into an int accumulator.
3425    On some targets it's possible to vectorize this pattern operating on 8
3426    shorts at a time (hence, the vectype for purposes of determining the
3427    vectorization factor should be V8HI); on the other hand, the vectype that
3428    is used to create the vector form is actually V4SI (the type of the result).
3429
3430    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
3431    indicates what is the actual level of parallelism (V8HI in the example), so
3432    that the right vectorization factor would be derived. This vectype
3433    corresponds to the type of arguments to the reduction stmt, and should *NOT*
3434    be used to create the vectorized stmt. The right vectype for the vectorized
3435    stmt is obtained from the type of the result X:
3436         get_vectype_for_scalar_type (TREE_TYPE (X))
3437
3438    This means that, contrary to "regular" reductions (or "regular" stmts in
3439    general), the following equation:
3440       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
3441    does *NOT* necessarily hold for reduction patterns.  */
3442
3443 bool
3444 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
3445                         gimple *vec_stmt)
3446 {
3447   tree vec_dest;
3448   tree scalar_dest;
3449   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
3450   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3451   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3452   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3453   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3454   enum tree_code code, orig_code, epilog_reduc_code;
3455   enum machine_mode vec_mode;
3456   int op_type;
3457   optab optab, reduc_optab;
3458   tree new_temp = NULL_TREE;
3459   tree def;
3460   gimple def_stmt;
3461   enum vect_def_type dt;
3462   gimple new_phi = NULL;
3463   tree scalar_type;
3464   bool is_simple_use;
3465   gimple orig_stmt;
3466   stmt_vec_info orig_stmt_info;
3467   tree expr = NULL_TREE;
3468   int i;
3469   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
3470   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
3471   int epilog_copies;
3472   stmt_vec_info prev_stmt_info, prev_phi_info;
3473   gimple first_phi = NULL;
3474   bool single_defuse_cycle = false;
3475   tree reduc_def = NULL_TREE;
3476   gimple new_stmt = NULL;
3477   int j;
3478   tree ops[3];
3479   bool nested_cycle = false, found_nested_cycle_def = false;
3480   gimple reduc_def_stmt = NULL;
3481   /* The default is that the reduction variable is the last in statement.  */
3482   int reduc_index = 2;
3483   bool double_reduc = false, dummy;
3484   basic_block def_bb;
3485   struct loop * def_stmt_loop, *outer_loop = NULL;
3486   tree def_arg;
3487   gimple def_arg_stmt;
3488
3489   if (nested_in_vect_loop_p (loop, stmt))
3490     {
3491       outer_loop = loop;
3492       loop = loop->inner;
3493       nested_cycle = true;
3494     }
3495
3496   gcc_assert (ncopies >= 1);
3497
3498   /* FORNOW: SLP not supported.  */
3499   if (STMT_SLP_TYPE (stmt_info))
3500     return false;
3501
3502   /* 1. Is vectorizable reduction?  */
3503   /* Not supportable if the reduction variable is used in the loop.  */
3504   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer)
3505     return false;
3506
3507   /* Reductions that are not used even in an enclosing outer-loop,
3508      are expected to be "live" (used out of the loop).  */
3509   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
3510       && !STMT_VINFO_LIVE_P (stmt_info))
3511     return false;
3512
3513   /* Make sure it was already recognized as a reduction computation.  */
3514   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
3515       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
3516     return false;
3517
3518   /* 2. Has this been recognized as a reduction pattern?
3519
3520      Check if STMT represents a pattern that has been recognized
3521      in earlier analysis stages.  For stmts that represent a pattern,
3522      the STMT_VINFO_RELATED_STMT field records the last stmt in
3523      the original sequence that constitutes the pattern.  */
3524
3525   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3526   if (orig_stmt)
3527     {
3528       orig_stmt_info = vinfo_for_stmt (orig_stmt);
3529       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt);
3530       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
3531       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
3532     }
3533
3534   /* 3. Check the operands of the operation. The first operands are defined
3535         inside the loop body. The last operand is the reduction variable,
3536         which is defined by the loop-header-phi.  */
3537
3538   gcc_assert (is_gimple_assign (stmt));
3539
3540   /* Flatten RHS */
3541   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3542     {
3543     case GIMPLE_SINGLE_RHS:
3544       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
3545       if (op_type == ternary_op)
3546         {
3547           tree rhs = gimple_assign_rhs1 (stmt);
3548           ops[0] = TREE_OPERAND (rhs, 0);
3549           ops[1] = TREE_OPERAND (rhs, 1);
3550           ops[2] = TREE_OPERAND (rhs, 2);
3551           code = TREE_CODE (rhs);
3552         }
3553       else
3554         return false;
3555       break;
3556
3557     case GIMPLE_BINARY_RHS:
3558       code = gimple_assign_rhs_code (stmt);
3559       op_type = TREE_CODE_LENGTH (code);
3560       gcc_assert (op_type == binary_op);
3561       ops[0] = gimple_assign_rhs1 (stmt);
3562       ops[1] = gimple_assign_rhs2 (stmt);
3563       break;
3564
3565     case GIMPLE_UNARY_RHS:
3566       return false;
3567
3568     default:
3569       gcc_unreachable ();
3570     }
3571
3572   scalar_dest = gimple_assign_lhs (stmt);
3573   scalar_type = TREE_TYPE (scalar_dest);
3574   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
3575       && !SCALAR_FLOAT_TYPE_P (scalar_type))
3576     return false;
3577
3578   /* All uses but the last are expected to be defined in the loop.
3579      The last use is the reduction variable. In case of nested cycle this
3580      assumption is not true: we use reduc_index to record the index of the
3581      reduction variable.  */
3582   for (i = 0; i < op_type-1; i++)
3583     {
3584       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
3585       if (i == 0 && code == COND_EXPR)
3586         continue;
3587
3588       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3589                                           &def, &dt);
3590       gcc_assert (is_simple_use);
3591       if (dt != vect_internal_def
3592           && dt != vect_external_def
3593           && dt != vect_constant_def
3594           && dt != vect_induction_def
3595           && !(dt == vect_nested_cycle && nested_cycle))
3596         return false;
3597
3598       if (dt == vect_nested_cycle)
3599         {
3600           found_nested_cycle_def = true;
3601           reduc_def_stmt = def_stmt;
3602           reduc_index = i;
3603         }
3604     }
3605
3606   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt,
3607                                       &def, &dt);
3608   gcc_assert (is_simple_use);
3609   gcc_assert (dt == vect_reduction_def
3610               || dt == vect_nested_cycle
3611               || ((dt == vect_internal_def || dt == vect_external_def
3612                    || dt == vect_constant_def || dt == vect_induction_def)
3613                    && nested_cycle && found_nested_cycle_def));
3614   if (!found_nested_cycle_def)
3615     reduc_def_stmt = def_stmt;
3616
3617   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
3618   if (orig_stmt)
3619     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
3620                                                        reduc_def_stmt,
3621                                                        !nested_cycle,
3622                                                        &dummy));
3623   else
3624     gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
3625                                                   !nested_cycle, &dummy));
3626
3627   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
3628     return false;
3629
3630   vec_mode = TYPE_MODE (vectype);
3631
3632   if (code == COND_EXPR)
3633     {
3634       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0))
3635         {
3636           if (vect_print_dump_info (REPORT_DETAILS))
3637             fprintf (vect_dump, "unsupported condition in reduction");
3638
3639             return false;
3640         }
3641     }
3642   else
3643     {
3644       /* 4. Supportable by target?  */
3645
3646       /* 4.1. check support for the operation in the loop  */
3647       optab = optab_for_tree_code (code, vectype, optab_default);
3648       if (!optab)
3649         {
3650           if (vect_print_dump_info (REPORT_DETAILS))
3651             fprintf (vect_dump, "no optab.");
3652
3653           return false;
3654         }
3655
3656       if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing)
3657         {
3658           if (vect_print_dump_info (REPORT_DETAILS))
3659             fprintf (vect_dump, "op not supported by target.");
3660
3661           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
3662               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3663                   < vect_min_worthwhile_factor (code))
3664             return false;
3665
3666           if (vect_print_dump_info (REPORT_DETAILS))
3667             fprintf (vect_dump, "proceeding using word mode.");
3668         }
3669
3670       /* Worthwhile without SIMD support?  */
3671       if (!VECTOR_MODE_P (TYPE_MODE (vectype))
3672           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
3673              < vect_min_worthwhile_factor (code))
3674         {
3675           if (vect_print_dump_info (REPORT_DETAILS))
3676             fprintf (vect_dump, "not worthwhile without SIMD support.");
3677
3678           return false;
3679         }
3680     }
3681
3682   /* 4.2. Check support for the epilog operation.
3683
3684           If STMT represents a reduction pattern, then the type of the
3685           reduction variable may be different than the type of the rest
3686           of the arguments.  For example, consider the case of accumulation
3687           of shorts into an int accumulator; The original code:
3688                         S1: int_a = (int) short_a;
3689           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
3690
3691           was replaced with:
3692                         STMT: int_acc = widen_sum <short_a, int_acc>
3693
3694           This means that:
3695           1. The tree-code that is used to create the vector operation in the
3696              epilog code (that reduces the partial results) is not the
3697              tree-code of STMT, but is rather the tree-code of the original
3698              stmt from the pattern that STMT is replacing. I.e, in the example
3699              above we want to use 'widen_sum' in the loop, but 'plus' in the
3700              epilog.
3701           2. The type (mode) we use to check available target support
3702              for the vector operation to be created in the *epilog*, is
3703              determined by the type of the reduction variable (in the example
3704              above we'd check this: plus_optab[vect_int_mode]).
3705              However the type (mode) we use to check available target support
3706              for the vector operation to be created *inside the loop*, is
3707              determined by the type of the other arguments to STMT (in the
3708              example we'd check this: widen_sum_optab[vect_short_mode]).
3709
3710           This is contrary to "regular" reductions, in which the types of all
3711           the arguments are the same as the type of the reduction variable.
3712           For "regular" reductions we can therefore use the same vector type
3713           (and also the same tree-code) when generating the epilog code and
3714           when generating the code inside the loop.  */
3715
3716   if (orig_stmt)
3717     {
3718       /* This is a reduction pattern: get the vectype from the type of the
3719          reduction variable, and get the tree-code from orig_stmt.  */
3720       orig_code = gimple_assign_rhs_code (orig_stmt);
3721       vectype = get_vectype_for_scalar_type (TREE_TYPE (def));
3722       if (!vectype)
3723         {
3724           if (vect_print_dump_info (REPORT_DETAILS))
3725             {
3726               fprintf (vect_dump, "unsupported data-type ");
3727               print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM);
3728             }
3729           return false;
3730         }
3731
3732       vec_mode = TYPE_MODE (vectype);
3733     }
3734   else
3735     {
3736       /* Regular reduction: use the same vectype and tree-code as used for
3737          the vector code inside the loop can be used for the epilog code. */
3738       orig_code = code;
3739     }
3740
3741   if (nested_cycle)
3742     {
3743       def_bb = gimple_bb (reduc_def_stmt);
3744       def_stmt_loop = def_bb->loop_father;
3745       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
3746                                        loop_preheader_edge (def_stmt_loop));
3747       if (TREE_CODE (def_arg) == SSA_NAME
3748           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
3749           && gimple_code (def_arg_stmt) == GIMPLE_PHI
3750           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
3751           && vinfo_for_stmt (def_arg_stmt)
3752           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
3753               == vect_double_reduction_def)
3754         double_reduc = true;
3755     }
3756
3757   epilog_reduc_code = ERROR_MARK;
3758   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
3759     {
3760       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype,
3761                                          optab_default);
3762       if (!reduc_optab)
3763         {
3764           if (vect_print_dump_info (REPORT_DETAILS))
3765             fprintf (vect_dump, "no optab for reduction.");
3766
3767           epilog_reduc_code = ERROR_MARK;
3768         }
3769
3770       if (reduc_optab
3771           && optab_handler (reduc_optab, vec_mode)->insn_code
3772               == CODE_FOR_nothing)
3773         {
3774           if (vect_print_dump_info (REPORT_DETAILS))
3775             fprintf (vect_dump, "reduc op not supported by target.");
3776
3777           epilog_reduc_code = ERROR_MARK;
3778         }
3779     }
3780   else
3781     {
3782       if (!nested_cycle || double_reduc)
3783         {
3784           if (vect_print_dump_info (REPORT_DETAILS))
3785             fprintf (vect_dump, "no reduc code for scalar code.");
3786
3787           return false;
3788         }
3789     }
3790
3791   if (double_reduc && ncopies > 1)
3792     {
3793       if (vect_print_dump_info (REPORT_DETAILS))
3794         fprintf (vect_dump, "multiple types in double reduction");
3795
3796       return false;
3797     }
3798
3799   if (!vec_stmt) /* transformation not required.  */
3800     {
3801       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
3802       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
3803         return false;
3804       return true;
3805     }
3806
3807   /** Transform.  **/
3808
3809   if (vect_print_dump_info (REPORT_DETAILS))
3810     fprintf (vect_dump, "transform reduction.");
3811
3812   /* FORNOW: Multiple types are not supported for condition.  */
3813   if (code == COND_EXPR)
3814     gcc_assert (ncopies == 1);
3815
3816   /* Create the destination vector  */
3817   vec_dest = vect_create_destination_var (scalar_dest, vectype);
3818
3819   /* In case the vectorization factor (VF) is bigger than the number
3820      of elements that we can fit in a vectype (nunits), we have to generate
3821      more than one vector stmt - i.e - we need to "unroll" the
3822      vector stmt by a factor VF/nunits.  For more details see documentation
3823      in vectorizable_operation.  */
3824
3825   /* If the reduction is used in an outer loop we need to generate
3826      VF intermediate results, like so (e.g. for ncopies=2):
3827         r0 = phi (init, r0)
3828         r1 = phi (init, r1)
3829         r0 = x0 + r0;
3830         r1 = x1 + r1;
3831     (i.e. we generate VF results in 2 registers).
3832     In this case we have a separate def-use cycle for each copy, and therefore
3833     for each copy we get the vector def for the reduction variable from the
3834     respective phi node created for this copy.
3835
3836     Otherwise (the reduction is unused in the loop nest), we can combine
3837     together intermediate results, like so (e.g. for ncopies=2):
3838         r = phi (init, r)
3839         r = x0 + r;
3840         r = x1 + r;
3841    (i.e. we generate VF/2 results in a single register).
3842    In this case for each copy we get the vector def for the reduction variable
3843    from the vectorized reduction operation generated in the previous iteration.
3844   */
3845
3846   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
3847     {
3848       single_defuse_cycle = true;
3849       epilog_copies = 1;
3850     }
3851   else
3852     epilog_copies = ncopies;
3853
3854   prev_stmt_info = NULL;
3855   prev_phi_info = NULL;
3856   for (j = 0; j < ncopies; j++)
3857     {
3858       if (j == 0 || !single_defuse_cycle)
3859         {
3860           /* Create the reduction-phi that defines the reduction-operand.  */
3861           new_phi = create_phi_node (vec_dest, loop->header);
3862           set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo,
3863                                                           NULL));
3864           /* Get the vector def for the reduction variable from the phi
3865              node.  */
3866           reduc_def = PHI_RESULT (new_phi);
3867         }
3868
3869       if (code == COND_EXPR)
3870         {
3871           first_phi = new_phi;
3872           vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index);
3873           /* Multiple types are not supported for condition.  */
3874           break;
3875         }
3876
3877       /* Handle uses.  */
3878       if (j == 0)
3879         {
3880           loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
3881                                                         stmt, NULL);
3882           if (op_type == ternary_op)
3883             {
3884               if (reduc_index == 0)
3885                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt,
3886                                                               NULL);
3887               else
3888                 loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt,
3889                                                               NULL);
3890             }
3891
3892           /* Get the vector def for the reduction variable from the phi
3893              node.  */
3894           first_phi = new_phi;
3895         }
3896       else
3897         {
3898           enum vect_def_type dt = vect_unknown_def_type; /* Dummy */
3899           loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0);
3900           if (op_type == ternary_op)
3901             loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1);
3902
3903           if (single_defuse_cycle)
3904             reduc_def = gimple_assign_lhs (new_stmt);
3905           else
3906             reduc_def = PHI_RESULT (new_phi);
3907
3908           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
3909         }
3910
3911       /* Arguments are ready. Create the new vector stmt.  */
3912       if (op_type == binary_op)
3913         {
3914           if (reduc_index == 0)
3915             expr = build2 (code, vectype, reduc_def, loop_vec_def0);
3916           else
3917             expr = build2 (code, vectype, loop_vec_def0, reduc_def);
3918         }
3919       else
3920         {
3921           if (reduc_index == 0)
3922             expr = build3 (code, vectype, reduc_def, loop_vec_def0,
3923                            loop_vec_def1);
3924           else
3925             {
3926               if (reduc_index == 1)
3927                 expr = build3 (code, vectype, loop_vec_def0, reduc_def,
3928                                loop_vec_def1);
3929               else
3930                 expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1,
3931                                reduc_def);
3932             }
3933         }
3934
3935       new_stmt = gimple_build_assign (vec_dest, expr);
3936       new_temp = make_ssa_name (vec_dest, new_stmt);
3937       gimple_assign_set_lhs (new_stmt, new_temp);
3938       vect_finish_stmt_generation (stmt, new_stmt, gsi);
3939
3940       if (j == 0)
3941         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
3942       else
3943         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
3944
3945       prev_stmt_info = vinfo_for_stmt (new_stmt);
3946       prev_phi_info = vinfo_for_stmt (new_phi);
3947     }
3948
3949   /* Finalize the reduction-phi (set its arguments) and create the
3950      epilog reduction code.  */
3951   if (!single_defuse_cycle || code == COND_EXPR)
3952     new_temp = gimple_assign_lhs (*vec_stmt);
3953
3954   vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies,
3955                                     epilog_reduc_code, first_phi, reduc_index,
3956                                     double_reduc);
3957   return true;
3958 }
3959
3960 /* Function vect_min_worthwhile_factor.
3961
3962    For a loop where we could vectorize the operation indicated by CODE,
3963    return the minimum vectorization factor that makes it worthwhile
3964    to use generic vectors.  */
3965 int
3966 vect_min_worthwhile_factor (enum tree_code code)
3967 {
3968   switch (code)
3969     {
3970     case PLUS_EXPR:
3971     case MINUS_EXPR:
3972     case NEGATE_EXPR:
3973       return 4;
3974
3975     case BIT_AND_EXPR:
3976     case BIT_IOR_EXPR:
3977     case BIT_XOR_EXPR:
3978     case BIT_NOT_EXPR:
3979       return 2;
3980
3981     default:
3982       return INT_MAX;
3983     }
3984 }
3985
3986
3987 /* Function vectorizable_induction
3988
3989    Check if PHI performs an induction computation that can be vectorized.
3990    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
3991    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
3992    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
3993
3994 bool
3995 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3996                         gimple *vec_stmt)
3997 {
3998   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
3999   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4000   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4001   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4002   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
4003   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
4004   tree vec_def;
4005
4006   gcc_assert (ncopies >= 1);
4007   /* FORNOW. This restriction should be relaxed.  */
4008   if (nested_in_vect_loop_p (loop, phi) && ncopies > 1)
4009     {
4010       if (vect_print_dump_info (REPORT_DETAILS))
4011         fprintf (vect_dump, "multiple types in nested loop.");
4012       return false;
4013     }
4014
4015   if (!STMT_VINFO_RELEVANT_P (stmt_info))
4016     return false;
4017
4018   /* FORNOW: SLP not supported.  */
4019   if (STMT_SLP_TYPE (stmt_info))
4020     return false;
4021
4022   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
4023
4024   if (gimple_code (phi) != GIMPLE_PHI)
4025     return false;
4026
4027   if (!vec_stmt) /* transformation not required.  */
4028     {
4029       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
4030       if (vect_print_dump_info (REPORT_DETAILS))
4031         fprintf (vect_dump, "=== vectorizable_induction ===");
4032       vect_model_induction_cost (stmt_info, ncopies);
4033       return true;
4034     }
4035
4036   /** Transform.  **/
4037
4038   if (vect_print_dump_info (REPORT_DETAILS))
4039     fprintf (vect_dump, "transform induction phi.");
4040
4041   vec_def = get_initial_def_for_induction (phi);
4042   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
4043   return true;
4044 }
4045
4046 /* Function vectorizable_live_operation.
4047
4048    STMT computes a value that is used outside the loop. Check if
4049    it can be supported.  */
4050
4051 bool
4052 vectorizable_live_operation (gimple stmt,
4053                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
4054                              gimple *vec_stmt ATTRIBUTE_UNUSED)
4055 {
4056   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4057   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4058   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4059   int i;
4060   int op_type;
4061   tree op;
4062   tree def;
4063   gimple def_stmt;
4064   enum vect_def_type dt;
4065   enum tree_code code;
4066   enum gimple_rhs_class rhs_class;
4067
4068   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
4069
4070   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
4071     return false;
4072
4073   if (!is_gimple_assign (stmt))
4074     return false;
4075
4076   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
4077     return false;
4078
4079   /* FORNOW. CHECKME. */
4080   if (nested_in_vect_loop_p (loop, stmt))
4081     return false;
4082
4083   code = gimple_assign_rhs_code (stmt);
4084   op_type = TREE_CODE_LENGTH (code);
4085   rhs_class = get_gimple_rhs_class (code);
4086   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
4087   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
4088
4089   /* FORNOW: support only if all uses are invariant. This means
4090      that the scalar operations can remain in place, unvectorized.
4091      The original last scalar value that they compute will be used.  */
4092
4093   for (i = 0; i < op_type; i++)
4094     {
4095       if (rhs_class == GIMPLE_SINGLE_RHS)
4096         op = TREE_OPERAND (gimple_op (stmt, 1), i);
4097       else
4098         op = gimple_op (stmt, i + 1);
4099       if (op
4100           && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt))
4101         {
4102           if (vect_print_dump_info (REPORT_DETAILS))
4103             fprintf (vect_dump, "use not simple.");
4104           return false;
4105         }
4106
4107       if (dt != vect_external_def && dt != vect_constant_def)
4108         return false;
4109     }
4110
4111   /* No transformation is required for the cases we currently support.  */
4112   return true;
4113 }
4114
4115 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
4116
4117 static void
4118 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
4119 {
4120   ssa_op_iter op_iter;
4121   imm_use_iterator imm_iter;
4122   def_operand_p def_p;
4123   gimple ustmt;
4124
4125   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
4126     {
4127       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
4128         {
4129           basic_block bb;
4130
4131           if (!is_gimple_debug (ustmt))
4132             continue;
4133
4134           bb = gimple_bb (ustmt);
4135
4136           if (!flow_bb_inside_loop_p (loop, bb))
4137             {
4138               if (gimple_debug_bind_p (ustmt))
4139                 {
4140                   if (vect_print_dump_info (REPORT_DETAILS))
4141                     fprintf (vect_dump, "killing debug use");
4142
4143                   gimple_debug_bind_reset_value (ustmt);
4144                   update_stmt (ustmt);
4145                 }
4146               else
4147                 gcc_unreachable ();
4148             }
4149         }
4150     }
4151 }
4152
4153 /* Function vect_transform_loop.
4154
4155    The analysis phase has determined that the loop is vectorizable.
4156    Vectorize the loop - created vectorized stmts to replace the scalar
4157    stmts in the loop, and update the loop exit condition.  */
4158
4159 void
4160 vect_transform_loop (loop_vec_info loop_vinfo)
4161 {
4162   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4163   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
4164   int nbbs = loop->num_nodes;
4165   gimple_stmt_iterator si;
4166   int i;
4167   tree ratio = NULL;
4168   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
4169   bool strided_store;
4170   bool slp_scheduled = false;
4171   unsigned int nunits;
4172   tree cond_expr = NULL_TREE;
4173   gimple_seq cond_expr_stmt_list = NULL;
4174   bool do_peeling_for_loop_bound;
4175
4176   if (vect_print_dump_info (REPORT_DETAILS))
4177     fprintf (vect_dump, "=== vec_transform_loop ===");
4178
4179   /* Peel the loop if there are data refs with unknown alignment.
4180      Only one data ref with unknown store is allowed.  */
4181
4182   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
4183     vect_do_peeling_for_alignment (loop_vinfo);
4184
4185   do_peeling_for_loop_bound
4186     = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4187        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4188            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0));
4189
4190   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
4191       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4192     vect_loop_versioning (loop_vinfo,
4193                           !do_peeling_for_loop_bound,
4194                           &cond_expr, &cond_expr_stmt_list);
4195
4196   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
4197      compile time constant), or it is a constant that doesn't divide by the
4198      vectorization factor, then an epilog loop needs to be created.
4199      We therefore duplicate the loop: the original loop will be vectorized,
4200      and will compute the first (n/VF) iterations. The second copy of the loop
4201      will remain scalar and will compute the remaining (n%VF) iterations.
4202      (VF is the vectorization factor).  */
4203
4204   if (do_peeling_for_loop_bound)
4205     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
4206                                     cond_expr, cond_expr_stmt_list);
4207   else
4208     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
4209                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
4210
4211   /* 1) Make sure the loop header has exactly two entries
4212      2) Make sure we have a preheader basic block.  */
4213
4214   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
4215
4216   split_edge (loop_preheader_edge (loop));
4217
4218   /* FORNOW: the vectorizer supports only loops which body consist
4219      of one basic block (header + empty latch). When the vectorizer will
4220      support more involved loop forms, the order by which the BBs are
4221      traversed need to be reconsidered.  */
4222
4223   for (i = 0; i < nbbs; i++)
4224     {
4225       basic_block bb = bbs[i];
4226       stmt_vec_info stmt_info;
4227       gimple phi;
4228
4229       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
4230         {
4231           phi = gsi_stmt (si);
4232           if (vect_print_dump_info (REPORT_DETAILS))
4233             {
4234               fprintf (vect_dump, "------>vectorizing phi: ");
4235               print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM);
4236             }
4237           stmt_info = vinfo_for_stmt (phi);
4238           if (!stmt_info)
4239             continue;
4240
4241           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4242               && !STMT_VINFO_LIVE_P (stmt_info))
4243             {
4244               if (MAY_HAVE_DEBUG_STMTS)
4245                 vect_loop_kill_debug_uses (loop, phi);
4246               continue;
4247             }
4248
4249           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
4250                 != (unsigned HOST_WIDE_INT) vectorization_factor)
4251               && vect_print_dump_info (REPORT_DETAILS))
4252             fprintf (vect_dump, "multiple-types.");
4253
4254           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
4255             {
4256               if (vect_print_dump_info (REPORT_DETAILS))
4257                 fprintf (vect_dump, "transform phi.");
4258               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
4259             }
4260         }
4261
4262       for (si = gsi_start_bb (bb); !gsi_end_p (si);)
4263         {
4264           gimple stmt = gsi_stmt (si);
4265           bool is_store;
4266
4267           if (vect_print_dump_info (REPORT_DETAILS))
4268             {
4269               fprintf (vect_dump, "------>vectorizing statement: ");
4270               print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM);
4271             }
4272
4273           stmt_info = vinfo_for_stmt (stmt);
4274
4275           /* vector stmts created in the outer-loop during vectorization of
4276              stmts in an inner-loop may not have a stmt_info, and do not
4277              need to be vectorized.  */
4278           if (!stmt_info)
4279             {
4280               gsi_next (&si);
4281               continue;
4282             }
4283
4284           if (!STMT_VINFO_RELEVANT_P (stmt_info)
4285               && !STMT_VINFO_LIVE_P (stmt_info))
4286             {
4287               if (MAY_HAVE_DEBUG_STMTS)
4288                 vect_loop_kill_debug_uses (loop, stmt);
4289               gsi_next (&si);
4290               continue;
4291             }
4292
4293           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
4294           nunits =
4295             (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
4296           if (!STMT_SLP_TYPE (stmt_info)
4297               && nunits != (unsigned int) vectorization_factor
4298               && vect_print_dump_info (REPORT_DETAILS))
4299             /* For SLP VF is set according to unrolling factor, and not to
4300                vector size, hence for SLP this print is not valid.  */
4301             fprintf (vect_dump, "multiple-types.");
4302
4303           /* SLP. Schedule all the SLP instances when the first SLP stmt is
4304              reached.  */
4305           if (STMT_SLP_TYPE (stmt_info))
4306             {
4307               if (!slp_scheduled)
4308                 {
4309                   slp_scheduled = true;
4310
4311                   if (vect_print_dump_info (REPORT_DETAILS))
4312                     fprintf (vect_dump, "=== scheduling SLP instances ===");
4313
4314                   vect_schedule_slp (loop_vinfo, NULL);
4315                 }
4316
4317               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
4318               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
4319                 {
4320                   gsi_next (&si);
4321                   continue;
4322                 }
4323             }
4324
4325           /* -------- vectorize statement ------------ */
4326           if (vect_print_dump_info (REPORT_DETAILS))
4327             fprintf (vect_dump, "transform statement.");
4328
4329           strided_store = false;
4330           is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL);
4331           if (is_store)
4332             {
4333               if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
4334                 {
4335                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
4336                      interleaving chain was completed - free all the stores in
4337                      the chain.  */
4338                   vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info));
4339                   gsi_remove (&si, true);
4340                   continue;
4341                 }
4342               else
4343                 {
4344                   /* Free the attached stmt_vec_info and remove the stmt.  */
4345                   free_stmt_vec_info (stmt);
4346                   gsi_remove (&si, true);
4347                   continue;
4348                 }
4349             }
4350           gsi_next (&si);
4351         }                       /* stmts in BB */
4352     }                           /* BBs in loop */
4353
4354   slpeel_make_loop_iterate_ntimes (loop, ratio);
4355
4356   /* The memory tags and pointers in vectorized statements need to
4357      have their SSA forms updated.  FIXME, why can't this be delayed
4358      until all the loops have been transformed?  */
4359   update_ssa (TODO_update_ssa);
4360
4361   if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4362     fprintf (vect_dump, "LOOP VECTORIZED.");
4363   if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS))
4364     fprintf (vect_dump, "OUTER LOOP VECTORIZED.");
4365 }