gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2013 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "ggc.h"
  28 #include "tree.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-flow.h"
  32 #include "tree-pass.h"
  33 #include "cfgloop.h"
  34 #include "expr.h"
  35 #include "recog.h"
  36 #include "optabs.h"
  37 #include "params.h"
  38 #include "diagnostic-core.h"
  39 #include "tree-chrec.h"
  40 #include "tree-scalar-evolution.h"
  41 #include "tree-vectorizer.h"
  42 #include "target.h"
  43
  44 /* Loop Vectorization Pass.
  45
  46    This pass tries to vectorize loops.
  47
  48    For example, the vectorizer transforms the following simple loop:
  49
  50         short a[N]; short b[N]; short c[N]; int i;
  51
  52         for (i=0; i<N; i++){
  53           a[i] = b[i] + c[i];
  54         }
  55
  56    as if it was manually vectorized by rewriting the source code into:
  57
  58         typedef int __attribute__((mode(V8HI))) v8hi;
  59         short a[N];  short b[N]; short c[N];   int i;
  60         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  61         v8hi va, vb, vc;
  62
  63         for (i=0; i<N/8; i++){
  64           vb = pb[i];
  65           vc = pc[i];
  66           va = vb + vc;
  67           pa[i] = va;
  68         }
  69
  70         The main entry to this pass is vectorize_loops(), in which
  71    the vectorizer applies a set of analyses on a given set of loops,
  72    followed by the actual vectorization transformation for the loops that
  73    had successfully passed the analysis phase.
  74         Throughout this pass we make a distinction between two types of
  75    data: scalars (which are represented by SSA_NAMES), and memory references
  76    ("data-refs").  These two types of data require different handling both
  77    during analysis and transformation. The types of data-refs that the
  78    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  79    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  80    accesses are required to have a simple (consecutive) access pattern.
  81
  82    Analysis phase:
  83    ===============
  84         The driver for the analysis phase is vect_analyze_loop().
  85    It applies a set of analyses, some of which rely on the scalar evolution
  86    analyzer (scev) developed by Sebastian Pop.
  87
  88         During the analysis phase the vectorizer records some information
  89    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  90    loop, as well as general information about the loop as a whole, which is
  91    recorded in a "loop_vec_info" struct attached to each loop.
  92
  93    Transformation phase:
  94    =====================
  95         The loop transformation phase scans all the stmts in the loop, and
  96    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
  97    the loop that needs to be vectorized.  It inserts the vector code sequence
  98    just before the scalar stmt S, and records a pointer to the vector code
  99    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 100    attached to S).  This pointer will be used for the vectorization of following
 101    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 102    otherwise, we rely on dead code elimination for removing it.
 103
 104         For example, say stmt S1 was vectorized into stmt VS1:
 105
 106    VS1: vb = px[i];
 107    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 108    S2:  a = b;
 109
 110    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 111    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 112    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 113    resulting sequence would be:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    VS2: va = vb;
 118    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 119
 120         Operands that are not SSA_NAMEs, are data-refs that appear in
 121    load/store operations (like 'x[i]' in S1), and are handled differently.
 122
 123    Target modeling:
 124    =================
 125         Currently the only target specific information that is used is the
 126    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 127    Targets that can support different sizes of vectors, for now will need
 128    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 129    flexibility will be added in the future.
 130
 131         Since we only vectorize operations which vector form can be
 132    expressed using existing tree codes, to verify that an operation is
 133    supported, the vectorizer checks the relevant optab at the relevant
 134    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 135    the value found is CODE_FOR_nothing, then there's no target support, and
 136    we can't vectorize the stmt.
 137
 138    For additional information on this project see:
 139    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 140 */
 141
 142 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 143
 144 /* Function vect_determine_vectorization_factor
 145
 146    Determine the vectorization factor (VF).  VF is the number of data elements
 147    that are operated upon in parallel in a single iteration of the vectorized
 148    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 149    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 150    elements can fit in a single vector register.
 151
 152    We currently support vectorization of loops in which all types operated upon
 153    are of the same size.  Therefore this function currently sets VF according to
 154    the size of the types operated upon, and fails if there are multiple sizes
 155    in the loop.
 156
 157    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 158    original loop:
 159         for (i=0; i<N; i++){
 160           a[i] = b[i] + c[i];
 161         }
 162
 163    vectorized loop:
 164         for (i=0; i<N; i+=VF){
 165           a[i:VF] = b[i:VF] + c[i:VF];
 166         }
 167 */
 168
 169 static bool
 170 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 171 {
 172   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 173   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 174   int nbbs = loop->num_nodes;
 175   gimple_stmt_iterator si;
 176   unsigned int vectorization_factor = 0;
 177   tree scalar_type;
 178   gimple phi;
 179   tree vectype;
 180   unsigned int nunits;
 181   stmt_vec_info stmt_info;
 182   int i;
 183   HOST_WIDE_INT dummy;
 184   gimple stmt, pattern_stmt = NULL;
 185   gimple_seq pattern_def_seq = NULL;
 186   gimple_stmt_iterator pattern_def_si = gsi_none ();
 187   bool analyze_pattern_stmt = false;
 188
 189   if (dump_enabled_p ())
 190     dump_printf_loc (MSG_NOTE, vect_location,
 191                      "=== vect_determine_vectorization_factor ===");
 192
 193   for (i = 0; i < nbbs; i++)
 194     {
 195       basic_block bb = bbs[i];
 196
 197       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 198         {
 199           phi = gsi_stmt (si);
 200           stmt_info = vinfo_for_stmt (phi);
 201           if (dump_enabled_p ())
 202             {
 203               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 204               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 205             }
 206
 207           gcc_assert (stmt_info);
 208
 209           if (STMT_VINFO_RELEVANT_P (stmt_info))
 210             {
 211               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 212               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 213
 214               if (dump_enabled_p ())
 215                 {
 216                   dump_printf_loc (MSG_NOTE, vect_location,
 217                                    "get vectype for scalar type:  ");
 218                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 219                 }
 220
 221               vectype = get_vectype_for_scalar_type (scalar_type);
 222               if (!vectype)
 223                 {
 224                   if (dump_enabled_p ())
 225                     {
 226                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 227                                        "not vectorized: unsupported "
 228                                        "data-type ");
 229                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 230                                          scalar_type);
 231                     }
 232                   return false;
 233                 }
 234               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 235
 236               if (dump_enabled_p ())
 237                 {
 238                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 239                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 240                 }
 241
 242               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 243               if (dump_enabled_p ())
 244                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 245
 246               if (!vectorization_factor
 247                   || (nunits > vectorization_factor))
 248                 vectorization_factor = nunits;
 249             }
 250         }
 251
 252       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 253         {
 254           tree vf_vectype;
 255
 256           if (analyze_pattern_stmt)
 257             stmt = pattern_stmt;
 258           else
 259             stmt = gsi_stmt (si);
 260
 261           stmt_info = vinfo_for_stmt (stmt);
 262
 263           if (dump_enabled_p ())
 264             {
 265               dump_printf_loc (MSG_NOTE, vect_location,
 266                                "==> examining statement: ");
 267               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 268             }
 269
 270           gcc_assert (stmt_info);
 271
 272           /* Skip stmts which do not need to be vectorized.  */
 273           if (!STMT_VINFO_RELEVANT_P (stmt_info)
 274               && !STMT_VINFO_LIVE_P (stmt_info))
 275             {
 276               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 277                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 278                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 279                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 280                 {
 281                   stmt = pattern_stmt;
 282                   stmt_info = vinfo_for_stmt (pattern_stmt);
 283                   if (dump_enabled_p ())
 284                     {
 285                       dump_printf_loc (MSG_NOTE, vect_location,
 286                                        "==> examining pattern statement: ");
 287                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288                     }
 289                 }
 290               else
 291                 {
 292                   if (dump_enabled_p ())
 293                     dump_printf_loc (MSG_NOTE, vect_location, "skip.");
 294                   gsi_next (&si);
 295                   continue;
 296                 }
 297             }
 298           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302             analyze_pattern_stmt = true;
 303
 304           /* If a pattern statement has def stmts, analyze them too.  */
 305           if (is_pattern_stmt_p (stmt_info))
 306             {
 307               if (pattern_def_seq == NULL)
 308                 {
 309                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 310                   pattern_def_si = gsi_start (pattern_def_seq);
 311                 }
 312               else if (!gsi_end_p (pattern_def_si))
 313                 gsi_next (&pattern_def_si);
 314               if (pattern_def_seq != NULL)
 315                 {
 316                   gimple pattern_def_stmt = NULL;
 317                   stmt_vec_info pattern_def_stmt_info = NULL;
 318
 319                   while (!gsi_end_p (pattern_def_si))
 320                     {
 321                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 322                       pattern_def_stmt_info
 323                         = vinfo_for_stmt (pattern_def_stmt);
 324                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 325                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 326                         break;
 327                       gsi_next (&pattern_def_si);
 328                     }
 329
 330                   if (!gsi_end_p (pattern_def_si))
 331                     {
 332                       if (dump_enabled_p ())
 333                         {
 334                           dump_printf_loc (MSG_NOTE, vect_location,
 335                                            "==> examining pattern def stmt: ");
 336                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 337                                             pattern_def_stmt, 0);
 338                         }
 339
 340                       stmt = pattern_def_stmt;
 341                       stmt_info = pattern_def_stmt_info;
 342                     }
 343                   else
 344                     {
 345                       pattern_def_si = gsi_none ();
 346                       analyze_pattern_stmt = false;
 347                     }
 348                 }
 349               else
 350                 analyze_pattern_stmt = false;
 351             }
 352
 353           if (gimple_get_lhs (stmt) == NULL_TREE)
 354             {
 355               if (dump_enabled_p ())
 356                 {
 357                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 358                                    "not vectorized: irregular stmt.");
 359                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 360                                     0);
 361                 }
 362               return false;
 363             }
 364
 365           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 366             {
 367               if (dump_enabled_p ())
 368                 {
 369                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 370                                    "not vectorized: vector stmt in loop:");
 371                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 372                 }
 373               return false;
 374             }
 375
 376           if (STMT_VINFO_VECTYPE (stmt_info))
 377             {
 378               /* The only case when a vectype had been already set is for stmts
 379                  that contain a dataref, or for "pattern-stmts" (stmts
 380                  generated by the vectorizer to represent/replace a certain
 381                  idiom).  */
 382               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 383                           || is_pattern_stmt_p (stmt_info)
 384                           || !gsi_end_p (pattern_def_si));
 385               vectype = STMT_VINFO_VECTYPE (stmt_info);
 386             }
 387           else
 388             {
 389               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 390               scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_NOTE, vect_location,
 394                                    "get vectype for scalar type:  ");
 395                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 396                 }
 397               vectype = get_vectype_for_scalar_type (scalar_type);
 398               if (!vectype)
 399                 {
 400                   if (dump_enabled_p ())
 401                     {
 402                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 403                                        "not vectorized: unsupported "
 404                                        "data-type ");
 405                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 406                                          scalar_type);
 407                     }
 408                   return false;
 409                 }
 410
 411               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 412             }
 413
 414           /* The vectorization factor is according to the smallest
 415              scalar type (or the largest vector size, but we only
 416              support one vector size per loop).  */
 417           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 418                                                        &dummy);
 419           if (dump_enabled_p ())
 420             {
 421               dump_printf_loc (MSG_NOTE, vect_location,
 422                                "get vectype for scalar type:  ");
 423               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 424             }
 425           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 426           if (!vf_vectype)
 427             {
 428               if (dump_enabled_p ())
 429                 {
 430                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 431                                    "not vectorized: unsupported data-type ");
 432                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 433                                      scalar_type);
 434                 }
 435               return false;
 436             }
 437
 438           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 439                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 440             {
 441               if (dump_enabled_p ())
 442                 {
 443                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 444                                    "not vectorized: different sized vector "
 445                                    "types in statement, ");
 446                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 447                                      vectype);
 448                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 449                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 450                                      vf_vectype);
 451                 }
 452               return false;
 453             }
 454
 455           if (dump_enabled_p ())
 456             {
 457               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 458               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 459             }
 460
 461           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 462           if (dump_enabled_p ())
 463             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d", nunits);
 464           if (!vectorization_factor
 465               || (nunits > vectorization_factor))
 466             vectorization_factor = nunits;
 467
 468           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 469             {
 470               pattern_def_seq = NULL;
 471               gsi_next (&si);
 472             }
 473         }
 474     }
 475
 476   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 477   if (dump_enabled_p ())
 478     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d",
 479                      vectorization_factor);
 480   if (vectorization_factor <= 1)
 481     {
 482       if (dump_enabled_p ())
 483         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 484                          "not vectorized: unsupported data-type");
 485       return false;
 486     }
 487   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 488
 489   return true;
 490 }
 491
 492
 493 /* Function vect_is_simple_iv_evolution.
 494
 495    FORNOW: A simple evolution of an induction variables in the loop is
 496    considered a polynomial evolution with constant step.  */
 497
 498 static bool
 499 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 500                              tree * step)
 501 {
 502   tree init_expr;
 503   tree step_expr;
 504   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 505
 506   /* When there is no evolution in this loop, the evolution function
 507      is not "simple".  */
 508   if (evolution_part == NULL_TREE)
 509     return false;
 510
 511   /* When the evolution is a polynomial of degree >= 2
 512      the evolution function is not "simple".  */
 513   if (tree_is_chrec (evolution_part))
 514     return false;
 515
 516   step_expr = evolution_part;
 517   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 518
 519   if (dump_enabled_p ())
 520     {
 521       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 522       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 523       dump_printf (MSG_NOTE, ",  init: ");
 524       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 525     }
 526
 527   *init = init_expr;
 528   *step = step_expr;
 529
 530   if (TREE_CODE (step_expr) != INTEGER_CST)
 531     {
 532       if (dump_enabled_p ())
 533         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                          "step unknown.");
 535       return false;
 536     }
 537
 538   return true;
 539 }
 540
 541 /* Function vect_analyze_scalar_cycles_1.
 542
 543    Examine the cross iteration def-use cycles of scalar variables
 544    in LOOP.  LOOP_VINFO represents the loop that is now being
 545    considered for vectorization (can be LOOP, or an outer-loop
 546    enclosing LOOP).  */
 547
 548 static void
 549 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 550 {
 551   basic_block bb = loop->header;
 552   tree dumy;
 553   vec<gimple> worklist;
 554   worklist.create (64);
 555   gimple_stmt_iterator gsi;
 556   bool double_reduc;
 557
 558   if (dump_enabled_p ())
 559     dump_printf_loc (MSG_NOTE, vect_location,
 560                      "=== vect_analyze_scalar_cycles ===");
 561
 562   /* First - identify all inductions.  Reduction detection assumes that all the
 563      inductions have been identified, therefore, this order must not be
 564      changed.  */
 565   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 566     {
 567       gimple phi = gsi_stmt (gsi);
 568       tree access_fn = NULL;
 569       tree def = PHI_RESULT (phi);
 570       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 571
 572       if (dump_enabled_p ())
 573         {
 574           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 575           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 576         }
 577
 578       /* Skip virtual phi's.  The data dependences that are associated with
 579          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 580       if (virtual_operand_p (def))
 581         continue;
 582
 583       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 584
 585       /* Analyze the evolution function.  */
 586       access_fn = analyze_scalar_evolution (loop, def);
 587       if (access_fn)
 588         {
 589           STRIP_NOPS (access_fn);
 590           if (dump_enabled_p ())
 591             {
 592               dump_printf_loc (MSG_NOTE, vect_location,
 593                                "Access function of PHI: ");
 594               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 595             }
 596           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 597             = evolution_part_in_loop_num (access_fn, loop->num);
 598         }
 599
 600       if (!access_fn
 601           || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy))
 602         {
 603           worklist.safe_push (phi);
 604           continue;
 605         }
 606
 607       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.");
 611       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 612     }
 613
 614
 615   /* Second - identify all reductions and nested cycles.  */
 616   while (worklist.length () > 0)
 617     {
 618       gimple phi = worklist.pop ();
 619       tree def = PHI_RESULT (phi);
 620       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 621       gimple reduc_stmt;
 622       bool nested_cycle;
 623
 624       if (dump_enabled_p ())
 625         {
 626           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 627           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 628         }
 629
 630       gcc_assert (!virtual_operand_p (def)
 631                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 632
 633       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 634       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 635                                                 &double_reduc);
 636       if (reduc_stmt)
 637         {
 638           if (double_reduc)
 639             {
 640               if (dump_enabled_p ())
 641                 dump_printf_loc (MSG_NOTE, vect_location,
 642                                  "Detected double reduction.");
 643
 644               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 645               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 646                                                     vect_double_reduction_def;
 647             }
 648           else
 649             {
 650               if (nested_cycle)
 651                 {
 652                   if (dump_enabled_p ())
 653                     dump_printf_loc (MSG_NOTE, vect_location,
 654                                      "Detected vectorizable nested cycle.");
 655
 656                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 657                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 658                                                              vect_nested_cycle;
 659                 }
 660               else
 661                 {
 662                   if (dump_enabled_p ())
 663                     dump_printf_loc (MSG_NOTE, vect_location,
 664                                      "Detected reduction.");
 665
 666                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 667                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 668                                                            vect_reduction_def;
 669                   /* Store the reduction cycles for possible vectorization in
 670                      loop-aware SLP.  */
 671                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 672                 }
 673             }
 674         }
 675       else
 676         if (dump_enabled_p ())
 677           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 678                            "Unknown def-use cycle pattern.");
 679     }
 680
 681   worklist.release ();
 682 }
 683
 684
 685 /* Function vect_analyze_scalar_cycles.
 686
 687    Examine the cross iteration def-use cycles of scalar variables, by
 688    analyzing the loop-header PHIs of scalar variables.  Classify each
 689    cycle as one of the following: invariant, induction, reduction, unknown.
 690    We do that for the loop represented by LOOP_VINFO, and also to its
 691    inner-loop, if exists.
 692    Examples for scalar cycles:
 693
 694    Example1: reduction:
 695
 696               loop1:
 697               for (i=0; i<N; i++)
 698                  sum += a[i];
 699
 700    Example2: induction:
 701
 702               loop2:
 703               for (i=0; i<N; i++)
 704                  a[i] = i;  */
 705
 706 static void
 707 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 708 {
 709   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 710
 711   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 712
 713   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 714      Reductions in such inner-loop therefore have different properties than
 715      the reductions in the nest that gets vectorized:
 716      1. When vectorized, they are executed in the same order as in the original
 717         scalar loop, so we can't change the order of computation when
 718         vectorizing them.
 719      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 720         current checks are too strict.  */
 721
 722   if (loop->inner)
 723     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 724 }
 725
 726 /* Function vect_get_loop_niters.
 727
 728    Determine how many iterations the loop is executed.
 729    If an expression that represents the number of iterations
 730    can be constructed, place it in NUMBER_OF_ITERATIONS.
 731    Return the loop exit condition.  */
 732
 733 static gimple
 734 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations)
 735 {
 736   tree niters;
 737
 738   if (dump_enabled_p ())
 739     dump_printf_loc (MSG_NOTE, vect_location,
 740                      "=== get_loop_niters ===");
 741   niters = number_of_exit_cond_executions (loop);
 742
 743   if (niters != NULL_TREE
 744       && niters != chrec_dont_know)
 745     {
 746       *number_of_iterations = niters;
 747
 748       if (dump_enabled_p ())
 749         {
 750           dump_printf_loc (MSG_NOTE, vect_location, "==> get_loop_niters:");
 751           dump_generic_expr (MSG_NOTE, TDF_SLIM, *number_of_iterations);
 752         }
 753     }
 754
 755   return get_loop_exit_condition (loop);
 756 }
 757
 758
 759 /* Function bb_in_loop_p
 760
 761    Used as predicate for dfs order traversal of the loop bbs.  */
 762
 763 static bool
 764 bb_in_loop_p (const_basic_block bb, const void *data)
 765 {
 766   const struct loop *const loop = (const struct loop *)data;
 767   if (flow_bb_inside_loop_p (loop, bb))
 768     return true;
 769   return false;
 770 }
 771
 772
 773 /* Function new_loop_vec_info.
 774
 775    Create and initialize a new loop_vec_info struct for LOOP, as well as
 776    stmt_vec_info structs for all the stmts in LOOP.  */
 777
 778 static loop_vec_info
 779 new_loop_vec_info (struct loop *loop)
 780 {
 781   loop_vec_info res;
 782   basic_block *bbs;
 783   gimple_stmt_iterator si;
 784   unsigned int i, nbbs;
 785
 786   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 787   LOOP_VINFO_LOOP (res) = loop;
 788
 789   bbs = get_loop_body (loop);
 790
 791   /* Create/Update stmt_info for all stmts in the loop.  */
 792   for (i = 0; i < loop->num_nodes; i++)
 793     {
 794       basic_block bb = bbs[i];
 795
 796       /* BBs in a nested inner-loop will have been already processed (because
 797          we will have called vect_analyze_loop_form for any nested inner-loop).
 798          Therefore, for stmts in an inner-loop we just want to update the
 799          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 800          loop_info of the outer-loop we are currently considering to vectorize
 801          (instead of the loop_info of the inner-loop).
 802          For stmts in other BBs we need to create a stmt_info from scratch.  */
 803       if (bb->loop_father != loop)
 804         {
 805           /* Inner-loop bb.  */
 806           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 807           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 808             {
 809               gimple phi = gsi_stmt (si);
 810               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 811               loop_vec_info inner_loop_vinfo =
 812                 STMT_VINFO_LOOP_VINFO (stmt_info);
 813               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 814               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 815             }
 816           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 817            {
 818               gimple stmt = gsi_stmt (si);
 819               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 820               loop_vec_info inner_loop_vinfo =
 821                  STMT_VINFO_LOOP_VINFO (stmt_info);
 822               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 823               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 824            }
 825         }
 826       else
 827         {
 828           /* bb in current nest.  */
 829           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 830             {
 831               gimple phi = gsi_stmt (si);
 832               gimple_set_uid (phi, 0);
 833               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 834             }
 835
 836           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 837             {
 838               gimple stmt = gsi_stmt (si);
 839               gimple_set_uid (stmt, 0);
 840               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 841             }
 842         }
 843     }
 844
 845   /* CHECKME: We want to visit all BBs before their successors (except for
 846      latch blocks, for which this assertion wouldn't hold).  In the simple
 847      case of the loop forms we allow, a dfs order of the BBs would the same
 848      as reversed postorder traversal, so we are safe.  */
 849
 850    free (bbs);
 851    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 852    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 853                               bbs, loop->num_nodes, loop);
 854    gcc_assert (nbbs == loop->num_nodes);
 855
 856   LOOP_VINFO_BBS (res) = bbs;
 857   LOOP_VINFO_NITERS (res) = NULL;
 858   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 859   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 860   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 861   LOOP_PEELING_FOR_ALIGNMENT (res) = 0;
 862   LOOP_VINFO_VECT_FACTOR (res) = 0;
 863   LOOP_VINFO_LOOP_NEST (res).create (3);
 864   LOOP_VINFO_DATAREFS (res).create (10);
 865   LOOP_VINFO_DDRS (res).create (10 * 10);
 866   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 867   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 868              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 869   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 870              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 871   LOOP_VINFO_GROUPED_STORES (res).create (10);
 872   LOOP_VINFO_REDUCTIONS (res).create (10);
 873   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 874   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 875   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 876   LOOP_VINFO_PEELING_HTAB (res) = NULL;
 877   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 878   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 879   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 880
 881   return res;
 882 }
 883
 884
 885 /* Function destroy_loop_vec_info.
 886
 887    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 888    stmts in the loop.  */
 889
 890 void
 891 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 892 {
 893   struct loop *loop;
 894   basic_block *bbs;
 895   int nbbs;
 896   gimple_stmt_iterator si;
 897   int j;
 898   vec<slp_instance> slp_instances;
 899   slp_instance instance;
 900   bool swapped;
 901
 902   if (!loop_vinfo)
 903     return;
 904
 905   loop = LOOP_VINFO_LOOP (loop_vinfo);
 906
 907   bbs = LOOP_VINFO_BBS (loop_vinfo);
 908   nbbs = loop->num_nodes;
 909   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 910
 911   if (!clean_stmts)
 912     {
 913       free (LOOP_VINFO_BBS (loop_vinfo));
 914       free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 915       free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 916       LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 917       LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 918       LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 919
 920       free (loop_vinfo);
 921       loop->aux = NULL;
 922       return;
 923     }
 924
 925   for (j = 0; j < nbbs; j++)
 926     {
 927       basic_block bb = bbs[j];
 928       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 929         free_stmt_vec_info (gsi_stmt (si));
 930
 931       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 932         {
 933           gimple stmt = gsi_stmt (si);
 934
 935           /* We may have broken canonical form by moving a constant
 936              into RHS1 of a commutative op.  Fix such occurrences.  */
 937           if (swapped && is_gimple_assign (stmt))
 938             {
 939               enum tree_code code = gimple_assign_rhs_code (stmt);
 940
 941               if ((code == PLUS_EXPR
 942                    || code == POINTER_PLUS_EXPR
 943                    || code == MULT_EXPR)
 944                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 945                 swap_tree_operands (stmt,
 946                                     gimple_assign_rhs1_ptr (stmt),
 947                                     gimple_assign_rhs2_ptr (stmt));
 948             }
 949
 950           /* Free stmt_vec_info.  */
 951           free_stmt_vec_info (stmt);
 952           gsi_next (&si);
 953         }
 954     }
 955
 956   free (LOOP_VINFO_BBS (loop_vinfo));
 957   free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo));
 958   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
 959   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
 960   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
 961   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
 962   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
 963   FOR_EACH_VEC_ELT (slp_instances, j, instance)
 964     vect_free_slp_instance (instance);
 965
 966   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
 967   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
 968   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
 969   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
 970
 971   if (LOOP_VINFO_PEELING_HTAB (loop_vinfo))
 972     htab_delete (LOOP_VINFO_PEELING_HTAB (loop_vinfo));
 973
 974   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
 975
 976   free (loop_vinfo);
 977   loop->aux = NULL;
 978 }
 979
 980
 981 /* Function vect_analyze_loop_1.
 982
 983    Apply a set of analyses on LOOP, and create a loop_vec_info struct
 984    for it. The different analyses will record information in the
 985    loop_vec_info struct.  This is a subset of the analyses applied in
 986    vect_analyze_loop, to be applied on an inner-loop nested in the loop
 987    that is now considered for (outer-loop) vectorization.  */
 988
 989 static loop_vec_info
 990 vect_analyze_loop_1 (struct loop *loop)
 991 {
 992   loop_vec_info loop_vinfo;
 993
 994   if (dump_enabled_p ())
 995     dump_printf_loc (MSG_NOTE, vect_location,
 996                      "===== analyze_loop_nest_1 =====");
 997
 998   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
 999
1000   loop_vinfo = vect_analyze_loop_form (loop);
1001   if (!loop_vinfo)
1002     {
1003       if (dump_enabled_p ())
1004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1005                          "bad inner-loop form.");
1006       return NULL;
1007     }
1008
1009   return loop_vinfo;
1010 }
1011
1012
1013 /* Function vect_analyze_loop_form.
1014
1015    Verify that certain CFG restrictions hold, including:
1016    - the loop has a pre-header
1017    - the loop has a single entry and exit
1018    - the loop exit condition is simple enough, and the number of iterations
1019      can be analyzed (a countable loop).  */
1020
1021 loop_vec_info
1022 vect_analyze_loop_form (struct loop *loop)
1023 {
1024   loop_vec_info loop_vinfo;
1025   gimple loop_cond;
1026   tree number_of_iterations = NULL;
1027   loop_vec_info inner_loop_vinfo = NULL;
1028
1029   if (dump_enabled_p ())
1030     dump_printf_loc (MSG_NOTE, vect_location,
1031                      "=== vect_analyze_loop_form ===");
1032
1033   /* Different restrictions apply when we are considering an inner-most loop,
1034      vs. an outer (nested) loop.
1035      (FORNOW. May want to relax some of these restrictions in the future).  */
1036
1037   if (!loop->inner)
1038     {
1039       /* Inner-most loop.  We currently require that the number of BBs is
1040          exactly 2 (the header and latch).  Vectorizable inner-most loops
1041          look like this:
1042
1043                         (pre-header)
1044                            |
1045                           header <--------+
1046                            | |            |
1047                            | +--> latch --+
1048                            |
1049                         (exit-bb)  */
1050
1051       if (loop->num_nodes != 2)
1052         {
1053           if (dump_enabled_p ())
1054             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1055                              "not vectorized: control flow in loop.");
1056           return NULL;
1057         }
1058
1059       if (empty_block_p (loop->header))
1060     {
1061           if (dump_enabled_p ())
1062             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1063                              "not vectorized: empty loop.");
1064       return NULL;
1065     }
1066     }
1067   else
1068     {
1069       struct loop *innerloop = loop->inner;
1070       edge entryedge;
1071
1072       /* Nested loop. We currently require that the loop is doubly-nested,
1073          contains a single inner loop, and the number of BBs is exactly 5.
1074          Vectorizable outer-loops look like this:
1075
1076                         (pre-header)
1077                            |
1078                           header <---+
1079                            |         |
1080                           inner-loop |
1081                            |         |
1082                           tail ------+
1083                            |
1084                         (exit-bb)
1085
1086          The inner-loop has the properties expected of inner-most loops
1087          as described above.  */
1088
1089       if ((loop->inner)->inner || (loop->inner)->next)
1090         {
1091           if (dump_enabled_p ())
1092             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1093                              "not vectorized: multiple nested loops.");
1094           return NULL;
1095         }
1096
1097       /* Analyze the inner-loop.  */
1098       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1099       if (!inner_loop_vinfo)
1100         {
1101           if (dump_enabled_p ())
1102             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1103                              "not vectorized: Bad inner loop.");
1104           return NULL;
1105         }
1106
1107       if (!expr_invariant_in_loop_p (loop,
1108                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1109         {
1110           if (dump_enabled_p ())
1111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1112                              "not vectorized: inner-loop count not invariant.");
1113           destroy_loop_vec_info (inner_loop_vinfo, true);
1114           return NULL;
1115         }
1116
1117       if (loop->num_nodes != 5)
1118         {
1119           if (dump_enabled_p ())
1120             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1121                              "not vectorized: control flow in loop.");
1122           destroy_loop_vec_info (inner_loop_vinfo, true);
1123           return NULL;
1124         }
1125
1126       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1127       entryedge = EDGE_PRED (innerloop->header, 0);
1128       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1129         entryedge = EDGE_PRED (innerloop->header, 1);
1130
1131       if (entryedge->src != loop->header
1132           || !single_exit (innerloop)
1133           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1134         {
1135           if (dump_enabled_p ())
1136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137                              "not vectorized: unsupported outerloop form.");
1138           destroy_loop_vec_info (inner_loop_vinfo, true);
1139           return NULL;
1140         }
1141
1142       if (dump_enabled_p ())
1143         dump_printf_loc (MSG_NOTE, vect_location,
1144                          "Considering outer-loop vectorization.");
1145     }
1146
1147   if (!single_exit (loop)
1148       || EDGE_COUNT (loop->header->preds) != 2)
1149     {
1150       if (dump_enabled_p ())
1151         {
1152           if (!single_exit (loop))
1153             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1154                              "not vectorized: multiple exits.");
1155           else if (EDGE_COUNT (loop->header->preds) != 2)
1156             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157                              "not vectorized: too many incoming edges.");
1158         }
1159       if (inner_loop_vinfo)
1160         destroy_loop_vec_info (inner_loop_vinfo, true);
1161       return NULL;
1162     }
1163
1164   /* We assume that the loop exit condition is at the end of the loop. i.e,
1165      that the loop is represented as a do-while (with a proper if-guard
1166      before the loop if needed), where the loop header contains all the
1167      executable statements, and the latch is empty.  */
1168   if (!empty_block_p (loop->latch)
1169       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1170     {
1171       if (dump_enabled_p ())
1172         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1173                          "not vectorized: latch block not empty.");
1174       if (inner_loop_vinfo)
1175         destroy_loop_vec_info (inner_loop_vinfo, true);
1176       return NULL;
1177     }
1178
1179   /* Make sure there exists a single-predecessor exit bb:  */
1180   if (!single_pred_p (single_exit (loop)->dest))
1181     {
1182       edge e = single_exit (loop);
1183       if (!(e->flags & EDGE_ABNORMAL))
1184         {
1185           split_loop_exit_edge (e);
1186           if (dump_enabled_p ())
1187             dump_printf (MSG_NOTE, "split exit edge.");
1188         }
1189       else
1190         {
1191           if (dump_enabled_p ())
1192             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1193                              "not vectorized: abnormal loop exit edge.");
1194           if (inner_loop_vinfo)
1195             destroy_loop_vec_info (inner_loop_vinfo, true);
1196           return NULL;
1197         }
1198     }
1199
1200   loop_cond = vect_get_loop_niters (loop, &number_of_iterations);
1201   if (!loop_cond)
1202     {
1203       if (dump_enabled_p ())
1204         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1205                          "not vectorized: complicated exit condition.");
1206       if (inner_loop_vinfo)
1207         destroy_loop_vec_info (inner_loop_vinfo, true);
1208       return NULL;
1209     }
1210
1211   if (!number_of_iterations)
1212     {
1213       if (dump_enabled_p ())
1214         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215                          "not vectorized: number of iterations cannot be "
1216                          "computed.");
1217       if (inner_loop_vinfo)
1218         destroy_loop_vec_info (inner_loop_vinfo, true);
1219       return NULL;
1220     }
1221
1222   if (chrec_contains_undetermined (number_of_iterations))
1223     {
1224       if (dump_enabled_p ())
1225             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1226                              "Infinite number of iterations.");
1227       if (inner_loop_vinfo)
1228         destroy_loop_vec_info (inner_loop_vinfo, true);
1229       return NULL;
1230     }
1231
1232   if (!NITERS_KNOWN_P (number_of_iterations))
1233     {
1234       if (dump_enabled_p ())
1235         {
1236           dump_printf_loc (MSG_NOTE, vect_location,
1237                            "Symbolic number of iterations is ");
1238           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1239         }
1240     }
1241   else if (TREE_INT_CST_LOW (number_of_iterations) == 0)
1242     {
1243       if (dump_enabled_p ())
1244         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1245                          "not vectorized: number of iterations = 0.");
1246       if (inner_loop_vinfo)
1247         destroy_loop_vec_info (inner_loop_vinfo, false);
1248       return NULL;
1249     }
1250
1251   loop_vinfo = new_loop_vec_info (loop);
1252   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1253   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1254
1255   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1256
1257   /* CHECKME: May want to keep it around it in the future.  */
1258   if (inner_loop_vinfo)
1259     destroy_loop_vec_info (inner_loop_vinfo, false);
1260
1261   gcc_assert (!loop->aux);
1262   loop->aux = loop_vinfo;
1263   return loop_vinfo;
1264 }
1265
1266
1267 /* Function vect_analyze_loop_operations.
1268
1269    Scan the loop stmts and make sure they are all vectorizable.  */
1270
1271 static bool
1272 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1273 {
1274   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1275   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1276   int nbbs = loop->num_nodes;
1277   gimple_stmt_iterator si;
1278   unsigned int vectorization_factor = 0;
1279   int i;
1280   gimple phi;
1281   stmt_vec_info stmt_info;
1282   bool need_to_vectorize = false;
1283   int min_profitable_iters;
1284   int min_scalar_loop_bound;
1285   unsigned int th;
1286   bool only_slp_in_loop = true, ok;
1287   HOST_WIDE_INT max_niter;
1288   HOST_WIDE_INT estimated_niter;
1289   int min_profitable_estimate;
1290
1291   if (dump_enabled_p ())
1292     dump_printf_loc (MSG_NOTE, vect_location,
1293                      "=== vect_analyze_loop_operations ===");
1294
1295   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1296   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1297   if (slp)
1298     {
1299       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1300          vectorization factor of the loop is the unrolling factor required by
1301          the SLP instances.  If that unrolling factor is 1, we say, that we
1302          perform pure SLP on loop - cross iteration parallelism is not
1303          exploited.  */
1304       for (i = 0; i < nbbs; i++)
1305         {
1306           basic_block bb = bbs[i];
1307           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1308             {
1309               gimple stmt = gsi_stmt (si);
1310               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1311               gcc_assert (stmt_info);
1312               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1313                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1314                   && !PURE_SLP_STMT (stmt_info))
1315                 /* STMT needs both SLP and loop-based vectorization.  */
1316                 only_slp_in_loop = false;
1317             }
1318         }
1319
1320       if (only_slp_in_loop)
1321         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1322       else
1323         vectorization_factor = least_common_multiple (vectorization_factor,
1324                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1325
1326       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1327       if (dump_enabled_p ())
1328         dump_printf_loc (MSG_NOTE, vect_location,
1329                          "Updating vectorization factor to %d ",
1330                          vectorization_factor);
1331     }
1332
1333   for (i = 0; i < nbbs; i++)
1334     {
1335       basic_block bb = bbs[i];
1336
1337       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1338         {
1339           phi = gsi_stmt (si);
1340           ok = true;
1341
1342           stmt_info = vinfo_for_stmt (phi);
1343           if (dump_enabled_p ())
1344             {
1345               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1346               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1347             }
1348
1349           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1350              (i.e., a phi in the tail of the outer-loop).  */
1351           if (! is_loop_header_bb_p (bb))
1352             {
1353               /* FORNOW: we currently don't support the case that these phis
1354                  are not used in the outerloop (unless it is double reduction,
1355                  i.e., this phi is vect_reduction_def), cause this case
1356                  requires to actually do something here.  */
1357               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1358                    || STMT_VINFO_LIVE_P (stmt_info))
1359                   && STMT_VINFO_DEF_TYPE (stmt_info)
1360                      != vect_double_reduction_def)
1361                 {
1362                   if (dump_enabled_p ())
1363                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1364                                      "Unsupported loop-closed phi in "
1365                                      "outer-loop.");
1366                   return false;
1367                 }
1368
1369               /* If PHI is used in the outer loop, we check that its operand
1370                  is defined in the inner loop.  */
1371               if (STMT_VINFO_RELEVANT_P (stmt_info))
1372                 {
1373                   tree phi_op;
1374                   gimple op_def_stmt;
1375
1376                   if (gimple_phi_num_args (phi) != 1)
1377                     return false;
1378
1379                   phi_op = PHI_ARG_DEF (phi, 0);
1380                   if (TREE_CODE (phi_op) != SSA_NAME)
1381                     return false;
1382
1383                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1384                   if (!op_def_stmt
1385                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1386                       || !vinfo_for_stmt (op_def_stmt))
1387                     return false;
1388
1389                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1390                         != vect_used_in_outer
1391                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1392                            != vect_used_in_outer_by_reduction)
1393                     return false;
1394                 }
1395
1396               continue;
1397             }
1398
1399           gcc_assert (stmt_info);
1400
1401           if (STMT_VINFO_LIVE_P (stmt_info))
1402             {
1403               /* FORNOW: not yet supported.  */
1404               if (dump_enabled_p ())
1405                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1406                                  "not vectorized: value used after loop.");
1407               return false;
1408             }
1409
1410           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1411               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1412             {
1413               /* A scalar-dependence cycle that we don't support.  */
1414               if (dump_enabled_p ())
1415                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1416                                  "not vectorized: scalar dependence cycle.");
1417               return false;
1418             }
1419
1420           if (STMT_VINFO_RELEVANT_P (stmt_info))
1421             {
1422               need_to_vectorize = true;
1423               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1424                 ok = vectorizable_induction (phi, NULL, NULL);
1425             }
1426
1427           if (!ok)
1428             {
1429               if (dump_enabled_p ())
1430                 {
1431                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1432                                    "not vectorized: relevant phi not "
1433                                    "supported: ");
1434                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1435                 }
1436               return false;
1437             }
1438         }
1439
1440       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1441         {
1442           gimple stmt = gsi_stmt (si);
1443           if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1444             return false;
1445         }
1446     } /* bbs */
1447
1448   /* All operations in the loop are either irrelevant (deal with loop
1449      control, or dead), or only used outside the loop and can be moved
1450      out of the loop (e.g. invariants, inductions).  The loop can be
1451      optimized away by scalar optimizations.  We're better off not
1452      touching this loop.  */
1453   if (!need_to_vectorize)
1454     {
1455       if (dump_enabled_p ())
1456         dump_printf_loc (MSG_NOTE, vect_location,
1457                          "All the computation can be taken out of the loop.");
1458       if (dump_enabled_p ())
1459         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1460                          "not vectorized: redundant loop. no profit to "
1461                          "vectorize.");
1462       return false;
1463     }
1464
1465   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1466     dump_printf_loc (MSG_NOTE, vect_location,
1467                      "vectorization_factor = %d, niters = "
1468                      HOST_WIDE_INT_PRINT_DEC, vectorization_factor,
1469                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1470
1471   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1472        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1473       || ((max_niter = max_stmt_executions_int (loop)) != -1
1474           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1475     {
1476       if (dump_enabled_p ())
1477         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1478                          "not vectorized: iteration count too small.");
1479       if (dump_enabled_p ())
1480         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1481                          "not vectorized: iteration count smaller than "
1482                          "vectorization factor.");
1483       return false;
1484     }
1485
1486   /* Analyze cost.  Decide if worth while to vectorize.  */
1487
1488   /* Once VF is set, SLP costs should be updated since the number of created
1489      vector stmts depends on VF.  */
1490   vect_update_slp_costs_according_to_vf (loop_vinfo);
1491
1492   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1493                                       &min_profitable_estimate);
1494   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1495
1496   if (min_profitable_iters < 0)
1497     {
1498       if (dump_enabled_p ())
1499         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1500                          "not vectorized: vectorization not profitable.");
1501       if (dump_enabled_p ())
1502         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503                          "not vectorized: vector version will never be "
1504                          "profitable.");
1505       return false;
1506     }
1507
1508   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1509                             * vectorization_factor) - 1);
1510
1511
1512   /* Use the cost model only if it is more conservative than user specified
1513      threshold.  */
1514
1515   th = (unsigned) min_scalar_loop_bound;
1516   if (min_profitable_iters
1517       && (!min_scalar_loop_bound
1518           || min_profitable_iters > min_scalar_loop_bound))
1519     th = (unsigned) min_profitable_iters;
1520
1521   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1522       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1523     {
1524       if (dump_enabled_p ())
1525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1526                          "not vectorized: vectorization not profitable.");
1527       if (dump_enabled_p ())
1528         dump_printf_loc (MSG_NOTE, vect_location,
1529                          "not vectorized: iteration count smaller than user "
1530                          "specified loop bound parameter or minimum profitable "
1531                          "iterations (whichever is more conservative).");
1532       return false;
1533     }
1534
1535   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1536       && ((unsigned HOST_WIDE_INT) estimated_niter
1537           <= MAX (th, (unsigned)min_profitable_estimate)))
1538     {
1539       if (dump_enabled_p ())
1540         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1541                          "not vectorized: estimated iteration count too "
1542                          "small.");
1543       if (dump_enabled_p ())
1544         dump_printf_loc (MSG_NOTE, vect_location,
1545                          "not vectorized: estimated iteration count smaller "
1546                          "than specified loop bound parameter or minimum "
1547                          "profitable iterations (whichever is more "
1548                          "conservative).");
1549       return false;
1550     }
1551
1552   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1553       || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0
1554       || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
1555     {
1556       if (dump_enabled_p ())
1557         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required.");
1558       if (!vect_can_advance_ivs_p (loop_vinfo))
1559         {
1560           if (dump_enabled_p ())
1561             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1562                              "not vectorized: can't create epilog loop 1.");
1563           return false;
1564         }
1565       if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1566         {
1567           if (dump_enabled_p ())
1568             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1569                              "not vectorized: can't create epilog loop 2.");
1570           return false;
1571         }
1572     }
1573
1574   return true;
1575 }
1576
1577
1578 /* Function vect_analyze_loop_2.
1579
1580    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1581    for it.  The different analyses will record information in the
1582    loop_vec_info struct.  */
1583 static bool
1584 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1585 {
1586   bool ok, slp = false;
1587   int max_vf = MAX_VECTORIZATION_FACTOR;
1588   int min_vf = 2;
1589
1590   /* Find all data references in the loop (which correspond to vdefs/vuses)
1591      and analyze their evolution in the loop.  Also adjust the minimal
1592      vectorization factor according to the loads and stores.
1593
1594      FORNOW: Handle only simple, array references, which
1595      alignment can be forced, and aligned pointer-references.  */
1596
1597   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf);
1598   if (!ok)
1599     {
1600       if (dump_enabled_p ())
1601         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1602                          "bad data references.");
1603       return false;
1604     }
1605
1606   /* Classify all cross-iteration scalar data-flow cycles.
1607      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1608
1609   vect_analyze_scalar_cycles (loop_vinfo);
1610
1611   vect_pattern_recog (loop_vinfo, NULL);
1612
1613   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1614
1615   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1616   if (!ok)
1617     {
1618       if (dump_enabled_p ())
1619         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1620                          "unexpected pattern.");
1621       return false;
1622     }
1623
1624   /* Analyze data dependences between the data-refs in the loop
1625      and adjust the maximum vectorization factor according to
1626      the dependences.
1627      FORNOW: fail at the first data dependence that we encounter.  */
1628
1629   ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL, &max_vf);
1630   if (!ok
1631       || max_vf < min_vf)
1632     {
1633       if (dump_enabled_p ())
1634             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1635                              "bad data dependence.");
1636       return false;
1637     }
1638
1639   ok = vect_determine_vectorization_factor (loop_vinfo);
1640   if (!ok)
1641     {
1642       if (dump_enabled_p ())
1643         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1644                          "can't determine vectorization factor.");
1645       return false;
1646     }
1647   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1648     {
1649       if (dump_enabled_p ())
1650         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1651                          "bad data dependence.");
1652       return false;
1653     }
1654
1655   /* Analyze the alignment of the data-refs in the loop.
1656      Fail if a data reference is found that cannot be vectorized.  */
1657
1658   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1659   if (!ok)
1660     {
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663                          "bad data alignment.");
1664       return false;
1665     }
1666
1667   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1668      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1669
1670   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1671   if (!ok)
1672     {
1673       if (dump_enabled_p ())
1674         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1675                          "bad data access.");
1676       return false;
1677     }
1678
1679   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1680      It is important to call pruning after vect_analyze_data_ref_accesses,
1681      since we use grouping information gathered by interleaving analysis.  */
1682   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1683   if (!ok)
1684     {
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687                          "too long list of versioning for alias "
1688                          "run-time tests.");
1689       return false;
1690     }
1691
1692   /* This pass will decide on using loop versioning and/or loop peeling in
1693      order to enhance the alignment of data references in the loop.  */
1694
1695   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1696   if (!ok)
1697     {
1698       if (dump_enabled_p ())
1699         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1700                          "bad data alignment.");
1701       return false;
1702     }
1703
1704   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1705   ok = vect_analyze_slp (loop_vinfo, NULL);
1706   if (ok)
1707     {
1708       /* Decide which possible SLP instances to SLP.  */
1709       slp = vect_make_slp_decision (loop_vinfo);
1710
1711       /* Find stmts that need to be both vectorized and SLPed.  */
1712       vect_detect_hybrid_slp (loop_vinfo);
1713     }
1714   else
1715     return false;
1716
1717   /* Scan all the operations in the loop and make sure they are
1718      vectorizable.  */
1719
1720   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1721   if (!ok)
1722     {
1723       if (dump_enabled_p ())
1724         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725                          "bad operation or unsupported loop bound.");
1726       return false;
1727     }
1728
1729   return true;
1730 }
1731
1732 /* Function vect_analyze_loop.
1733
1734    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1735    for it.  The different analyses will record information in the
1736    loop_vec_info struct.  */
1737 loop_vec_info
1738 vect_analyze_loop (struct loop *loop)
1739 {
1740   loop_vec_info loop_vinfo;
1741   unsigned int vector_sizes;
1742
1743   /* Autodetect first vector size we try.  */
1744   current_vector_size = 0;
1745   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1746
1747   if (dump_enabled_p ())
1748     dump_printf_loc (MSG_NOTE, vect_location,
1749                      "===== analyze_loop_nest =====");
1750
1751   if (loop_outer (loop)
1752       && loop_vec_info_for_loop (loop_outer (loop))
1753       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1754     {
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_NOTE, vect_location,
1757                          "outer-loop already vectorized.");
1758       return NULL;
1759     }
1760
1761   while (1)
1762     {
1763       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1764       loop_vinfo = vect_analyze_loop_form (loop);
1765       if (!loop_vinfo)
1766         {
1767           if (dump_enabled_p ())
1768             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1769                              "bad loop form.");
1770           return NULL;
1771         }
1772
1773       if (vect_analyze_loop_2 (loop_vinfo))
1774         {
1775           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1776
1777           return loop_vinfo;
1778         }
1779
1780       destroy_loop_vec_info (loop_vinfo, true);
1781
1782       vector_sizes &= ~current_vector_size;
1783       if (vector_sizes == 0
1784           || current_vector_size == 0)
1785         return NULL;
1786
1787       /* Try the next biggest vector size.  */
1788       current_vector_size = 1 << floor_log2 (vector_sizes);
1789       if (dump_enabled_p ())
1790         dump_printf_loc (MSG_NOTE, vect_location,
1791                          "***** Re-trying analysis with "
1792                          "vector size %d\n", current_vector_size);
1793     }
1794 }
1795
1796
1797 /* Function reduction_code_for_scalar_code
1798
1799    Input:
1800    CODE - tree_code of a reduction operations.
1801
1802    Output:
1803    REDUC_CODE - the corresponding tree-code to be used to reduce the
1804       vector of partial results into a single scalar result (which
1805       will also reside in a vector) or ERROR_MARK if the operation is
1806       a supported reduction operation, but does not have such tree-code.
1807
1808    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1809
1810 static bool
1811 reduction_code_for_scalar_code (enum tree_code code,
1812                                 enum tree_code *reduc_code)
1813 {
1814   switch (code)
1815     {
1816       case MAX_EXPR:
1817         *reduc_code = REDUC_MAX_EXPR;
1818         return true;
1819
1820       case MIN_EXPR:
1821         *reduc_code = REDUC_MIN_EXPR;
1822         return true;
1823
1824       case PLUS_EXPR:
1825         *reduc_code = REDUC_PLUS_EXPR;
1826         return true;
1827
1828       case MULT_EXPR:
1829       case MINUS_EXPR:
1830       case BIT_IOR_EXPR:
1831       case BIT_XOR_EXPR:
1832       case BIT_AND_EXPR:
1833         *reduc_code = ERROR_MARK;
1834         return true;
1835
1836       default:
1837        return false;
1838     }
1839 }
1840
1841
1842 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1843    STMT is printed with a message MSG. */
1844
1845 static void
1846 report_vect_op (int msg_type, gimple stmt, const char *msg)
1847 {
1848   dump_printf_loc (msg_type, vect_location, "%s", msg);
1849   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1850 }
1851
1852
1853 /* Detect SLP reduction of the form:
1854
1855    #a1 = phi <a5, a0>
1856    a2 = operation (a1)
1857    a3 = operation (a2)
1858    a4 = operation (a3)
1859    a5 = operation (a4)
1860
1861    #a = phi <a5>
1862
1863    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1864    FIRST_STMT is the first reduction stmt in the chain
1865    (a2 = operation (a1)).
1866
1867    Return TRUE if a reduction chain was detected.  */
1868
1869 static bool
1870 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1871 {
1872   struct loop *loop = (gimple_bb (phi))->loop_father;
1873   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1874   enum tree_code code;
1875   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1876   stmt_vec_info use_stmt_info, current_stmt_info;
1877   tree lhs;
1878   imm_use_iterator imm_iter;
1879   use_operand_p use_p;
1880   int nloop_uses, size = 0, n_out_of_loop_uses;
1881   bool found = false;
1882
1883   if (loop != vect_loop)
1884     return false;
1885
1886   lhs = PHI_RESULT (phi);
1887   code = gimple_assign_rhs_code (first_stmt);
1888   while (1)
1889     {
1890       nloop_uses = 0;
1891       n_out_of_loop_uses = 0;
1892       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1893         {
1894           gimple use_stmt = USE_STMT (use_p);
1895           if (is_gimple_debug (use_stmt))
1896             continue;
1897
1898           use_stmt = USE_STMT (use_p);
1899
1900           /* Check if we got back to the reduction phi.  */
1901           if (use_stmt == phi)
1902             {
1903               loop_use_stmt = use_stmt;
1904               found = true;
1905               break;
1906             }
1907
1908           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
1909             {
1910               if (vinfo_for_stmt (use_stmt)
1911                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
1912                 {
1913                   loop_use_stmt = use_stmt;
1914                   nloop_uses++;
1915                 }
1916             }
1917            else
1918              n_out_of_loop_uses++;
1919
1920            /* There are can be either a single use in the loop or two uses in
1921               phi nodes.  */
1922            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
1923              return false;
1924         }
1925
1926       if (found)
1927         break;
1928
1929       /* We reached a statement with no loop uses.  */
1930       if (nloop_uses == 0)
1931         return false;
1932
1933       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
1934       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
1935         return false;
1936
1937       if (!is_gimple_assign (loop_use_stmt)
1938           || code != gimple_assign_rhs_code (loop_use_stmt)
1939           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
1940         return false;
1941
1942       /* Insert USE_STMT into reduction chain.  */
1943       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
1944       if (current_stmt)
1945         {
1946           current_stmt_info = vinfo_for_stmt (current_stmt);
1947           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
1948           GROUP_FIRST_ELEMENT (use_stmt_info)
1949             = GROUP_FIRST_ELEMENT (current_stmt_info);
1950         }
1951       else
1952         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
1953
1954       lhs = gimple_assign_lhs (loop_use_stmt);
1955       current_stmt = loop_use_stmt;
1956       size++;
1957    }
1958
1959   if (!found || loop_use_stmt != phi || size < 2)
1960     return false;
1961
1962   /* Swap the operands, if needed, to make the reduction operand be the second
1963      operand.  */
1964   lhs = PHI_RESULT (phi);
1965   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
1966   while (next_stmt)
1967     {
1968       if (gimple_assign_rhs2 (next_stmt) == lhs)
1969         {
1970           tree op = gimple_assign_rhs1 (next_stmt);
1971           gimple def_stmt = NULL;
1972
1973           if (TREE_CODE (op) == SSA_NAME)
1974             def_stmt = SSA_NAME_DEF_STMT (op);
1975
1976           /* Check that the other def is either defined in the loop
1977              ("vect_internal_def"), or it's an induction (defined by a
1978              loop-header phi-node).  */
1979           if (def_stmt
1980               && gimple_bb (def_stmt)
1981               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
1982               && (is_gimple_assign (def_stmt)
1983                   || is_gimple_call (def_stmt)
1984                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1985                            == vect_induction_def
1986                   || (gimple_code (def_stmt) == GIMPLE_PHI
1987                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
1988                                   == vect_internal_def
1989                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
1990             {
1991               lhs = gimple_assign_lhs (next_stmt);
1992               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
1993               continue;
1994             }
1995
1996           return false;
1997         }
1998       else
1999         {
2000           tree op = gimple_assign_rhs2 (next_stmt);
2001           gimple def_stmt = NULL;
2002
2003           if (TREE_CODE (op) == SSA_NAME)
2004             def_stmt = SSA_NAME_DEF_STMT (op);
2005
2006           /* Check that the other def is either defined in the loop
2007             ("vect_internal_def"), or it's an induction (defined by a
2008             loop-header phi-node).  */
2009           if (def_stmt
2010               && gimple_bb (def_stmt)
2011               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2012               && (is_gimple_assign (def_stmt)
2013                   || is_gimple_call (def_stmt)
2014                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2015                               == vect_induction_def
2016                   || (gimple_code (def_stmt) == GIMPLE_PHI
2017                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2018                                   == vect_internal_def
2019                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2020             {
2021               if (dump_enabled_p ())
2022                 {
2023                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2024                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2025                 }
2026
2027               swap_tree_operands (next_stmt,
2028                                   gimple_assign_rhs1_ptr (next_stmt),
2029                                   gimple_assign_rhs2_ptr (next_stmt));
2030               update_stmt (next_stmt);
2031
2032               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2033                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2034             }
2035           else
2036             return false;
2037         }
2038
2039       lhs = gimple_assign_lhs (next_stmt);
2040       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2041     }
2042
2043   /* Save the chain for further analysis in SLP detection.  */
2044   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2045   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2046   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2047
2048   return true;
2049 }
2050
2051
2052 /* Function vect_is_simple_reduction_1
2053
2054    (1) Detect a cross-iteration def-use cycle that represents a simple
2055    reduction computation.  We look for the following pattern:
2056
2057    loop_header:
2058      a1 = phi < a0, a2 >
2059      a3 = ...
2060      a2 = operation (a3, a1)
2061
2062    such that:
2063    1. operation is commutative and associative and it is safe to
2064       change the order of the computation (if CHECK_REDUCTION is true)
2065    2. no uses for a2 in the loop (a2 is used out of the loop)
2066    3. no uses of a1 in the loop besides the reduction operation
2067    4. no uses of a1 outside the loop.
2068
2069    Conditions 1,4 are tested here.
2070    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2071
2072    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2073    nested cycles, if CHECK_REDUCTION is false.
2074
2075    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2076    reductions:
2077
2078      a1 = phi < a0, a2 >
2079      inner loop (def of a3)
2080      a2 = phi < a3 >
2081
2082    If MODIFY is true it tries also to rework the code in-place to enable
2083    detection of more reduction patterns.  For the time being we rewrite
2084    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2085 */
2086
2087 static gimple
2088 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2089                             bool check_reduction, bool *double_reduc,
2090                             bool modify)
2091 {
2092   struct loop *loop = (gimple_bb (phi))->loop_father;
2093   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2094   edge latch_e = loop_latch_edge (loop);
2095   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2096   gimple def_stmt, def1 = NULL, def2 = NULL;
2097   enum tree_code orig_code, code;
2098   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2099   tree type;
2100   int nloop_uses;
2101   tree name;
2102   imm_use_iterator imm_iter;
2103   use_operand_p use_p;
2104   bool phi_def;
2105
2106   *double_reduc = false;
2107
2108   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2109      otherwise, we assume outer loop vectorization.  */
2110   gcc_assert ((check_reduction && loop == vect_loop)
2111               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2112
2113   name = PHI_RESULT (phi);
2114   nloop_uses = 0;
2115   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2116     {
2117       gimple use_stmt = USE_STMT (use_p);
2118       if (is_gimple_debug (use_stmt))
2119         continue;
2120
2121       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2122         {
2123           if (dump_enabled_p ())
2124             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2125                              "intermediate value used outside loop.");
2126
2127           return NULL;
2128         }
2129
2130       if (vinfo_for_stmt (use_stmt)
2131           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2132         nloop_uses++;
2133       if (nloop_uses > 1)
2134         {
2135           if (dump_enabled_p ())
2136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2137                              "reduction used in loop.");
2138           return NULL;
2139         }
2140     }
2141
2142   if (TREE_CODE (loop_arg) != SSA_NAME)
2143     {
2144       if (dump_enabled_p ())
2145         {
2146           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147                            "reduction: not ssa_name: ");
2148           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2149         }
2150       return NULL;
2151     }
2152
2153   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2154   if (!def_stmt)
2155     {
2156       if (dump_enabled_p ())
2157         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2158                          "reduction: no def_stmt.");
2159       return NULL;
2160     }
2161
2162   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2163     {
2164       if (dump_enabled_p ())
2165         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2166       return NULL;
2167     }
2168
2169   if (is_gimple_assign (def_stmt))
2170     {
2171       name = gimple_assign_lhs (def_stmt);
2172       phi_def = false;
2173     }
2174   else
2175     {
2176       name = PHI_RESULT (def_stmt);
2177       phi_def = true;
2178     }
2179
2180   nloop_uses = 0;
2181   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2182     {
2183       gimple use_stmt = USE_STMT (use_p);
2184       if (is_gimple_debug (use_stmt))
2185         continue;
2186       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2187           && vinfo_for_stmt (use_stmt)
2188           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2189         nloop_uses++;
2190       if (nloop_uses > 1)
2191         {
2192           if (dump_enabled_p ())
2193             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2194                              "reduction used in loop.");
2195           return NULL;
2196         }
2197     }
2198
2199   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2200      defined in the inner loop.  */
2201   if (phi_def)
2202     {
2203       op1 = PHI_ARG_DEF (def_stmt, 0);
2204
2205       if (gimple_phi_num_args (def_stmt) != 1
2206           || TREE_CODE (op1) != SSA_NAME)
2207         {
2208           if (dump_enabled_p ())
2209             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2210                              "unsupported phi node definition.");
2211
2212           return NULL;
2213         }
2214
2215       def1 = SSA_NAME_DEF_STMT (op1);
2216       if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2217           && loop->inner
2218           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2219           && is_gimple_assign (def1))
2220         {
2221           if (dump_enabled_p ())
2222             report_vect_op (MSG_NOTE, def_stmt,
2223                             "detected double reduction: ");
2224
2225           *double_reduc = true;
2226           return def_stmt;
2227         }
2228
2229       return NULL;
2230     }
2231
2232   code = orig_code = gimple_assign_rhs_code (def_stmt);
2233
2234   /* We can handle "res -= x[i]", which is non-associative by
2235      simply rewriting this into "res += -x[i]".  Avoid changing
2236      gimple instruction for the first simple tests and only do this
2237      if we're allowed to change code at all.  */
2238   if (code == MINUS_EXPR
2239       && modify
2240       && (op1 = gimple_assign_rhs1 (def_stmt))
2241       && TREE_CODE (op1) == SSA_NAME
2242       && SSA_NAME_DEF_STMT (op1) == phi)
2243     code = PLUS_EXPR;
2244
2245   if (check_reduction
2246       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2247     {
2248       if (dump_enabled_p ())
2249         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2250                         "reduction: not commutative/associative: ");
2251       return NULL;
2252     }
2253
2254   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2255     {
2256       if (code != COND_EXPR)
2257         {
2258           if (dump_enabled_p ())
2259             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2260                             "reduction: not binary operation: ");
2261
2262           return NULL;
2263         }
2264
2265       op3 = gimple_assign_rhs1 (def_stmt);
2266       if (COMPARISON_CLASS_P (op3))
2267         {
2268           op4 = TREE_OPERAND (op3, 1);
2269           op3 = TREE_OPERAND (op3, 0);
2270         }
2271
2272       op1 = gimple_assign_rhs2 (def_stmt);
2273       op2 = gimple_assign_rhs3 (def_stmt);
2274
2275       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2276         {
2277           if (dump_enabled_p ())
2278             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2279                             "reduction: uses not ssa_names: ");
2280
2281           return NULL;
2282         }
2283     }
2284   else
2285     {
2286       op1 = gimple_assign_rhs1 (def_stmt);
2287       op2 = gimple_assign_rhs2 (def_stmt);
2288
2289       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2290         {
2291           if (dump_enabled_p ())
2292             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2293                             "reduction: uses not ssa_names: ");
2294
2295           return NULL;
2296         }
2297    }
2298
2299   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2300   if ((TREE_CODE (op1) == SSA_NAME
2301        && !types_compatible_p (type,TREE_TYPE (op1)))
2302       || (TREE_CODE (op2) == SSA_NAME
2303           && !types_compatible_p (type, TREE_TYPE (op2)))
2304       || (op3 && TREE_CODE (op3) == SSA_NAME
2305           && !types_compatible_p (type, TREE_TYPE (op3)))
2306       || (op4 && TREE_CODE (op4) == SSA_NAME
2307           && !types_compatible_p (type, TREE_TYPE (op4))))
2308     {
2309       if (dump_enabled_p ())
2310         {
2311           dump_printf_loc (MSG_NOTE, vect_location,
2312                            "reduction: multiple types: operation type: ");
2313           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2314           dump_printf (MSG_NOTE, ", operands types: ");
2315           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2316                              TREE_TYPE (op1));
2317           dump_printf (MSG_NOTE, ",");
2318           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2319                              TREE_TYPE (op2));
2320           if (op3)
2321             {
2322               dump_printf (MSG_NOTE, ",");
2323               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2324                                  TREE_TYPE (op3));
2325             }
2326
2327           if (op4)
2328             {
2329               dump_printf (MSG_NOTE, ",");
2330               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2331                                  TREE_TYPE (op4));
2332             }
2333         }
2334
2335       return NULL;
2336     }
2337
2338   /* Check that it's ok to change the order of the computation.
2339      Generally, when vectorizing a reduction we change the order of the
2340      computation.  This may change the behavior of the program in some
2341      cases, so we need to check that this is ok.  One exception is when
2342      vectorizing an outer-loop: the inner-loop is executed sequentially,
2343      and therefore vectorizing reductions in the inner-loop during
2344      outer-loop vectorization is safe.  */
2345
2346   /* CHECKME: check for !flag_finite_math_only too?  */
2347   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2348       && check_reduction)
2349     {
2350       /* Changing the order of operations changes the semantics.  */
2351       if (dump_enabled_p ())
2352         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2353                         "reduction: unsafe fp math optimization: ");
2354       return NULL;
2355     }
2356   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2357            && check_reduction)
2358     {
2359       /* Changing the order of operations changes the semantics.  */
2360       if (dump_enabled_p ())
2361         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2362                         "reduction: unsafe int math optimization: ");
2363       return NULL;
2364     }
2365   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2366     {
2367       /* Changing the order of operations changes the semantics.  */
2368       if (dump_enabled_p ())
2369         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2370                         "reduction: unsafe fixed-point math optimization: ");
2371       return NULL;
2372     }
2373
2374   /* If we detected "res -= x[i]" earlier, rewrite it into
2375      "res += -x[i]" now.  If this turns out to be useless reassoc
2376      will clean it up again.  */
2377   if (orig_code == MINUS_EXPR)
2378     {
2379       tree rhs = gimple_assign_rhs2 (def_stmt);
2380       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2381       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2382                                                          rhs, NULL);
2383       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2384       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2385                                                           loop_info, NULL));
2386       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2387       gimple_assign_set_rhs2 (def_stmt, negrhs);
2388       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2389       update_stmt (def_stmt);
2390     }
2391
2392   /* Reduction is safe. We're dealing with one of the following:
2393      1) integer arithmetic and no trapv
2394      2) floating point arithmetic, and special flags permit this optimization
2395      3) nested cycle (i.e., outer loop vectorization).  */
2396   if (TREE_CODE (op1) == SSA_NAME)
2397     def1 = SSA_NAME_DEF_STMT (op1);
2398
2399   if (TREE_CODE (op2) == SSA_NAME)
2400     def2 = SSA_NAME_DEF_STMT (op2);
2401
2402   if (code != COND_EXPR
2403       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2404     {
2405       if (dump_enabled_p ())
2406         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2407       return NULL;
2408     }
2409
2410   /* Check that one def is the reduction def, defined by PHI,
2411      the other def is either defined in the loop ("vect_internal_def"),
2412      or it's an induction (defined by a loop-header phi-node).  */
2413
2414   if (def2 && def2 == phi
2415       && (code == COND_EXPR
2416           || !def1 || gimple_nop_p (def1)
2417           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2418               && (is_gimple_assign (def1)
2419                   || is_gimple_call (def1)
2420                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2421                       == vect_induction_def
2422                   || (gimple_code (def1) == GIMPLE_PHI
2423                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2424                           == vect_internal_def
2425                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2426     {
2427       if (dump_enabled_p ())
2428         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2429       return def_stmt;
2430     }
2431
2432   if (def1 && def1 == phi
2433       && (code == COND_EXPR
2434           || !def2 || gimple_nop_p (def2)
2435           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2436               && (is_gimple_assign (def2)
2437                   || is_gimple_call (def2)
2438                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2439                       == vect_induction_def
2440                   || (gimple_code (def2) == GIMPLE_PHI
2441                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2442                           == vect_internal_def
2443                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2444     {
2445       if (check_reduction)
2446         {
2447           /* Swap operands (just for simplicity - so that the rest of the code
2448              can assume that the reduction variable is always the last (second)
2449              argument).  */
2450           if (dump_enabled_p ())
2451             report_vect_op (MSG_NOTE, def_stmt,
2452                             "detected reduction: need to swap operands: ");
2453
2454           swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2455                               gimple_assign_rhs2_ptr (def_stmt));
2456
2457           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2458             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2459         }
2460       else
2461         {
2462           if (dump_enabled_p ())
2463             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2464         }
2465
2466       return def_stmt;
2467     }
2468
2469   /* Try to find SLP reduction chain.  */
2470   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2471     {
2472       if (dump_enabled_p ())
2473         report_vect_op (MSG_NOTE, def_stmt,
2474                         "reduction: detected reduction chain: ");
2475
2476       return def_stmt;
2477     }
2478
2479   if (dump_enabled_p ())
2480     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2481                     "reduction: unknown pattern: ");
2482
2483   return NULL;
2484 }
2485
2486 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2487    in-place.  Arguments as there.  */
2488
2489 static gimple
2490 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2491                           bool check_reduction, bool *double_reduc)
2492 {
2493   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2494                                      double_reduc, false);
2495 }
2496
2497 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2498    in-place if it enables detection of more reductions.  Arguments
2499    as there.  */
2500
2501 gimple
2502 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2503                           bool check_reduction, bool *double_reduc)
2504 {
2505   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2506                                      double_reduc, true);
2507 }
2508
2509 /* Calculate the cost of one scalar iteration of the loop.  */
2510 int
2511 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2512 {
2513   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2514   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2515   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2516   int innerloop_iters, i, stmt_cost;
2517
2518   /* Count statements in scalar loop.  Using this as scalar cost for a single
2519      iteration for now.
2520
2521      TODO: Add outer loop support.
2522
2523      TODO: Consider assigning different costs to different scalar
2524      statements.  */
2525
2526   /* FORNOW.  */
2527   innerloop_iters = 1;
2528   if (loop->inner)
2529     innerloop_iters = 50; /* FIXME */
2530
2531   for (i = 0; i < nbbs; i++)
2532     {
2533       gimple_stmt_iterator si;
2534       basic_block bb = bbs[i];
2535
2536       if (bb->loop_father == loop->inner)
2537         factor = innerloop_iters;
2538       else
2539         factor = 1;
2540
2541       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2542         {
2543           gimple stmt = gsi_stmt (si);
2544           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2545
2546           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2547             continue;
2548
2549           /* Skip stmts that are not vectorized inside the loop.  */
2550           if (stmt_info
2551               && !STMT_VINFO_RELEVANT_P (stmt_info)
2552               && (!STMT_VINFO_LIVE_P (stmt_info)
2553                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2554               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2555             continue;
2556
2557           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2558             {
2559               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2560                stmt_cost = vect_get_stmt_cost (scalar_load);
2561              else
2562                stmt_cost = vect_get_stmt_cost (scalar_store);
2563             }
2564           else
2565             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2566
2567           scalar_single_iter_cost += stmt_cost * factor;
2568         }
2569     }
2570   return scalar_single_iter_cost;
2571 }
2572
2573 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2574 int
2575 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2576                              int *peel_iters_epilogue,
2577                              int scalar_single_iter_cost,
2578                              stmt_vector_for_cost *prologue_cost_vec,
2579                              stmt_vector_for_cost *epilogue_cost_vec)
2580 {
2581   int retval = 0;
2582   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2583
2584   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2585     {
2586       *peel_iters_epilogue = vf/2;
2587       if (dump_enabled_p ())
2588         dump_printf_loc (MSG_NOTE, vect_location,
2589                          "cost model: epilogue peel iters set to vf/2 "
2590                          "because loop iterations are unknown .");
2591
2592       /* If peeled iterations are known but number of scalar loop
2593          iterations are unknown, count a taken branch per peeled loop.  */
2594       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2595                                  NULL, 0, vect_prologue);
2596     }
2597   else
2598     {
2599       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2600       peel_iters_prologue = niters < peel_iters_prologue ?
2601                             niters : peel_iters_prologue;
2602       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2603       /* If we need to peel for gaps, but no peeling is required, we have to
2604          peel VF iterations.  */
2605       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2606         *peel_iters_epilogue = vf;
2607     }
2608
2609   if (peel_iters_prologue)
2610     retval += record_stmt_cost (prologue_cost_vec,
2611                                 peel_iters_prologue * scalar_single_iter_cost,
2612                                 scalar_stmt, NULL, 0, vect_prologue);
2613   if (*peel_iters_epilogue)
2614     retval += record_stmt_cost (epilogue_cost_vec,
2615                                 *peel_iters_epilogue * scalar_single_iter_cost,
2616                                 scalar_stmt, NULL, 0, vect_epilogue);
2617   return retval;
2618 }
2619
2620 /* Function vect_estimate_min_profitable_iters
2621
2622    Return the number of iterations required for the vector version of the
2623    loop to be profitable relative to the cost of the scalar version of the
2624    loop.  */
2625
2626 static void
2627 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2628                                     int *ret_min_profitable_niters,
2629                                     int *ret_min_profitable_estimate)
2630 {
2631   int min_profitable_iters;
2632   int min_profitable_estimate;
2633   int peel_iters_prologue;
2634   int peel_iters_epilogue;
2635   unsigned vec_inside_cost = 0;
2636   int vec_outside_cost = 0;
2637   unsigned vec_prologue_cost = 0;
2638   unsigned vec_epilogue_cost = 0;
2639   int scalar_single_iter_cost = 0;
2640   int scalar_outside_cost = 0;
2641   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2642   int npeel = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo);
2643   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2644
2645   /* Cost model disabled.  */
2646   if (!flag_vect_cost_model)
2647     {
2648       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.");
2649       *ret_min_profitable_niters = 0;
2650       *ret_min_profitable_estimate = 0;
2651       return;
2652     }
2653
2654   /* Requires loop versioning tests to handle misalignment.  */
2655   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2656     {
2657       /*  FIXME: Make cost depend on complexity of individual check.  */
2658       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2659       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2660                             vect_prologue);
2661       dump_printf (MSG_NOTE,
2662                    "cost model: Adding cost of checks for loop "
2663                    "versioning to treat misalignment.\n");
2664     }
2665
2666   /* Requires loop versioning with alias checks.  */
2667   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2668     {
2669       /*  FIXME: Make cost depend on complexity of individual check.  */
2670       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2671       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2672                             vect_prologue);
2673       dump_printf (MSG_NOTE,
2674                    "cost model: Adding cost of checks for loop "
2675                    "versioning aliasing.\n");
2676     }
2677
2678   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2679       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2680     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2681                           vect_prologue);
2682
2683   /* Count statements in scalar loop.  Using this as scalar cost for a single
2684      iteration for now.
2685
2686      TODO: Add outer loop support.
2687
2688      TODO: Consider assigning different costs to different scalar
2689      statements.  */
2690
2691   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2692
2693   /* Add additional cost for the peeled instructions in prologue and epilogue
2694      loop.
2695
2696      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2697      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2698
2699      TODO: Build an expression that represents peel_iters for prologue and
2700      epilogue to be used in a run-time test.  */
2701
2702   if (npeel  < 0)
2703     {
2704       peel_iters_prologue = vf/2;
2705       dump_printf (MSG_NOTE, "cost model: "
2706                    "prologue peel iters set to vf/2.");
2707
2708       /* If peeling for alignment is unknown, loop bound of main loop becomes
2709          unknown.  */
2710       peel_iters_epilogue = vf/2;
2711       dump_printf (MSG_NOTE, "cost model: "
2712                    "epilogue peel iters set to vf/2 because "
2713                    "peeling for alignment is unknown.");
2714
2715       /* If peeled iterations are unknown, count a taken branch and a not taken
2716          branch per peeled loop. Even if scalar loop iterations are known,
2717          vector iterations are not known since peeled prologue iterations are
2718          not known. Hence guards remain the same.  */
2719       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2720                             NULL, 0, vect_prologue);
2721       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2722                             NULL, 0, vect_prologue);
2723       /* FORNOW: Don't attempt to pass individual scalar instructions to
2724          the model; just assume linear cost for scalar iterations.  */
2725       (void) add_stmt_cost (target_cost_data,
2726                             peel_iters_prologue * scalar_single_iter_cost,
2727                             scalar_stmt, NULL, 0, vect_prologue);
2728       (void) add_stmt_cost (target_cost_data,
2729                             peel_iters_epilogue * scalar_single_iter_cost,
2730                             scalar_stmt, NULL, 0, vect_epilogue);
2731     }
2732   else
2733     {
2734       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2735       stmt_info_for_cost *si;
2736       int j;
2737       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2738
2739       prologue_cost_vec.create (2);
2740       epilogue_cost_vec.create (2);
2741       peel_iters_prologue = npeel;
2742
2743       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2744                                           &peel_iters_epilogue,
2745                                           scalar_single_iter_cost,
2746                                           &prologue_cost_vec,
2747                                           &epilogue_cost_vec);
2748
2749       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2750         {
2751           struct _stmt_vec_info *stmt_info
2752             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2753           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2754                                 si->misalign, vect_prologue);
2755         }
2756
2757       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2758         {
2759           struct _stmt_vec_info *stmt_info
2760             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2761           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2762                                 si->misalign, vect_epilogue);
2763         }
2764
2765       prologue_cost_vec.release ();
2766       epilogue_cost_vec.release ();
2767     }
2768
2769   /* FORNOW: The scalar outside cost is incremented in one of the
2770      following ways:
2771
2772      1. The vectorizer checks for alignment and aliasing and generates
2773      a condition that allows dynamic vectorization.  A cost model
2774      check is ANDED with the versioning condition.  Hence scalar code
2775      path now has the added cost of the versioning check.
2776
2777        if (cost > th & versioning_check)
2778          jmp to vector code
2779
2780      Hence run-time scalar is incremented by not-taken branch cost.
2781
2782      2. The vectorizer then checks if a prologue is required.  If the
2783      cost model check was not done before during versioning, it has to
2784      be done before the prologue check.
2785
2786        if (cost <= th)
2787          prologue = scalar_iters
2788        if (prologue == 0)
2789          jmp to vector code
2790        else
2791          execute prologue
2792        if (prologue == num_iters)
2793          go to exit
2794
2795      Hence the run-time scalar cost is incremented by a taken branch,
2796      plus a not-taken branch, plus a taken branch cost.
2797
2798      3. The vectorizer then checks if an epilogue is required.  If the
2799      cost model check was not done before during prologue check, it
2800      has to be done with the epilogue check.
2801
2802        if (prologue == 0)
2803          jmp to vector code
2804        else
2805          execute prologue
2806        if (prologue == num_iters)
2807          go to exit
2808        vector code:
2809          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2810            jmp to epilogue
2811
2812      Hence the run-time scalar cost should be incremented by 2 taken
2813      branches.
2814
2815      TODO: The back end may reorder the BBS's differently and reverse
2816      conditions/branch directions.  Change the estimates below to
2817      something more reasonable.  */
2818
2819   /* If the number of iterations is known and we do not do versioning, we can
2820      decide whether to vectorize at compile time.  Hence the scalar version
2821      do not carry cost model guard costs.  */
2822   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2823       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2824       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2825     {
2826       /* Cost model check occurs at versioning.  */
2827       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2828           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2829         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2830       else
2831         {
2832           /* Cost model check occurs at prologue generation.  */
2833           if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2834             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2835               + vect_get_stmt_cost (cond_branch_not_taken);
2836           /* Cost model check occurs at epilogue generation.  */
2837           else
2838             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2839         }
2840     }
2841
2842   /* Complete the target-specific cost calculations.  */
2843   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2844                &vec_inside_cost, &vec_epilogue_cost);
2845
2846   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2847
2848   /* Calculate number of iterations required to make the vector version
2849      profitable, relative to the loop bodies only.  The following condition
2850      must hold true:
2851      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2852      where
2853      SIC = scalar iteration cost, VIC = vector iteration cost,
2854      VOC = vector outside cost, VF = vectorization factor,
2855      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2856      SOC = scalar outside cost for run time cost model check.  */
2857
2858   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2859     {
2860       if (vec_outside_cost <= 0)
2861         min_profitable_iters = 1;
2862       else
2863         {
2864           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2865                                   - vec_inside_cost * peel_iters_prologue
2866                                   - vec_inside_cost * peel_iters_epilogue)
2867                                  / ((scalar_single_iter_cost * vf)
2868                                     - vec_inside_cost);
2869
2870           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2871               <= (((int) vec_inside_cost * min_profitable_iters)
2872                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2873             min_profitable_iters++;
2874         }
2875     }
2876   /* vector version will never be profitable.  */
2877   else
2878     {
2879       if (dump_enabled_p ())
2880         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2881                          "cost model: the vector iteration cost = %d "
2882                          "divided by the scalar iteration cost = %d "
2883                          "is greater or equal to the vectorization factor = %d.",
2884                          vec_inside_cost, scalar_single_iter_cost, vf);
2885       *ret_min_profitable_niters = -1;
2886       *ret_min_profitable_estimate = -1;
2887       return;
2888     }
2889
2890   if (dump_enabled_p ())
2891     {
2892       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
2893       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
2894                    vec_inside_cost);
2895       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
2896                    vec_prologue_cost);
2897       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
2898                    vec_epilogue_cost);
2899       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
2900                    scalar_single_iter_cost);
2901       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
2902                    scalar_outside_cost);
2903       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
2904                    vec_outside_cost);
2905       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
2906                    peel_iters_prologue);
2907       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
2908                    peel_iters_epilogue);
2909       dump_printf (MSG_NOTE,
2910                    "  Calculated minimum iters for profitability: %d\n",
2911                    min_profitable_iters);
2912     }
2913
2914   min_profitable_iters =
2915         min_profitable_iters < vf ? vf : min_profitable_iters;
2916
2917   /* Because the condition we create is:
2918      if (niters <= min_profitable_iters)
2919        then skip the vectorized loop.  */
2920   min_profitable_iters--;
2921
2922   if (dump_enabled_p ())
2923     dump_printf_loc (MSG_NOTE, vect_location,
2924                      "  Runtime profitability threshold = %d\n", min_profitable_iters);
2925
2926   *ret_min_profitable_niters = min_profitable_iters;
2927
2928   /* Calculate number of iterations required to make the vector version
2929      profitable, relative to the loop bodies only.
2930
2931      Non-vectorized variant is SIC * niters and it must win over vector
2932      variant on the expected loop trip count.  The following condition must hold true:
2933      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
2934
2935   if (vec_outside_cost <= 0)
2936     min_profitable_estimate = 1;
2937   else
2938     {
2939       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
2940                                  - vec_inside_cost * peel_iters_prologue
2941                                  - vec_inside_cost * peel_iters_epilogue)
2942                                  / ((scalar_single_iter_cost * vf)
2943                                    - vec_inside_cost);
2944     }
2945   min_profitable_estimate --;
2946   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
2947   if (dump_enabled_p ())
2948     dump_printf_loc (MSG_NOTE, vect_location,
2949                      "  Static estimate profitability threshold = %d\n",
2950                       min_profitable_iters);
2951
2952   *ret_min_profitable_estimate = min_profitable_estimate;
2953 }
2954
2955
2956 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
2957    functions. Design better to avoid maintenance issues.  */
2958
2959 /* Function vect_model_reduction_cost.
2960
2961    Models cost for a reduction operation, including the vector ops
2962    generated within the strip-mine loop, the initial definition before
2963    the loop, and the epilogue code that must be generated.  */
2964
2965 static bool
2966 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
2967                            int ncopies)
2968 {
2969   int prologue_cost = 0, epilogue_cost = 0;
2970   enum tree_code code;
2971   optab optab;
2972   tree vectype;
2973   gimple stmt, orig_stmt;
2974   tree reduction_op;
2975   enum machine_mode mode;
2976   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2977   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2978   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2979
2980   /* Cost of reduction op inside loop.  */
2981   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
2982                                         stmt_info, 0, vect_body);
2983   stmt = STMT_VINFO_STMT (stmt_info);
2984
2985   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
2986     {
2987     case GIMPLE_SINGLE_RHS:
2988       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
2989       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
2990       break;
2991     case GIMPLE_UNARY_RHS:
2992       reduction_op = gimple_assign_rhs1 (stmt);
2993       break;
2994     case GIMPLE_BINARY_RHS:
2995       reduction_op = gimple_assign_rhs2 (stmt);
2996       break;
2997     case GIMPLE_TERNARY_RHS:
2998       reduction_op = gimple_assign_rhs3 (stmt);
2999       break;
3000     default:
3001       gcc_unreachable ();
3002     }
3003
3004   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3005   if (!vectype)
3006     {
3007       if (dump_enabled_p ())
3008         {
3009           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3010                            "unsupported data-type ");
3011           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3012                              TREE_TYPE (reduction_op));
3013         }
3014       return false;
3015    }
3016
3017   mode = TYPE_MODE (vectype);
3018   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3019
3020   if (!orig_stmt)
3021     orig_stmt = STMT_VINFO_STMT (stmt_info);
3022
3023   code = gimple_assign_rhs_code (orig_stmt);
3024
3025   /* Add in cost for initial definition.  */
3026   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3027                                   stmt_info, 0, vect_prologue);
3028
3029   /* Determine cost of epilogue code.
3030
3031      We have a reduction operator that will reduce the vector in one statement.
3032      Also requires scalar extract.  */
3033
3034   if (!nested_in_vect_loop_p (loop, orig_stmt))
3035     {
3036       if (reduc_code != ERROR_MARK)
3037         {
3038           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3039                                           stmt_info, 0, vect_epilogue);
3040           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3041                                           stmt_info, 0, vect_epilogue);
3042         }
3043       else
3044         {
3045           int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
3046           tree bitsize =
3047             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3048           int element_bitsize = tree_low_cst (bitsize, 1);
3049           int nelements = vec_size_in_bits / element_bitsize;
3050
3051           optab = optab_for_tree_code (code, vectype, optab_default);
3052
3053           /* We have a whole vector shift available.  */
3054           if (VECTOR_MODE_P (mode)
3055               && optab_handler (optab, mode) != CODE_FOR_nothing
3056               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3057             {
3058               /* Final reduction via vector shifts and the reduction operator.
3059                  Also requires scalar extract.  */
3060               epilogue_cost += add_stmt_cost (target_cost_data,
3061                                               exact_log2 (nelements) * 2,
3062                                               vector_stmt, stmt_info, 0,
3063                                               vect_epilogue);
3064               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3065                                               vec_to_scalar, stmt_info, 0,
3066                                               vect_epilogue);
3067             }
3068           else
3069             /* Use extracts and reduction op for final reduction.  For N
3070                elements, we have N extracts and N-1 reduction ops.  */
3071             epilogue_cost += add_stmt_cost (target_cost_data,
3072                                             nelements + nelements - 1,
3073                                             vector_stmt, stmt_info, 0,
3074                                             vect_epilogue);
3075         }
3076     }
3077
3078   if (dump_enabled_p ())
3079     dump_printf (MSG_NOTE,
3080                  "vect_model_reduction_cost: inside_cost = %d, "
3081                  "prologue_cost = %d, epilogue_cost = %d .", inside_cost,
3082                  prologue_cost, epilogue_cost);
3083
3084   return true;
3085 }
3086
3087
3088 /* Function vect_model_induction_cost.
3089
3090    Models cost for induction operations.  */
3091
3092 static void
3093 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3094 {
3095   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3096   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3097   unsigned inside_cost, prologue_cost;
3098
3099   /* loop cost for vec_loop.  */
3100   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3101                                stmt_info, 0, vect_body);
3102
3103   /* prologue cost for vec_init and vec_step.  */
3104   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3105                                  stmt_info, 0, vect_prologue);
3106
3107   if (dump_enabled_p ())
3108     dump_printf_loc (MSG_NOTE, vect_location,
3109                      "vect_model_induction_cost: inside_cost = %d, "
3110                      "prologue_cost = %d .", inside_cost, prologue_cost);
3111 }
3112
3113
3114 /* Function get_initial_def_for_induction
3115
3116    Input:
3117    STMT - a stmt that performs an induction operation in the loop.
3118    IV_PHI - the initial value of the induction variable
3119
3120    Output:
3121    Return a vector variable, initialized with the first VF values of
3122    the induction variable.  E.g., for an iv with IV_PHI='X' and
3123    evolution S, for a vector of 4 units, we want to return:
3124    [X, X + S, X + 2*S, X + 3*S].  */
3125
3126 static tree
3127 get_initial_def_for_induction (gimple iv_phi)
3128 {
3129   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3130   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3131   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3132   tree scalar_type;
3133   tree vectype;
3134   int nunits;
3135   edge pe = loop_preheader_edge (loop);
3136   struct loop *iv_loop;
3137   basic_block new_bb;
3138   tree new_vec, vec_init, vec_step, t;
3139   tree access_fn;
3140   tree new_var;
3141   tree new_name;
3142   gimple init_stmt, induction_phi, new_stmt;
3143   tree induc_def, vec_def, vec_dest;
3144   tree init_expr, step_expr;
3145   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3146   int i;
3147   bool ok;
3148   int ncopies;
3149   tree expr;
3150   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3151   bool nested_in_vect_loop = false;
3152   gimple_seq stmts = NULL;
3153   imm_use_iterator imm_iter;
3154   use_operand_p use_p;
3155   gimple exit_phi;
3156   edge latch_e;
3157   tree loop_arg;
3158   gimple_stmt_iterator si;
3159   basic_block bb = gimple_bb (iv_phi);
3160   tree stepvectype;
3161   tree resvectype;
3162
3163   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3164   if (nested_in_vect_loop_p (loop, iv_phi))
3165     {
3166       nested_in_vect_loop = true;
3167       iv_loop = loop->inner;
3168     }
3169   else
3170     iv_loop = loop;
3171   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3172
3173   latch_e = loop_latch_edge (iv_loop);
3174   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3175
3176   access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi));
3177   gcc_assert (access_fn);
3178   STRIP_NOPS (access_fn);
3179   ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn,
3180                                     &init_expr, &step_expr);
3181   gcc_assert (ok);
3182   pe = loop_preheader_edge (iv_loop);
3183
3184   scalar_type = TREE_TYPE (init_expr);
3185   vectype = get_vectype_for_scalar_type (scalar_type);
3186   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3187   gcc_assert (vectype);
3188   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3189   ncopies = vf / nunits;
3190
3191   gcc_assert (phi_info);
3192   gcc_assert (ncopies >= 1);
3193
3194   /* Find the first insertion point in the BB.  */
3195   si = gsi_after_labels (bb);
3196
3197   /* Create the vector that holds the initial_value of the induction.  */
3198   if (nested_in_vect_loop)
3199     {
3200       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3201          been created during vectorization of previous stmts.  We obtain it
3202          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3203       tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3204                                            loop_preheader_edge (iv_loop));
3205       vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL);
3206       /* If the initial value is not of proper type, convert it.  */
3207       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3208         {
3209           new_stmt = gimple_build_assign_with_ops
3210               (VIEW_CONVERT_EXPR,
3211                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3212                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3213           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3214           gimple_assign_set_lhs (new_stmt, vec_init);
3215           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3216                                                  new_stmt);
3217           gcc_assert (!new_bb);
3218           set_vinfo_for_stmt (new_stmt,
3219                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3220         }
3221     }
3222   else
3223     {
3224       vec<constructor_elt, va_gc> *v;
3225
3226       /* iv_loop is the loop to be vectorized. Create:
3227          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3228       new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_");
3229       new_name = force_gimple_operand (init_expr, &stmts, false, new_var);
3230       if (stmts)
3231         {
3232           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3233           gcc_assert (!new_bb);
3234         }
3235
3236       vec_alloc (v, nunits);
3237       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3238       for (i = 1; i < nunits; i++)
3239         {
3240           /* Create: new_name_i = new_name + step_expr  */
3241           enum tree_code code = POINTER_TYPE_P (scalar_type)
3242                                 ? POINTER_PLUS_EXPR : PLUS_EXPR;
3243           init_stmt = gimple_build_assign_with_ops (code, new_var,
3244                                                     new_name, step_expr);
3245           new_name = make_ssa_name (new_var, init_stmt);
3246           gimple_assign_set_lhs (init_stmt, new_name);
3247
3248           new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3249           gcc_assert (!new_bb);
3250
3251           if (dump_enabled_p ())
3252             {
3253               dump_printf_loc (MSG_NOTE, vect_location,
3254                                "created new init_stmt: ");
3255               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3256             }
3257           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3258         }
3259       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3260       new_vec = build_constructor (vectype, v);
3261       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3262     }
3263
3264
3265   /* Create the vector that holds the step of the induction.  */
3266   if (nested_in_vect_loop)
3267     /* iv_loop is nested in the loop to be vectorized. Generate:
3268        vec_step = [S, S, S, S]  */
3269     new_name = step_expr;
3270   else
3271     {
3272       /* iv_loop is the loop to be vectorized. Generate:
3273           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3274       expr = build_int_cst (TREE_TYPE (step_expr), vf);
3275       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3276                               expr, step_expr);
3277     }
3278
3279   t = unshare_expr (new_name);
3280   gcc_assert (CONSTANT_CLASS_P (new_name));
3281   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3282   gcc_assert (stepvectype);
3283   new_vec = build_vector_from_val (stepvectype, t);
3284   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3285
3286
3287   /* Create the following def-use cycle:
3288      loop prolog:
3289          vec_init = ...
3290          vec_step = ...
3291      loop:
3292          vec_iv = PHI <vec_init, vec_loop>
3293          ...
3294          STMT
3295          ...
3296          vec_loop = vec_iv + vec_step;  */
3297
3298   /* Create the induction-phi that defines the induction-operand.  */
3299   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3300   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3301   set_vinfo_for_stmt (induction_phi,
3302                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3303   induc_def = PHI_RESULT (induction_phi);
3304
3305   /* Create the iv update inside the loop  */
3306   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3307                                            induc_def, vec_step);
3308   vec_def = make_ssa_name (vec_dest, new_stmt);
3309   gimple_assign_set_lhs (new_stmt, vec_def);
3310   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3311   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3312                                                    NULL));
3313
3314   /* Set the arguments of the phi node:  */
3315   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3316   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3317                UNKNOWN_LOCATION);
3318
3319
3320   /* In case that vectorization factor (VF) is bigger than the number
3321      of elements that we can fit in a vectype (nunits), we have to generate
3322      more than one vector stmt - i.e - we need to "unroll" the
3323      vector stmt by a factor VF/nunits.  For more details see documentation
3324      in vectorizable_operation.  */
3325
3326   if (ncopies > 1)
3327     {
3328       stmt_vec_info prev_stmt_vinfo;
3329       /* FORNOW. This restriction should be relaxed.  */
3330       gcc_assert (!nested_in_vect_loop);
3331
3332       /* Create the vector that holds the step of the induction.  */
3333       expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3334       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3335                               expr, step_expr);
3336       t = unshare_expr (new_name);
3337       gcc_assert (CONSTANT_CLASS_P (new_name));
3338       new_vec = build_vector_from_val (stepvectype, t);
3339       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3340
3341       vec_def = induc_def;
3342       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3343       for (i = 1; i < ncopies; i++)
3344         {
3345           /* vec_i = vec_prev + vec_step  */
3346           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3347                                                    vec_def, vec_step);
3348           vec_def = make_ssa_name (vec_dest, new_stmt);
3349           gimple_assign_set_lhs (new_stmt, vec_def);
3350
3351           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3352           if (!useless_type_conversion_p (resvectype, vectype))
3353             {
3354               new_stmt = gimple_build_assign_with_ops
3355                   (VIEW_CONVERT_EXPR,
3356                    vect_get_new_vect_var (resvectype, vect_simple_var,
3357                                           "vec_iv_"),
3358                    build1 (VIEW_CONVERT_EXPR, resvectype,
3359                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3360               gimple_assign_set_lhs (new_stmt,
3361                                      make_ssa_name
3362                                        (gimple_assign_lhs (new_stmt), new_stmt));
3363               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3364             }
3365           set_vinfo_for_stmt (new_stmt,
3366                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3367           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3368           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3369         }
3370     }
3371
3372   if (nested_in_vect_loop)
3373     {
3374       /* Find the loop-closed exit-phi of the induction, and record
3375          the final vector of induction results:  */
3376       exit_phi = NULL;
3377       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3378         {
3379           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p))))
3380             {
3381               exit_phi = USE_STMT (use_p);
3382               break;
3383             }
3384         }
3385       if (exit_phi)
3386         {
3387           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3388           /* FORNOW. Currently not supporting the case that an inner-loop induction
3389              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3390           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3391                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3392
3393           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3394           if (dump_enabled_p ())
3395             {
3396               dump_printf_loc (MSG_NOTE, vect_location,
3397                                "vector of inductions after inner-loop:");
3398               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3399             }
3400         }
3401     }
3402
3403
3404   if (dump_enabled_p ())
3405     {
3406       dump_printf_loc (MSG_NOTE, vect_location,
3407                        "transform induction: created def-use cycle: ");
3408       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3409       dump_printf (MSG_NOTE, "\n");
3410       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3411                         SSA_NAME_DEF_STMT (vec_def), 0);
3412     }
3413
3414   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3415   if (!useless_type_conversion_p (resvectype, vectype))
3416     {
3417       new_stmt = gimple_build_assign_with_ops
3418          (VIEW_CONVERT_EXPR,
3419           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3420           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3421       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3422       gimple_assign_set_lhs (new_stmt, induc_def);
3423       si = gsi_after_labels (bb);
3424       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3425       set_vinfo_for_stmt (new_stmt,
3426                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3427       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3428         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3429     }
3430
3431   return induc_def;
3432 }
3433
3434
3435 /* Function get_initial_def_for_reduction
3436
3437    Input:
3438    STMT - a stmt that performs a reduction operation in the loop.
3439    INIT_VAL - the initial value of the reduction variable
3440
3441    Output:
3442    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3443         of the reduction (used for adjusting the epilog - see below).
3444    Return a vector variable, initialized according to the operation that STMT
3445         performs. This vector will be used as the initial value of the
3446         vector of partial results.
3447
3448    Option1 (adjust in epilog): Initialize the vector as follows:
3449      add/bit or/xor:    [0,0,...,0,0]
3450      mult/bit and:      [1,1,...,1,1]
3451      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3452    and when necessary (e.g. add/mult case) let the caller know
3453    that it needs to adjust the result by init_val.
3454
3455    Option2: Initialize the vector as follows:
3456      add/bit or/xor:    [init_val,0,0,...,0]
3457      mult/bit and:      [init_val,1,1,...,1]
3458      min/max/cond_expr: [init_val,init_val,...,init_val]
3459    and no adjustments are needed.
3460
3461    For example, for the following code:
3462
3463    s = init_val;
3464    for (i=0;i<n;i++)
3465      s = s + a[i];
3466
3467    STMT is 's = s + a[i]', and the reduction variable is 's'.
3468    For a vector of 4 units, we want to return either [0,0,0,init_val],
3469    or [0,0,0,0] and let the caller know that it needs to adjust
3470    the result at the end by 'init_val'.
3471
3472    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3473    initialization vector is simpler (same element in all entries), if
3474    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3475
3476    A cost model should help decide between these two schemes.  */
3477
3478 tree
3479 get_initial_def_for_reduction (gimple stmt, tree init_val,
3480                                tree *adjustment_def)
3481 {
3482   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3483   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3484   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3485   tree scalar_type = TREE_TYPE (init_val);
3486   tree vectype = get_vectype_for_scalar_type (scalar_type);
3487   int nunits;
3488   enum tree_code code = gimple_assign_rhs_code (stmt);
3489   tree def_for_init;
3490   tree init_def;
3491   tree *elts;
3492   int i;
3493   bool nested_in_vect_loop = false;
3494   tree init_value;
3495   REAL_VALUE_TYPE real_init_val = dconst0;
3496   int int_init_val = 0;
3497   gimple def_stmt = NULL;
3498
3499   gcc_assert (vectype);
3500   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3501
3502   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3503               || SCALAR_FLOAT_TYPE_P (scalar_type));
3504
3505   if (nested_in_vect_loop_p (loop, stmt))
3506     nested_in_vect_loop = true;
3507   else
3508     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3509
3510   /* In case of double reduction we only create a vector variable to be put
3511      in the reduction phi node.  The actual statement creation is done in
3512      vect_create_epilog_for_reduction.  */
3513   if (adjustment_def && nested_in_vect_loop
3514       && TREE_CODE (init_val) == SSA_NAME
3515       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3516       && gimple_code (def_stmt) == GIMPLE_PHI
3517       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3518       && vinfo_for_stmt (def_stmt)
3519       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3520           == vect_double_reduction_def)
3521     {
3522       *adjustment_def = NULL;
3523       return vect_create_destination_var (init_val, vectype);
3524     }
3525
3526   if (TREE_CONSTANT (init_val))
3527     {
3528       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3529         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3530       else
3531         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3532     }
3533   else
3534     init_value = init_val;
3535
3536   switch (code)
3537     {
3538       case WIDEN_SUM_EXPR:
3539       case DOT_PROD_EXPR:
3540       case PLUS_EXPR:
3541       case MINUS_EXPR:
3542       case BIT_IOR_EXPR:
3543       case BIT_XOR_EXPR:
3544       case MULT_EXPR:
3545       case BIT_AND_EXPR:
3546         /* ADJUSMENT_DEF is NULL when called from
3547            vect_create_epilog_for_reduction to vectorize double reduction.  */
3548         if (adjustment_def)
3549           {
3550             if (nested_in_vect_loop)
3551               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3552                                                               NULL);
3553             else
3554               *adjustment_def = init_val;
3555           }
3556
3557         if (code == MULT_EXPR)
3558           {
3559             real_init_val = dconst1;
3560             int_init_val = 1;
3561           }
3562
3563         if (code == BIT_AND_EXPR)
3564           int_init_val = -1;
3565
3566         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3567           def_for_init = build_real (scalar_type, real_init_val);
3568         else
3569           def_for_init = build_int_cst (scalar_type, int_init_val);
3570
3571         /* Create a vector of '0' or '1' except the first element.  */
3572         elts = XALLOCAVEC (tree, nunits);
3573         for (i = nunits - 2; i >= 0; --i)
3574           elts[i + 1] = def_for_init;
3575
3576         /* Option1: the first element is '0' or '1' as well.  */
3577         if (adjustment_def)
3578           {
3579             elts[0] = def_for_init;
3580             init_def = build_vector (vectype, elts);
3581             break;
3582           }
3583
3584         /* Option2: the first element is INIT_VAL.  */
3585         elts[0] = init_val;
3586         if (TREE_CONSTANT (init_val))
3587           init_def = build_vector (vectype, elts);
3588         else
3589           {
3590             vec<constructor_elt, va_gc> *v;
3591             vec_alloc (v, nunits);
3592             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3593             for (i = 1; i < nunits; ++i)
3594               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3595             init_def = build_constructor (vectype, v);
3596           }
3597
3598         break;
3599
3600       case MIN_EXPR:
3601       case MAX_EXPR:
3602       case COND_EXPR:
3603         if (adjustment_def)
3604           {
3605             *adjustment_def = NULL_TREE;
3606             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3607             break;
3608           }
3609
3610         init_def = build_vector_from_val (vectype, init_value);
3611         break;
3612
3613       default:
3614         gcc_unreachable ();
3615     }
3616
3617   return init_def;
3618 }
3619
3620
3621 /* Function vect_create_epilog_for_reduction
3622
3623    Create code at the loop-epilog to finalize the result of a reduction
3624    computation.
3625
3626    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3627      reduction statements.
3628    STMT is the scalar reduction stmt that is being vectorized.
3629    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3630      number of elements that we can fit in a vectype (nunits).  In this case
3631      we have to generate more than one vector stmt - i.e - we need to "unroll"
3632      the vector stmt by a factor VF/nunits.  For more details see documentation
3633      in vectorizable_operation.
3634    REDUC_CODE is the tree-code for the epilog reduction.
3635    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3636      computation.
3637    REDUC_INDEX is the index of the operand in the right hand side of the
3638      statement that is defined by REDUCTION_PHI.
3639    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3640    SLP_NODE is an SLP node containing a group of reduction statements. The
3641      first one in this group is STMT.
3642
3643    This function:
3644    1. Creates the reduction def-use cycles: sets the arguments for
3645       REDUCTION_PHIS:
3646       The loop-entry argument is the vectorized initial-value of the reduction.
3647       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3648       sums.
3649    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3650       by applying the operation specified by REDUC_CODE if available, or by
3651       other means (whole-vector shifts or a scalar loop).
3652       The function also creates a new phi node at the loop exit to preserve
3653       loop-closed form, as illustrated below.
3654
3655      The flow at the entry to this function:
3656
3657         loop:
3658           vec_def = phi <null, null>            # REDUCTION_PHI
3659           VECT_DEF = vector_stmt                # vectorized form of STMT
3660           s_loop = scalar_stmt                  # (scalar) STMT
3661         loop_exit:
3662           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3663           use <s_out0>
3664           use <s_out0>
3665
3666      The above is transformed by this function into:
3667
3668         loop:
3669           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3670           VECT_DEF = vector_stmt                # vectorized form of STMT
3671           s_loop = scalar_stmt                  # (scalar) STMT
3672         loop_exit:
3673           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3674           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3675           v_out2 = reduce <v_out1>
3676           s_out3 = extract_field <v_out2, 0>
3677           s_out4 = adjust_result <s_out3>
3678           use <s_out4>
3679           use <s_out4>
3680 */
3681
3682 static void
3683 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3684                                   int ncopies, enum tree_code reduc_code,
3685                                   vec<gimple> reduction_phis,
3686                                   int reduc_index, bool double_reduc,
3687                                   slp_tree slp_node)
3688 {
3689   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3690   stmt_vec_info prev_phi_info;
3691   tree vectype;
3692   enum machine_mode mode;
3693   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3694   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3695   basic_block exit_bb;
3696   tree scalar_dest;
3697   tree scalar_type;
3698   gimple new_phi = NULL, phi;
3699   gimple_stmt_iterator exit_gsi;
3700   tree vec_dest;
3701   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3702   gimple epilog_stmt = NULL;
3703   enum tree_code code = gimple_assign_rhs_code (stmt);
3704   gimple exit_phi;
3705   tree bitsize, bitpos;
3706   tree adjustment_def = NULL;
3707   tree vec_initial_def = NULL;
3708   tree reduction_op, expr, def;
3709   tree orig_name, scalar_result;
3710   imm_use_iterator imm_iter, phi_imm_iter;
3711   use_operand_p use_p, phi_use_p;
3712   bool extract_scalar_result = false;
3713   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3714   bool nested_in_vect_loop = false;
3715   vec<gimple> new_phis = vNULL;
3716   vec<gimple> inner_phis = vNULL;
3717   enum vect_def_type dt = vect_unknown_def_type;
3718   int j, i;
3719   vec<tree> scalar_results = vNULL;
3720   unsigned int group_size = 1, k, ratio;
3721   vec<tree> vec_initial_defs = vNULL;
3722   vec<gimple> phis;
3723   bool slp_reduc = false;
3724   tree new_phi_result;
3725   gimple inner_phi = NULL;
3726
3727   if (slp_node)
3728     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3729
3730   if (nested_in_vect_loop_p (loop, stmt))
3731     {
3732       outer_loop = loop;
3733       loop = loop->inner;
3734       nested_in_vect_loop = true;
3735       gcc_assert (!slp_node);
3736     }
3737
3738   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3739     {
3740     case GIMPLE_SINGLE_RHS:
3741       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3742                   == ternary_op);
3743       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3744       break;
3745     case GIMPLE_UNARY_RHS:
3746       reduction_op = gimple_assign_rhs1 (stmt);
3747       break;
3748     case GIMPLE_BINARY_RHS:
3749       reduction_op = reduc_index ?
3750                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3751       break;
3752     case GIMPLE_TERNARY_RHS:
3753       reduction_op = gimple_op (stmt, reduc_index + 1);
3754       break;
3755     default:
3756       gcc_unreachable ();
3757     }
3758
3759   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3760   gcc_assert (vectype);
3761   mode = TYPE_MODE (vectype);
3762
3763   /* 1. Create the reduction def-use cycle:
3764      Set the arguments of REDUCTION_PHIS, i.e., transform
3765
3766         loop:
3767           vec_def = phi <null, null>            # REDUCTION_PHI
3768           VECT_DEF = vector_stmt                # vectorized form of STMT
3769           ...
3770
3771      into:
3772
3773         loop:
3774           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3775           VECT_DEF = vector_stmt                # vectorized form of STMT
3776           ...
3777
3778      (in case of SLP, do it for all the phis). */
3779
3780   /* Get the loop-entry arguments.  */
3781   if (slp_node)
3782     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3783                        NULL, slp_node, reduc_index);
3784   else
3785     {
3786       vec_initial_defs.create (1);
3787      /* For the case of reduction, vect_get_vec_def_for_operand returns
3788         the scalar def before the loop, that defines the initial value
3789         of the reduction variable.  */
3790       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3791                                                       &adjustment_def);
3792       vec_initial_defs.quick_push (vec_initial_def);
3793     }
3794
3795   /* Set phi nodes arguments.  */
3796   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3797     {
3798       tree vec_init_def = vec_initial_defs[i];
3799       tree def = vect_defs[i];
3800       for (j = 0; j < ncopies; j++)
3801         {
3802           /* Set the loop-entry arg of the reduction-phi.  */
3803           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3804                        UNKNOWN_LOCATION);
3805
3806           /* Set the loop-latch arg for the reduction-phi.  */
3807           if (j > 0)
3808             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3809
3810           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3811
3812           if (dump_enabled_p ())
3813             {
3814               dump_printf_loc (MSG_NOTE, vect_location,
3815                                "transform reduction: created def-use cycle: ");
3816               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3817               dump_printf (MSG_NOTE, "\n");
3818               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3819             }
3820
3821           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3822         }
3823     }
3824
3825   vec_initial_defs.release ();
3826
3827   /* 2. Create epilog code.
3828         The reduction epilog code operates across the elements of the vector
3829         of partial results computed by the vectorized loop.
3830         The reduction epilog code consists of:
3831
3832         step 1: compute the scalar result in a vector (v_out2)
3833         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3834         step 3: adjust the scalar result (s_out3) if needed.
3835
3836         Step 1 can be accomplished using one the following three schemes:
3837           (scheme 1) using reduc_code, if available.
3838           (scheme 2) using whole-vector shifts, if available.
3839           (scheme 3) using a scalar loop. In this case steps 1+2 above are
3840                      combined.
3841
3842           The overall epilog code looks like this:
3843
3844           s_out0 = phi <s_loop>         # original EXIT_PHI
3845           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
3846           v_out2 = reduce <v_out1>              # step 1
3847           s_out3 = extract_field <v_out2, 0>    # step 2
3848           s_out4 = adjust_result <s_out3>       # step 3
3849
3850           (step 3 is optional, and steps 1 and 2 may be combined).
3851           Lastly, the uses of s_out0 are replaced by s_out4.  */
3852
3853
3854   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
3855          v_out1 = phi <VECT_DEF>
3856          Store them in NEW_PHIS.  */
3857
3858   exit_bb = single_exit (loop)->dest;
3859   prev_phi_info = NULL;
3860   new_phis.create (vect_defs.length ());
3861   FOR_EACH_VEC_ELT (vect_defs, i, def)
3862     {
3863       for (j = 0; j < ncopies; j++)
3864         {
3865           tree new_def = copy_ssa_name (def, NULL);
3866           phi = create_phi_node (new_def, exit_bb);
3867           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
3868           if (j == 0)
3869             new_phis.quick_push (phi);
3870           else
3871             {
3872               def = vect_get_vec_def_for_stmt_copy (dt, def);
3873               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
3874             }
3875
3876           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
3877           prev_phi_info = vinfo_for_stmt (phi);
3878         }
3879     }
3880
3881   /* The epilogue is created for the outer-loop, i.e., for the loop being
3882      vectorized.  Create exit phis for the outer loop.  */
3883   if (double_reduc)
3884     {
3885       loop = outer_loop;
3886       exit_bb = single_exit (loop)->dest;
3887       inner_phis.create (vect_defs.length ());
3888       FOR_EACH_VEC_ELT (new_phis, i, phi)
3889         {
3890           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3891           gimple outer_phi = create_phi_node (new_result, exit_bb);
3892           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3893                            PHI_RESULT (phi));
3894           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3895                                                             loop_vinfo, NULL));
3896           inner_phis.quick_push (phi);
3897           new_phis[i] = outer_phi;
3898           prev_phi_info = vinfo_for_stmt (outer_phi);
3899           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
3900             {
3901               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3902               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
3903               outer_phi = create_phi_node (new_result, exit_bb);
3904               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
3905                                PHI_RESULT (phi));
3906               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
3907                                                         loop_vinfo, NULL));
3908               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
3909               prev_phi_info = vinfo_for_stmt (outer_phi);
3910             }
3911         }
3912     }
3913
3914   exit_gsi = gsi_after_labels (exit_bb);
3915
3916   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
3917          (i.e. when reduc_code is not available) and in the final adjustment
3918          code (if needed).  Also get the original scalar reduction variable as
3919          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
3920          represents a reduction pattern), the tree-code and scalar-def are
3921          taken from the original stmt that the pattern-stmt (STMT) replaces.
3922          Otherwise (it is a regular reduction) - the tree-code and scalar-def
3923          are taken from STMT.  */
3924
3925   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3926   if (!orig_stmt)
3927     {
3928       /* Regular reduction  */
3929       orig_stmt = stmt;
3930     }
3931   else
3932     {
3933       /* Reduction pattern  */
3934       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
3935       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
3936       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
3937     }
3938
3939   code = gimple_assign_rhs_code (orig_stmt);
3940   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
3941      partial results are added and not subtracted.  */
3942   if (code == MINUS_EXPR)
3943     code = PLUS_EXPR;
3944
3945   scalar_dest = gimple_assign_lhs (orig_stmt);
3946   scalar_type = TREE_TYPE (scalar_dest);
3947   scalar_results.create (group_size);
3948   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
3949   bitsize = TYPE_SIZE (scalar_type);
3950
3951   /* In case this is a reduction in an inner-loop while vectorizing an outer
3952      loop - we don't need to extract a single scalar result at the end of the
3953      inner-loop (unless it is double reduction, i.e., the use of reduction is
3954      outside the outer-loop).  The final vector of partial results will be used
3955      in the vectorized outer-loop, or reduced to a scalar result at the end of
3956      the outer-loop.  */
3957   if (nested_in_vect_loop && !double_reduc)
3958     goto vect_finalize_reduction;
3959
3960   /* SLP reduction without reduction chain, e.g.,
3961      # a1 = phi <a2, a0>
3962      # b1 = phi <b2, b0>
3963      a2 = operation (a1)
3964      b2 = operation (b1)  */
3965   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
3966
3967   /* In case of reduction chain, e.g.,
3968      # a1 = phi <a3, a0>
3969      a2 = operation (a1)
3970      a3 = operation (a2),
3971
3972      we may end up with more than one vector result.  Here we reduce them to
3973      one vector.  */
3974   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
3975     {
3976       tree first_vect = PHI_RESULT (new_phis[0]);
3977       tree tmp;
3978       gimple new_vec_stmt = NULL;
3979
3980       vec_dest = vect_create_destination_var (scalar_dest, vectype);
3981       for (k = 1; k < new_phis.length (); k++)
3982         {
3983           gimple next_phi = new_phis[k];
3984           tree second_vect = PHI_RESULT (next_phi);
3985
3986           tmp = build2 (code, vectype,  first_vect, second_vect);
3987           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
3988           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
3989           gimple_assign_set_lhs (new_vec_stmt, first_vect);
3990           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
3991         }
3992
3993       new_phi_result = first_vect;
3994       if (new_vec_stmt)
3995         {
3996           new_phis.truncate (0);
3997           new_phis.safe_push (new_vec_stmt);
3998         }
3999     }
4000   else
4001     new_phi_result = PHI_RESULT (new_phis[0]);
4002
4003   /* 2.3 Create the reduction code, using one of the three schemes described
4004          above. In SLP we simply need to extract all the elements from the
4005          vector (without reducing them), so we use scalar shifts.  */
4006   if (reduc_code != ERROR_MARK && !slp_reduc)
4007     {
4008       tree tmp;
4009
4010       /*** Case 1:  Create:
4011            v_out2 = reduc_expr <v_out1>  */
4012
4013       if (dump_enabled_p ())
4014         dump_printf_loc (MSG_NOTE, vect_location,
4015                          "Reduce using direct vector reduction.");
4016
4017       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4018       tmp = build1 (reduc_code, vectype, new_phi_result);
4019       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4020       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4021       gimple_assign_set_lhs (epilog_stmt, new_temp);
4022       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4023
4024       extract_scalar_result = true;
4025     }
4026   else
4027     {
4028       enum tree_code shift_code = ERROR_MARK;
4029       bool have_whole_vector_shift = true;
4030       int bit_offset;
4031       int element_bitsize = tree_low_cst (bitsize, 1);
4032       int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4033       tree vec_temp;
4034
4035       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4036         shift_code = VEC_RSHIFT_EXPR;
4037       else
4038         have_whole_vector_shift = false;
4039
4040       /* Regardless of whether we have a whole vector shift, if we're
4041          emulating the operation via tree-vect-generic, we don't want
4042          to use it.  Only the first round of the reduction is likely
4043          to still be profitable via emulation.  */
4044       /* ??? It might be better to emit a reduction tree code here, so that
4045          tree-vect-generic can expand the first round via bit tricks.  */
4046       if (!VECTOR_MODE_P (mode))
4047         have_whole_vector_shift = false;
4048       else
4049         {
4050           optab optab = optab_for_tree_code (code, vectype, optab_default);
4051           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4052             have_whole_vector_shift = false;
4053         }
4054
4055       if (have_whole_vector_shift && !slp_reduc)
4056         {
4057           /*** Case 2: Create:
4058              for (offset = VS/2; offset >= element_size; offset/=2)
4059                 {
4060                   Create:  va' = vec_shift <va, offset>
4061                   Create:  va = vop <va, va'>
4062                 }  */
4063
4064           if (dump_enabled_p ())
4065             dump_printf_loc (MSG_NOTE, vect_location,
4066                              "Reduce using vector shifts");
4067
4068           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4069           new_temp = new_phi_result;
4070           for (bit_offset = vec_size_in_bits/2;
4071                bit_offset >= element_bitsize;
4072                bit_offset /= 2)
4073             {
4074               tree bitpos = size_int (bit_offset);
4075
4076               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4077                                                vec_dest, new_temp, bitpos);
4078               new_name = make_ssa_name (vec_dest, epilog_stmt);
4079               gimple_assign_set_lhs (epilog_stmt, new_name);
4080               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4081
4082               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4083                                                           new_name, new_temp);
4084               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4085               gimple_assign_set_lhs (epilog_stmt, new_temp);
4086               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4087             }
4088
4089           extract_scalar_result = true;
4090         }
4091       else
4092         {
4093           tree rhs;
4094
4095           /*** Case 3: Create:
4096              s = extract_field <v_out2, 0>
4097              for (offset = element_size;
4098                   offset < vector_size;
4099                   offset += element_size;)
4100                {
4101                  Create:  s' = extract_field <v_out2, offset>
4102                  Create:  s = op <s, s'>  // For non SLP cases
4103                }  */
4104
4105           if (dump_enabled_p ())
4106             dump_printf_loc (MSG_NOTE, vect_location,
4107                              "Reduce using scalar code. ");
4108
4109           vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1);
4110           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4111             {
4112               if (gimple_code (new_phi) == GIMPLE_PHI)
4113                 vec_temp = PHI_RESULT (new_phi);
4114               else
4115                 vec_temp = gimple_assign_lhs (new_phi);
4116               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4117                             bitsize_zero_node);
4118               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4119               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4120               gimple_assign_set_lhs (epilog_stmt, new_temp);
4121               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4122
4123               /* In SLP we don't need to apply reduction operation, so we just
4124                  collect s' values in SCALAR_RESULTS.  */
4125               if (slp_reduc)
4126                 scalar_results.safe_push (new_temp);
4127
4128               for (bit_offset = element_bitsize;
4129                    bit_offset < vec_size_in_bits;
4130                    bit_offset += element_bitsize)
4131                 {
4132                   tree bitpos = bitsize_int (bit_offset);
4133                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4134                                      bitsize, bitpos);
4135
4136                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4137                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4138                   gimple_assign_set_lhs (epilog_stmt, new_name);
4139                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4140
4141                   if (slp_reduc)
4142                     {
4143                       /* In SLP we don't need to apply reduction operation, so
4144                          we just collect s' values in SCALAR_RESULTS.  */
4145                       new_temp = new_name;
4146                       scalar_results.safe_push (new_name);
4147                     }
4148                   else
4149                     {
4150                       epilog_stmt = gimple_build_assign_with_ops (code,
4151                                           new_scalar_dest, new_name, new_temp);
4152                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4153                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4154                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4155                     }
4156                 }
4157             }
4158
4159           /* The only case where we need to reduce scalar results in SLP, is
4160              unrolling.  If the size of SCALAR_RESULTS is greater than
4161              GROUP_SIZE, we reduce them combining elements modulo
4162              GROUP_SIZE.  */
4163           if (slp_reduc)
4164             {
4165               tree res, first_res, new_res;
4166               gimple new_stmt;
4167
4168               /* Reduce multiple scalar results in case of SLP unrolling.  */
4169               for (j = group_size; scalar_results.iterate (j, &res);
4170                    j++)
4171                 {
4172                   first_res = scalar_results[j % group_size];
4173                   new_stmt = gimple_build_assign_with_ops (code,
4174                                               new_scalar_dest, first_res, res);
4175                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4176                   gimple_assign_set_lhs (new_stmt, new_res);
4177                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4178                   scalar_results[j % group_size] = new_res;
4179                 }
4180             }
4181           else
4182             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4183             scalar_results.safe_push (new_temp);
4184
4185           extract_scalar_result = false;
4186         }
4187     }
4188
4189   /* 2.4  Extract the final scalar result.  Create:
4190           s_out3 = extract_field <v_out2, bitpos>  */
4191
4192   if (extract_scalar_result)
4193     {
4194       tree rhs;
4195
4196       if (dump_enabled_p ())
4197         dump_printf_loc (MSG_NOTE, vect_location,
4198                          "extract scalar result");
4199
4200       if (BYTES_BIG_ENDIAN)
4201         bitpos = size_binop (MULT_EXPR,
4202                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4203                              TYPE_SIZE (scalar_type));
4204       else
4205         bitpos = bitsize_zero_node;
4206
4207       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4208       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4209       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4210       gimple_assign_set_lhs (epilog_stmt, new_temp);
4211       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4212       scalar_results.safe_push (new_temp);
4213     }
4214
4215 vect_finalize_reduction:
4216
4217   if (double_reduc)
4218     loop = loop->inner;
4219
4220   /* 2.5 Adjust the final result by the initial value of the reduction
4221          variable. (When such adjustment is not needed, then
4222          'adjustment_def' is zero).  For example, if code is PLUS we create:
4223          new_temp = loop_exit_def + adjustment_def  */
4224
4225   if (adjustment_def)
4226     {
4227       gcc_assert (!slp_reduc);
4228       if (nested_in_vect_loop)
4229         {
4230           new_phi = new_phis[0];
4231           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4232           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4233           new_dest = vect_create_destination_var (scalar_dest, vectype);
4234         }
4235       else
4236         {
4237           new_temp = scalar_results[0];
4238           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4239           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4240           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4241         }
4242
4243       epilog_stmt = gimple_build_assign (new_dest, expr);
4244       new_temp = make_ssa_name (new_dest, epilog_stmt);
4245       gimple_assign_set_lhs (epilog_stmt, new_temp);
4246       SSA_NAME_DEF_STMT (new_temp) = epilog_stmt;
4247       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4248       if (nested_in_vect_loop)
4249         {
4250           set_vinfo_for_stmt (epilog_stmt,
4251                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4252                                                  NULL));
4253           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4254                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4255
4256           if (!double_reduc)
4257             scalar_results.quick_push (new_temp);
4258           else
4259             scalar_results[0] = new_temp;
4260         }
4261       else
4262         scalar_results[0] = new_temp;
4263
4264       new_phis[0] = epilog_stmt;
4265     }
4266
4267   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4268           phis with new adjusted scalar results, i.e., replace use <s_out0>
4269           with use <s_out4>.
4270
4271      Transform:
4272         loop_exit:
4273           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4274           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4275           v_out2 = reduce <v_out1>
4276           s_out3 = extract_field <v_out2, 0>
4277           s_out4 = adjust_result <s_out3>
4278           use <s_out0>
4279           use <s_out0>
4280
4281      into:
4282
4283         loop_exit:
4284           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4285           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4286           v_out2 = reduce <v_out1>
4287           s_out3 = extract_field <v_out2, 0>
4288           s_out4 = adjust_result <s_out3>
4289           use <s_out4>
4290           use <s_out4> */
4291
4292
4293   /* In SLP reduction chain we reduce vector results into one vector if
4294      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4295      the last stmt in the reduction chain, since we are looking for the loop
4296      exit phi node.  */
4297   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4298     {
4299       scalar_dest = gimple_assign_lhs (
4300                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4301       group_size = 1;
4302     }
4303
4304   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4305      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4306      need to match SCALAR_RESULTS with corresponding statements.  The first
4307      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4308      the first vector stmt, etc.
4309      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4310   if (group_size > new_phis.length ())
4311     {
4312       ratio = group_size / new_phis.length ();
4313       gcc_assert (!(group_size % new_phis.length ()));
4314     }
4315   else
4316     ratio = 1;
4317
4318   for (k = 0; k < group_size; k++)
4319     {
4320       if (k % ratio == 0)
4321         {
4322           epilog_stmt = new_phis[k / ratio];
4323           reduction_phi = reduction_phis[k / ratio];
4324           if (double_reduc)
4325             inner_phi = inner_phis[k / ratio];
4326         }
4327
4328       if (slp_reduc)
4329         {
4330           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4331
4332           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4333           /* SLP statements can't participate in patterns.  */
4334           gcc_assert (!orig_stmt);
4335           scalar_dest = gimple_assign_lhs (current_stmt);
4336         }
4337
4338       phis.create (3);
4339       /* Find the loop-closed-use at the loop exit of the original scalar
4340          result.  (The reduction result is expected to have two immediate uses -
4341          one at the latch block, and one at the loop exit).  */
4342       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4343         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4344           phis.safe_push (USE_STMT (use_p));
4345
4346       /* We expect to have found an exit_phi because of loop-closed-ssa
4347          form.  */
4348       gcc_assert (!phis.is_empty ());
4349
4350       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4351         {
4352           if (outer_loop)
4353             {
4354               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4355               gimple vect_phi;
4356
4357               /* FORNOW. Currently not supporting the case that an inner-loop
4358                  reduction is not used in the outer-loop (but only outside the
4359                  outer-loop), unless it is double reduction.  */
4360               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4361                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4362                           || double_reduc);
4363
4364               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4365               if (!double_reduc
4366                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4367                       != vect_double_reduction_def)
4368                 continue;
4369
4370               /* Handle double reduction:
4371
4372                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4373                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4374                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4375                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4376
4377                  At that point the regular reduction (stmt2 and stmt3) is
4378                  already vectorized, as well as the exit phi node, stmt4.
4379                  Here we vectorize the phi node of double reduction, stmt1, and
4380                  update all relevant statements.  */
4381
4382               /* Go through all the uses of s2 to find double reduction phi
4383                  node, i.e., stmt1 above.  */
4384               orig_name = PHI_RESULT (exit_phi);
4385               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4386                 {
4387                   stmt_vec_info use_stmt_vinfo;
4388                   stmt_vec_info new_phi_vinfo;
4389                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4390                   basic_block bb = gimple_bb (use_stmt);
4391                   gimple use;
4392
4393                   /* Check that USE_STMT is really double reduction phi
4394                      node.  */
4395                   if (gimple_code (use_stmt) != GIMPLE_PHI
4396                       || gimple_phi_num_args (use_stmt) != 2
4397                       || bb->loop_father != outer_loop)
4398                     continue;
4399                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4400                   if (!use_stmt_vinfo
4401                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4402                           != vect_double_reduction_def)
4403                     continue;
4404
4405                   /* Create vector phi node for double reduction:
4406                      vs1 = phi <vs0, vs2>
4407                      vs1 was created previously in this function by a call to
4408                        vect_get_vec_def_for_operand and is stored in
4409                        vec_initial_def;
4410                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4411                      vs0 is created here.  */
4412
4413                   /* Create vector phi node.  */
4414                   vect_phi = create_phi_node (vec_initial_def, bb);
4415                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4416                                     loop_vec_info_for_loop (outer_loop), NULL);
4417                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4418
4419                   /* Create vs0 - initial def of the double reduction phi.  */
4420                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4421                                              loop_preheader_edge (outer_loop));
4422                   init_def = get_initial_def_for_reduction (stmt,
4423                                                           preheader_arg, NULL);
4424                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4425                                                     vectype, NULL);
4426
4427                   /* Update phi node arguments with vs0 and vs2.  */
4428                   add_phi_arg (vect_phi, vect_phi_init,
4429                                loop_preheader_edge (outer_loop),
4430                                UNKNOWN_LOCATION);
4431                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4432                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4433                   if (dump_enabled_p ())
4434                     {
4435                       dump_printf_loc (MSG_NOTE, vect_location,
4436                                        "created double reduction phi node: ");
4437                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4438                     }
4439
4440                   vect_phi_res = PHI_RESULT (vect_phi);
4441
4442                   /* Replace the use, i.e., set the correct vs1 in the regular
4443                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4444                      loop is redundant.  */
4445                   use = reduction_phi;
4446                   for (j = 0; j < ncopies; j++)
4447                     {
4448                       edge pr_edge = loop_preheader_edge (loop);
4449                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4450                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4451                     }
4452                 }
4453             }
4454         }
4455
4456       phis.release ();
4457       if (nested_in_vect_loop)
4458         {
4459           if (double_reduc)
4460             loop = outer_loop;
4461           else
4462             continue;
4463         }
4464
4465       phis.create (3);
4466       /* Find the loop-closed-use at the loop exit of the original scalar
4467          result.  (The reduction result is expected to have two immediate uses,
4468          one at the latch block, and one at the loop exit).  For double
4469          reductions we are looking for exit phis of the outer loop.  */
4470       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4471         {
4472           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4473             phis.safe_push (USE_STMT (use_p));
4474           else
4475             {
4476               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4477                 {
4478                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4479
4480                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4481                     {
4482                       if (!flow_bb_inside_loop_p (loop,
4483                                              gimple_bb (USE_STMT (phi_use_p))))
4484                         phis.safe_push (USE_STMT (phi_use_p));
4485                     }
4486                 }
4487             }
4488         }
4489
4490       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4491         {
4492           /* Replace the uses:  */
4493           orig_name = PHI_RESULT (exit_phi);
4494           scalar_result = scalar_results[k];
4495           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4496             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4497               SET_USE (use_p, scalar_result);
4498         }
4499
4500       phis.release ();
4501     }
4502
4503   scalar_results.release ();
4504   new_phis.release ();
4505 }
4506
4507
4508 /* Function vectorizable_reduction.
4509
4510    Check if STMT performs a reduction operation that can be vectorized.
4511    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4512    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4513    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4514
4515    This function also handles reduction idioms (patterns) that have been
4516    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4517    of this form:
4518      X = pattern_expr (arg0, arg1, ..., X)
4519    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4520    sequence that had been detected and replaced by the pattern-stmt (STMT).
4521
4522    In some cases of reduction patterns, the type of the reduction variable X is
4523    different than the type of the other arguments of STMT.
4524    In such cases, the vectype that is used when transforming STMT into a vector
4525    stmt is different than the vectype that is used to determine the
4526    vectorization factor, because it consists of a different number of elements
4527    than the actual number of elements that are being operated upon in parallel.
4528
4529    For example, consider an accumulation of shorts into an int accumulator.
4530    On some targets it's possible to vectorize this pattern operating on 8
4531    shorts at a time (hence, the vectype for purposes of determining the
4532    vectorization factor should be V8HI); on the other hand, the vectype that
4533    is used to create the vector form is actually V4SI (the type of the result).
4534
4535    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4536    indicates what is the actual level of parallelism (V8HI in the example), so
4537    that the right vectorization factor would be derived.  This vectype
4538    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4539    be used to create the vectorized stmt.  The right vectype for the vectorized
4540    stmt is obtained from the type of the result X:
4541         get_vectype_for_scalar_type (TREE_TYPE (X))
4542
4543    This means that, contrary to "regular" reductions (or "regular" stmts in
4544    general), the following equation:
4545       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4546    does *NOT* necessarily hold for reduction patterns.  */
4547
4548 bool
4549 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4550                         gimple *vec_stmt, slp_tree slp_node)
4551 {
4552   tree vec_dest;
4553   tree scalar_dest;
4554   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4555   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4556   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4557   tree vectype_in = NULL_TREE;
4558   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4559   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4560   enum tree_code code, orig_code, epilog_reduc_code;
4561   enum machine_mode vec_mode;
4562   int op_type;
4563   optab optab, reduc_optab;
4564   tree new_temp = NULL_TREE;
4565   tree def;
4566   gimple def_stmt;
4567   enum vect_def_type dt;
4568   gimple new_phi = NULL;
4569   tree scalar_type;
4570   bool is_simple_use;
4571   gimple orig_stmt;
4572   stmt_vec_info orig_stmt_info;
4573   tree expr = NULL_TREE;
4574   int i;
4575   int ncopies;
4576   int epilog_copies;
4577   stmt_vec_info prev_stmt_info, prev_phi_info;
4578   bool single_defuse_cycle = false;
4579   tree reduc_def = NULL_TREE;
4580   gimple new_stmt = NULL;
4581   int j;
4582   tree ops[3];
4583   bool nested_cycle = false, found_nested_cycle_def = false;
4584   gimple reduc_def_stmt = NULL;
4585   /* The default is that the reduction variable is the last in statement.  */
4586   int reduc_index = 2;
4587   bool double_reduc = false, dummy;
4588   basic_block def_bb;
4589   struct loop * def_stmt_loop, *outer_loop = NULL;
4590   tree def_arg;
4591   gimple def_arg_stmt;
4592   vec<tree> vec_oprnds0 = vNULL;
4593   vec<tree> vec_oprnds1 = vNULL;
4594   vec<tree> vect_defs = vNULL;
4595   vec<gimple> phis = vNULL;
4596   int vec_num;
4597   tree def0, def1, tem, op0, op1 = NULL_TREE;
4598
4599   /* In case of reduction chain we switch to the first stmt in the chain, but
4600      we don't update STMT_INFO, since only the last stmt is marked as reduction
4601      and has reduction properties.  */
4602   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4603     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4604
4605   if (nested_in_vect_loop_p (loop, stmt))
4606     {
4607       outer_loop = loop;
4608       loop = loop->inner;
4609       nested_cycle = true;
4610     }
4611
4612   /* 1. Is vectorizable reduction?  */
4613   /* Not supportable if the reduction variable is used in the loop, unless
4614      it's a reduction chain.  */
4615   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4616       && !GROUP_FIRST_ELEMENT (stmt_info))
4617     return false;
4618
4619   /* Reductions that are not used even in an enclosing outer-loop,
4620      are expected to be "live" (used out of the loop).  */
4621   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4622       && !STMT_VINFO_LIVE_P (stmt_info))
4623     return false;
4624
4625   /* Make sure it was already recognized as a reduction computation.  */
4626   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4627       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4628     return false;
4629
4630   /* 2. Has this been recognized as a reduction pattern?
4631
4632      Check if STMT represents a pattern that has been recognized
4633      in earlier analysis stages.  For stmts that represent a pattern,
4634      the STMT_VINFO_RELATED_STMT field records the last stmt in
4635      the original sequence that constitutes the pattern.  */
4636
4637   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4638   if (orig_stmt)
4639     {
4640       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4641       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4642       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4643     }
4644
4645   /* 3. Check the operands of the operation.  The first operands are defined
4646         inside the loop body. The last operand is the reduction variable,
4647         which is defined by the loop-header-phi.  */
4648
4649   gcc_assert (is_gimple_assign (stmt));
4650
4651   /* Flatten RHS.  */
4652   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4653     {
4654     case GIMPLE_SINGLE_RHS:
4655       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4656       if (op_type == ternary_op)
4657         {
4658           tree rhs = gimple_assign_rhs1 (stmt);
4659           ops[0] = TREE_OPERAND (rhs, 0);
4660           ops[1] = TREE_OPERAND (rhs, 1);
4661           ops[2] = TREE_OPERAND (rhs, 2);
4662           code = TREE_CODE (rhs);
4663         }
4664       else
4665         return false;
4666       break;
4667
4668     case GIMPLE_BINARY_RHS:
4669       code = gimple_assign_rhs_code (stmt);
4670       op_type = TREE_CODE_LENGTH (code);
4671       gcc_assert (op_type == binary_op);
4672       ops[0] = gimple_assign_rhs1 (stmt);
4673       ops[1] = gimple_assign_rhs2 (stmt);
4674       break;
4675
4676     case GIMPLE_TERNARY_RHS:
4677       code = gimple_assign_rhs_code (stmt);
4678       op_type = TREE_CODE_LENGTH (code);
4679       gcc_assert (op_type == ternary_op);
4680       ops[0] = gimple_assign_rhs1 (stmt);
4681       ops[1] = gimple_assign_rhs2 (stmt);
4682       ops[2] = gimple_assign_rhs3 (stmt);
4683       break;
4684
4685     case GIMPLE_UNARY_RHS:
4686       return false;
4687
4688     default:
4689       gcc_unreachable ();
4690     }
4691
4692   if (code == COND_EXPR && slp_node)
4693     return false;
4694
4695   scalar_dest = gimple_assign_lhs (stmt);
4696   scalar_type = TREE_TYPE (scalar_dest);
4697   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4698       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4699     return false;
4700
4701   /* Do not try to vectorize bit-precision reductions.  */
4702   if ((TYPE_PRECISION (scalar_type)
4703        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4704     return false;
4705
4706   /* All uses but the last are expected to be defined in the loop.
4707      The last use is the reduction variable.  In case of nested cycle this
4708      assumption is not true: we use reduc_index to record the index of the
4709      reduction variable.  */
4710   for (i = 0; i < op_type - 1; i++)
4711     {
4712       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4713       if (i == 0 && code == COND_EXPR)
4714         continue;
4715
4716       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4717                                             &def_stmt, &def, &dt, &tem);
4718       if (!vectype_in)
4719         vectype_in = tem;
4720       gcc_assert (is_simple_use);
4721
4722       if (dt != vect_internal_def
4723           && dt != vect_external_def
4724           && dt != vect_constant_def
4725           && dt != vect_induction_def
4726           && !(dt == vect_nested_cycle && nested_cycle))
4727         return false;
4728
4729       if (dt == vect_nested_cycle)
4730         {
4731           found_nested_cycle_def = true;
4732           reduc_def_stmt = def_stmt;
4733           reduc_index = i;
4734         }
4735     }
4736
4737   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4738                                         &def_stmt, &def, &dt, &tem);
4739   if (!vectype_in)
4740     vectype_in = tem;
4741   gcc_assert (is_simple_use);
4742   if (!(dt == vect_reduction_def
4743         || dt == vect_nested_cycle
4744         || ((dt == vect_internal_def || dt == vect_external_def
4745              || dt == vect_constant_def || dt == vect_induction_def)
4746             && nested_cycle && found_nested_cycle_def)))
4747     {
4748       /* For pattern recognized stmts, orig_stmt might be a reduction,
4749          but some helper statements for the pattern might not, or
4750          might be COND_EXPRs with reduction uses in the condition.  */
4751       gcc_assert (orig_stmt);
4752       return false;
4753     }
4754   if (!found_nested_cycle_def)
4755     reduc_def_stmt = def_stmt;
4756
4757   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4758   if (orig_stmt)
4759     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4760                                                        reduc_def_stmt,
4761                                                        !nested_cycle,
4762                                                        &dummy));
4763   else
4764     {
4765       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4766                                              !nested_cycle, &dummy);
4767       /* We changed STMT to be the first stmt in reduction chain, hence we
4768          check that in this case the first element in the chain is STMT.  */
4769       gcc_assert (stmt == tmp
4770                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4771     }
4772
4773   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4774     return false;
4775
4776   if (slp_node || PURE_SLP_STMT (stmt_info))
4777     ncopies = 1;
4778   else
4779     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4780                / TYPE_VECTOR_SUBPARTS (vectype_in));
4781
4782   gcc_assert (ncopies >= 1);
4783
4784   vec_mode = TYPE_MODE (vectype_in);
4785
4786   if (code == COND_EXPR)
4787     {
4788       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4789         {
4790           if (dump_enabled_p ())
4791             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4792                              "unsupported condition in reduction");
4793
4794             return false;
4795         }
4796     }
4797   else
4798     {
4799       /* 4. Supportable by target?  */
4800
4801       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4802           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4803         {
4804           /* Shifts and rotates are only supported by vectorizable_shifts,
4805              not vectorizable_reduction.  */
4806           if (dump_enabled_p ())
4807             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4808                              "unsupported shift or rotation.");
4809           return false;
4810         }
4811
4812       /* 4.1. check support for the operation in the loop  */
4813       optab = optab_for_tree_code (code, vectype_in, optab_default);
4814       if (!optab)
4815         {
4816           if (dump_enabled_p ())
4817             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4818                              "no optab.");
4819
4820           return false;
4821         }
4822
4823       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4824         {
4825           if (dump_enabled_p ())
4826             dump_printf (MSG_NOTE, "op not supported by target.");
4827
4828           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4829               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4830                   < vect_min_worthwhile_factor (code))
4831             return false;
4832
4833           if (dump_enabled_p ())
4834             dump_printf (MSG_NOTE, "proceeding using word mode.");
4835         }
4836
4837       /* Worthwhile without SIMD support?  */
4838       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
4839           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4840              < vect_min_worthwhile_factor (code))
4841         {
4842           if (dump_enabled_p ())
4843             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4844                              "not worthwhile without SIMD support.");
4845
4846           return false;
4847         }
4848     }
4849
4850   /* 4.2. Check support for the epilog operation.
4851
4852           If STMT represents a reduction pattern, then the type of the
4853           reduction variable may be different than the type of the rest
4854           of the arguments.  For example, consider the case of accumulation
4855           of shorts into an int accumulator; The original code:
4856                         S1: int_a = (int) short_a;
4857           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
4858
4859           was replaced with:
4860                         STMT: int_acc = widen_sum <short_a, int_acc>
4861
4862           This means that:
4863           1. The tree-code that is used to create the vector operation in the
4864              epilog code (that reduces the partial results) is not the
4865              tree-code of STMT, but is rather the tree-code of the original
4866              stmt from the pattern that STMT is replacing.  I.e, in the example
4867              above we want to use 'widen_sum' in the loop, but 'plus' in the
4868              epilog.
4869           2. The type (mode) we use to check available target support
4870              for the vector operation to be created in the *epilog*, is
4871              determined by the type of the reduction variable (in the example
4872              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
4873              However the type (mode) we use to check available target support
4874              for the vector operation to be created *inside the loop*, is
4875              determined by the type of the other arguments to STMT (in the
4876              example we'd check this: optab_handler (widen_sum_optab,
4877              vect_short_mode)).
4878
4879           This is contrary to "regular" reductions, in which the types of all
4880           the arguments are the same as the type of the reduction variable.
4881           For "regular" reductions we can therefore use the same vector type
4882           (and also the same tree-code) when generating the epilog code and
4883           when generating the code inside the loop.  */
4884
4885   if (orig_stmt)
4886     {
4887       /* This is a reduction pattern: get the vectype from the type of the
4888          reduction variable, and get the tree-code from orig_stmt.  */
4889       orig_code = gimple_assign_rhs_code (orig_stmt);
4890       gcc_assert (vectype_out);
4891       vec_mode = TYPE_MODE (vectype_out);
4892     }
4893   else
4894     {
4895       /* Regular reduction: use the same vectype and tree-code as used for
4896          the vector code inside the loop can be used for the epilog code. */
4897       orig_code = code;
4898     }
4899
4900   if (nested_cycle)
4901     {
4902       def_bb = gimple_bb (reduc_def_stmt);
4903       def_stmt_loop = def_bb->loop_father;
4904       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4905                                        loop_preheader_edge (def_stmt_loop));
4906       if (TREE_CODE (def_arg) == SSA_NAME
4907           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
4908           && gimple_code (def_arg_stmt) == GIMPLE_PHI
4909           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
4910           && vinfo_for_stmt (def_arg_stmt)
4911           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
4912               == vect_double_reduction_def)
4913         double_reduc = true;
4914     }
4915
4916   epilog_reduc_code = ERROR_MARK;
4917   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
4918     {
4919       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
4920                                          optab_default);
4921       if (!reduc_optab)
4922         {
4923           if (dump_enabled_p ())
4924             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4925                              "no optab for reduction.");
4926
4927           epilog_reduc_code = ERROR_MARK;
4928         }
4929
4930       if (reduc_optab
4931           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
4932         {
4933           if (dump_enabled_p ())
4934             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4935                              "reduc op not supported by target.");
4936
4937           epilog_reduc_code = ERROR_MARK;
4938         }
4939     }
4940   else
4941     {
4942       if (!nested_cycle || double_reduc)
4943         {
4944           if (dump_enabled_p ())
4945             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4946                              "no reduc code for scalar code.");
4947
4948           return false;
4949         }
4950     }
4951
4952   if (double_reduc && ncopies > 1)
4953     {
4954       if (dump_enabled_p ())
4955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4956                          "multiple types in double reduction");
4957
4958       return false;
4959     }
4960
4961   /* In case of widenning multiplication by a constant, we update the type
4962      of the constant to be the type of the other operand.  We check that the
4963      constant fits the type in the pattern recognition pass.  */
4964   if (code == DOT_PROD_EXPR
4965       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
4966     {
4967       if (TREE_CODE (ops[0]) == INTEGER_CST)
4968         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
4969       else if (TREE_CODE (ops[1]) == INTEGER_CST)
4970         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
4971       else
4972         {
4973           if (dump_enabled_p ())
4974             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4975                              "invalid types in dot-prod");
4976
4977           return false;
4978         }
4979     }
4980
4981   if (!vec_stmt) /* transformation not required.  */
4982     {
4983       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
4984         return false;
4985       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
4986       return true;
4987     }
4988
4989   /** Transform.  **/
4990
4991   if (dump_enabled_p ())
4992     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.");
4993
4994   /* FORNOW: Multiple types are not supported for condition.  */
4995   if (code == COND_EXPR)
4996     gcc_assert (ncopies == 1);
4997
4998   /* Create the destination vector  */
4999   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5000
5001   /* In case the vectorization factor (VF) is bigger than the number
5002      of elements that we can fit in a vectype (nunits), we have to generate
5003      more than one vector stmt - i.e - we need to "unroll" the
5004      vector stmt by a factor VF/nunits.  For more details see documentation
5005      in vectorizable_operation.  */
5006
5007   /* If the reduction is used in an outer loop we need to generate
5008      VF intermediate results, like so (e.g. for ncopies=2):
5009         r0 = phi (init, r0)
5010         r1 = phi (init, r1)
5011         r0 = x0 + r0;
5012         r1 = x1 + r1;
5013     (i.e. we generate VF results in 2 registers).
5014     In this case we have a separate def-use cycle for each copy, and therefore
5015     for each copy we get the vector def for the reduction variable from the
5016     respective phi node created for this copy.
5017
5018     Otherwise (the reduction is unused in the loop nest), we can combine
5019     together intermediate results, like so (e.g. for ncopies=2):
5020         r = phi (init, r)
5021         r = x0 + r;
5022         r = x1 + r;
5023    (i.e. we generate VF/2 results in a single register).
5024    In this case for each copy we get the vector def for the reduction variable
5025    from the vectorized reduction operation generated in the previous iteration.
5026   */
5027
5028   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5029     {
5030       single_defuse_cycle = true;
5031       epilog_copies = 1;
5032     }
5033   else
5034     epilog_copies = ncopies;
5035
5036   prev_stmt_info = NULL;
5037   prev_phi_info = NULL;
5038   if (slp_node)
5039     {
5040       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5041       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5042                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5043     }
5044   else
5045     {
5046       vec_num = 1;
5047       vec_oprnds0.create (1);
5048       if (op_type == ternary_op)
5049         vec_oprnds1.create (1);
5050     }
5051
5052   phis.create (vec_num);
5053   vect_defs.create (vec_num);
5054   if (!slp_node)
5055     vect_defs.quick_push (NULL_TREE);
5056
5057   for (j = 0; j < ncopies; j++)
5058     {
5059       if (j == 0 || !single_defuse_cycle)
5060         {
5061           for (i = 0; i < vec_num; i++)
5062             {
5063               /* Create the reduction-phi that defines the reduction
5064                  operand.  */
5065               new_phi = create_phi_node (vec_dest, loop->header);
5066               set_vinfo_for_stmt (new_phi,
5067                                   new_stmt_vec_info (new_phi, loop_vinfo,
5068                                                      NULL));
5069                if (j == 0 || slp_node)
5070                  phis.quick_push (new_phi);
5071             }
5072         }
5073
5074       if (code == COND_EXPR)
5075         {
5076           gcc_assert (!slp_node);
5077           vectorizable_condition (stmt, gsi, vec_stmt,
5078                                   PHI_RESULT (phis[0]),
5079                                   reduc_index, NULL);
5080           /* Multiple types are not supported for condition.  */
5081           break;
5082         }
5083
5084       /* Handle uses.  */
5085       if (j == 0)
5086         {
5087           op0 = ops[!reduc_index];
5088           if (op_type == ternary_op)
5089             {
5090               if (reduc_index == 0)
5091                 op1 = ops[2];
5092               else
5093                 op1 = ops[1];
5094             }
5095
5096           if (slp_node)
5097             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5098                                slp_node, -1);
5099           else
5100             {
5101               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5102                                                             stmt, NULL);
5103               vec_oprnds0.quick_push (loop_vec_def0);
5104               if (op_type == ternary_op)
5105                {
5106                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5107                                                                NULL);
5108                  vec_oprnds1.quick_push (loop_vec_def1);
5109                }
5110             }
5111         }
5112       else
5113         {
5114           if (!slp_node)
5115             {
5116               enum vect_def_type dt;
5117               gimple dummy_stmt;
5118               tree dummy;
5119
5120               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5121                                   &dummy_stmt, &dummy, &dt);
5122               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5123                                                               loop_vec_def0);
5124               vec_oprnds0[0] = loop_vec_def0;
5125               if (op_type == ternary_op)
5126                 {
5127                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5128                                       &dummy, &dt);
5129                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5130                                                                 loop_vec_def1);
5131                   vec_oprnds1[0] = loop_vec_def1;
5132                 }
5133             }
5134
5135           if (single_defuse_cycle)
5136             reduc_def = gimple_assign_lhs (new_stmt);
5137
5138           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5139         }
5140
5141       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5142         {
5143           if (slp_node)
5144             reduc_def = PHI_RESULT (phis[i]);
5145           else
5146             {
5147               if (!single_defuse_cycle || j == 0)
5148                 reduc_def = PHI_RESULT (new_phi);
5149             }
5150
5151           def1 = ((op_type == ternary_op)
5152                   ? vec_oprnds1[i] : NULL);
5153           if (op_type == binary_op)
5154             {
5155               if (reduc_index == 0)
5156                 expr = build2 (code, vectype_out, reduc_def, def0);
5157               else
5158                 expr = build2 (code, vectype_out, def0, reduc_def);
5159             }
5160           else
5161             {
5162               if (reduc_index == 0)
5163                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5164               else
5165                 {
5166                   if (reduc_index == 1)
5167                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5168                   else
5169                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5170                 }
5171             }
5172
5173           new_stmt = gimple_build_assign (vec_dest, expr);
5174           new_temp = make_ssa_name (vec_dest, new_stmt);
5175           gimple_assign_set_lhs (new_stmt, new_temp);
5176           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5177
5178           if (slp_node)
5179             {
5180               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5181               vect_defs.quick_push (new_temp);
5182             }
5183           else
5184             vect_defs[0] = new_temp;
5185         }
5186
5187       if (slp_node)
5188         continue;
5189
5190       if (j == 0)
5191         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5192       else
5193         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5194
5195       prev_stmt_info = vinfo_for_stmt (new_stmt);
5196       prev_phi_info = vinfo_for_stmt (new_phi);
5197     }
5198
5199   /* Finalize the reduction-phi (set its arguments) and create the
5200      epilog reduction code.  */
5201   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5202     {
5203       new_temp = gimple_assign_lhs (*vec_stmt);
5204       vect_defs[0] = new_temp;
5205     }
5206
5207   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5208                                     epilog_reduc_code, phis, reduc_index,
5209                                     double_reduc, slp_node);
5210
5211   phis.release ();
5212   vect_defs.release ();
5213   vec_oprnds0.release ();
5214   vec_oprnds1.release ();
5215
5216   return true;
5217 }
5218
5219 /* Function vect_min_worthwhile_factor.
5220
5221    For a loop where we could vectorize the operation indicated by CODE,
5222    return the minimum vectorization factor that makes it worthwhile
5223    to use generic vectors.  */
5224 int
5225 vect_min_worthwhile_factor (enum tree_code code)
5226 {
5227   switch (code)
5228     {
5229     case PLUS_EXPR:
5230     case MINUS_EXPR:
5231     case NEGATE_EXPR:
5232       return 4;
5233
5234     case BIT_AND_EXPR:
5235     case BIT_IOR_EXPR:
5236     case BIT_XOR_EXPR:
5237     case BIT_NOT_EXPR:
5238       return 2;
5239
5240     default:
5241       return INT_MAX;
5242     }
5243 }
5244
5245
5246 /* Function vectorizable_induction
5247
5248    Check if PHI performs an induction computation that can be vectorized.
5249    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5250    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5251    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5252
5253 bool
5254 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5255                         gimple *vec_stmt)
5256 {
5257   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5258   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5259   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5260   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5261   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5262   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5263   tree vec_def;
5264
5265   gcc_assert (ncopies >= 1);
5266   /* FORNOW. These restrictions should be relaxed.  */
5267   if (nested_in_vect_loop_p (loop, phi))
5268     {
5269       imm_use_iterator imm_iter;
5270       use_operand_p use_p;
5271       gimple exit_phi;
5272       edge latch_e;
5273       tree loop_arg;
5274
5275       if (ncopies > 1)
5276         {
5277           if (dump_enabled_p ())
5278             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5279                              "multiple types in nested loop.");
5280           return false;
5281         }
5282
5283       exit_phi = NULL;
5284       latch_e = loop_latch_edge (loop->inner);
5285       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5286       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5287         {
5288           if (!flow_bb_inside_loop_p (loop->inner,
5289                                       gimple_bb (USE_STMT (use_p))))
5290             {
5291               exit_phi = USE_STMT (use_p);
5292               break;
5293             }
5294         }
5295       if (exit_phi)
5296         {
5297           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5298           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5299                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5300             {
5301               if (dump_enabled_p ())
5302                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5303                                  "inner-loop induction only used outside "
5304                                  "of the outer vectorized loop.");
5305               return false;
5306             }
5307         }
5308     }
5309
5310   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5311     return false;
5312
5313   /* FORNOW: SLP not supported.  */
5314   if (STMT_SLP_TYPE (stmt_info))
5315     return false;
5316
5317   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5318
5319   if (gimple_code (phi) != GIMPLE_PHI)
5320     return false;
5321
5322   if (!vec_stmt) /* transformation not required.  */
5323     {
5324       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5325       if (dump_enabled_p ())
5326         dump_printf_loc (MSG_NOTE, vect_location,
5327                          "=== vectorizable_induction ===");
5328       vect_model_induction_cost (stmt_info, ncopies);
5329       return true;
5330     }
5331
5332   /** Transform.  **/
5333
5334   if (dump_enabled_p ())
5335     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.");
5336
5337   vec_def = get_initial_def_for_induction (phi);
5338   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5339   return true;
5340 }
5341
5342 /* Function vectorizable_live_operation.
5343
5344    STMT computes a value that is used outside the loop.  Check if
5345    it can be supported.  */
5346
5347 bool
5348 vectorizable_live_operation (gimple stmt,
5349                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5350                              gimple *vec_stmt ATTRIBUTE_UNUSED)
5351 {
5352   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5353   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5354   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5355   int i;
5356   int op_type;
5357   tree op;
5358   tree def;
5359   gimple def_stmt;
5360   enum vect_def_type dt;
5361   enum tree_code code;
5362   enum gimple_rhs_class rhs_class;
5363
5364   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5365
5366   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5367     return false;
5368
5369   if (!is_gimple_assign (stmt))
5370     return false;
5371
5372   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5373     return false;
5374
5375   /* FORNOW. CHECKME. */
5376   if (nested_in_vect_loop_p (loop, stmt))
5377     return false;
5378
5379   code = gimple_assign_rhs_code (stmt);
5380   op_type = TREE_CODE_LENGTH (code);
5381   rhs_class = get_gimple_rhs_class (code);
5382   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5383   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5384
5385   /* FORNOW: support only if all uses are invariant.  This means
5386      that the scalar operations can remain in place, unvectorized.
5387      The original last scalar value that they compute will be used.  */
5388
5389   for (i = 0; i < op_type; i++)
5390     {
5391       if (rhs_class == GIMPLE_SINGLE_RHS)
5392         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5393       else
5394         op = gimple_op (stmt, i + 1);
5395       if (op
5396           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5397                                   &dt))
5398         {
5399           if (dump_enabled_p ())
5400             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5401                              "use not simple.");
5402           return false;
5403         }
5404
5405       if (dt != vect_external_def && dt != vect_constant_def)
5406         return false;
5407     }
5408
5409   /* No transformation is required for the cases we currently support.  */
5410   return true;
5411 }
5412
5413 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5414
5415 static void
5416 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5417 {
5418   ssa_op_iter op_iter;
5419   imm_use_iterator imm_iter;
5420   def_operand_p def_p;
5421   gimple ustmt;
5422
5423   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5424     {
5425       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5426         {
5427           basic_block bb;
5428
5429           if (!is_gimple_debug (ustmt))
5430             continue;
5431
5432           bb = gimple_bb (ustmt);
5433
5434           if (!flow_bb_inside_loop_p (loop, bb))
5435             {
5436               if (gimple_debug_bind_p (ustmt))
5437                 {
5438                   if (dump_enabled_p ())
5439                     dump_printf_loc (MSG_NOTE, vect_location,
5440                                      "killing debug use");
5441
5442                   gimple_debug_bind_reset_value (ustmt);
5443                   update_stmt (ustmt);
5444                 }
5445               else
5446                 gcc_unreachable ();
5447             }
5448         }
5449     }
5450 }
5451
5452 /* Function vect_transform_loop.
5453
5454    The analysis phase has determined that the loop is vectorizable.
5455    Vectorize the loop - created vectorized stmts to replace the scalar
5456    stmts in the loop, and update the loop exit condition.  */
5457
5458 void
5459 vect_transform_loop (loop_vec_info loop_vinfo)
5460 {
5461   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5462   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5463   int nbbs = loop->num_nodes;
5464   gimple_stmt_iterator si;
5465   int i;
5466   tree ratio = NULL;
5467   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5468   bool grouped_store;
5469   bool slp_scheduled = false;
5470   unsigned int nunits;
5471   gimple stmt, pattern_stmt;
5472   gimple_seq pattern_def_seq = NULL;
5473   gimple_stmt_iterator pattern_def_si = gsi_none ();
5474   bool transform_pattern_stmt = false;
5475   bool check_profitability = false;
5476   int th;
5477   /* Record number of iterations before we started tampering with the profile. */
5478   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5479
5480   if (dump_enabled_p ())
5481     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===");
5482
5483   /* If profile is inprecise, we have chance to fix it up.  */
5484   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5485     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5486
5487   /* Use the more conservative vectorization threshold.  If the number
5488      of iterations is constant assume the cost check has been performed
5489      by our caller.  If the threshold makes all loops profitable that
5490      run at least the vectorization factor number of times checking
5491      is pointless, too.  */
5492   th = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
5493          * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1);
5494   th = MAX (th, LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo));
5495   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5496       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5497     {
5498       if (dump_enabled_p ())
5499         dump_printf_loc (MSG_NOTE, vect_location,
5500                          "Profitability threshold is %d loop iterations.", th);
5501       check_profitability = true;
5502     }
5503
5504   /* Peel the loop if there are data refs with unknown alignment.
5505      Only one data ref with unknown store is allowed.  */
5506
5507   if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo))
5508     {
5509       vect_do_peeling_for_alignment (loop_vinfo, th, check_profitability);
5510       check_profitability = false;
5511     }
5512
5513   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5514       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5515     {
5516       vect_loop_versioning (loop_vinfo, th, check_profitability);
5517       check_profitability = false;
5518     }
5519
5520   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5521      compile time constant), or it is a constant that doesn't divide by the
5522      vectorization factor, then an epilog loop needs to be created.
5523      We therefore duplicate the loop: the original loop will be vectorized,
5524      and will compute the first (n/VF) iterations.  The second copy of the loop
5525      will remain scalar and will compute the remaining (n%VF) iterations.
5526      (VF is the vectorization factor).  */
5527
5528   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5529        || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
5530            && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)
5531        || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5532     vect_do_peeling_for_loop_bound (loop_vinfo, &ratio,
5533                                     th, check_profitability);
5534   else
5535     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5536                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5537
5538   /* 1) Make sure the loop header has exactly two entries
5539      2) Make sure we have a preheader basic block.  */
5540
5541   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5542
5543   split_edge (loop_preheader_edge (loop));
5544
5545   /* FORNOW: the vectorizer supports only loops which body consist
5546      of one basic block (header + empty latch). When the vectorizer will
5547      support more involved loop forms, the order by which the BBs are
5548      traversed need to be reconsidered.  */
5549
5550   for (i = 0; i < nbbs; i++)
5551     {
5552       basic_block bb = bbs[i];
5553       stmt_vec_info stmt_info;
5554       gimple phi;
5555
5556       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5557         {
5558           phi = gsi_stmt (si);
5559           if (dump_enabled_p ())
5560             {
5561               dump_printf_loc (MSG_NOTE, vect_location,
5562                                "------>vectorizing phi: ");
5563               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5564             }
5565           stmt_info = vinfo_for_stmt (phi);
5566           if (!stmt_info)
5567             continue;
5568
5569           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5570             vect_loop_kill_debug_uses (loop, phi);
5571
5572           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5573               && !STMT_VINFO_LIVE_P (stmt_info))
5574             continue;
5575
5576           if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5577                 != (unsigned HOST_WIDE_INT) vectorization_factor)
5578               && dump_enabled_p ())
5579             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.");
5580
5581           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5582             {
5583               if (dump_enabled_p ())
5584                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.");
5585               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5586             }
5587         }
5588
5589       pattern_stmt = NULL;
5590       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5591         {
5592           bool is_store;
5593
5594           if (transform_pattern_stmt)
5595             stmt = pattern_stmt;
5596           else
5597             stmt = gsi_stmt (si);
5598
5599           if (dump_enabled_p ())
5600             {
5601               dump_printf_loc (MSG_NOTE, vect_location,
5602                                "------>vectorizing statement: ");
5603               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5604             }
5605
5606           stmt_info = vinfo_for_stmt (stmt);
5607
5608           /* vector stmts created in the outer-loop during vectorization of
5609              stmts in an inner-loop may not have a stmt_info, and do not
5610              need to be vectorized.  */
5611           if (!stmt_info)
5612             {
5613               gsi_next (&si);
5614               continue;
5615             }
5616
5617           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5618             vect_loop_kill_debug_uses (loop, stmt);
5619
5620           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5621               && !STMT_VINFO_LIVE_P (stmt_info))
5622             {
5623               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5624                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5625                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5626                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5627                 {
5628                   stmt = pattern_stmt;
5629                   stmt_info = vinfo_for_stmt (stmt);
5630                 }
5631               else
5632                 {
5633                   gsi_next (&si);
5634                   continue;
5635                 }
5636             }
5637           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5638                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5639                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5640                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5641             transform_pattern_stmt = true;
5642
5643           /* If pattern statement has def stmts, vectorize them too.  */
5644           if (is_pattern_stmt_p (stmt_info))
5645             {
5646               if (pattern_def_seq == NULL)
5647                 {
5648                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5649                   pattern_def_si = gsi_start (pattern_def_seq);
5650                 }
5651               else if (!gsi_end_p (pattern_def_si))
5652                 gsi_next (&pattern_def_si);
5653               if (pattern_def_seq != NULL)
5654                 {
5655                   gimple pattern_def_stmt = NULL;
5656                   stmt_vec_info pattern_def_stmt_info = NULL;
5657
5658                   while (!gsi_end_p (pattern_def_si))
5659                     {
5660                       pattern_def_stmt = gsi_stmt (pattern_def_si);
5661                       pattern_def_stmt_info
5662                         = vinfo_for_stmt (pattern_def_stmt);
5663                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
5664                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
5665                         break;
5666                       gsi_next (&pattern_def_si);
5667                     }
5668
5669                   if (!gsi_end_p (pattern_def_si))
5670                     {
5671                       if (dump_enabled_p ())
5672                         {
5673                           dump_printf_loc (MSG_NOTE, vect_location,
5674                                            "==> vectorizing pattern def "
5675                                            "stmt: ");
5676                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
5677                                             pattern_def_stmt, 0);
5678                         }
5679
5680                       stmt = pattern_def_stmt;
5681                       stmt_info = pattern_def_stmt_info;
5682                     }
5683                   else
5684                     {
5685                       pattern_def_si = gsi_none ();
5686                       transform_pattern_stmt = false;
5687                     }
5688                 }
5689               else
5690                 transform_pattern_stmt = false;
5691             }
5692
5693           gcc_assert (STMT_VINFO_VECTYPE (stmt_info));
5694           nunits = (unsigned int) TYPE_VECTOR_SUBPARTS (
5695                                                STMT_VINFO_VECTYPE (stmt_info));
5696           if (!STMT_SLP_TYPE (stmt_info)
5697               && nunits != (unsigned int) vectorization_factor
5698               && dump_enabled_p ())
5699             /* For SLP VF is set according to unrolling factor, and not to
5700                vector size, hence for SLP this print is not valid.  */
5701             dump_printf_loc (MSG_NOTE, vect_location,
5702                              "multiple-types.");
5703
5704           /* SLP. Schedule all the SLP instances when the first SLP stmt is
5705              reached.  */
5706           if (STMT_SLP_TYPE (stmt_info))
5707             {
5708               if (!slp_scheduled)
5709                 {
5710                   slp_scheduled = true;
5711
5712                   if (dump_enabled_p ())
5713                     dump_printf_loc (MSG_NOTE, vect_location,
5714                                      "=== scheduling SLP instances ===");
5715
5716                   vect_schedule_slp (loop_vinfo, NULL);
5717                 }
5718
5719               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
5720               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
5721                 {
5722                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5723                     {
5724                       pattern_def_seq = NULL;
5725                       gsi_next (&si);
5726                     }
5727                   continue;
5728                 }
5729             }
5730
5731           /* -------- vectorize statement ------------ */
5732           if (dump_enabled_p ())
5733             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.");
5734
5735           grouped_store = false;
5736           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
5737           if (is_store)
5738             {
5739               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
5740                 {
5741                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
5742                      interleaving chain was completed - free all the stores in
5743                      the chain.  */
5744                   gsi_next (&si);
5745                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
5746                   continue;
5747                 }
5748               else
5749                 {
5750                   /* Free the attached stmt_vec_info and remove the stmt.  */
5751                   gimple store = gsi_stmt (si);
5752                   free_stmt_vec_info (store);
5753                   unlink_stmt_vdef (store);
5754                   gsi_remove (&si, true);
5755                   release_defs (store);
5756                   continue;
5757                 }
5758             }
5759
5760           if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
5761             {
5762               pattern_def_seq = NULL;
5763               gsi_next (&si);
5764             }
5765         }                       /* stmts in BB */
5766     }                           /* BBs in loop */
5767
5768   slpeel_make_loop_iterate_ntimes (loop, ratio);
5769
5770   /* Reduce loop iterations by the vectorization factor.  */
5771   scale_loop_profile (loop, RDIV (REG_BR_PROB_BASE , vectorization_factor),
5772                       expected_iterations / vectorization_factor);
5773   loop->nb_iterations_upper_bound
5774     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (vectorization_factor),
5775                                             FLOOR_DIV_EXPR);
5776   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5777       && loop->nb_iterations_upper_bound != double_int_zero)
5778     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - double_int_one;
5779   if (loop->any_estimate)
5780     {
5781       loop->nb_iterations_estimate
5782         = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (vectorization_factor),
5783                                              FLOOR_DIV_EXPR);
5784        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
5785            && loop->nb_iterations_estimate != double_int_zero)
5786          loop->nb_iterations_estimate = loop->nb_iterations_estimate - double_int_one;
5787     }
5788
5789   /* The memory tags and pointers in vectorized statements need to
5790      have their SSA forms updated.  FIXME, why can't this be delayed
5791      until all the loops have been transformed?  */
5792   update_ssa (TODO_update_ssa);
5793
5794   if (dump_enabled_p ())
5795     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location, "LOOP VECTORIZED.");
5796   if (loop->inner && dump_enabled_p ())
5797     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
5798                      "OUTER LOOP VECTORIZED.");
5799 }